BinKhoaLe1812 commited on
Commit
58c8a97
·
verified ·
1 Parent(s): 1b4459c

Upd multilingual search strat

Browse files
search/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Search package
2
+ from .search import WebSearcher, search_web, search_web_with_content, search_medical, search_multilingual_medical
3
+ from .coordinator import SearchCoordinator
4
+ from .engines import DuckDuckGoEngine, MedicalSearchEngine, MultilingualMedicalEngine
5
+ from .extractors import ContentExtractor
6
+ from .processors import MedicalSearchProcessor, LanguageProcessor
7
+
8
+ __all__ = [
9
+ 'WebSearcher',
10
+ 'search_web',
11
+ 'search_web_with_content',
12
+ 'search_medical',
13
+ 'search_multilingual_medical',
14
+ 'SearchCoordinator',
15
+ 'DuckDuckGoEngine',
16
+ 'MedicalSearchEngine',
17
+ 'MultilingualMedicalEngine',
18
+ 'ContentExtractor',
19
+ 'MedicalSearchProcessor',
20
+ 'LanguageProcessor'
21
+ ]
search/coordinator.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ import time
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
5
+
6
+ from .engines.duckduckgo import DuckDuckGoEngine
7
+ from .engines.medical import MedicalSearchEngine
8
+ from .engines.multilingual import MultilingualMedicalEngine
9
+ from .extractors.content import ContentExtractor
10
+ from .processors.medical import MedicalSearchProcessor
11
+ from .processors.language import LanguageProcessor
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class SearchCoordinator:
16
+ """Coordinate multiple search strategies for comprehensive medical information"""
17
+
18
+ def __init__(self, max_workers: int = 3):
19
+ self.max_workers = max_workers
20
+
21
+ # Initialize search engines
22
+ self.duckduckgo_engine = DuckDuckGoEngine()
23
+ self.medical_engine = MedicalSearchEngine()
24
+ self.multilingual_engine = MultilingualMedicalEngine()
25
+
26
+ # Initialize processors
27
+ self.content_extractor = ContentExtractor()
28
+ self.medical_processor = MedicalSearchProcessor()
29
+ self.language_processor = LanguageProcessor()
30
+
31
+ # Search strategies
32
+ self.strategies = [
33
+ self._search_multilingual,
34
+ self._search_duckduckgo,
35
+ self._search_medical_sources
36
+ ]
37
+
38
+ def search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
39
+ """Execute comprehensive multilingual search with multiple strategies"""
40
+ logger.info(f"Starting comprehensive multilingual search for: {query}")
41
+
42
+ # Detect and enhance query for multiple languages
43
+ enhanced_queries = self.language_processor.enhance_query(query, target_language)
44
+ logger.info(f"Enhanced queries: {list(enhanced_queries.keys())}")
45
+
46
+ # Execute search strategies in parallel
47
+ all_results = []
48
+
49
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
50
+ # Submit search tasks for each language
51
+ future_to_strategy = {}
52
+
53
+ for lang, enhanced_query in enhanced_queries.items():
54
+ for strategy in self.strategies:
55
+ future = executor.submit(strategy, enhanced_query, num_results // len(enhanced_queries), lang)
56
+ future_to_strategy[future] = f"{strategy.__name__}_{lang}"
57
+
58
+ # Collect results
59
+ for future in as_completed(future_to_strategy):
60
+ strategy_name = future_to_strategy[future]
61
+ try:
62
+ results = future.result()
63
+ if results:
64
+ all_results.extend(results)
65
+ logger.info(f"{strategy_name} found {len(results)} results")
66
+ except Exception as e:
67
+ logger.error(f"{strategy_name} failed: {e}")
68
+
69
+ # Remove duplicates and filter by language preference
70
+ unique_results = self._remove_duplicates(all_results)
71
+ if target_language:
72
+ unique_results = self.language_processor.filter_by_language(unique_results, target_language)
73
+
74
+ logger.info(f"Total unique results: {len(unique_results)}")
75
+
76
+ # Extract content from URLs
77
+ enriched_results = self._enrich_with_content(unique_results)
78
+
79
+ # Process results into comprehensive summary
80
+ summary, url_mapping = self.medical_processor.process_results(enriched_results, query)
81
+
82
+ logger.info(f"Multilingual search completed: {len(url_mapping)} sources processed")
83
+ return summary, url_mapping
84
+
85
+ def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
86
+ """Search using multilingual medical engine"""
87
+ try:
88
+ if language:
89
+ results = self.multilingual_engine.search_by_language(query, language, num_results)
90
+ else:
91
+ results = self.multilingual_engine.search(query, num_results)
92
+ return results
93
+ except Exception as e:
94
+ logger.error(f"Multilingual search failed: {e}")
95
+ return []
96
+
97
+ def _search_duckduckgo(self, query: str, num_results: int, language: str = None) -> List[Dict]:
98
+ """Search using DuckDuckGo engine"""
99
+ try:
100
+ results = self.duckduckgo_engine.search(query, num_results)
101
+ return results
102
+ except Exception as e:
103
+ logger.error(f"DuckDuckGo search failed: {e}")
104
+ return []
105
+
106
+ def _search_medical_sources(self, query: str, num_results: int, language: str = None) -> List[Dict]:
107
+ """Search using medical sources engine"""
108
+ try:
109
+ results = self.medical_engine.search(query, num_results)
110
+ return results
111
+ except Exception as e:
112
+ logger.error(f"Medical sources search failed: {e}")
113
+ return []
114
+
115
+ def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
116
+ """Remove duplicate results based on URL"""
117
+ seen_urls = set()
118
+ unique_results = []
119
+
120
+ for result in results:
121
+ url = result.get('url', '')
122
+ if url and url not in seen_urls:
123
+ seen_urls.add(url)
124
+ unique_results.append(result)
125
+
126
+ return unique_results
127
+
128
+ def _enrich_with_content(self, results: List[Dict]) -> List[Dict]:
129
+ """Enrich results with extracted content"""
130
+ enriched_results = []
131
+
132
+ # Extract content in parallel
133
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
134
+ # Submit content extraction tasks
135
+ future_to_result = {
136
+ executor.submit(self.content_extractor.extract, result['url']): result
137
+ for result in results
138
+ }
139
+
140
+ # Collect enriched results
141
+ for future in as_completed(future_to_result):
142
+ original_result = future_to_result[future]
143
+ try:
144
+ content = future.result()
145
+ if content:
146
+ enriched_result = original_result.copy()
147
+ enriched_result['content'] = content
148
+ enriched_results.append(enriched_result)
149
+ except Exception as e:
150
+ logger.warning(f"Content extraction failed for {original_result['url']}: {e}")
151
+ # Still include result without content
152
+ enriched_results.append(original_result)
153
+
154
+ return enriched_results
155
+
156
+ def quick_search(self, query: str, num_results: int = 5) -> List[Dict]:
157
+ """Quick search for basic results without content extraction"""
158
+ logger.info(f"Quick search for: {query}")
159
+
160
+ # Use only DuckDuckGo for speed
161
+ results = self.duckduckgo_engine.search(query, num_results)
162
+
163
+ # Remove duplicates
164
+ unique_results = self._remove_duplicates(results)
165
+
166
+ logger.info(f"Quick search completed: {len(unique_results)} results")
167
+ return unique_results
168
+
169
+ def medical_focus_search(self, query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
170
+ """Medical-focused search with enhanced processing"""
171
+ logger.info(f"Medical focus search for: {query}")
172
+
173
+ # Use medical engine primarily
174
+ medical_results = self.medical_engine.search(query, num_results)
175
+
176
+ # Add some general results for context
177
+ general_results = self.duckduckgo_engine.search(query, 3)
178
+
179
+ # Combine and deduplicate
180
+ all_results = self._remove_duplicates(medical_results + general_results)
181
+
182
+ # Enrich with content
183
+ enriched_results = self._enrich_with_content(all_results)
184
+
185
+ # Process with medical focus
186
+ summary, url_mapping = self.medical_processor.process_results(enriched_results, query)
187
+
188
+ logger.info(f"Medical focus search completed: {len(url_mapping)} sources")
189
+ return summary, url_mapping
190
+
191
+ def multilingual_medical_search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
192
+ """Comprehensive multilingual medical search"""
193
+ logger.info(f"Multilingual medical search for: {query} (target: {target_language})")
194
+
195
+ # Detect source language
196
+ source_language = self.language_processor.detect_language(query)
197
+ logger.info(f"Detected source language: {source_language}")
198
+
199
+ # Use multilingual search with language preference
200
+ summary, url_mapping = self.search(query, num_results, target_language)
201
+
202
+ logger.info(f"Multilingual medical search completed: {len(url_mapping)} sources")
203
+ return summary, url_mapping
search/engines/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .duckduckgo import DuckDuckGoEngine
2
+ from .medical import MedicalSearchEngine
3
+ from .multilingual import MultilingualMedicalEngine
4
+
5
+ __all__ = ['DuckDuckGoEngine', 'MedicalSearchEngine', 'MultilingualMedicalEngine']
search/engines/duckduckgo.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class DuckDuckGoEngine:
10
+ """DuckDuckGo search engine with multiple strategies"""
11
+
12
+ def __init__(self, timeout: int = 15):
13
+ self.session = requests.Session()
14
+ self.session.headers.update({
15
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
16
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
17
+ 'Accept-Language': 'en-US,en;q=0.5',
18
+ 'Accept-Encoding': 'gzip, deflate',
19
+ 'Connection': 'keep-alive',
20
+ 'Upgrade-Insecure-Requests': '1',
21
+ })
22
+ self.timeout = timeout
23
+
24
+ def search(self, query: str, num_results: int = 10) -> List[Dict]:
25
+ """Search with multiple DuckDuckGo strategies"""
26
+ results = []
27
+
28
+ # Strategy 1: HTML Interface
29
+ html_results = self._search_html(query, num_results)
30
+ if html_results:
31
+ results.extend(html_results)
32
+ logger.info(f"DuckDuckGo HTML found {len(html_results)} results")
33
+
34
+ # Strategy 2: Instant Answer API
35
+ if len(results) < num_results:
36
+ api_results = self._search_api(query, num_results - len(results))
37
+ if api_results:
38
+ results.extend(api_results)
39
+ logger.info(f"DuckDuckGo API found {len(api_results)} results")
40
+
41
+ # Strategy 3: Lite Interface (mobile-friendly)
42
+ if len(results) < num_results:
43
+ lite_results = self._search_lite(query, num_results - len(results))
44
+ if lite_results:
45
+ results.extend(lite_results)
46
+ logger.info(f"DuckDuckGo Lite found {len(lite_results)} results")
47
+
48
+ return results[:num_results]
49
+
50
+ def _search_html(self, query: str, num_results: int) -> List[Dict]:
51
+ """Search using DuckDuckGo HTML interface"""
52
+ try:
53
+ url = "https://html.duckduckgo.com/html/"
54
+ params = {
55
+ 'q': query,
56
+ 'kl': 'us-en',
57
+ 's': '0', # Start from first result
58
+ 'dc': '1', # Disable auto-complete
59
+ 'v': 'l', # Lite version
60
+ 'o': 'json', # JSON output
61
+ 'api': 'd.js' # API format
62
+ }
63
+
64
+ response = self.session.get(url, params=params, timeout=self.timeout)
65
+ response.raise_for_status()
66
+
67
+ soup = BeautifulSoup(response.content, 'html.parser')
68
+ results = []
69
+
70
+ # Multiple selectors for different DDG layouts
71
+ selectors = [
72
+ 'a.result__a',
73
+ 'a[data-testid="result-title-a"]',
74
+ '.result__title a',
75
+ '.web-result a',
76
+ '.result a',
77
+ 'a[href*="http"]:not([href*="duckduckgo.com"])'
78
+ ]
79
+
80
+ for selector in selectors:
81
+ links = soup.select(selector)
82
+ if links:
83
+ logger.info(f"Using selector: {selector} - found {len(links)} links")
84
+ break
85
+
86
+ for link in links[:num_results]:
87
+ try:
88
+ href = link.get('href')
89
+ if not href or href.startswith('#') or 'duckduckgo.com' in href:
90
+ continue
91
+
92
+ # Clean up DDG redirect URLs
93
+ if href.startswith('/l/?uddg='):
94
+ import urllib.parse
95
+ href = urllib.parse.unquote(href.split('uddg=')[1])
96
+
97
+ title = link.get_text(strip=True)
98
+ if title and href.startswith('http'):
99
+ results.append({
100
+ 'url': href,
101
+ 'title': title,
102
+ 'source': 'duckduckgo_html'
103
+ })
104
+ except Exception as e:
105
+ logger.debug(f"Error parsing link: {e}")
106
+ continue
107
+
108
+ return results
109
+
110
+ except Exception as e:
111
+ logger.warning(f"DuckDuckGo HTML search failed: {e}")
112
+ return []
113
+
114
+ def _search_api(self, query: str, num_results: int) -> List[Dict]:
115
+ """Search using DuckDuckGo Instant Answer API"""
116
+ try:
117
+ url = "https://api.duckduckgo.com/"
118
+ params = {
119
+ 'q': query,
120
+ 'format': 'json',
121
+ 'no_html': '1',
122
+ 'skip_disambig': '1',
123
+ 't': 'MedicalChatbot'
124
+ }
125
+
126
+ response = self.session.get(url, params=params, timeout=self.timeout)
127
+ response.raise_for_status()
128
+ data = response.json()
129
+
130
+ results = []
131
+
132
+ # Abstract result
133
+ if data.get('AbstractURL') and data.get('Abstract'):
134
+ results.append({
135
+ 'url': data['AbstractURL'],
136
+ 'title': data.get('Heading', query),
137
+ 'content': data.get('Abstract', ''),
138
+ 'source': 'duckduckgo_api'
139
+ })
140
+
141
+ # Related topics
142
+ for topic in data.get('RelatedTopics', []):
143
+ if len(results) >= num_results:
144
+ break
145
+
146
+ if isinstance(topic, dict) and topic.get('FirstURL'):
147
+ text = topic.get('Text', '')
148
+ title = text.split(' - ')[0] if ' - ' in text else text[:50]
149
+
150
+ results.append({
151
+ 'url': topic['FirstURL'],
152
+ 'title': title,
153
+ 'content': text,
154
+ 'source': 'duckduckgo_api'
155
+ })
156
+
157
+ return results
158
+
159
+ except Exception as e:
160
+ logger.warning(f"DuckDuckGo API search failed: {e}")
161
+ return []
162
+
163
+ def _search_lite(self, query: str, num_results: int) -> List[Dict]:
164
+ """Search using DuckDuckGo Lite interface"""
165
+ try:
166
+ url = "https://lite.duckduckgo.com/lite/"
167
+ params = {
168
+ 'q': query,
169
+ 'kl': 'us-en'
170
+ }
171
+
172
+ response = self.session.get(url, params=params, timeout=self.timeout)
173
+ response.raise_for_status()
174
+
175
+ soup = BeautifulSoup(response.content, 'html.parser')
176
+ results = []
177
+
178
+ # Lite interface selectors
179
+ links = soup.select('a[href*="http"]:not([href*="duckduckgo.com"])')
180
+
181
+ for link in links[:num_results]:
182
+ try:
183
+ href = link.get('href')
184
+ title = link.get_text(strip=True)
185
+
186
+ if href and title and href.startswith('http'):
187
+ results.append({
188
+ 'url': href,
189
+ 'title': title,
190
+ 'source': 'duckduckgo_lite'
191
+ })
192
+ except Exception as e:
193
+ logger.debug(f"Error parsing lite link: {e}")
194
+ continue
195
+
196
+ return results
197
+
198
+ except Exception as e:
199
+ logger.warning(f"DuckDuckGo Lite search failed: {e}")
200
+ return []
search/engines/medical.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict
5
+ import time
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class MedicalSearchEngine:
10
+ """Specialized medical search engine with curated sources"""
11
+
12
+ def __init__(self, timeout: int = 15):
13
+ self.session = requests.Session()
14
+ self.session.headers.update({
15
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
16
+ })
17
+ self.timeout = timeout
18
+
19
+ # Curated medical sources
20
+ self.medical_sources = {
21
+ 'mayo_clinic': {
22
+ 'base_url': 'https://www.mayoclinic.org',
23
+ 'search_url': 'https://www.mayoclinic.org/search/search-results',
24
+ 'domains': ['mayoclinic.org']
25
+ },
26
+ 'webmd': {
27
+ 'base_url': 'https://www.webmd.com',
28
+ 'search_url': 'https://www.webmd.com/search/search_results/default.aspx',
29
+ 'domains': ['webmd.com']
30
+ },
31
+ 'healthline': {
32
+ 'base_url': 'https://www.healthline.com',
33
+ 'search_url': 'https://www.healthline.com/search',
34
+ 'domains': ['healthline.com']
35
+ },
36
+ 'medlineplus': {
37
+ 'base_url': 'https://medlineplus.gov',
38
+ 'search_url': 'https://medlineplus.gov/search',
39
+ 'domains': ['medlineplus.gov']
40
+ },
41
+ 'nih': {
42
+ 'base_url': 'https://www.nih.gov',
43
+ 'search_url': 'https://search.nih.gov/search',
44
+ 'domains': ['nih.gov', 'nlm.nih.gov']
45
+ }
46
+ }
47
+
48
+ def search(self, query: str, num_results: int = 10) -> List[Dict]:
49
+ """Search medical sources for relevant information"""
50
+ results = []
51
+
52
+ # Strategy 1: Direct medical source searches
53
+ for source_name, source_config in self.medical_sources.items():
54
+ if len(results) >= num_results:
55
+ break
56
+
57
+ source_results = self._search_medical_source(query, source_name, source_config)
58
+ results.extend(source_results)
59
+
60
+ # Add delay between requests
61
+ time.sleep(0.5)
62
+
63
+ # Strategy 2: Medical fallback sources
64
+ if len(results) < num_results:
65
+ fallback_results = self._get_fallback_sources(query, num_results - len(results))
66
+ results.extend(fallback_results)
67
+
68
+ return results[:num_results]
69
+
70
+ def _search_medical_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
71
+ """Search a specific medical source"""
72
+ try:
73
+ search_url = source_config.get('search_url')
74
+ if not search_url:
75
+ return []
76
+
77
+ params = {
78
+ 'q': query,
79
+ 'query': query,
80
+ 'search': query
81
+ }
82
+
83
+ response = self.session.get(search_url, params=params, timeout=self.timeout)
84
+ response.raise_for_status()
85
+
86
+ soup = BeautifulSoup(response.content, 'html.parser')
87
+ results = []
88
+
89
+ # Source-specific selectors
90
+ selectors = self._get_source_selectors(source_name)
91
+
92
+ for selector in selectors:
93
+ links = soup.select(selector)
94
+ if links:
95
+ logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
96
+ break
97
+
98
+ for link in links[:3]: # Limit per source
99
+ try:
100
+ href = link.get('href')
101
+ if not href:
102
+ continue
103
+
104
+ # Make absolute URL
105
+ if href.startswith('/'):
106
+ href = source_config['base_url'] + href
107
+
108
+ title = link.get_text(strip=True)
109
+ if title and href.startswith('http'):
110
+ results.append({
111
+ 'url': href,
112
+ 'title': title,
113
+ 'source': source_name,
114
+ 'domain': source_config['domains'][0]
115
+ })
116
+ except Exception as e:
117
+ logger.debug(f"Error parsing {source_name} link: {e}")
118
+ continue
119
+
120
+ return results
121
+
122
+ except Exception as e:
123
+ logger.warning(f"Medical source {source_name} search failed: {e}")
124
+ return []
125
+
126
+ def _get_source_selectors(self, source_name: str) -> List[str]:
127
+ """Get CSS selectors for specific medical sources"""
128
+ selectors_map = {
129
+ 'mayo_clinic': [
130
+ 'a[href*="/diseases-conditions/"]',
131
+ 'a[href*="/symptoms/"]',
132
+ '.search-result a',
133
+ '.result-title a'
134
+ ],
135
+ 'webmd': [
136
+ 'a[href*="/default.htm"]',
137
+ '.search-result a',
138
+ '.result-title a',
139
+ 'a[href*="/content/"]'
140
+ ],
141
+ 'healthline': [
142
+ 'a[href*="/health/"]',
143
+ '.search-result a',
144
+ '.result-title a',
145
+ 'a[href*="/conditions/"]'
146
+ ],
147
+ 'medlineplus': [
148
+ 'a[href*="/healthtopics/"]',
149
+ '.search-result a',
150
+ '.result-title a'
151
+ ],
152
+ 'nih': [
153
+ 'a[href*="/health/"]',
154
+ '.search-result a',
155
+ '.result-title a'
156
+ ]
157
+ }
158
+ return selectors_map.get(source_name, ['a[href*="http"]'])
159
+
160
+ def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
161
+ """Get fallback medical sources when direct search fails"""
162
+ fallback_sources = [
163
+ {
164
+ 'url': 'https://www.mayoclinic.org/diseases-conditions',
165
+ 'title': f'Mayo Clinic: {query}',
166
+ 'source': 'mayo_fallback',
167
+ 'domain': 'mayoclinic.org'
168
+ },
169
+ {
170
+ 'url': 'https://www.webmd.com/default.htm',
171
+ 'title': f'WebMD: {query}',
172
+ 'source': 'webmd_fallback',
173
+ 'domain': 'webmd.com'
174
+ },
175
+ {
176
+ 'url': 'https://www.healthline.com/health',
177
+ 'title': f'Healthline: {query}',
178
+ 'source': 'healthline_fallback',
179
+ 'domain': 'healthline.com'
180
+ },
181
+ {
182
+ 'url': 'https://medlineplus.gov/healthtopics.html',
183
+ 'title': f'MedlinePlus: {query}',
184
+ 'source': 'medlineplus_fallback',
185
+ 'domain': 'medlineplus.gov'
186
+ },
187
+ {
188
+ 'url': 'https://www.cdc.gov',
189
+ 'title': f'CDC: {query}',
190
+ 'source': 'cdc_fallback',
191
+ 'domain': 'cdc.gov'
192
+ }
193
+ ]
194
+
195
+ return fallback_sources[:num_results]
search/engines/multilingual.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import List, Dict, Optional
5
+ import time
6
+ import re
7
+ from urllib.parse import urlparse, quote
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class MultilingualMedicalEngine:
12
+ """Multilingual medical search engine supporting English, Vietnamese, and Chinese sources"""
13
+
14
+ def __init__(self, timeout: int = 15):
15
+ self.session = requests.Session()
16
+ self.session.headers.update({
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
20
+ 'Accept-Encoding': 'gzip, deflate',
21
+ 'Connection': 'keep-alive',
22
+ })
23
+ self.timeout = timeout
24
+
25
+ # Comprehensive medical sources by language
26
+ self.medical_sources = {
27
+ 'en': {
28
+ 'mayo_clinic': {
29
+ 'base_url': 'https://www.mayoclinic.org',
30
+ 'search_url': 'https://www.mayoclinic.org/search/search-results',
31
+ 'domains': ['mayoclinic.org'],
32
+ 'selectors': ['a[href*="/diseases-conditions/"]', 'a[href*="/symptoms/"]', '.search-result a']
33
+ },
34
+ 'webmd': {
35
+ 'base_url': 'https://www.webmd.com',
36
+ 'search_url': 'https://www.webmd.com/search/search_results/default.aspx',
37
+ 'domains': ['webmd.com'],
38
+ 'selectors': ['a[href*="/default.htm"]', '.search-result a', 'a[href*="/content/"]']
39
+ },
40
+ 'healthline': {
41
+ 'base_url': 'https://www.healthline.com',
42
+ 'search_url': 'https://www.healthline.com/search',
43
+ 'domains': ['healthline.com'],
44
+ 'selectors': ['a[href*="/health/"]', 'a[href*="/conditions/"]', '.search-result a']
45
+ },
46
+ 'medlineplus': {
47
+ 'base_url': 'https://medlineplus.gov',
48
+ 'search_url': 'https://medlineplus.gov/search',
49
+ 'domains': ['medlineplus.gov'],
50
+ 'selectors': ['a[href*="/healthtopics/"]', '.search-result a']
51
+ },
52
+ 'nih': {
53
+ 'base_url': 'https://www.nih.gov',
54
+ 'search_url': 'https://search.nih.gov/search',
55
+ 'domains': ['nih.gov', 'nlm.nih.gov'],
56
+ 'selectors': ['a[href*="/health/"]', '.search-result a']
57
+ },
58
+ 'cdc': {
59
+ 'base_url': 'https://www.cdc.gov',
60
+ 'search_url': 'https://www.cdc.gov/search/index.html',
61
+ 'domains': ['cdc.gov'],
62
+ 'selectors': ['a[href*="/health/"]', '.search-result a']
63
+ }
64
+ },
65
+ 'vi': {
66
+ 'hello_bacsi': {
67
+ 'base_url': 'https://hellobacsi.com',
68
+ 'search_url': 'https://hellobacsi.com/tim-kiem',
69
+ 'domains': ['hellobacsi.com'],
70
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a', '.article-title a']
71
+ },
72
+ 'alo_bacsi': {
73
+ 'base_url': 'https://alobacsi.com',
74
+ 'search_url': 'https://alobacsi.com/tim-kiem',
75
+ 'domains': ['alobacsi.com'],
76
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
77
+ },
78
+ 'vinmec': {
79
+ 'base_url': 'https://www.vinmec.com',
80
+ 'search_url': 'https://www.vinmec.com/vi/tim-kiem',
81
+ 'domains': ['vinmec.com'],
82
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
83
+ },
84
+ 'tam_anh': {
85
+ 'base_url': 'https://tamanhhospital.vn',
86
+ 'search_url': 'https://tamanhhospital.vn/tim-kiem',
87
+ 'domains': ['tamanhhospital.vn'],
88
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
89
+ },
90
+ 'medlatec': {
91
+ 'base_url': 'https://medlatec.vn',
92
+ 'search_url': 'https://medlatec.vn/tim-kiem',
93
+ 'domains': ['medlatec.vn'],
94
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
95
+ },
96
+ 'suckhoe_doisong': {
97
+ 'base_url': 'https://suckhoedoisong.vn',
98
+ 'search_url': 'https://suckhoedoisong.vn/tim-kiem',
99
+ 'domains': ['suckhoedoisong.vn'],
100
+ 'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
101
+ },
102
+ 'vien_dinh_duong': {
103
+ 'base_url': 'https://viendinhduong.vn',
104
+ 'search_url': 'https://viendinhduong.vn/tim-kiem',
105
+ 'domains': ['viendinhduong.vn'],
106
+ 'selectors': ['a[href*="/dinh-duong/"]', 'a[href*="/suc-khoe/"]', '.search-result a']
107
+ }
108
+ },
109
+ 'zh': {
110
+ 'haodf': {
111
+ 'base_url': 'https://www.haodf.com',
112
+ 'search_url': 'https://www.haodf.com/search',
113
+ 'domains': ['haodf.com'],
114
+ 'selectors': ['a[href*="/jibing/"]', 'a[href*="/zixun/"]', '.search-result a']
115
+ },
116
+ 'dxy': {
117
+ 'base_url': 'https://www.dxy.cn',
118
+ 'search_url': 'https://www.dxy.cn/search',
119
+ 'domains': ['dxy.cn'],
120
+ 'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
121
+ },
122
+ 'chunyuyisheng': {
123
+ 'base_url': 'https://www.chunyuyisheng.com',
124
+ 'search_url': 'https://www.chunyuyisheng.com/search',
125
+ 'domains': ['chunyuyisheng.com'],
126
+ 'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
127
+ },
128
+ 'xywy': {
129
+ 'base_url': 'https://www.xywy.com',
130
+ 'search_url': 'https://www.xywy.com/search',
131
+ 'domains': ['xywy.com'],
132
+ 'selectors': ['a[href*="/jibing/"]', 'a[href*="/article/"]', '.search-result a']
133
+ },
134
+ 'jiankang': {
135
+ 'base_url': 'https://www.jiankang.com',
136
+ 'search_url': 'https://www.jiankang.com/search',
137
+ 'domains': ['jiankang.com'],
138
+ 'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
139
+ },
140
+ 'familydoctor': {
141
+ 'base_url': 'https://www.familydoctor.com.cn',
142
+ 'search_url': 'https://www.familydoctor.com.cn/search',
143
+ 'domains': ['familydoctor.com.cn'],
144
+ 'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
145
+ }
146
+ }
147
+ }
148
+
149
+ def search(self, query: str, num_results: int = 10, languages: List[str] = None) -> List[Dict]:
150
+ """Search across multiple languages and medical sources"""
151
+ if languages is None:
152
+ languages = ['en', 'vi', 'zh']
153
+
154
+ all_results = []
155
+
156
+ for lang in languages:
157
+ if lang in self.medical_sources:
158
+ lang_results = self._search_language_sources(query, lang, num_results // len(languages))
159
+ all_results.extend(lang_results)
160
+ time.sleep(0.5) # Be respectful to servers
161
+
162
+ # Remove duplicates and sort by relevance
163
+ unique_results = self._remove_duplicates(all_results)
164
+ return unique_results[:num_results]
165
+
166
+ def _search_language_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
167
+ """Search sources for a specific language"""
168
+ results = []
169
+ sources = self.medical_sources.get(language, {})
170
+
171
+ for source_name, source_config in sources.items():
172
+ if len(results) >= num_results:
173
+ break
174
+
175
+ source_results = self._search_source(query, source_name, source_config, language)
176
+ results.extend(source_results)
177
+ time.sleep(0.3) # Rate limiting
178
+
179
+ return results
180
+
181
+ def _search_source(self, query: str, source_name: str, source_config: Dict, language: str) -> List[Dict]:
182
+ """Search a specific medical source"""
183
+ try:
184
+ search_url = source_config.get('search_url')
185
+ if not search_url:
186
+ return []
187
+
188
+ # Prepare search parameters based on language
189
+ params = self._prepare_search_params(query, language)
190
+
191
+ response = self.session.get(search_url, params=params, timeout=self.timeout)
192
+ response.raise_for_status()
193
+
194
+ soup = BeautifulSoup(response.content, 'html.parser')
195
+ results = []
196
+
197
+ # Try source-specific selectors
198
+ selectors = source_config.get('selectors', ['a[href*="http"]'])
199
+
200
+ for selector in selectors:
201
+ links = soup.select(selector)
202
+ if links:
203
+ logger.info(f"{source_name} ({language}) found {len(links)} results with selector: {selector}")
204
+ break
205
+
206
+ for link in links[:3]: # Limit per source
207
+ try:
208
+ href = link.get('href')
209
+ if not href:
210
+ continue
211
+
212
+ # Make absolute URL
213
+ if href.startswith('/'):
214
+ href = source_config['base_url'] + href
215
+
216
+ title = link.get_text(strip=True)
217
+ if title and href.startswith('http'):
218
+ results.append({
219
+ 'url': href,
220
+ 'title': title,
221
+ 'source': source_name,
222
+ 'language': language,
223
+ 'domain': source_config['domains'][0]
224
+ })
225
+ except Exception as e:
226
+ logger.debug(f"Error parsing {source_name} link: {e}")
227
+ continue
228
+
229
+ return results
230
+
231
+ except Exception as e:
232
+ logger.warning(f"Medical source {source_name} ({language}) search failed: {e}")
233
+ return []
234
+
235
+ def _prepare_search_params(self, query: str, language: str) -> Dict[str, str]:
236
+ """Prepare search parameters based on language"""
237
+ # Common parameter names across different languages
238
+ param_mappings = {
239
+ 'en': {'q': query, 'query': query, 'search': query},
240
+ 'vi': {'q': query, 'query': query, 'search': query, 'tu-khoa': query, 'tim-kiem': query},
241
+ 'zh': {'q': query, 'query': query, 'search': query, 'keyword': query, 'sousuo': query}
242
+ }
243
+
244
+ return param_mappings.get(language, {'q': query})
245
+
246
+ def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
247
+ """Remove duplicate results based on URL"""
248
+ seen_urls = set()
249
+ unique_results = []
250
+
251
+ for result in results:
252
+ url = result.get('url', '')
253
+ if url and url not in seen_urls:
254
+ seen_urls.add(url)
255
+ unique_results.append(result)
256
+
257
+ return unique_results
258
+
259
+ def search_by_language(self, query: str, language: str, num_results: int = 10) -> List[Dict]:
260
+ """Search sources for a specific language only"""
261
+ if language not in self.medical_sources:
262
+ logger.warning(f"Language {language} not supported")
263
+ return []
264
+
265
+ return self._search_language_sources(query, language, num_results)
266
+
267
+ def get_fallback_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
268
+ """Get fallback sources when direct search fails"""
269
+ fallback_sources = {
270
+ 'en': [
271
+ {
272
+ 'url': 'https://www.mayoclinic.org/diseases-conditions',
273
+ 'title': f'Mayo Clinic: {query}',
274
+ 'source': 'mayo_fallback',
275
+ 'language': 'en',
276
+ 'domain': 'mayoclinic.org'
277
+ },
278
+ {
279
+ 'url': 'https://www.webmd.com/default.htm',
280
+ 'title': f'WebMD: {query}',
281
+ 'source': 'webmd_fallback',
282
+ 'language': 'en',
283
+ 'domain': 'webmd.com'
284
+ }
285
+ ],
286
+ 'vi': [
287
+ {
288
+ 'url': 'https://hellobacsi.com/suc-khoe',
289
+ 'title': f'Hello Bacsi: {query}',
290
+ 'source': 'hello_bacsi_fallback',
291
+ 'language': 'vi',
292
+ 'domain': 'hellobacsi.com'
293
+ },
294
+ {
295
+ 'url': 'https://www.vinmec.com/vi/suc-khoe',
296
+ 'title': f'Vinmec: {query}',
297
+ 'source': 'vinmec_fallback',
298
+ 'language': 'vi',
299
+ 'domain': 'vinmec.com'
300
+ }
301
+ ],
302
+ 'zh': [
303
+ {
304
+ 'url': 'https://www.haodf.com/jibing',
305
+ 'title': f'好大夫在线: {query}',
306
+ 'source': 'haodf_fallback',
307
+ 'language': 'zh',
308
+ 'domain': 'haodf.com'
309
+ },
310
+ {
311
+ 'url': 'https://www.dxy.cn/article',
312
+ 'title': f'丁香园: {query}',
313
+ 'source': 'dxy_fallback',
314
+ 'language': 'zh',
315
+ 'domain': 'dxy.cn'
316
+ }
317
+ ]
318
+ }
319
+
320
+ return fallback_sources.get(language, [])[:num_results]
search/extractors/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .content import ContentExtractor
2
+
3
+ __all__ = ['ContentExtractor']
search/extractors/content.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import logging
4
+ from typing import Dict, Optional
5
+ import re
6
+ from urllib.parse import urlparse
7
+ import time
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ class ContentExtractor:
12
+ """Extract and clean content from web pages"""
13
+
14
+ def __init__(self, timeout: int = 15):
15
+ self.session = requests.Session()
16
+ self.session.headers.update({
17
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
18
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
19
+ 'Accept-Language': 'en-US,en;q=0.5',
20
+ 'Accept-Encoding': 'gzip, deflate',
21
+ 'Connection': 'keep-alive',
22
+ })
23
+ self.timeout = timeout
24
+
25
+ # Medical content indicators
26
+ self.medical_indicators = [
27
+ 'symptom', 'treatment', 'diagnosis', 'medicine', 'medication',
28
+ 'therapy', 'condition', 'disease', 'health', 'medical',
29
+ 'doctor', 'physician', 'patient', 'clinical', 'study'
30
+ ]
31
+
32
+ def extract(self, url: str, max_length: int = 2000) -> Optional[str]:
33
+ """Extract content from a URL with medical focus"""
34
+ try:
35
+ response = self.session.get(url, timeout=self.timeout)
36
+ response.raise_for_status()
37
+
38
+ soup = BeautifulSoup(response.content, 'html.parser')
39
+
40
+ # Remove unwanted elements
41
+ self._remove_unwanted_elements(soup)
42
+
43
+ # Extract main content
44
+ content = self._extract_main_content(soup)
45
+
46
+ if not content:
47
+ return None
48
+
49
+ # Clean and process content
50
+ cleaned_content = self._clean_content(content)
51
+
52
+ # Focus on medical content if possible
53
+ medical_content = self._extract_medical_content(cleaned_content)
54
+
55
+ # Truncate to max length
56
+ final_content = self._truncate_content(medical_content or cleaned_content, max_length)
57
+
58
+ return final_content if final_content else None
59
+
60
+ except Exception as e:
61
+ logger.warning(f"Content extraction failed for {url}: {e}")
62
+ return None
63
+
64
+ def _remove_unwanted_elements(self, soup: BeautifulSoup):
65
+ """Remove unwanted HTML elements"""
66
+ unwanted_tags = [
67
+ 'script', 'style', 'nav', 'header', 'footer', 'aside',
68
+ 'advertisement', 'ads', 'sidebar', 'menu', 'navigation',
69
+ 'social', 'share', 'comment', 'comments', 'related',
70
+ 'cookie', 'privacy', 'terms', 'disclaimer'
71
+ ]
72
+
73
+ for tag in unwanted_tags:
74
+ for element in soup.find_all(tag):
75
+ element.decompose()
76
+
77
+ # Remove elements with unwanted classes/ids
78
+ unwanted_selectors = [
79
+ '[class*="ad"]', '[class*="advertisement"]', '[class*="sidebar"]',
80
+ '[class*="menu"]', '[class*="nav"]', '[class*="social"]',
81
+ '[class*="share"]', '[class*="comment"]', '[class*="related"]',
82
+ '[id*="ad"]', '[id*="sidebar"]', '[id*="menu"]', '[id*="nav"]'
83
+ ]
84
+
85
+ for selector in unwanted_selectors:
86
+ for element in soup.select(selector):
87
+ element.decompose()
88
+
89
+ def _extract_main_content(self, soup: BeautifulSoup) -> str:
90
+ """Extract main content from the page"""
91
+ # Priority order for content extraction
92
+ content_selectors = [
93
+ 'article',
94
+ 'main',
95
+ '[role="main"]',
96
+ '.content',
97
+ '.main-content',
98
+ '.article-content',
99
+ '.post-content',
100
+ '.entry-content',
101
+ '.page-content',
102
+ 'body'
103
+ ]
104
+
105
+ for selector in content_selectors:
106
+ elements = soup.select(selector)
107
+ if elements:
108
+ # Get the largest content element
109
+ largest_element = max(elements, key=lambda x: len(x.get_text()))
110
+ content = largest_element.get_text(separator=' ', strip=True)
111
+ if len(content) > 100: # Minimum content length
112
+ return content
113
+
114
+ # Fallback: get all text
115
+ return soup.get_text(separator=' ', strip=True)
116
+
117
+ def _clean_content(self, content: str) -> str:
118
+ """Clean and normalize content"""
119
+ if not content:
120
+ return ""
121
+
122
+ # Remove excessive whitespace
123
+ content = re.sub(r'\s+', ' ', content)
124
+
125
+ # Remove common web artifacts
126
+ artifacts = [
127
+ r'Cookie\s+Policy',
128
+ r'Privacy\s+Policy',
129
+ r'Terms\s+of\s+Service',
130
+ r'Subscribe\s+to\s+our\s+newsletter',
131
+ r'Follow\s+us\s+on',
132
+ r'Share\s+this\s+article',
133
+ r'Related\s+articles',
134
+ r'Advertisement',
135
+ r'Ad\s+content'
136
+ ]
137
+
138
+ for artifact in artifacts:
139
+ content = re.sub(artifact, '', content, flags=re.IGNORECASE)
140
+
141
+ # Remove excessive punctuation
142
+ content = re.sub(r'[.]{3,}', '...', content)
143
+ content = re.sub(r'[!]{2,}', '!', content)
144
+ content = re.sub(r'[?]{2,}', '?', content)
145
+
146
+ return content.strip()
147
+
148
+ def _extract_medical_content(self, content: str) -> Optional[str]:
149
+ """Extract medical-focused content from the text"""
150
+ if not content:
151
+ return None
152
+
153
+ # Split content into sentences
154
+ sentences = re.split(r'[.!?]+', content)
155
+ medical_sentences = []
156
+
157
+ for sentence in sentences:
158
+ sentence = sentence.strip()
159
+ if len(sentence) < 20: # Skip very short sentences
160
+ continue
161
+
162
+ # Check if sentence contains medical indicators
163
+ sentence_lower = sentence.lower()
164
+ if any(indicator in sentence_lower for indicator in self.medical_indicators):
165
+ medical_sentences.append(sentence)
166
+
167
+ if medical_sentences:
168
+ # Return medical sentences, prioritizing longer ones
169
+ medical_sentences.sort(key=len, reverse=True)
170
+ return '. '.join(medical_sentences[:10]) + '.'
171
+
172
+ return None
173
+
174
+ def _truncate_content(self, content: str, max_length: int) -> str:
175
+ """Truncate content to max length while preserving sentences"""
176
+ if len(content) <= max_length:
177
+ return content
178
+
179
+ # Try to truncate at sentence boundary
180
+ truncated = content[:max_length]
181
+ last_period = truncated.rfind('.')
182
+ last_exclamation = truncated.rfind('!')
183
+ last_question = truncated.rfind('?')
184
+
185
+ last_sentence_end = max(last_period, last_exclamation, last_question)
186
+
187
+ if last_sentence_end > max_length * 0.7: # If we can find a good break point
188
+ return content[:last_sentence_end + 1]
189
+
190
+ # Fallback: truncate at word boundary
191
+ words = truncated.split()
192
+ if len(words) > 1:
193
+ return ' '.join(words[:-1]) + '...'
194
+
195
+ return truncated + '...'
196
+
197
+ def extract_multiple(self, urls: list, max_length: int = 2000) -> Dict[str, str]:
198
+ """Extract content from multiple URLs"""
199
+ results = {}
200
+
201
+ for url in urls:
202
+ try:
203
+ content = self.extract(url, max_length)
204
+ if content:
205
+ results[url] = content
206
+ time.sleep(0.5) # Be respectful to servers
207
+ except Exception as e:
208
+ logger.warning(f"Failed to extract content from {url}: {e}")
209
+ continue
210
+
211
+ return results
search/processors/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .medical import MedicalSearchProcessor
2
+ from .language import LanguageProcessor
3
+
4
+ __all__ = ['MedicalSearchProcessor', 'LanguageProcessor']
search/processors/language.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import logging
3
+ from typing import List, Dict, Tuple, Optional
4
+ from langdetect import detect, DetectorFactory
5
+ from langdetect.lang_detect_exception import LangDetectException
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Set seed for consistent language detection
10
+ DetectorFactory.seed = 0
11
+
12
+ class LanguageProcessor:
13
+ """Process and enhance queries for multilingual medical search"""
14
+
15
+ def __init__(self):
16
+ # Medical keywords in different languages
17
+ self.medical_keywords = {
18
+ 'en': [
19
+ 'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
20
+ 'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
21
+ 'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
22
+ 'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
23
+ 'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
24
+ 'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
25
+ 'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
26
+ 'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
27
+ 'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
28
+ ],
29
+ 'vi': [
30
+ 'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
31
+ 'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
32
+ 'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
33
+ 'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
34
+ 'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
35
+ 'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
36
+ 'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
37
+ 'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
38
+ 'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
39
+ ],
40
+ 'zh': [
41
+ '症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
42
+ '药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
43
+ '医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
44
+ '背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
45
+ '睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
46
+ '婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
47
+ '手术', '医院', '诊所'
48
+ ]
49
+ }
50
+
51
+ # Language-specific search enhancements
52
+ self.language_enhancements = {
53
+ 'vi': {
54
+ 'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
55
+ 'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
56
+ },
57
+ 'zh': {
58
+ 'common_terms': ['是什么', '原因', '治疗方法', '症状'],
59
+ 'medical_context': ['医疗', '健康', '医院', '医生']
60
+ },
61
+ 'en': {
62
+ 'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
63
+ 'medical_context': ['medical', 'health', 'hospital', 'doctor']
64
+ }
65
+ }
66
+
67
+ def detect_language(self, text: str) -> str:
68
+ """Detect the language of the input text"""
69
+ if not text or not text.strip():
70
+ return 'en' # Default to English
71
+
72
+ try:
73
+ # Clean text for better detection
74
+ cleaned_text = re.sub(r'[^\w\s]', ' ', text)
75
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
76
+
77
+ if len(cleaned_text) < 3:
78
+ return 'en'
79
+
80
+ detected = detect(cleaned_text)
81
+
82
+ # Map detected language to our supported languages
83
+ language_mapping = {
84
+ 'vi': 'vi', # Vietnamese
85
+ 'zh-cn': 'zh', # Chinese Simplified
86
+ 'zh-tw': 'zh', # Chinese Traditional
87
+ 'zh': 'zh', # Chinese
88
+ 'en': 'en' # English
89
+ }
90
+
91
+ return language_mapping.get(detected, 'en')
92
+
93
+ except LangDetectException as e:
94
+ logger.warning(f"Language detection failed: {e}")
95
+ return 'en'
96
+
97
+ def enhance_query(self, query: str, target_language: str = None) -> Dict[str, str]:
98
+ """Enhance query for better search results in multiple languages"""
99
+ if not query or not query.strip():
100
+ return {}
101
+
102
+ # Detect source language
103
+ source_language = self.detect_language(query)
104
+
105
+ # If target language not specified, use source language
106
+ if target_language is None:
107
+ target_language = source_language
108
+
109
+ enhanced_queries = {}
110
+
111
+ # Original query
112
+ enhanced_queries[source_language] = query
113
+
114
+ # Enhance for source language
115
+ if source_language in self.language_enhancements:
116
+ enhanced_queries[source_language] = self._enhance_for_language(
117
+ query, source_language
118
+ )
119
+
120
+ # Create translations for other languages if needed
121
+ if target_language != source_language:
122
+ enhanced_queries[target_language] = self._translate_query(
123
+ query, source_language, target_language
124
+ )
125
+
126
+ # Add English version for comprehensive search
127
+ if 'en' not in enhanced_queries:
128
+ if source_language != 'en':
129
+ enhanced_queries['en'] = self._translate_query(query, source_language, 'en')
130
+ else:
131
+ enhanced_queries['en'] = query
132
+
133
+ return enhanced_queries
134
+
135
+ def _enhance_for_language(self, query: str, language: str) -> str:
136
+ """Enhance query for a specific language"""
137
+ enhancements = self.language_enhancements.get(language, {})
138
+ common_terms = enhancements.get('common_terms', [])
139
+ medical_context = enhancements.get('medical_context', [])
140
+
141
+ # Check if query already contains medical context
142
+ query_lower = query.lower()
143
+ has_medical_context = any(term in query_lower for term in medical_context)
144
+
145
+ # If no medical context, add it
146
+ if not has_medical_context and medical_context:
147
+ # Add the most relevant medical context term
148
+ query += f" {medical_context[0]}"
149
+
150
+ # Check if query is a question and add relevant terms
151
+ if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
152
+ if common_terms:
153
+ query += f" {common_terms[0]}" # Add "causes" or equivalent
154
+
155
+ return query.strip()
156
+
157
+ def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
158
+ """Simple keyword-based translation for medical terms"""
159
+ # This is a basic implementation - in production, you'd use a proper translation service
160
+
161
+ # Medical term translations
162
+ translations = {
163
+ ('vi', 'en'): {
164
+ 'triệu chứng': 'symptoms',
165
+ 'đau': 'pain',
166
+ 'đau đầu': 'headache',
167
+ 'sốt': 'fever',
168
+ 'ho': 'cough',
169
+ 'điều trị': 'treatment',
170
+ 'thuốc': 'medicine',
171
+ 'bệnh': 'disease',
172
+ 'bác sĩ': 'doctor',
173
+ 'sức khỏe': 'health',
174
+ 'bệnh viện': 'hospital'
175
+ },
176
+ ('zh', 'en'): {
177
+ '症状': 'symptoms',
178
+ '疼痛': 'pain',
179
+ '头痛': 'headache',
180
+ '发烧': 'fever',
181
+ '咳嗽': 'cough',
182
+ '治疗': 'treatment',
183
+ '药物': 'medicine',
184
+ '疾病': 'disease',
185
+ '医生': 'doctor',
186
+ '健康': 'health',
187
+ '医院': 'hospital'
188
+ },
189
+ ('en', 'vi'): {
190
+ 'symptoms': 'triệu chứng',
191
+ 'pain': 'đau',
192
+ 'headache': 'đau đầu',
193
+ 'fever': 'sốt',
194
+ 'cough': 'ho',
195
+ 'treatment': 'điều trị',
196
+ 'medicine': 'thuốc',
197
+ 'disease': 'bệnh',
198
+ 'doctor': 'bác sĩ',
199
+ 'health': 'sức khỏe',
200
+ 'hospital': 'bệnh viện'
201
+ },
202
+ ('en', 'zh'): {
203
+ 'symptoms': '症状',
204
+ 'pain': '疼痛',
205
+ 'headache': '头痛',
206
+ 'fever': '发烧',
207
+ 'cough': '咳嗽',
208
+ 'treatment': '治疗',
209
+ 'medicine': '药物',
210
+ 'disease': '疾病',
211
+ 'doctor': '医生',
212
+ 'health': '健康',
213
+ 'hospital': '医院'
214
+ }
215
+ }
216
+
217
+ translation_map = translations.get((source_lang, target_lang), {})
218
+
219
+ # Simple word-by-word translation
220
+ translated_query = query
221
+ for source_term, target_term in translation_map.items():
222
+ translated_query = translated_query.replace(source_term, target_term)
223
+
224
+ return translated_query
225
+
226
+ def get_medical_relevance_score(self, text: str, language: str) -> float:
227
+ """Calculate medical relevance score for text in a specific language"""
228
+ if not text:
229
+ return 0.0
230
+
231
+ keywords = self.medical_keywords.get(language, [])
232
+ if not keywords:
233
+ return 0.0
234
+
235
+ text_lower = text.lower()
236
+ matches = sum(1 for keyword in keywords if keyword in text_lower)
237
+
238
+ # Normalize by text length and keyword count
239
+ score = matches / max(len(keywords), 1)
240
+
241
+ # Boost score for longer matches
242
+ if matches > 0:
243
+ score *= (1 + matches * 0.1)
244
+
245
+ return min(score, 1.0)
246
+
247
+ def filter_by_language(self, results: List[Dict], target_language: str) -> List[Dict]:
248
+ """Filter results by language preference"""
249
+ if not results:
250
+ return results
251
+
252
+ # Score results by language match
253
+ scored_results = []
254
+ for result in results:
255
+ result_language = result.get('language', 'en')
256
+ language_score = 1.0 if result_language == target_language else 0.5
257
+
258
+ # Add language score to result
259
+ result_copy = result.copy()
260
+ result_copy['language_score'] = language_score
261
+ scored_results.append(result_copy)
262
+
263
+ # Sort by language score (prefer target language)
264
+ scored_results.sort(key=lambda x: x.get('language_score', 0), reverse=True)
265
+
266
+ return scored_results
search/processors/medical.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ from models.summarizer import summarizer
4
+ import re
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ class MedicalSearchProcessor:
9
+ """Process and enhance medical search results"""
10
+
11
+ def __init__(self):
12
+ self.medical_keywords = [
13
+ 'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
14
+ 'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
15
+ 'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
16
+ 'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
17
+ 'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
18
+ 'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
19
+ 'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
20
+ 'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
21
+ 'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
22
+ ]
23
+
24
+ def process_results(self, results: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
25
+ """Process search results and create comprehensive medical summary"""
26
+ if not results:
27
+ return "", {}
28
+
29
+ # Filter and rank results by medical relevance
30
+ relevant_results = self._filter_medical_results(results, user_query)
31
+
32
+ if not relevant_results:
33
+ logger.warning("No medically relevant results found")
34
+ return "", {}
35
+
36
+ # Extract and summarize content
37
+ summarized_results = self._summarize_results(relevant_results, user_query)
38
+
39
+ # Create comprehensive summary
40
+ combined_summary = self._create_combined_summary(summarized_results, user_query)
41
+
42
+ # Create URL mapping for citations
43
+ url_mapping = self._create_url_mapping(relevant_results)
44
+
45
+ return combined_summary, url_mapping
46
+
47
+ def _filter_medical_results(self, results: List[Dict], user_query: str) -> List[Dict]:
48
+ """Filter results by medical relevance"""
49
+ relevant_results = []
50
+
51
+ for result in results:
52
+ relevance_score = self._calculate_relevance_score(result, user_query)
53
+
54
+ if relevance_score > 0.3: # Threshold for medical relevance
55
+ result['relevance_score'] = relevance_score
56
+ relevant_results.append(result)
57
+
58
+ # Sort by relevance score
59
+ relevant_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
60
+
61
+ # Limit to top results
62
+ return relevant_results[:10]
63
+
64
+ def _calculate_relevance_score(self, result: Dict, user_query: str) -> float:
65
+ """Calculate medical relevance score for a result"""
66
+ score = 0.0
67
+
68
+ # Check title relevance
69
+ title = result.get('title', '').lower()
70
+ query_lower = user_query.lower()
71
+
72
+ # Direct query match in title
73
+ if any(word in title for word in query_lower.split()):
74
+ score += 0.4
75
+
76
+ # Medical keyword match in title
77
+ medical_matches = sum(1 for keyword in self.medical_keywords if keyword in title)
78
+ score += min(medical_matches * 0.1, 0.3)
79
+
80
+ # Domain credibility
81
+ url = result.get('url', '').lower()
82
+ credible_domains = [
83
+ 'mayoclinic.org', 'webmd.com', 'healthline.com', 'medlineplus.gov',
84
+ 'nih.gov', 'cdc.gov', 'who.int', 'pubmed.ncbi.nlm.nih.gov',
85
+ 'uptodate.com', 'merckmanuals.com', 'medscape.com'
86
+ ]
87
+
88
+ if any(domain in url for domain in credible_domains):
89
+ score += 0.3
90
+
91
+ # Source type bonus
92
+ source = result.get('source', '')
93
+ if 'medical' in source or any(domain in source for domain in credible_domains):
94
+ score += 0.2
95
+
96
+ return min(score, 1.0)
97
+
98
+ def _summarize_results(self, results: List[Dict], user_query: str) -> List[Dict]:
99
+ """Summarize content from search results"""
100
+ summarized_results = []
101
+
102
+ for i, result in enumerate(results):
103
+ try:
104
+ content = result.get('content', '')
105
+ if not content:
106
+ continue
107
+
108
+ # Create focused summary
109
+ summary = summarizer.summarize_for_query(content, user_query, max_length=300)
110
+
111
+ if summary:
112
+ summarized_results.append({
113
+ 'id': i + 1,
114
+ 'url': result['url'],
115
+ 'title': result['title'],
116
+ 'summary': summary,
117
+ 'relevance_score': result.get('relevance_score', 0)
118
+ })
119
+
120
+ except Exception as e:
121
+ logger.warning(f"Failed to summarize result {i}: {e}")
122
+ continue
123
+
124
+ return summarized_results
125
+
126
+ def _create_combined_summary(self, summarized_results: List[Dict], user_query: str) -> str:
127
+ """Create a comprehensive summary from all results"""
128
+ if not summarized_results:
129
+ return ""
130
+
131
+ # Group by topic/similarity
132
+ topic_groups = self._group_by_topic(summarized_results)
133
+
134
+ summary_parts = []
135
+
136
+ for topic, results in topic_groups.items():
137
+ if not results:
138
+ continue
139
+
140
+ # Create topic summary
141
+ topic_summary = self._create_topic_summary(topic, results, user_query)
142
+ if topic_summary:
143
+ summary_parts.append(topic_summary)
144
+
145
+ # Combine all parts
146
+ combined_summary = "\n\n".join(summary_parts)
147
+
148
+ # Final summarization to ensure conciseness
149
+ if len(combined_summary) > 1500:
150
+ combined_summary = summarizer.summarize_text(combined_summary, max_length=1500)
151
+
152
+ return combined_summary
153
+
154
+ def _group_by_topic(self, results: List[Dict]) -> Dict[str, List[Dict]]:
155
+ """Group results by medical topic"""
156
+ topics = {
157
+ 'symptoms': [],
158
+ 'treatments': [],
159
+ 'diagnosis': [],
160
+ 'general': []
161
+ }
162
+
163
+ for result in results:
164
+ title_lower = result['title'].lower()
165
+ summary_lower = result.get('summary', '').lower()
166
+ content_lower = f"{title_lower} {summary_lower}"
167
+
168
+ # Categorize by content
169
+ if any(word in content_lower for word in ['symptom', 'sign', 'pain', 'ache']):
170
+ topics['symptoms'].append(result)
171
+ elif any(word in content_lower for word in ['treatment', 'therapy', 'medicine', 'medication']):
172
+ topics['treatments'].append(result)
173
+ elif any(word in content_lower for word in ['diagnosis', 'test', 'examination', 'evaluation']):
174
+ topics['diagnosis'].append(result)
175
+ else:
176
+ topics['general'].append(result)
177
+
178
+ return topics
179
+
180
+ def _create_topic_summary(self, topic: str, results: List[Dict], user_query: str) -> str:
181
+ """Create summary for a specific topic"""
182
+ if not results:
183
+ return ""
184
+
185
+ # Combine summaries for this topic
186
+ combined_text = " ".join([r.get('summary', '') for r in results])
187
+
188
+ if not combined_text:
189
+ return ""
190
+
191
+ # Create focused summary for this topic
192
+ topic_summary = summarizer.summarize_for_query(combined_text, user_query, max_length=400)
193
+
194
+ if topic_summary:
195
+ # Add topic header
196
+ topic_headers = {
197
+ 'symptoms': "**Symptoms and Signs:**",
198
+ 'treatments': "**Treatment Options:**",
199
+ 'diagnosis': "**Diagnosis and Testing:**",
200
+ 'general': "**General Information:**"
201
+ }
202
+
203
+ header = topic_headers.get(topic, "**Information:**")
204
+ return f"{header}\n{topic_summary}"
205
+
206
+ return ""
207
+
208
+ def _create_url_mapping(self, results: List[Dict]) -> Dict[int, str]:
209
+ """Create URL mapping for citations"""
210
+ url_mapping = {}
211
+
212
+ for i, result in enumerate(results):
213
+ url_mapping[i + 1] = result['url']
214
+
215
+ return url_mapping
search/search.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Tuple
3
+ from .coordinator import SearchCoordinator
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ # Global search coordinator instance
8
+ _search_coordinator = None
9
+
10
+ def get_search_coordinator() -> SearchCoordinator:
11
+ """Get or create the global search coordinator instance"""
12
+ global _search_coordinator
13
+ if _search_coordinator is None:
14
+ _search_coordinator = SearchCoordinator()
15
+ return _search_coordinator
16
+
17
+ class WebSearcher:
18
+ """Legacy wrapper for backward compatibility"""
19
+ def __init__(self):
20
+ self.coordinator = get_search_coordinator()
21
+ self.max_results = 10
22
+ self.timeout = 10
23
+
24
+ def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
25
+ """Search using the new coordinator system"""
26
+ try:
27
+ return self.coordinator.quick_search(query, num_results)
28
+ except Exception as e:
29
+ logger.error(f"Search failed: {e}")
30
+ return []
31
+
32
+ def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
33
+ """Search using DuckDuckGo engine"""
34
+ try:
35
+ return self.coordinator.quick_search(query, num_results)
36
+ except Exception as e:
37
+ logger.error(f"DuckDuckGo search failed: {e}")
38
+ return []
39
+
40
+ def extract_content(self, url: str) -> str:
41
+ """Extract content using the new content extractor"""
42
+ try:
43
+ return self.coordinator.content_extractor.extract(url)
44
+ except Exception as e:
45
+ logger.error(f"Content extraction failed: {e}")
46
+ return ""
47
+
48
+ def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
49
+ """Search and extract content using the new system"""
50
+ try:
51
+ # Get search results
52
+ results = self.coordinator.quick_search(query, num_results)
53
+
54
+ # Extract content for each result
55
+ enriched_results = []
56
+ for result in results:
57
+ content = self.extract_content(result['url'])
58
+ if content:
59
+ enriched_result = result.copy()
60
+ enriched_result['content'] = content
61
+ enriched_results.append(enriched_result)
62
+
63
+ return enriched_results
64
+ except Exception as e:
65
+ logger.error(f"Search and extract failed: {e}")
66
+ return []
67
+
68
+ # Main search function for backward compatibility
69
+ def search_web(query: str, num_results: int = 10) -> List[Dict]:
70
+ """Main search function using the new coordinator system"""
71
+ try:
72
+ coordinator = get_search_coordinator()
73
+ return coordinator.quick_search(query, num_results)
74
+ except Exception as e:
75
+ logger.error(f"Web search failed: {e}")
76
+ return []
77
+
78
+ # Enhanced search function with content extraction
79
+ def search_web_with_content(query: str, num_results: int = 10) -> Tuple[str, Dict[int, str]]:
80
+ """Enhanced search with content extraction and summarization"""
81
+ try:
82
+ coordinator = get_search_coordinator()
83
+ return coordinator.search(query, num_results)
84
+ except Exception as e:
85
+ logger.error(f"Enhanced web search failed: {e}")
86
+ return "", {}
87
+
88
+ # Medical-focused search function
89
+ def search_medical(query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
90
+ """Medical-focused search with enhanced processing"""
91
+ try:
92
+ coordinator = get_search_coordinator()
93
+ return coordinator.medical_focus_search(query, num_results)
94
+ except Exception as e:
95
+ logger.error(f"Medical search failed: {e}")
96
+ return "", {}
97
+
98
+ # Multilingual medical search function
99
+ def search_multilingual_medical(query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
100
+ """Comprehensive multilingual medical search supporting English, Vietnamese, and Chinese"""
101
+ try:
102
+ coordinator = get_search_coordinator()
103
+ return coordinator.multilingual_medical_search(query, num_results, target_language)
104
+ except Exception as e:
105
+ logger.error(f"Multilingual medical search failed: {e}")
106
+ return "", {}