Spaces:
Sleeping
Sleeping
Upd multilingual search strat
Browse files- search/__init__.py +21 -0
- search/coordinator.py +203 -0
- search/engines/__init__.py +5 -0
- search/engines/duckduckgo.py +200 -0
- search/engines/medical.py +195 -0
- search/engines/multilingual.py +320 -0
- search/extractors/__init__.py +3 -0
- search/extractors/content.py +211 -0
- search/processors/__init__.py +4 -0
- search/processors/language.py +266 -0
- search/processors/medical.py +215 -0
- search/search.py +106 -0
search/__init__.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Search package
|
| 2 |
+
from .search import WebSearcher, search_web, search_web_with_content, search_medical, search_multilingual_medical
|
| 3 |
+
from .coordinator import SearchCoordinator
|
| 4 |
+
from .engines import DuckDuckGoEngine, MedicalSearchEngine, MultilingualMedicalEngine
|
| 5 |
+
from .extractors import ContentExtractor
|
| 6 |
+
from .processors import MedicalSearchProcessor, LanguageProcessor
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
'WebSearcher',
|
| 10 |
+
'search_web',
|
| 11 |
+
'search_web_with_content',
|
| 12 |
+
'search_medical',
|
| 13 |
+
'search_multilingual_medical',
|
| 14 |
+
'SearchCoordinator',
|
| 15 |
+
'DuckDuckGoEngine',
|
| 16 |
+
'MedicalSearchEngine',
|
| 17 |
+
'MultilingualMedicalEngine',
|
| 18 |
+
'ContentExtractor',
|
| 19 |
+
'MedicalSearchProcessor',
|
| 20 |
+
'LanguageProcessor'
|
| 21 |
+
]
|
search/coordinator.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
import time
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 5 |
+
|
| 6 |
+
from .engines.duckduckgo import DuckDuckGoEngine
|
| 7 |
+
from .engines.medical import MedicalSearchEngine
|
| 8 |
+
from .engines.multilingual import MultilingualMedicalEngine
|
| 9 |
+
from .extractors.content import ContentExtractor
|
| 10 |
+
from .processors.medical import MedicalSearchProcessor
|
| 11 |
+
from .processors.language import LanguageProcessor
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class SearchCoordinator:
|
| 16 |
+
"""Coordinate multiple search strategies for comprehensive medical information"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, max_workers: int = 3):
|
| 19 |
+
self.max_workers = max_workers
|
| 20 |
+
|
| 21 |
+
# Initialize search engines
|
| 22 |
+
self.duckduckgo_engine = DuckDuckGoEngine()
|
| 23 |
+
self.medical_engine = MedicalSearchEngine()
|
| 24 |
+
self.multilingual_engine = MultilingualMedicalEngine()
|
| 25 |
+
|
| 26 |
+
# Initialize processors
|
| 27 |
+
self.content_extractor = ContentExtractor()
|
| 28 |
+
self.medical_processor = MedicalSearchProcessor()
|
| 29 |
+
self.language_processor = LanguageProcessor()
|
| 30 |
+
|
| 31 |
+
# Search strategies
|
| 32 |
+
self.strategies = [
|
| 33 |
+
self._search_multilingual,
|
| 34 |
+
self._search_duckduckgo,
|
| 35 |
+
self._search_medical_sources
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
def search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 39 |
+
"""Execute comprehensive multilingual search with multiple strategies"""
|
| 40 |
+
logger.info(f"Starting comprehensive multilingual search for: {query}")
|
| 41 |
+
|
| 42 |
+
# Detect and enhance query for multiple languages
|
| 43 |
+
enhanced_queries = self.language_processor.enhance_query(query, target_language)
|
| 44 |
+
logger.info(f"Enhanced queries: {list(enhanced_queries.keys())}")
|
| 45 |
+
|
| 46 |
+
# Execute search strategies in parallel
|
| 47 |
+
all_results = []
|
| 48 |
+
|
| 49 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 50 |
+
# Submit search tasks for each language
|
| 51 |
+
future_to_strategy = {}
|
| 52 |
+
|
| 53 |
+
for lang, enhanced_query in enhanced_queries.items():
|
| 54 |
+
for strategy in self.strategies:
|
| 55 |
+
future = executor.submit(strategy, enhanced_query, num_results // len(enhanced_queries), lang)
|
| 56 |
+
future_to_strategy[future] = f"{strategy.__name__}_{lang}"
|
| 57 |
+
|
| 58 |
+
# Collect results
|
| 59 |
+
for future in as_completed(future_to_strategy):
|
| 60 |
+
strategy_name = future_to_strategy[future]
|
| 61 |
+
try:
|
| 62 |
+
results = future.result()
|
| 63 |
+
if results:
|
| 64 |
+
all_results.extend(results)
|
| 65 |
+
logger.info(f"{strategy_name} found {len(results)} results")
|
| 66 |
+
except Exception as e:
|
| 67 |
+
logger.error(f"{strategy_name} failed: {e}")
|
| 68 |
+
|
| 69 |
+
# Remove duplicates and filter by language preference
|
| 70 |
+
unique_results = self._remove_duplicates(all_results)
|
| 71 |
+
if target_language:
|
| 72 |
+
unique_results = self.language_processor.filter_by_language(unique_results, target_language)
|
| 73 |
+
|
| 74 |
+
logger.info(f"Total unique results: {len(unique_results)}")
|
| 75 |
+
|
| 76 |
+
# Extract content from URLs
|
| 77 |
+
enriched_results = self._enrich_with_content(unique_results)
|
| 78 |
+
|
| 79 |
+
# Process results into comprehensive summary
|
| 80 |
+
summary, url_mapping = self.medical_processor.process_results(enriched_results, query)
|
| 81 |
+
|
| 82 |
+
logger.info(f"Multilingual search completed: {len(url_mapping)} sources processed")
|
| 83 |
+
return summary, url_mapping
|
| 84 |
+
|
| 85 |
+
def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 86 |
+
"""Search using multilingual medical engine"""
|
| 87 |
+
try:
|
| 88 |
+
if language:
|
| 89 |
+
results = self.multilingual_engine.search_by_language(query, language, num_results)
|
| 90 |
+
else:
|
| 91 |
+
results = self.multilingual_engine.search(query, num_results)
|
| 92 |
+
return results
|
| 93 |
+
except Exception as e:
|
| 94 |
+
logger.error(f"Multilingual search failed: {e}")
|
| 95 |
+
return []
|
| 96 |
+
|
| 97 |
+
def _search_duckduckgo(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 98 |
+
"""Search using DuckDuckGo engine"""
|
| 99 |
+
try:
|
| 100 |
+
results = self.duckduckgo_engine.search(query, num_results)
|
| 101 |
+
return results
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(f"DuckDuckGo search failed: {e}")
|
| 104 |
+
return []
|
| 105 |
+
|
| 106 |
+
def _search_medical_sources(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 107 |
+
"""Search using medical sources engine"""
|
| 108 |
+
try:
|
| 109 |
+
results = self.medical_engine.search(query, num_results)
|
| 110 |
+
return results
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Medical sources search failed: {e}")
|
| 113 |
+
return []
|
| 114 |
+
|
| 115 |
+
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
|
| 116 |
+
"""Remove duplicate results based on URL"""
|
| 117 |
+
seen_urls = set()
|
| 118 |
+
unique_results = []
|
| 119 |
+
|
| 120 |
+
for result in results:
|
| 121 |
+
url = result.get('url', '')
|
| 122 |
+
if url and url not in seen_urls:
|
| 123 |
+
seen_urls.add(url)
|
| 124 |
+
unique_results.append(result)
|
| 125 |
+
|
| 126 |
+
return unique_results
|
| 127 |
+
|
| 128 |
+
def _enrich_with_content(self, results: List[Dict]) -> List[Dict]:
|
| 129 |
+
"""Enrich results with extracted content"""
|
| 130 |
+
enriched_results = []
|
| 131 |
+
|
| 132 |
+
# Extract content in parallel
|
| 133 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 134 |
+
# Submit content extraction tasks
|
| 135 |
+
future_to_result = {
|
| 136 |
+
executor.submit(self.content_extractor.extract, result['url']): result
|
| 137 |
+
for result in results
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
# Collect enriched results
|
| 141 |
+
for future in as_completed(future_to_result):
|
| 142 |
+
original_result = future_to_result[future]
|
| 143 |
+
try:
|
| 144 |
+
content = future.result()
|
| 145 |
+
if content:
|
| 146 |
+
enriched_result = original_result.copy()
|
| 147 |
+
enriched_result['content'] = content
|
| 148 |
+
enriched_results.append(enriched_result)
|
| 149 |
+
except Exception as e:
|
| 150 |
+
logger.warning(f"Content extraction failed for {original_result['url']}: {e}")
|
| 151 |
+
# Still include result without content
|
| 152 |
+
enriched_results.append(original_result)
|
| 153 |
+
|
| 154 |
+
return enriched_results
|
| 155 |
+
|
| 156 |
+
def quick_search(self, query: str, num_results: int = 5) -> List[Dict]:
|
| 157 |
+
"""Quick search for basic results without content extraction"""
|
| 158 |
+
logger.info(f"Quick search for: {query}")
|
| 159 |
+
|
| 160 |
+
# Use only DuckDuckGo for speed
|
| 161 |
+
results = self.duckduckgo_engine.search(query, num_results)
|
| 162 |
+
|
| 163 |
+
# Remove duplicates
|
| 164 |
+
unique_results = self._remove_duplicates(results)
|
| 165 |
+
|
| 166 |
+
logger.info(f"Quick search completed: {len(unique_results)} results")
|
| 167 |
+
return unique_results
|
| 168 |
+
|
| 169 |
+
def medical_focus_search(self, query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
|
| 170 |
+
"""Medical-focused search with enhanced processing"""
|
| 171 |
+
logger.info(f"Medical focus search for: {query}")
|
| 172 |
+
|
| 173 |
+
# Use medical engine primarily
|
| 174 |
+
medical_results = self.medical_engine.search(query, num_results)
|
| 175 |
+
|
| 176 |
+
# Add some general results for context
|
| 177 |
+
general_results = self.duckduckgo_engine.search(query, 3)
|
| 178 |
+
|
| 179 |
+
# Combine and deduplicate
|
| 180 |
+
all_results = self._remove_duplicates(medical_results + general_results)
|
| 181 |
+
|
| 182 |
+
# Enrich with content
|
| 183 |
+
enriched_results = self._enrich_with_content(all_results)
|
| 184 |
+
|
| 185 |
+
# Process with medical focus
|
| 186 |
+
summary, url_mapping = self.medical_processor.process_results(enriched_results, query)
|
| 187 |
+
|
| 188 |
+
logger.info(f"Medical focus search completed: {len(url_mapping)} sources")
|
| 189 |
+
return summary, url_mapping
|
| 190 |
+
|
| 191 |
+
def multilingual_medical_search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 192 |
+
"""Comprehensive multilingual medical search"""
|
| 193 |
+
logger.info(f"Multilingual medical search for: {query} (target: {target_language})")
|
| 194 |
+
|
| 195 |
+
# Detect source language
|
| 196 |
+
source_language = self.language_processor.detect_language(query)
|
| 197 |
+
logger.info(f"Detected source language: {source_language}")
|
| 198 |
+
|
| 199 |
+
# Use multilingual search with language preference
|
| 200 |
+
summary, url_mapping = self.search(query, num_results, target_language)
|
| 201 |
+
|
| 202 |
+
logger.info(f"Multilingual medical search completed: {len(url_mapping)} sources")
|
| 203 |
+
return summary, url_mapping
|
search/engines/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .duckduckgo import DuckDuckGoEngine
|
| 2 |
+
from .medical import MedicalSearchEngine
|
| 3 |
+
from .multilingual import MultilingualMedicalEngine
|
| 4 |
+
|
| 5 |
+
__all__ = ['DuckDuckGoEngine', 'MedicalSearchEngine', 'MultilingualMedicalEngine']
|
search/engines/duckduckgo.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class DuckDuckGoEngine:
|
| 10 |
+
"""DuckDuckGo search engine with multiple strategies"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, timeout: int = 15):
|
| 13 |
+
self.session = requests.Session()
|
| 14 |
+
self.session.headers.update({
|
| 15 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 16 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 17 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 18 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 19 |
+
'Connection': 'keep-alive',
|
| 20 |
+
'Upgrade-Insecure-Requests': '1',
|
| 21 |
+
})
|
| 22 |
+
self.timeout = timeout
|
| 23 |
+
|
| 24 |
+
def search(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 25 |
+
"""Search with multiple DuckDuckGo strategies"""
|
| 26 |
+
results = []
|
| 27 |
+
|
| 28 |
+
# Strategy 1: HTML Interface
|
| 29 |
+
html_results = self._search_html(query, num_results)
|
| 30 |
+
if html_results:
|
| 31 |
+
results.extend(html_results)
|
| 32 |
+
logger.info(f"DuckDuckGo HTML found {len(html_results)} results")
|
| 33 |
+
|
| 34 |
+
# Strategy 2: Instant Answer API
|
| 35 |
+
if len(results) < num_results:
|
| 36 |
+
api_results = self._search_api(query, num_results - len(results))
|
| 37 |
+
if api_results:
|
| 38 |
+
results.extend(api_results)
|
| 39 |
+
logger.info(f"DuckDuckGo API found {len(api_results)} results")
|
| 40 |
+
|
| 41 |
+
# Strategy 3: Lite Interface (mobile-friendly)
|
| 42 |
+
if len(results) < num_results:
|
| 43 |
+
lite_results = self._search_lite(query, num_results - len(results))
|
| 44 |
+
if lite_results:
|
| 45 |
+
results.extend(lite_results)
|
| 46 |
+
logger.info(f"DuckDuckGo Lite found {len(lite_results)} results")
|
| 47 |
+
|
| 48 |
+
return results[:num_results]
|
| 49 |
+
|
| 50 |
+
def _search_html(self, query: str, num_results: int) -> List[Dict]:
|
| 51 |
+
"""Search using DuckDuckGo HTML interface"""
|
| 52 |
+
try:
|
| 53 |
+
url = "https://html.duckduckgo.com/html/"
|
| 54 |
+
params = {
|
| 55 |
+
'q': query,
|
| 56 |
+
'kl': 'us-en',
|
| 57 |
+
's': '0', # Start from first result
|
| 58 |
+
'dc': '1', # Disable auto-complete
|
| 59 |
+
'v': 'l', # Lite version
|
| 60 |
+
'o': 'json', # JSON output
|
| 61 |
+
'api': 'd.js' # API format
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 65 |
+
response.raise_for_status()
|
| 66 |
+
|
| 67 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 68 |
+
results = []
|
| 69 |
+
|
| 70 |
+
# Multiple selectors for different DDG layouts
|
| 71 |
+
selectors = [
|
| 72 |
+
'a.result__a',
|
| 73 |
+
'a[data-testid="result-title-a"]',
|
| 74 |
+
'.result__title a',
|
| 75 |
+
'.web-result a',
|
| 76 |
+
'.result a',
|
| 77 |
+
'a[href*="http"]:not([href*="duckduckgo.com"])'
|
| 78 |
+
]
|
| 79 |
+
|
| 80 |
+
for selector in selectors:
|
| 81 |
+
links = soup.select(selector)
|
| 82 |
+
if links:
|
| 83 |
+
logger.info(f"Using selector: {selector} - found {len(links)} links")
|
| 84 |
+
break
|
| 85 |
+
|
| 86 |
+
for link in links[:num_results]:
|
| 87 |
+
try:
|
| 88 |
+
href = link.get('href')
|
| 89 |
+
if not href or href.startswith('#') or 'duckduckgo.com' in href:
|
| 90 |
+
continue
|
| 91 |
+
|
| 92 |
+
# Clean up DDG redirect URLs
|
| 93 |
+
if href.startswith('/l/?uddg='):
|
| 94 |
+
import urllib.parse
|
| 95 |
+
href = urllib.parse.unquote(href.split('uddg=')[1])
|
| 96 |
+
|
| 97 |
+
title = link.get_text(strip=True)
|
| 98 |
+
if title and href.startswith('http'):
|
| 99 |
+
results.append({
|
| 100 |
+
'url': href,
|
| 101 |
+
'title': title,
|
| 102 |
+
'source': 'duckduckgo_html'
|
| 103 |
+
})
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.debug(f"Error parsing link: {e}")
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
return results
|
| 109 |
+
|
| 110 |
+
except Exception as e:
|
| 111 |
+
logger.warning(f"DuckDuckGo HTML search failed: {e}")
|
| 112 |
+
return []
|
| 113 |
+
|
| 114 |
+
def _search_api(self, query: str, num_results: int) -> List[Dict]:
|
| 115 |
+
"""Search using DuckDuckGo Instant Answer API"""
|
| 116 |
+
try:
|
| 117 |
+
url = "https://api.duckduckgo.com/"
|
| 118 |
+
params = {
|
| 119 |
+
'q': query,
|
| 120 |
+
'format': 'json',
|
| 121 |
+
'no_html': '1',
|
| 122 |
+
'skip_disambig': '1',
|
| 123 |
+
't': 'MedicalChatbot'
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 127 |
+
response.raise_for_status()
|
| 128 |
+
data = response.json()
|
| 129 |
+
|
| 130 |
+
results = []
|
| 131 |
+
|
| 132 |
+
# Abstract result
|
| 133 |
+
if data.get('AbstractURL') and data.get('Abstract'):
|
| 134 |
+
results.append({
|
| 135 |
+
'url': data['AbstractURL'],
|
| 136 |
+
'title': data.get('Heading', query),
|
| 137 |
+
'content': data.get('Abstract', ''),
|
| 138 |
+
'source': 'duckduckgo_api'
|
| 139 |
+
})
|
| 140 |
+
|
| 141 |
+
# Related topics
|
| 142 |
+
for topic in data.get('RelatedTopics', []):
|
| 143 |
+
if len(results) >= num_results:
|
| 144 |
+
break
|
| 145 |
+
|
| 146 |
+
if isinstance(topic, dict) and topic.get('FirstURL'):
|
| 147 |
+
text = topic.get('Text', '')
|
| 148 |
+
title = text.split(' - ')[0] if ' - ' in text else text[:50]
|
| 149 |
+
|
| 150 |
+
results.append({
|
| 151 |
+
'url': topic['FirstURL'],
|
| 152 |
+
'title': title,
|
| 153 |
+
'content': text,
|
| 154 |
+
'source': 'duckduckgo_api'
|
| 155 |
+
})
|
| 156 |
+
|
| 157 |
+
return results
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.warning(f"DuckDuckGo API search failed: {e}")
|
| 161 |
+
return []
|
| 162 |
+
|
| 163 |
+
def _search_lite(self, query: str, num_results: int) -> List[Dict]:
|
| 164 |
+
"""Search using DuckDuckGo Lite interface"""
|
| 165 |
+
try:
|
| 166 |
+
url = "https://lite.duckduckgo.com/lite/"
|
| 167 |
+
params = {
|
| 168 |
+
'q': query,
|
| 169 |
+
'kl': 'us-en'
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 173 |
+
response.raise_for_status()
|
| 174 |
+
|
| 175 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 176 |
+
results = []
|
| 177 |
+
|
| 178 |
+
# Lite interface selectors
|
| 179 |
+
links = soup.select('a[href*="http"]:not([href*="duckduckgo.com"])')
|
| 180 |
+
|
| 181 |
+
for link in links[:num_results]:
|
| 182 |
+
try:
|
| 183 |
+
href = link.get('href')
|
| 184 |
+
title = link.get_text(strip=True)
|
| 185 |
+
|
| 186 |
+
if href and title and href.startswith('http'):
|
| 187 |
+
results.append({
|
| 188 |
+
'url': href,
|
| 189 |
+
'title': title,
|
| 190 |
+
'source': 'duckduckgo_lite'
|
| 191 |
+
})
|
| 192 |
+
except Exception as e:
|
| 193 |
+
logger.debug(f"Error parsing lite link: {e}")
|
| 194 |
+
continue
|
| 195 |
+
|
| 196 |
+
return results
|
| 197 |
+
|
| 198 |
+
except Exception as e:
|
| 199 |
+
logger.warning(f"DuckDuckGo Lite search failed: {e}")
|
| 200 |
+
return []
|
search/engines/medical.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class MedicalSearchEngine:
|
| 10 |
+
"""Specialized medical search engine with curated sources"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, timeout: int = 15):
|
| 13 |
+
self.session = requests.Session()
|
| 14 |
+
self.session.headers.update({
|
| 15 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 16 |
+
})
|
| 17 |
+
self.timeout = timeout
|
| 18 |
+
|
| 19 |
+
# Curated medical sources
|
| 20 |
+
self.medical_sources = {
|
| 21 |
+
'mayo_clinic': {
|
| 22 |
+
'base_url': 'https://www.mayoclinic.org',
|
| 23 |
+
'search_url': 'https://www.mayoclinic.org/search/search-results',
|
| 24 |
+
'domains': ['mayoclinic.org']
|
| 25 |
+
},
|
| 26 |
+
'webmd': {
|
| 27 |
+
'base_url': 'https://www.webmd.com',
|
| 28 |
+
'search_url': 'https://www.webmd.com/search/search_results/default.aspx',
|
| 29 |
+
'domains': ['webmd.com']
|
| 30 |
+
},
|
| 31 |
+
'healthline': {
|
| 32 |
+
'base_url': 'https://www.healthline.com',
|
| 33 |
+
'search_url': 'https://www.healthline.com/search',
|
| 34 |
+
'domains': ['healthline.com']
|
| 35 |
+
},
|
| 36 |
+
'medlineplus': {
|
| 37 |
+
'base_url': 'https://medlineplus.gov',
|
| 38 |
+
'search_url': 'https://medlineplus.gov/search',
|
| 39 |
+
'domains': ['medlineplus.gov']
|
| 40 |
+
},
|
| 41 |
+
'nih': {
|
| 42 |
+
'base_url': 'https://www.nih.gov',
|
| 43 |
+
'search_url': 'https://search.nih.gov/search',
|
| 44 |
+
'domains': ['nih.gov', 'nlm.nih.gov']
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def search(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 49 |
+
"""Search medical sources for relevant information"""
|
| 50 |
+
results = []
|
| 51 |
+
|
| 52 |
+
# Strategy 1: Direct medical source searches
|
| 53 |
+
for source_name, source_config in self.medical_sources.items():
|
| 54 |
+
if len(results) >= num_results:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
source_results = self._search_medical_source(query, source_name, source_config)
|
| 58 |
+
results.extend(source_results)
|
| 59 |
+
|
| 60 |
+
# Add delay between requests
|
| 61 |
+
time.sleep(0.5)
|
| 62 |
+
|
| 63 |
+
# Strategy 2: Medical fallback sources
|
| 64 |
+
if len(results) < num_results:
|
| 65 |
+
fallback_results = self._get_fallback_sources(query, num_results - len(results))
|
| 66 |
+
results.extend(fallback_results)
|
| 67 |
+
|
| 68 |
+
return results[:num_results]
|
| 69 |
+
|
| 70 |
+
def _search_medical_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
|
| 71 |
+
"""Search a specific medical source"""
|
| 72 |
+
try:
|
| 73 |
+
search_url = source_config.get('search_url')
|
| 74 |
+
if not search_url:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
params = {
|
| 78 |
+
'q': query,
|
| 79 |
+
'query': query,
|
| 80 |
+
'search': query
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
response = self.session.get(search_url, params=params, timeout=self.timeout)
|
| 84 |
+
response.raise_for_status()
|
| 85 |
+
|
| 86 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 87 |
+
results = []
|
| 88 |
+
|
| 89 |
+
# Source-specific selectors
|
| 90 |
+
selectors = self._get_source_selectors(source_name)
|
| 91 |
+
|
| 92 |
+
for selector in selectors:
|
| 93 |
+
links = soup.select(selector)
|
| 94 |
+
if links:
|
| 95 |
+
logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
for link in links[:3]: # Limit per source
|
| 99 |
+
try:
|
| 100 |
+
href = link.get('href')
|
| 101 |
+
if not href:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# Make absolute URL
|
| 105 |
+
if href.startswith('/'):
|
| 106 |
+
href = source_config['base_url'] + href
|
| 107 |
+
|
| 108 |
+
title = link.get_text(strip=True)
|
| 109 |
+
if title and href.startswith('http'):
|
| 110 |
+
results.append({
|
| 111 |
+
'url': href,
|
| 112 |
+
'title': title,
|
| 113 |
+
'source': source_name,
|
| 114 |
+
'domain': source_config['domains'][0]
|
| 115 |
+
})
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.debug(f"Error parsing {source_name} link: {e}")
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
return results
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning(f"Medical source {source_name} search failed: {e}")
|
| 124 |
+
return []
|
| 125 |
+
|
| 126 |
+
def _get_source_selectors(self, source_name: str) -> List[str]:
|
| 127 |
+
"""Get CSS selectors for specific medical sources"""
|
| 128 |
+
selectors_map = {
|
| 129 |
+
'mayo_clinic': [
|
| 130 |
+
'a[href*="/diseases-conditions/"]',
|
| 131 |
+
'a[href*="/symptoms/"]',
|
| 132 |
+
'.search-result a',
|
| 133 |
+
'.result-title a'
|
| 134 |
+
],
|
| 135 |
+
'webmd': [
|
| 136 |
+
'a[href*="/default.htm"]',
|
| 137 |
+
'.search-result a',
|
| 138 |
+
'.result-title a',
|
| 139 |
+
'a[href*="/content/"]'
|
| 140 |
+
],
|
| 141 |
+
'healthline': [
|
| 142 |
+
'a[href*="/health/"]',
|
| 143 |
+
'.search-result a',
|
| 144 |
+
'.result-title a',
|
| 145 |
+
'a[href*="/conditions/"]'
|
| 146 |
+
],
|
| 147 |
+
'medlineplus': [
|
| 148 |
+
'a[href*="/healthtopics/"]',
|
| 149 |
+
'.search-result a',
|
| 150 |
+
'.result-title a'
|
| 151 |
+
],
|
| 152 |
+
'nih': [
|
| 153 |
+
'a[href*="/health/"]',
|
| 154 |
+
'.search-result a',
|
| 155 |
+
'.result-title a'
|
| 156 |
+
]
|
| 157 |
+
}
|
| 158 |
+
return selectors_map.get(source_name, ['a[href*="http"]'])
|
| 159 |
+
|
| 160 |
+
def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
|
| 161 |
+
"""Get fallback medical sources when direct search fails"""
|
| 162 |
+
fallback_sources = [
|
| 163 |
+
{
|
| 164 |
+
'url': 'https://www.mayoclinic.org/diseases-conditions',
|
| 165 |
+
'title': f'Mayo Clinic: {query}',
|
| 166 |
+
'source': 'mayo_fallback',
|
| 167 |
+
'domain': 'mayoclinic.org'
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
'url': 'https://www.webmd.com/default.htm',
|
| 171 |
+
'title': f'WebMD: {query}',
|
| 172 |
+
'source': 'webmd_fallback',
|
| 173 |
+
'domain': 'webmd.com'
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
'url': 'https://www.healthline.com/health',
|
| 177 |
+
'title': f'Healthline: {query}',
|
| 178 |
+
'source': 'healthline_fallback',
|
| 179 |
+
'domain': 'healthline.com'
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
'url': 'https://medlineplus.gov/healthtopics.html',
|
| 183 |
+
'title': f'MedlinePlus: {query}',
|
| 184 |
+
'source': 'medlineplus_fallback',
|
| 185 |
+
'domain': 'medlineplus.gov'
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
'url': 'https://www.cdc.gov',
|
| 189 |
+
'title': f'CDC: {query}',
|
| 190 |
+
'source': 'cdc_fallback',
|
| 191 |
+
'domain': 'cdc.gov'
|
| 192 |
+
}
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
return fallback_sources[:num_results]
|
search/engines/multilingual.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
import time
|
| 6 |
+
import re
|
| 7 |
+
from urllib.parse import urlparse, quote
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class MultilingualMedicalEngine:
|
| 12 |
+
"""Multilingual medical search engine supporting English, Vietnamese, and Chinese sources"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, timeout: int = 15):
|
| 15 |
+
self.session = requests.Session()
|
| 16 |
+
self.session.headers.update({
|
| 17 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 18 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 19 |
+
'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
|
| 20 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 21 |
+
'Connection': 'keep-alive',
|
| 22 |
+
})
|
| 23 |
+
self.timeout = timeout
|
| 24 |
+
|
| 25 |
+
# Comprehensive medical sources by language
|
| 26 |
+
self.medical_sources = {
|
| 27 |
+
'en': {
|
| 28 |
+
'mayo_clinic': {
|
| 29 |
+
'base_url': 'https://www.mayoclinic.org',
|
| 30 |
+
'search_url': 'https://www.mayoclinic.org/search/search-results',
|
| 31 |
+
'domains': ['mayoclinic.org'],
|
| 32 |
+
'selectors': ['a[href*="/diseases-conditions/"]', 'a[href*="/symptoms/"]', '.search-result a']
|
| 33 |
+
},
|
| 34 |
+
'webmd': {
|
| 35 |
+
'base_url': 'https://www.webmd.com',
|
| 36 |
+
'search_url': 'https://www.webmd.com/search/search_results/default.aspx',
|
| 37 |
+
'domains': ['webmd.com'],
|
| 38 |
+
'selectors': ['a[href*="/default.htm"]', '.search-result a', 'a[href*="/content/"]']
|
| 39 |
+
},
|
| 40 |
+
'healthline': {
|
| 41 |
+
'base_url': 'https://www.healthline.com',
|
| 42 |
+
'search_url': 'https://www.healthline.com/search',
|
| 43 |
+
'domains': ['healthline.com'],
|
| 44 |
+
'selectors': ['a[href*="/health/"]', 'a[href*="/conditions/"]', '.search-result a']
|
| 45 |
+
},
|
| 46 |
+
'medlineplus': {
|
| 47 |
+
'base_url': 'https://medlineplus.gov',
|
| 48 |
+
'search_url': 'https://medlineplus.gov/search',
|
| 49 |
+
'domains': ['medlineplus.gov'],
|
| 50 |
+
'selectors': ['a[href*="/healthtopics/"]', '.search-result a']
|
| 51 |
+
},
|
| 52 |
+
'nih': {
|
| 53 |
+
'base_url': 'https://www.nih.gov',
|
| 54 |
+
'search_url': 'https://search.nih.gov/search',
|
| 55 |
+
'domains': ['nih.gov', 'nlm.nih.gov'],
|
| 56 |
+
'selectors': ['a[href*="/health/"]', '.search-result a']
|
| 57 |
+
},
|
| 58 |
+
'cdc': {
|
| 59 |
+
'base_url': 'https://www.cdc.gov',
|
| 60 |
+
'search_url': 'https://www.cdc.gov/search/index.html',
|
| 61 |
+
'domains': ['cdc.gov'],
|
| 62 |
+
'selectors': ['a[href*="/health/"]', '.search-result a']
|
| 63 |
+
}
|
| 64 |
+
},
|
| 65 |
+
'vi': {
|
| 66 |
+
'hello_bacsi': {
|
| 67 |
+
'base_url': 'https://hellobacsi.com',
|
| 68 |
+
'search_url': 'https://hellobacsi.com/tim-kiem',
|
| 69 |
+
'domains': ['hellobacsi.com'],
|
| 70 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a', '.article-title a']
|
| 71 |
+
},
|
| 72 |
+
'alo_bacsi': {
|
| 73 |
+
'base_url': 'https://alobacsi.com',
|
| 74 |
+
'search_url': 'https://alobacsi.com/tim-kiem',
|
| 75 |
+
'domains': ['alobacsi.com'],
|
| 76 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
|
| 77 |
+
},
|
| 78 |
+
'vinmec': {
|
| 79 |
+
'base_url': 'https://www.vinmec.com',
|
| 80 |
+
'search_url': 'https://www.vinmec.com/vi/tim-kiem',
|
| 81 |
+
'domains': ['vinmec.com'],
|
| 82 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
|
| 83 |
+
},
|
| 84 |
+
'tam_anh': {
|
| 85 |
+
'base_url': 'https://tamanhhospital.vn',
|
| 86 |
+
'search_url': 'https://tamanhhospital.vn/tim-kiem',
|
| 87 |
+
'domains': ['tamanhhospital.vn'],
|
| 88 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
|
| 89 |
+
},
|
| 90 |
+
'medlatec': {
|
| 91 |
+
'base_url': 'https://medlatec.vn',
|
| 92 |
+
'search_url': 'https://medlatec.vn/tim-kiem',
|
| 93 |
+
'domains': ['medlatec.vn'],
|
| 94 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
|
| 95 |
+
},
|
| 96 |
+
'suckhoe_doisong': {
|
| 97 |
+
'base_url': 'https://suckhoedoisong.vn',
|
| 98 |
+
'search_url': 'https://suckhoedoisong.vn/tim-kiem',
|
| 99 |
+
'domains': ['suckhoedoisong.vn'],
|
| 100 |
+
'selectors': ['a[href*="/suc-khoe/"]', 'a[href*="/benh/"]', '.search-result a']
|
| 101 |
+
},
|
| 102 |
+
'vien_dinh_duong': {
|
| 103 |
+
'base_url': 'https://viendinhduong.vn',
|
| 104 |
+
'search_url': 'https://viendinhduong.vn/tim-kiem',
|
| 105 |
+
'domains': ['viendinhduong.vn'],
|
| 106 |
+
'selectors': ['a[href*="/dinh-duong/"]', 'a[href*="/suc-khoe/"]', '.search-result a']
|
| 107 |
+
}
|
| 108 |
+
},
|
| 109 |
+
'zh': {
|
| 110 |
+
'haodf': {
|
| 111 |
+
'base_url': 'https://www.haodf.com',
|
| 112 |
+
'search_url': 'https://www.haodf.com/search',
|
| 113 |
+
'domains': ['haodf.com'],
|
| 114 |
+
'selectors': ['a[href*="/jibing/"]', 'a[href*="/zixun/"]', '.search-result a']
|
| 115 |
+
},
|
| 116 |
+
'dxy': {
|
| 117 |
+
'base_url': 'https://www.dxy.cn',
|
| 118 |
+
'search_url': 'https://www.dxy.cn/search',
|
| 119 |
+
'domains': ['dxy.cn'],
|
| 120 |
+
'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
|
| 121 |
+
},
|
| 122 |
+
'chunyuyisheng': {
|
| 123 |
+
'base_url': 'https://www.chunyuyisheng.com',
|
| 124 |
+
'search_url': 'https://www.chunyuyisheng.com/search',
|
| 125 |
+
'domains': ['chunyuyisheng.com'],
|
| 126 |
+
'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
|
| 127 |
+
},
|
| 128 |
+
'xywy': {
|
| 129 |
+
'base_url': 'https://www.xywy.com',
|
| 130 |
+
'search_url': 'https://www.xywy.com/search',
|
| 131 |
+
'domains': ['xywy.com'],
|
| 132 |
+
'selectors': ['a[href*="/jibing/"]', 'a[href*="/article/"]', '.search-result a']
|
| 133 |
+
},
|
| 134 |
+
'jiankang': {
|
| 135 |
+
'base_url': 'https://www.jiankang.com',
|
| 136 |
+
'search_url': 'https://www.jiankang.com/search',
|
| 137 |
+
'domains': ['jiankang.com'],
|
| 138 |
+
'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
|
| 139 |
+
},
|
| 140 |
+
'familydoctor': {
|
| 141 |
+
'base_url': 'https://www.familydoctor.com.cn',
|
| 142 |
+
'search_url': 'https://www.familydoctor.com.cn/search',
|
| 143 |
+
'domains': ['familydoctor.com.cn'],
|
| 144 |
+
'selectors': ['a[href*="/article/"]', 'a[href*="/jibing/"]', '.search-result a']
|
| 145 |
+
}
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
def search(self, query: str, num_results: int = 10, languages: List[str] = None) -> List[Dict]:
|
| 150 |
+
"""Search across multiple languages and medical sources"""
|
| 151 |
+
if languages is None:
|
| 152 |
+
languages = ['en', 'vi', 'zh']
|
| 153 |
+
|
| 154 |
+
all_results = []
|
| 155 |
+
|
| 156 |
+
for lang in languages:
|
| 157 |
+
if lang in self.medical_sources:
|
| 158 |
+
lang_results = self._search_language_sources(query, lang, num_results // len(languages))
|
| 159 |
+
all_results.extend(lang_results)
|
| 160 |
+
time.sleep(0.5) # Be respectful to servers
|
| 161 |
+
|
| 162 |
+
# Remove duplicates and sort by relevance
|
| 163 |
+
unique_results = self._remove_duplicates(all_results)
|
| 164 |
+
return unique_results[:num_results]
|
| 165 |
+
|
| 166 |
+
def _search_language_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
|
| 167 |
+
"""Search sources for a specific language"""
|
| 168 |
+
results = []
|
| 169 |
+
sources = self.medical_sources.get(language, {})
|
| 170 |
+
|
| 171 |
+
for source_name, source_config in sources.items():
|
| 172 |
+
if len(results) >= num_results:
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
source_results = self._search_source(query, source_name, source_config, language)
|
| 176 |
+
results.extend(source_results)
|
| 177 |
+
time.sleep(0.3) # Rate limiting
|
| 178 |
+
|
| 179 |
+
return results
|
| 180 |
+
|
| 181 |
+
def _search_source(self, query: str, source_name: str, source_config: Dict, language: str) -> List[Dict]:
|
| 182 |
+
"""Search a specific medical source"""
|
| 183 |
+
try:
|
| 184 |
+
search_url = source_config.get('search_url')
|
| 185 |
+
if not search_url:
|
| 186 |
+
return []
|
| 187 |
+
|
| 188 |
+
# Prepare search parameters based on language
|
| 189 |
+
params = self._prepare_search_params(query, language)
|
| 190 |
+
|
| 191 |
+
response = self.session.get(search_url, params=params, timeout=self.timeout)
|
| 192 |
+
response.raise_for_status()
|
| 193 |
+
|
| 194 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 195 |
+
results = []
|
| 196 |
+
|
| 197 |
+
# Try source-specific selectors
|
| 198 |
+
selectors = source_config.get('selectors', ['a[href*="http"]'])
|
| 199 |
+
|
| 200 |
+
for selector in selectors:
|
| 201 |
+
links = soup.select(selector)
|
| 202 |
+
if links:
|
| 203 |
+
logger.info(f"{source_name} ({language}) found {len(links)} results with selector: {selector}")
|
| 204 |
+
break
|
| 205 |
+
|
| 206 |
+
for link in links[:3]: # Limit per source
|
| 207 |
+
try:
|
| 208 |
+
href = link.get('href')
|
| 209 |
+
if not href:
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
# Make absolute URL
|
| 213 |
+
if href.startswith('/'):
|
| 214 |
+
href = source_config['base_url'] + href
|
| 215 |
+
|
| 216 |
+
title = link.get_text(strip=True)
|
| 217 |
+
if title and href.startswith('http'):
|
| 218 |
+
results.append({
|
| 219 |
+
'url': href,
|
| 220 |
+
'title': title,
|
| 221 |
+
'source': source_name,
|
| 222 |
+
'language': language,
|
| 223 |
+
'domain': source_config['domains'][0]
|
| 224 |
+
})
|
| 225 |
+
except Exception as e:
|
| 226 |
+
logger.debug(f"Error parsing {source_name} link: {e}")
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
return results
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
logger.warning(f"Medical source {source_name} ({language}) search failed: {e}")
|
| 233 |
+
return []
|
| 234 |
+
|
| 235 |
+
def _prepare_search_params(self, query: str, language: str) -> Dict[str, str]:
|
| 236 |
+
"""Prepare search parameters based on language"""
|
| 237 |
+
# Common parameter names across different languages
|
| 238 |
+
param_mappings = {
|
| 239 |
+
'en': {'q': query, 'query': query, 'search': query},
|
| 240 |
+
'vi': {'q': query, 'query': query, 'search': query, 'tu-khoa': query, 'tim-kiem': query},
|
| 241 |
+
'zh': {'q': query, 'query': query, 'search': query, 'keyword': query, 'sousuo': query}
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
return param_mappings.get(language, {'q': query})
|
| 245 |
+
|
| 246 |
+
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
|
| 247 |
+
"""Remove duplicate results based on URL"""
|
| 248 |
+
seen_urls = set()
|
| 249 |
+
unique_results = []
|
| 250 |
+
|
| 251 |
+
for result in results:
|
| 252 |
+
url = result.get('url', '')
|
| 253 |
+
if url and url not in seen_urls:
|
| 254 |
+
seen_urls.add(url)
|
| 255 |
+
unique_results.append(result)
|
| 256 |
+
|
| 257 |
+
return unique_results
|
| 258 |
+
|
| 259 |
+
def search_by_language(self, query: str, language: str, num_results: int = 10) -> List[Dict]:
|
| 260 |
+
"""Search sources for a specific language only"""
|
| 261 |
+
if language not in self.medical_sources:
|
| 262 |
+
logger.warning(f"Language {language} not supported")
|
| 263 |
+
return []
|
| 264 |
+
|
| 265 |
+
return self._search_language_sources(query, language, num_results)
|
| 266 |
+
|
| 267 |
+
def get_fallback_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
|
| 268 |
+
"""Get fallback sources when direct search fails"""
|
| 269 |
+
fallback_sources = {
|
| 270 |
+
'en': [
|
| 271 |
+
{
|
| 272 |
+
'url': 'https://www.mayoclinic.org/diseases-conditions',
|
| 273 |
+
'title': f'Mayo Clinic: {query}',
|
| 274 |
+
'source': 'mayo_fallback',
|
| 275 |
+
'language': 'en',
|
| 276 |
+
'domain': 'mayoclinic.org'
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
'url': 'https://www.webmd.com/default.htm',
|
| 280 |
+
'title': f'WebMD: {query}',
|
| 281 |
+
'source': 'webmd_fallback',
|
| 282 |
+
'language': 'en',
|
| 283 |
+
'domain': 'webmd.com'
|
| 284 |
+
}
|
| 285 |
+
],
|
| 286 |
+
'vi': [
|
| 287 |
+
{
|
| 288 |
+
'url': 'https://hellobacsi.com/suc-khoe',
|
| 289 |
+
'title': f'Hello Bacsi: {query}',
|
| 290 |
+
'source': 'hello_bacsi_fallback',
|
| 291 |
+
'language': 'vi',
|
| 292 |
+
'domain': 'hellobacsi.com'
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
'url': 'https://www.vinmec.com/vi/suc-khoe',
|
| 296 |
+
'title': f'Vinmec: {query}',
|
| 297 |
+
'source': 'vinmec_fallback',
|
| 298 |
+
'language': 'vi',
|
| 299 |
+
'domain': 'vinmec.com'
|
| 300 |
+
}
|
| 301 |
+
],
|
| 302 |
+
'zh': [
|
| 303 |
+
{
|
| 304 |
+
'url': 'https://www.haodf.com/jibing',
|
| 305 |
+
'title': f'好大夫在线: {query}',
|
| 306 |
+
'source': 'haodf_fallback',
|
| 307 |
+
'language': 'zh',
|
| 308 |
+
'domain': 'haodf.com'
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
'url': 'https://www.dxy.cn/article',
|
| 312 |
+
'title': f'丁香园: {query}',
|
| 313 |
+
'source': 'dxy_fallback',
|
| 314 |
+
'language': 'zh',
|
| 315 |
+
'domain': 'dxy.cn'
|
| 316 |
+
}
|
| 317 |
+
]
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
return fallback_sources.get(language, [])[:num_results]
|
search/extractors/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .content import ContentExtractor
|
| 2 |
+
|
| 3 |
+
__all__ = ['ContentExtractor']
|
search/extractors/content.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
import re
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class ContentExtractor:
|
| 12 |
+
"""Extract and clean content from web pages"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, timeout: int = 15):
|
| 15 |
+
self.session = requests.Session()
|
| 16 |
+
self.session.headers.update({
|
| 17 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 18 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 19 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 20 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 21 |
+
'Connection': 'keep-alive',
|
| 22 |
+
})
|
| 23 |
+
self.timeout = timeout
|
| 24 |
+
|
| 25 |
+
# Medical content indicators
|
| 26 |
+
self.medical_indicators = [
|
| 27 |
+
'symptom', 'treatment', 'diagnosis', 'medicine', 'medication',
|
| 28 |
+
'therapy', 'condition', 'disease', 'health', 'medical',
|
| 29 |
+
'doctor', 'physician', 'patient', 'clinical', 'study'
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
def extract(self, url: str, max_length: int = 2000) -> Optional[str]:
|
| 33 |
+
"""Extract content from a URL with medical focus"""
|
| 34 |
+
try:
|
| 35 |
+
response = self.session.get(url, timeout=self.timeout)
|
| 36 |
+
response.raise_for_status()
|
| 37 |
+
|
| 38 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 39 |
+
|
| 40 |
+
# Remove unwanted elements
|
| 41 |
+
self._remove_unwanted_elements(soup)
|
| 42 |
+
|
| 43 |
+
# Extract main content
|
| 44 |
+
content = self._extract_main_content(soup)
|
| 45 |
+
|
| 46 |
+
if not content:
|
| 47 |
+
return None
|
| 48 |
+
|
| 49 |
+
# Clean and process content
|
| 50 |
+
cleaned_content = self._clean_content(content)
|
| 51 |
+
|
| 52 |
+
# Focus on medical content if possible
|
| 53 |
+
medical_content = self._extract_medical_content(cleaned_content)
|
| 54 |
+
|
| 55 |
+
# Truncate to max length
|
| 56 |
+
final_content = self._truncate_content(medical_content or cleaned_content, max_length)
|
| 57 |
+
|
| 58 |
+
return final_content if final_content else None
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.warning(f"Content extraction failed for {url}: {e}")
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
def _remove_unwanted_elements(self, soup: BeautifulSoup):
|
| 65 |
+
"""Remove unwanted HTML elements"""
|
| 66 |
+
unwanted_tags = [
|
| 67 |
+
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
| 68 |
+
'advertisement', 'ads', 'sidebar', 'menu', 'navigation',
|
| 69 |
+
'social', 'share', 'comment', 'comments', 'related',
|
| 70 |
+
'cookie', 'privacy', 'terms', 'disclaimer'
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
for tag in unwanted_tags:
|
| 74 |
+
for element in soup.find_all(tag):
|
| 75 |
+
element.decompose()
|
| 76 |
+
|
| 77 |
+
# Remove elements with unwanted classes/ids
|
| 78 |
+
unwanted_selectors = [
|
| 79 |
+
'[class*="ad"]', '[class*="advertisement"]', '[class*="sidebar"]',
|
| 80 |
+
'[class*="menu"]', '[class*="nav"]', '[class*="social"]',
|
| 81 |
+
'[class*="share"]', '[class*="comment"]', '[class*="related"]',
|
| 82 |
+
'[id*="ad"]', '[id*="sidebar"]', '[id*="menu"]', '[id*="nav"]'
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
for selector in unwanted_selectors:
|
| 86 |
+
for element in soup.select(selector):
|
| 87 |
+
element.decompose()
|
| 88 |
+
|
| 89 |
+
def _extract_main_content(self, soup: BeautifulSoup) -> str:
|
| 90 |
+
"""Extract main content from the page"""
|
| 91 |
+
# Priority order for content extraction
|
| 92 |
+
content_selectors = [
|
| 93 |
+
'article',
|
| 94 |
+
'main',
|
| 95 |
+
'[role="main"]',
|
| 96 |
+
'.content',
|
| 97 |
+
'.main-content',
|
| 98 |
+
'.article-content',
|
| 99 |
+
'.post-content',
|
| 100 |
+
'.entry-content',
|
| 101 |
+
'.page-content',
|
| 102 |
+
'body'
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
for selector in content_selectors:
|
| 106 |
+
elements = soup.select(selector)
|
| 107 |
+
if elements:
|
| 108 |
+
# Get the largest content element
|
| 109 |
+
largest_element = max(elements, key=lambda x: len(x.get_text()))
|
| 110 |
+
content = largest_element.get_text(separator=' ', strip=True)
|
| 111 |
+
if len(content) > 100: # Minimum content length
|
| 112 |
+
return content
|
| 113 |
+
|
| 114 |
+
# Fallback: get all text
|
| 115 |
+
return soup.get_text(separator=' ', strip=True)
|
| 116 |
+
|
| 117 |
+
def _clean_content(self, content: str) -> str:
|
| 118 |
+
"""Clean and normalize content"""
|
| 119 |
+
if not content:
|
| 120 |
+
return ""
|
| 121 |
+
|
| 122 |
+
# Remove excessive whitespace
|
| 123 |
+
content = re.sub(r'\s+', ' ', content)
|
| 124 |
+
|
| 125 |
+
# Remove common web artifacts
|
| 126 |
+
artifacts = [
|
| 127 |
+
r'Cookie\s+Policy',
|
| 128 |
+
r'Privacy\s+Policy',
|
| 129 |
+
r'Terms\s+of\s+Service',
|
| 130 |
+
r'Subscribe\s+to\s+our\s+newsletter',
|
| 131 |
+
r'Follow\s+us\s+on',
|
| 132 |
+
r'Share\s+this\s+article',
|
| 133 |
+
r'Related\s+articles',
|
| 134 |
+
r'Advertisement',
|
| 135 |
+
r'Ad\s+content'
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
for artifact in artifacts:
|
| 139 |
+
content = re.sub(artifact, '', content, flags=re.IGNORECASE)
|
| 140 |
+
|
| 141 |
+
# Remove excessive punctuation
|
| 142 |
+
content = re.sub(r'[.]{3,}', '...', content)
|
| 143 |
+
content = re.sub(r'[!]{2,}', '!', content)
|
| 144 |
+
content = re.sub(r'[?]{2,}', '?', content)
|
| 145 |
+
|
| 146 |
+
return content.strip()
|
| 147 |
+
|
| 148 |
+
def _extract_medical_content(self, content: str) -> Optional[str]:
|
| 149 |
+
"""Extract medical-focused content from the text"""
|
| 150 |
+
if not content:
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
# Split content into sentences
|
| 154 |
+
sentences = re.split(r'[.!?]+', content)
|
| 155 |
+
medical_sentences = []
|
| 156 |
+
|
| 157 |
+
for sentence in sentences:
|
| 158 |
+
sentence = sentence.strip()
|
| 159 |
+
if len(sentence) < 20: # Skip very short sentences
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# Check if sentence contains medical indicators
|
| 163 |
+
sentence_lower = sentence.lower()
|
| 164 |
+
if any(indicator in sentence_lower for indicator in self.medical_indicators):
|
| 165 |
+
medical_sentences.append(sentence)
|
| 166 |
+
|
| 167 |
+
if medical_sentences:
|
| 168 |
+
# Return medical sentences, prioritizing longer ones
|
| 169 |
+
medical_sentences.sort(key=len, reverse=True)
|
| 170 |
+
return '. '.join(medical_sentences[:10]) + '.'
|
| 171 |
+
|
| 172 |
+
return None
|
| 173 |
+
|
| 174 |
+
def _truncate_content(self, content: str, max_length: int) -> str:
|
| 175 |
+
"""Truncate content to max length while preserving sentences"""
|
| 176 |
+
if len(content) <= max_length:
|
| 177 |
+
return content
|
| 178 |
+
|
| 179 |
+
# Try to truncate at sentence boundary
|
| 180 |
+
truncated = content[:max_length]
|
| 181 |
+
last_period = truncated.rfind('.')
|
| 182 |
+
last_exclamation = truncated.rfind('!')
|
| 183 |
+
last_question = truncated.rfind('?')
|
| 184 |
+
|
| 185 |
+
last_sentence_end = max(last_period, last_exclamation, last_question)
|
| 186 |
+
|
| 187 |
+
if last_sentence_end > max_length * 0.7: # If we can find a good break point
|
| 188 |
+
return content[:last_sentence_end + 1]
|
| 189 |
+
|
| 190 |
+
# Fallback: truncate at word boundary
|
| 191 |
+
words = truncated.split()
|
| 192 |
+
if len(words) > 1:
|
| 193 |
+
return ' '.join(words[:-1]) + '...'
|
| 194 |
+
|
| 195 |
+
return truncated + '...'
|
| 196 |
+
|
| 197 |
+
def extract_multiple(self, urls: list, max_length: int = 2000) -> Dict[str, str]:
|
| 198 |
+
"""Extract content from multiple URLs"""
|
| 199 |
+
results = {}
|
| 200 |
+
|
| 201 |
+
for url in urls:
|
| 202 |
+
try:
|
| 203 |
+
content = self.extract(url, max_length)
|
| 204 |
+
if content:
|
| 205 |
+
results[url] = content
|
| 206 |
+
time.sleep(0.5) # Be respectful to servers
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.warning(f"Failed to extract content from {url}: {e}")
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
return results
|
search/processors/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .medical import MedicalSearchProcessor
|
| 2 |
+
from .language import LanguageProcessor
|
| 3 |
+
|
| 4 |
+
__all__ = ['MedicalSearchProcessor', 'LanguageProcessor']
|
search/processors/language.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Dict, Tuple, Optional
|
| 4 |
+
from langdetect import detect, DetectorFactory
|
| 5 |
+
from langdetect.lang_detect_exception import LangDetectException
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
# Set seed for consistent language detection
|
| 10 |
+
DetectorFactory.seed = 0
|
| 11 |
+
|
| 12 |
+
class LanguageProcessor:
|
| 13 |
+
"""Process and enhance queries for multilingual medical search"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
# Medical keywords in different languages
|
| 17 |
+
self.medical_keywords = {
|
| 18 |
+
'en': [
|
| 19 |
+
'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
|
| 20 |
+
'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
|
| 21 |
+
'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
|
| 22 |
+
'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
|
| 23 |
+
'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
|
| 24 |
+
'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
|
| 25 |
+
'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
|
| 26 |
+
'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
|
| 27 |
+
'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
|
| 28 |
+
],
|
| 29 |
+
'vi': [
|
| 30 |
+
'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
|
| 31 |
+
'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
|
| 32 |
+
'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
|
| 33 |
+
'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
|
| 34 |
+
'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
|
| 35 |
+
'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
|
| 36 |
+
'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
|
| 37 |
+
'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
|
| 38 |
+
'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
|
| 39 |
+
],
|
| 40 |
+
'zh': [
|
| 41 |
+
'症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
|
| 42 |
+
'药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
|
| 43 |
+
'医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
|
| 44 |
+
'背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
|
| 45 |
+
'睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
|
| 46 |
+
'婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
|
| 47 |
+
'手术', '医院', '诊所'
|
| 48 |
+
]
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Language-specific search enhancements
|
| 52 |
+
self.language_enhancements = {
|
| 53 |
+
'vi': {
|
| 54 |
+
'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
|
| 55 |
+
'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
|
| 56 |
+
},
|
| 57 |
+
'zh': {
|
| 58 |
+
'common_terms': ['是什么', '原因', '治疗方法', '症状'],
|
| 59 |
+
'medical_context': ['医疗', '健康', '医院', '医生']
|
| 60 |
+
},
|
| 61 |
+
'en': {
|
| 62 |
+
'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
|
| 63 |
+
'medical_context': ['medical', 'health', 'hospital', 'doctor']
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def detect_language(self, text: str) -> str:
|
| 68 |
+
"""Detect the language of the input text"""
|
| 69 |
+
if not text or not text.strip():
|
| 70 |
+
return 'en' # Default to English
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Clean text for better detection
|
| 74 |
+
cleaned_text = re.sub(r'[^\w\s]', ' ', text)
|
| 75 |
+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
| 76 |
+
|
| 77 |
+
if len(cleaned_text) < 3:
|
| 78 |
+
return 'en'
|
| 79 |
+
|
| 80 |
+
detected = detect(cleaned_text)
|
| 81 |
+
|
| 82 |
+
# Map detected language to our supported languages
|
| 83 |
+
language_mapping = {
|
| 84 |
+
'vi': 'vi', # Vietnamese
|
| 85 |
+
'zh-cn': 'zh', # Chinese Simplified
|
| 86 |
+
'zh-tw': 'zh', # Chinese Traditional
|
| 87 |
+
'zh': 'zh', # Chinese
|
| 88 |
+
'en': 'en' # English
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
return language_mapping.get(detected, 'en')
|
| 92 |
+
|
| 93 |
+
except LangDetectException as e:
|
| 94 |
+
logger.warning(f"Language detection failed: {e}")
|
| 95 |
+
return 'en'
|
| 96 |
+
|
| 97 |
+
def enhance_query(self, query: str, target_language: str = None) -> Dict[str, str]:
|
| 98 |
+
"""Enhance query for better search results in multiple languages"""
|
| 99 |
+
if not query or not query.strip():
|
| 100 |
+
return {}
|
| 101 |
+
|
| 102 |
+
# Detect source language
|
| 103 |
+
source_language = self.detect_language(query)
|
| 104 |
+
|
| 105 |
+
# If target language not specified, use source language
|
| 106 |
+
if target_language is None:
|
| 107 |
+
target_language = source_language
|
| 108 |
+
|
| 109 |
+
enhanced_queries = {}
|
| 110 |
+
|
| 111 |
+
# Original query
|
| 112 |
+
enhanced_queries[source_language] = query
|
| 113 |
+
|
| 114 |
+
# Enhance for source language
|
| 115 |
+
if source_language in self.language_enhancements:
|
| 116 |
+
enhanced_queries[source_language] = self._enhance_for_language(
|
| 117 |
+
query, source_language
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Create translations for other languages if needed
|
| 121 |
+
if target_language != source_language:
|
| 122 |
+
enhanced_queries[target_language] = self._translate_query(
|
| 123 |
+
query, source_language, target_language
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Add English version for comprehensive search
|
| 127 |
+
if 'en' not in enhanced_queries:
|
| 128 |
+
if source_language != 'en':
|
| 129 |
+
enhanced_queries['en'] = self._translate_query(query, source_language, 'en')
|
| 130 |
+
else:
|
| 131 |
+
enhanced_queries['en'] = query
|
| 132 |
+
|
| 133 |
+
return enhanced_queries
|
| 134 |
+
|
| 135 |
+
def _enhance_for_language(self, query: str, language: str) -> str:
|
| 136 |
+
"""Enhance query for a specific language"""
|
| 137 |
+
enhancements = self.language_enhancements.get(language, {})
|
| 138 |
+
common_terms = enhancements.get('common_terms', [])
|
| 139 |
+
medical_context = enhancements.get('medical_context', [])
|
| 140 |
+
|
| 141 |
+
# Check if query already contains medical context
|
| 142 |
+
query_lower = query.lower()
|
| 143 |
+
has_medical_context = any(term in query_lower for term in medical_context)
|
| 144 |
+
|
| 145 |
+
# If no medical context, add it
|
| 146 |
+
if not has_medical_context and medical_context:
|
| 147 |
+
# Add the most relevant medical context term
|
| 148 |
+
query += f" {medical_context[0]}"
|
| 149 |
+
|
| 150 |
+
# Check if query is a question and add relevant terms
|
| 151 |
+
if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
|
| 152 |
+
if common_terms:
|
| 153 |
+
query += f" {common_terms[0]}" # Add "causes" or equivalent
|
| 154 |
+
|
| 155 |
+
return query.strip()
|
| 156 |
+
|
| 157 |
+
def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
|
| 158 |
+
"""Simple keyword-based translation for medical terms"""
|
| 159 |
+
# This is a basic implementation - in production, you'd use a proper translation service
|
| 160 |
+
|
| 161 |
+
# Medical term translations
|
| 162 |
+
translations = {
|
| 163 |
+
('vi', 'en'): {
|
| 164 |
+
'triệu chứng': 'symptoms',
|
| 165 |
+
'đau': 'pain',
|
| 166 |
+
'đau đầu': 'headache',
|
| 167 |
+
'sốt': 'fever',
|
| 168 |
+
'ho': 'cough',
|
| 169 |
+
'điều trị': 'treatment',
|
| 170 |
+
'thuốc': 'medicine',
|
| 171 |
+
'bệnh': 'disease',
|
| 172 |
+
'bác sĩ': 'doctor',
|
| 173 |
+
'sức khỏe': 'health',
|
| 174 |
+
'bệnh viện': 'hospital'
|
| 175 |
+
},
|
| 176 |
+
('zh', 'en'): {
|
| 177 |
+
'症状': 'symptoms',
|
| 178 |
+
'疼痛': 'pain',
|
| 179 |
+
'头痛': 'headache',
|
| 180 |
+
'发烧': 'fever',
|
| 181 |
+
'咳嗽': 'cough',
|
| 182 |
+
'治疗': 'treatment',
|
| 183 |
+
'药物': 'medicine',
|
| 184 |
+
'疾病': 'disease',
|
| 185 |
+
'医生': 'doctor',
|
| 186 |
+
'健康': 'health',
|
| 187 |
+
'医院': 'hospital'
|
| 188 |
+
},
|
| 189 |
+
('en', 'vi'): {
|
| 190 |
+
'symptoms': 'triệu chứng',
|
| 191 |
+
'pain': 'đau',
|
| 192 |
+
'headache': 'đau đầu',
|
| 193 |
+
'fever': 'sốt',
|
| 194 |
+
'cough': 'ho',
|
| 195 |
+
'treatment': 'điều trị',
|
| 196 |
+
'medicine': 'thuốc',
|
| 197 |
+
'disease': 'bệnh',
|
| 198 |
+
'doctor': 'bác sĩ',
|
| 199 |
+
'health': 'sức khỏe',
|
| 200 |
+
'hospital': 'bệnh viện'
|
| 201 |
+
},
|
| 202 |
+
('en', 'zh'): {
|
| 203 |
+
'symptoms': '症状',
|
| 204 |
+
'pain': '疼痛',
|
| 205 |
+
'headache': '头痛',
|
| 206 |
+
'fever': '发烧',
|
| 207 |
+
'cough': '咳嗽',
|
| 208 |
+
'treatment': '治疗',
|
| 209 |
+
'medicine': '药物',
|
| 210 |
+
'disease': '疾病',
|
| 211 |
+
'doctor': '医生',
|
| 212 |
+
'health': '健康',
|
| 213 |
+
'hospital': '医院'
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
translation_map = translations.get((source_lang, target_lang), {})
|
| 218 |
+
|
| 219 |
+
# Simple word-by-word translation
|
| 220 |
+
translated_query = query
|
| 221 |
+
for source_term, target_term in translation_map.items():
|
| 222 |
+
translated_query = translated_query.replace(source_term, target_term)
|
| 223 |
+
|
| 224 |
+
return translated_query
|
| 225 |
+
|
| 226 |
+
def get_medical_relevance_score(self, text: str, language: str) -> float:
|
| 227 |
+
"""Calculate medical relevance score for text in a specific language"""
|
| 228 |
+
if not text:
|
| 229 |
+
return 0.0
|
| 230 |
+
|
| 231 |
+
keywords = self.medical_keywords.get(language, [])
|
| 232 |
+
if not keywords:
|
| 233 |
+
return 0.0
|
| 234 |
+
|
| 235 |
+
text_lower = text.lower()
|
| 236 |
+
matches = sum(1 for keyword in keywords if keyword in text_lower)
|
| 237 |
+
|
| 238 |
+
# Normalize by text length and keyword count
|
| 239 |
+
score = matches / max(len(keywords), 1)
|
| 240 |
+
|
| 241 |
+
# Boost score for longer matches
|
| 242 |
+
if matches > 0:
|
| 243 |
+
score *= (1 + matches * 0.1)
|
| 244 |
+
|
| 245 |
+
return min(score, 1.0)
|
| 246 |
+
|
| 247 |
+
def filter_by_language(self, results: List[Dict], target_language: str) -> List[Dict]:
|
| 248 |
+
"""Filter results by language preference"""
|
| 249 |
+
if not results:
|
| 250 |
+
return results
|
| 251 |
+
|
| 252 |
+
# Score results by language match
|
| 253 |
+
scored_results = []
|
| 254 |
+
for result in results:
|
| 255 |
+
result_language = result.get('language', 'en')
|
| 256 |
+
language_score = 1.0 if result_language == target_language else 0.5
|
| 257 |
+
|
| 258 |
+
# Add language score to result
|
| 259 |
+
result_copy = result.copy()
|
| 260 |
+
result_copy['language_score'] = language_score
|
| 261 |
+
scored_results.append(result_copy)
|
| 262 |
+
|
| 263 |
+
# Sort by language score (prefer target language)
|
| 264 |
+
scored_results.sort(key=lambda x: x.get('language_score', 0), reverse=True)
|
| 265 |
+
|
| 266 |
+
return scored_results
|
search/processors/medical.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
from models.summarizer import summarizer
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class MedicalSearchProcessor:
|
| 9 |
+
"""Process and enhance medical search results"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.medical_keywords = [
|
| 13 |
+
'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
|
| 14 |
+
'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
|
| 15 |
+
'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
|
| 16 |
+
'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
|
| 17 |
+
'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
|
| 18 |
+
'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
|
| 19 |
+
'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
|
| 20 |
+
'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
|
| 21 |
+
'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
def process_results(self, results: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
|
| 25 |
+
"""Process search results and create comprehensive medical summary"""
|
| 26 |
+
if not results:
|
| 27 |
+
return "", {}
|
| 28 |
+
|
| 29 |
+
# Filter and rank results by medical relevance
|
| 30 |
+
relevant_results = self._filter_medical_results(results, user_query)
|
| 31 |
+
|
| 32 |
+
if not relevant_results:
|
| 33 |
+
logger.warning("No medically relevant results found")
|
| 34 |
+
return "", {}
|
| 35 |
+
|
| 36 |
+
# Extract and summarize content
|
| 37 |
+
summarized_results = self._summarize_results(relevant_results, user_query)
|
| 38 |
+
|
| 39 |
+
# Create comprehensive summary
|
| 40 |
+
combined_summary = self._create_combined_summary(summarized_results, user_query)
|
| 41 |
+
|
| 42 |
+
# Create URL mapping for citations
|
| 43 |
+
url_mapping = self._create_url_mapping(relevant_results)
|
| 44 |
+
|
| 45 |
+
return combined_summary, url_mapping
|
| 46 |
+
|
| 47 |
+
def _filter_medical_results(self, results: List[Dict], user_query: str) -> List[Dict]:
|
| 48 |
+
"""Filter results by medical relevance"""
|
| 49 |
+
relevant_results = []
|
| 50 |
+
|
| 51 |
+
for result in results:
|
| 52 |
+
relevance_score = self._calculate_relevance_score(result, user_query)
|
| 53 |
+
|
| 54 |
+
if relevance_score > 0.3: # Threshold for medical relevance
|
| 55 |
+
result['relevance_score'] = relevance_score
|
| 56 |
+
relevant_results.append(result)
|
| 57 |
+
|
| 58 |
+
# Sort by relevance score
|
| 59 |
+
relevant_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
|
| 60 |
+
|
| 61 |
+
# Limit to top results
|
| 62 |
+
return relevant_results[:10]
|
| 63 |
+
|
| 64 |
+
def _calculate_relevance_score(self, result: Dict, user_query: str) -> float:
|
| 65 |
+
"""Calculate medical relevance score for a result"""
|
| 66 |
+
score = 0.0
|
| 67 |
+
|
| 68 |
+
# Check title relevance
|
| 69 |
+
title = result.get('title', '').lower()
|
| 70 |
+
query_lower = user_query.lower()
|
| 71 |
+
|
| 72 |
+
# Direct query match in title
|
| 73 |
+
if any(word in title for word in query_lower.split()):
|
| 74 |
+
score += 0.4
|
| 75 |
+
|
| 76 |
+
# Medical keyword match in title
|
| 77 |
+
medical_matches = sum(1 for keyword in self.medical_keywords if keyword in title)
|
| 78 |
+
score += min(medical_matches * 0.1, 0.3)
|
| 79 |
+
|
| 80 |
+
# Domain credibility
|
| 81 |
+
url = result.get('url', '').lower()
|
| 82 |
+
credible_domains = [
|
| 83 |
+
'mayoclinic.org', 'webmd.com', 'healthline.com', 'medlineplus.gov',
|
| 84 |
+
'nih.gov', 'cdc.gov', 'who.int', 'pubmed.ncbi.nlm.nih.gov',
|
| 85 |
+
'uptodate.com', 'merckmanuals.com', 'medscape.com'
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
if any(domain in url for domain in credible_domains):
|
| 89 |
+
score += 0.3
|
| 90 |
+
|
| 91 |
+
# Source type bonus
|
| 92 |
+
source = result.get('source', '')
|
| 93 |
+
if 'medical' in source or any(domain in source for domain in credible_domains):
|
| 94 |
+
score += 0.2
|
| 95 |
+
|
| 96 |
+
return min(score, 1.0)
|
| 97 |
+
|
| 98 |
+
def _summarize_results(self, results: List[Dict], user_query: str) -> List[Dict]:
|
| 99 |
+
"""Summarize content from search results"""
|
| 100 |
+
summarized_results = []
|
| 101 |
+
|
| 102 |
+
for i, result in enumerate(results):
|
| 103 |
+
try:
|
| 104 |
+
content = result.get('content', '')
|
| 105 |
+
if not content:
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# Create focused summary
|
| 109 |
+
summary = summarizer.summarize_for_query(content, user_query, max_length=300)
|
| 110 |
+
|
| 111 |
+
if summary:
|
| 112 |
+
summarized_results.append({
|
| 113 |
+
'id': i + 1,
|
| 114 |
+
'url': result['url'],
|
| 115 |
+
'title': result['title'],
|
| 116 |
+
'summary': summary,
|
| 117 |
+
'relevance_score': result.get('relevance_score', 0)
|
| 118 |
+
})
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.warning(f"Failed to summarize result {i}: {e}")
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
return summarized_results
|
| 125 |
+
|
| 126 |
+
def _create_combined_summary(self, summarized_results: List[Dict], user_query: str) -> str:
|
| 127 |
+
"""Create a comprehensive summary from all results"""
|
| 128 |
+
if not summarized_results:
|
| 129 |
+
return ""
|
| 130 |
+
|
| 131 |
+
# Group by topic/similarity
|
| 132 |
+
topic_groups = self._group_by_topic(summarized_results)
|
| 133 |
+
|
| 134 |
+
summary_parts = []
|
| 135 |
+
|
| 136 |
+
for topic, results in topic_groups.items():
|
| 137 |
+
if not results:
|
| 138 |
+
continue
|
| 139 |
+
|
| 140 |
+
# Create topic summary
|
| 141 |
+
topic_summary = self._create_topic_summary(topic, results, user_query)
|
| 142 |
+
if topic_summary:
|
| 143 |
+
summary_parts.append(topic_summary)
|
| 144 |
+
|
| 145 |
+
# Combine all parts
|
| 146 |
+
combined_summary = "\n\n".join(summary_parts)
|
| 147 |
+
|
| 148 |
+
# Final summarization to ensure conciseness
|
| 149 |
+
if len(combined_summary) > 1500:
|
| 150 |
+
combined_summary = summarizer.summarize_text(combined_summary, max_length=1500)
|
| 151 |
+
|
| 152 |
+
return combined_summary
|
| 153 |
+
|
| 154 |
+
def _group_by_topic(self, results: List[Dict]) -> Dict[str, List[Dict]]:
|
| 155 |
+
"""Group results by medical topic"""
|
| 156 |
+
topics = {
|
| 157 |
+
'symptoms': [],
|
| 158 |
+
'treatments': [],
|
| 159 |
+
'diagnosis': [],
|
| 160 |
+
'general': []
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
for result in results:
|
| 164 |
+
title_lower = result['title'].lower()
|
| 165 |
+
summary_lower = result.get('summary', '').lower()
|
| 166 |
+
content_lower = f"{title_lower} {summary_lower}"
|
| 167 |
+
|
| 168 |
+
# Categorize by content
|
| 169 |
+
if any(word in content_lower for word in ['symptom', 'sign', 'pain', 'ache']):
|
| 170 |
+
topics['symptoms'].append(result)
|
| 171 |
+
elif any(word in content_lower for word in ['treatment', 'therapy', 'medicine', 'medication']):
|
| 172 |
+
topics['treatments'].append(result)
|
| 173 |
+
elif any(word in content_lower for word in ['diagnosis', 'test', 'examination', 'evaluation']):
|
| 174 |
+
topics['diagnosis'].append(result)
|
| 175 |
+
else:
|
| 176 |
+
topics['general'].append(result)
|
| 177 |
+
|
| 178 |
+
return topics
|
| 179 |
+
|
| 180 |
+
def _create_topic_summary(self, topic: str, results: List[Dict], user_query: str) -> str:
|
| 181 |
+
"""Create summary for a specific topic"""
|
| 182 |
+
if not results:
|
| 183 |
+
return ""
|
| 184 |
+
|
| 185 |
+
# Combine summaries for this topic
|
| 186 |
+
combined_text = " ".join([r.get('summary', '') for r in results])
|
| 187 |
+
|
| 188 |
+
if not combined_text:
|
| 189 |
+
return ""
|
| 190 |
+
|
| 191 |
+
# Create focused summary for this topic
|
| 192 |
+
topic_summary = summarizer.summarize_for_query(combined_text, user_query, max_length=400)
|
| 193 |
+
|
| 194 |
+
if topic_summary:
|
| 195 |
+
# Add topic header
|
| 196 |
+
topic_headers = {
|
| 197 |
+
'symptoms': "**Symptoms and Signs:**",
|
| 198 |
+
'treatments': "**Treatment Options:**",
|
| 199 |
+
'diagnosis': "**Diagnosis and Testing:**",
|
| 200 |
+
'general': "**General Information:**"
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
header = topic_headers.get(topic, "**Information:**")
|
| 204 |
+
return f"{header}\n{topic_summary}"
|
| 205 |
+
|
| 206 |
+
return ""
|
| 207 |
+
|
| 208 |
+
def _create_url_mapping(self, results: List[Dict]) -> Dict[int, str]:
|
| 209 |
+
"""Create URL mapping for citations"""
|
| 210 |
+
url_mapping = {}
|
| 211 |
+
|
| 212 |
+
for i, result in enumerate(results):
|
| 213 |
+
url_mapping[i + 1] = result['url']
|
| 214 |
+
|
| 215 |
+
return url_mapping
|
search/search.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
from .coordinator import SearchCoordinator
|
| 4 |
+
|
| 5 |
+
logger = logging.getLogger(__name__)
|
| 6 |
+
|
| 7 |
+
# Global search coordinator instance
|
| 8 |
+
_search_coordinator = None
|
| 9 |
+
|
| 10 |
+
def get_search_coordinator() -> SearchCoordinator:
|
| 11 |
+
"""Get or create the global search coordinator instance"""
|
| 12 |
+
global _search_coordinator
|
| 13 |
+
if _search_coordinator is None:
|
| 14 |
+
_search_coordinator = SearchCoordinator()
|
| 15 |
+
return _search_coordinator
|
| 16 |
+
|
| 17 |
+
class WebSearcher:
|
| 18 |
+
"""Legacy wrapper for backward compatibility"""
|
| 19 |
+
def __init__(self):
|
| 20 |
+
self.coordinator = get_search_coordinator()
|
| 21 |
+
self.max_results = 10
|
| 22 |
+
self.timeout = 10
|
| 23 |
+
|
| 24 |
+
def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 25 |
+
"""Search using the new coordinator system"""
|
| 26 |
+
try:
|
| 27 |
+
return self.coordinator.quick_search(query, num_results)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
logger.error(f"Search failed: {e}")
|
| 30 |
+
return []
|
| 31 |
+
|
| 32 |
+
def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 33 |
+
"""Search using DuckDuckGo engine"""
|
| 34 |
+
try:
|
| 35 |
+
return self.coordinator.quick_search(query, num_results)
|
| 36 |
+
except Exception as e:
|
| 37 |
+
logger.error(f"DuckDuckGo search failed: {e}")
|
| 38 |
+
return []
|
| 39 |
+
|
| 40 |
+
def extract_content(self, url: str) -> str:
|
| 41 |
+
"""Extract content using the new content extractor"""
|
| 42 |
+
try:
|
| 43 |
+
return self.coordinator.content_extractor.extract(url)
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.error(f"Content extraction failed: {e}")
|
| 46 |
+
return ""
|
| 47 |
+
|
| 48 |
+
def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 49 |
+
"""Search and extract content using the new system"""
|
| 50 |
+
try:
|
| 51 |
+
# Get search results
|
| 52 |
+
results = self.coordinator.quick_search(query, num_results)
|
| 53 |
+
|
| 54 |
+
# Extract content for each result
|
| 55 |
+
enriched_results = []
|
| 56 |
+
for result in results:
|
| 57 |
+
content = self.extract_content(result['url'])
|
| 58 |
+
if content:
|
| 59 |
+
enriched_result = result.copy()
|
| 60 |
+
enriched_result['content'] = content
|
| 61 |
+
enriched_results.append(enriched_result)
|
| 62 |
+
|
| 63 |
+
return enriched_results
|
| 64 |
+
except Exception as e:
|
| 65 |
+
logger.error(f"Search and extract failed: {e}")
|
| 66 |
+
return []
|
| 67 |
+
|
| 68 |
+
# Main search function for backward compatibility
|
| 69 |
+
def search_web(query: str, num_results: int = 10) -> List[Dict]:
|
| 70 |
+
"""Main search function using the new coordinator system"""
|
| 71 |
+
try:
|
| 72 |
+
coordinator = get_search_coordinator()
|
| 73 |
+
return coordinator.quick_search(query, num_results)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Web search failed: {e}")
|
| 76 |
+
return []
|
| 77 |
+
|
| 78 |
+
# Enhanced search function with content extraction
|
| 79 |
+
def search_web_with_content(query: str, num_results: int = 10) -> Tuple[str, Dict[int, str]]:
|
| 80 |
+
"""Enhanced search with content extraction and summarization"""
|
| 81 |
+
try:
|
| 82 |
+
coordinator = get_search_coordinator()
|
| 83 |
+
return coordinator.search(query, num_results)
|
| 84 |
+
except Exception as e:
|
| 85 |
+
logger.error(f"Enhanced web search failed: {e}")
|
| 86 |
+
return "", {}
|
| 87 |
+
|
| 88 |
+
# Medical-focused search function
|
| 89 |
+
def search_medical(query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
|
| 90 |
+
"""Medical-focused search with enhanced processing"""
|
| 91 |
+
try:
|
| 92 |
+
coordinator = get_search_coordinator()
|
| 93 |
+
return coordinator.medical_focus_search(query, num_results)
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Medical search failed: {e}")
|
| 96 |
+
return "", {}
|
| 97 |
+
|
| 98 |
+
# Multilingual medical search function
|
| 99 |
+
def search_multilingual_medical(query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 100 |
+
"""Comprehensive multilingual medical search supporting English, Vietnamese, and Chinese"""
|
| 101 |
+
try:
|
| 102 |
+
coordinator = get_search_coordinator()
|
| 103 |
+
return coordinator.multilingual_medical_search(query, num_results, target_language)
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Multilingual medical search failed: {e}")
|
| 106 |
+
return "", {}
|