BinKhoaLe1812 commited on
Commit
1b4459c
·
verified ·
1 Parent(s): 2600d9f

Delete search

Browse files
Files changed (2) hide show
  1. search/__init__.py +0 -2
  2. search/search.py +0 -304
search/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- # Search package
2
- from .search import WebSearcher, search_web
 
 
 
search/search.py DELETED
@@ -1,304 +0,0 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- import re
4
- from urllib.parse import urljoin, urlparse
5
- import time
6
- import logging
7
- from typing import List, Dict, Tuple, Set
8
- from models import summarizer
9
- import os
10
-
11
- logger = logging.getLogger(__name__)
12
-
13
- class WebSearcher:
14
- def __init__(self):
15
- self.session = requests.Session()
16
- self.session.headers.update({
17
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
18
- })
19
- self.max_results = 10
20
- self.timeout = 10
21
-
22
- def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
23
- """Search Google and return results with URLs and titles"""
24
- try:
25
- # Use DuckDuckGo as it's more reliable for scraping
26
- return self.search_duckduckgo(query, num_results)
27
- except Exception as e:
28
- logger.error(f"Google search failed: {e}")
29
- return []
30
-
31
- def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
32
- """Search DuckDuckGo with multiple fallback strategies"""
33
- results = []
34
-
35
- # Strategy 1: Try DuckDuckGo HTML interface
36
- try:
37
- url = "https://html.duckduckgo.com/html/"
38
- params = {
39
- 'q': query,
40
- 'kl': 'us-en'
41
- }
42
-
43
- response = self.session.get(url, params=params, timeout=self.timeout)
44
- response.raise_for_status()
45
-
46
- soup = BeautifulSoup(response.content, 'html.parser')
47
-
48
- # Try multiple selectors for result links
49
- selectors = [
50
- 'a.result__a',
51
- 'a[data-testid="result-title-a"]',
52
- '.result__title a',
53
- '.web-result a',
54
- 'a[href*="http"]'
55
- ]
56
-
57
- for selector in selectors:
58
- result_links = soup.select(selector)
59
- if result_links:
60
- logger.info(f"Found {len(result_links)} results with selector: {selector}")
61
- break
62
-
63
- for link in result_links[:num_results]:
64
- try:
65
- href = link.get('href')
66
- if href and href.startswith('http') and 'duckduckgo.com' not in href:
67
- title = link.get_text(strip=True)
68
- if title and href:
69
- results.append({
70
- 'url': href,
71
- 'title': title,
72
- 'content': ''
73
- })
74
- except Exception as e:
75
- logger.warning(f"Error parsing result: {e}")
76
- continue
77
-
78
- if results:
79
- logger.info(f"DuckDuckGo HTML search found {len(results)} results")
80
- return results
81
-
82
- except Exception as e:
83
- logger.warning(f"DuckDuckGo HTML search failed: {e}")
84
-
85
- # Strategy 2: Try DuckDuckGo Instant Answer API
86
- try:
87
- api_url = "https://api.duckduckgo.com/"
88
- params = {
89
- 'q': query,
90
- 'format': 'json',
91
- 'no_html': '1',
92
- 'skip_disambig': '1'
93
- }
94
-
95
- response = self.session.get(api_url, params=params, timeout=self.timeout)
96
- response.raise_for_status()
97
- data = response.json()
98
-
99
- # Extract results from API response
100
- if data.get('AbstractURL'):
101
- results.append({
102
- 'url': data['AbstractURL'],
103
- 'title': data.get('Heading', query),
104
- 'content': ''
105
- })
106
-
107
- # Add related topics
108
- for topic in data.get('RelatedTopics', [])[:num_results-1]:
109
- if isinstance(topic, dict) and topic.get('FirstURL'):
110
- results.append({
111
- 'url': topic['FirstURL'],
112
- 'title': topic.get('Text', '').split(' - ')[0] if topic.get('Text') else query,
113
- 'content': ''
114
- })
115
-
116
- if results:
117
- logger.info(f"DuckDuckGo API search found {len(results)} results")
118
- return results
119
-
120
- except Exception as e:
121
- logger.warning(f"DuckDuckGo API search failed: {e}")
122
-
123
- # Strategy 3: Fallback to mock medical results for testing
124
- if not results:
125
- logger.warning("All search strategies failed, using fallback medical sources")
126
- fallback_sources = [
127
- {
128
- 'url': 'https://www.mayoclinic.org/diseases-conditions/migraine/symptoms-causes/syc-20360201',
129
- 'title': f'Mayo Clinic: {query}',
130
- 'content': ''
131
- },
132
- {
133
- 'url': 'https://www.webmd.com/migraines-headaches/default.htm',
134
- 'title': f'WebMD: {query}',
135
- 'content': ''
136
- },
137
- {
138
- 'url': 'https://www.healthline.com/health/migraine',
139
- 'title': f'Healthline: {query}',
140
- 'content': ''
141
- }
142
- ]
143
- results = fallback_sources[:min(num_results, len(fallback_sources))]
144
- logger.info(f"Using {len(results)} fallback medical sources")
145
-
146
- return results
147
-
148
- def extract_content(self, url: str) -> str:
149
- """Extract text content from a webpage"""
150
- try:
151
- response = self.session.get(url, timeout=self.timeout)
152
- response.raise_for_status()
153
-
154
- soup = BeautifulSoup(response.content, 'html.parser')
155
-
156
- # Remove script and style elements
157
- for script in soup(["script", "style"]):
158
- script.decompose()
159
-
160
- # Prefer main/article content if available
161
- main = soup.find('main') or soup.find('article') or soup.find('div', {'role': 'main'})
162
- text = (main.get_text(separator=' ') if main else soup.get_text(separator=' '))
163
-
164
- # Clean up text
165
- lines = (line.strip() for line in text.splitlines())
166
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
167
- text = ' '.join(chunk for chunk in chunks if chunk)
168
-
169
- # Keep larger text; we'll chunk later
170
- if len(text) > 20000:
171
- text = text[:20000]
172
-
173
- return text
174
-
175
- except Exception as e:
176
- logger.warning(f"Failed to extract content from {url}: {e}")
177
- return ""
178
-
179
- def _chunk_text(self, text: str, chunk_size: int = 1200, overlap: int = 150) -> List[str]:
180
- """Chunk large texts with overlap for LLM summarization."""
181
- chunks = []
182
- start = 0
183
- n = len(text)
184
- while start < n:
185
- end = min(start + chunk_size, n)
186
- chunk = text[start:end]
187
- chunks.append(chunk)
188
- if end == n:
189
- break
190
- start = end - overlap
191
- if start < 0:
192
- start = 0
193
- return chunks
194
-
195
- def _summarize_relevant(self, text: str, query: str) -> str:
196
- """Summarize only query-relevant facts from a text chunk using NVIDIA Llama."""
197
- return summarizer.summarize_for_query(text, query, max_length=260)
198
-
199
- def _expand_intrasite_links(self, base_url: str, soup: BeautifulSoup, limit: int = 3) -> List[str]:
200
- """Collect a few same-domain links for deeper crawl (e.g., subpages, sections)."""
201
- try:
202
- parsed_base = urlparse(base_url)
203
- base_domain = parsed_base.netloc
204
- links: List[str] = []
205
- seen: Set[str] = set()
206
- for a in soup.find_all('a', href=True):
207
- href = a['href']
208
- if href.startswith('#'):
209
- continue
210
- abs_url = urljoin(base_url, href)
211
- parsed = urlparse(abs_url)
212
- if parsed.netloc != base_domain:
213
- continue
214
- if abs_url in seen:
215
- continue
216
- seen.add(abs_url)
217
- links.append(abs_url)
218
- if len(links) >= limit:
219
- break
220
- return links
221
- except Exception:
222
- return []
223
-
224
- def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
225
- """Search for query and extract content from top results"""
226
- logger.info(f"Searching for: {query}")
227
-
228
- # Aggregate from multiple sources to hit more sites
229
- ddg_results = self.search_duckduckgo(query, min(num_results * 2, 20))
230
- # TODO: add additional engines/APIs if available in env (e.g., SerpAPI, Bing). For now, DDG only.
231
- search_results = ddg_results
232
-
233
- # Extract content from each result with parallel processing
234
- enriched_results = []
235
- failed_count = 0
236
- max_failures = 5 # Stop after 5 consecutive failures
237
-
238
- for i, result in enumerate(search_results):
239
- if len(enriched_results) >= num_results:
240
- break
241
-
242
- if failed_count >= max_failures:
243
- logger.warning(f"Too many failures ({failed_count}), stopping extraction")
244
- break
245
-
246
- try:
247
- logger.info(f"Extracting content from {result['url']}")
248
- # Fetch HTML once to support intrasite expansion
249
- resp = self.session.get(result['url'], timeout=self.timeout)
250
- resp.raise_for_status()
251
- soup = BeautifulSoup(resp.content, 'html.parser')
252
- content = self.extract_content(result['url'])
253
-
254
- relevant_snippets: List[str] = []
255
- if content and len(content.strip()) > 50:
256
- # Chunk and summarize only relevant parts
257
- for chunk in self._chunk_text(content, chunk_size=1400, overlap=200):
258
- rel = self._summarize_relevant(chunk, query)
259
- if rel:
260
- relevant_snippets.append(rel)
261
-
262
- # Try a few intrasite links (same domain) to gather more context
263
- for extra_url in self._expand_intrasite_links(result['url'], soup, limit=2):
264
- try:
265
- extra_text = self.extract_content(extra_url)
266
- if not extra_text:
267
- continue
268
- for chunk in self._chunk_text(extra_text, chunk_size=1200, overlap=150):
269
- rel = self._summarize_relevant(chunk, query)
270
- if rel:
271
- relevant_snippets.append(rel)
272
- # Be polite between requests
273
- time.sleep(0.3)
274
- except Exception as e:
275
- logger.debug(f"Failed extra link {extra_url}: {e}")
276
-
277
- # Only keep entries with relevant snippets
278
- if relevant_snippets:
279
- enriched_results.append({
280
- 'id': len(enriched_results) + 1,
281
- 'url': result['url'],
282
- 'title': result['title'],
283
- 'content': " \n".join(relevant_snippets)[:2000]
284
- })
285
- failed_count = 0 # Reset failure counter
286
- else:
287
- failed_count += 1
288
- logger.warning(f"No query-relevant content from {result['url']}")
289
-
290
- # Add delay to be respectful
291
- time.sleep(0.4)
292
-
293
- except Exception as e:
294
- failed_count += 1
295
- logger.warning(f"Failed to process {result['url']}: {e}")
296
- continue
297
-
298
- logger.info(f"Successfully processed {len(enriched_results)} results out of {len(search_results)} attempted")
299
- return enriched_results
300
-
301
- def search_web(query: str, num_results: int = 10) -> List[Dict]:
302
- """Main function to search the web and return enriched results"""
303
- searcher = WebSearcher()
304
- return searcher.search_and_extract(query, num_results)