BinKhoaLe1812 commited on
Commit
c54a81c
·
verified ·
1 Parent(s): 87e9584

Update search chunker

Browse files
Files changed (1) hide show
  1. search/search.py +91 -14
search/search.py CHANGED
@@ -4,7 +4,8 @@ import re
4
  from urllib.parse import urljoin, urlparse
5
  import time
6
  import logging
7
- from typing import List, Dict, Tuple
 
8
  import os
9
 
10
  logger = logging.getLogger(__name__)
@@ -78,17 +79,18 @@ class WebSearcher:
78
  for script in soup(["script", "style"]):
79
  script.decompose()
80
 
81
- # Get text content
82
- text = soup.get_text()
 
83
 
84
  # Clean up text
85
  lines = (line.strip() for line in text.splitlines())
86
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
87
  text = ' '.join(chunk for chunk in chunks if chunk)
88
 
89
- # Limit content length
90
- if len(text) > 2000:
91
- text = text[:2000] + "..."
92
 
93
  return text
94
 
@@ -96,12 +98,59 @@ class WebSearcher:
96
  logger.warning(f"Failed to extract content from {url}: {e}")
97
  return ""
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
100
  """Search for query and extract content from top results"""
101
  logger.info(f"Searching for: {query}")
102
 
103
- # Get search results (fetch more than needed for filtering)
104
- search_results = self.search_duckduckgo(query, min(num_results * 2, 20))
 
 
105
 
106
  # Extract content from each result with parallel processing
107
  enriched_results = []
@@ -118,22 +167,50 @@ class WebSearcher:
118
 
119
  try:
120
  logger.info(f"Extracting content from {result['url']}")
 
 
 
 
121
  content = self.extract_content(result['url'])
122
 
123
- if content and len(content.strip()) > 50: # Only include substantial content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  enriched_results.append({
125
- 'id': len(enriched_results) + 1, # Sequential ID
126
  'url': result['url'],
127
  'title': result['title'],
128
- 'content': content
129
  })
130
  failed_count = 0 # Reset failure counter
131
  else:
132
  failed_count += 1
133
- logger.warning(f"Insufficient content from {result['url']}")
134
 
135
  # Add delay to be respectful
136
- time.sleep(0.5) # Reduced delay for better performance
137
 
138
  except Exception as e:
139
  failed_count += 1
@@ -146,4 +223,4 @@ class WebSearcher:
146
  def search_web(query: str, num_results: int = 10) -> List[Dict]:
147
  """Main function to search the web and return enriched results"""
148
  searcher = WebSearcher()
149
- return searcher.search_and_extract(query, num_results)
 
4
  from urllib.parse import urljoin, urlparse
5
  import time
6
  import logging
7
+ from typing import List, Dict, Tuple, Set
8
+ from models import summarizer
9
  import os
10
 
11
  logger = logging.getLogger(__name__)
 
79
  for script in soup(["script", "style"]):
80
  script.decompose()
81
 
82
+ # Prefer main/article content if available
83
+ main = soup.find('main') or soup.find('article') or soup.find('div', {'role': 'main'})
84
+ text = (main.get_text(separator=' ') if main else soup.get_text(separator=' '))
85
 
86
  # Clean up text
87
  lines = (line.strip() for line in text.splitlines())
88
  chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
89
  text = ' '.join(chunk for chunk in chunks if chunk)
90
 
91
+ # Keep larger text; we'll chunk later
92
+ if len(text) > 20000:
93
+ text = text[:20000]
94
 
95
  return text
96
 
 
98
  logger.warning(f"Failed to extract content from {url}: {e}")
99
  return ""
100
 
101
+ def _chunk_text(self, text: str, chunk_size: int = 1200, overlap: int = 150) -> List[str]:
102
+ """Chunk large texts with overlap for LLM summarization."""
103
+ chunks = []
104
+ start = 0
105
+ n = len(text)
106
+ while start < n:
107
+ end = min(start + chunk_size, n)
108
+ chunk = text[start:end]
109
+ chunks.append(chunk)
110
+ if end == n:
111
+ break
112
+ start = end - overlap
113
+ if start < 0:
114
+ start = 0
115
+ return chunks
116
+
117
+ def _summarize_relevant(self, text: str, query: str) -> str:
118
+ """Summarize only query-relevant facts from a text chunk using NVIDIA Llama."""
119
+ return summarizer.summarize_for_query(text, query, max_length=260)
120
+
121
+ def _expand_intrasite_links(self, base_url: str, soup: BeautifulSoup, limit: int = 3) -> List[str]:
122
+ """Collect a few same-domain links for deeper crawl (e.g., subpages, sections)."""
123
+ try:
124
+ parsed_base = urlparse(base_url)
125
+ base_domain = parsed_base.netloc
126
+ links: List[str] = []
127
+ seen: Set[str] = set()
128
+ for a in soup.find_all('a', href=True):
129
+ href = a['href']
130
+ if href.startswith('#'):
131
+ continue
132
+ abs_url = urljoin(base_url, href)
133
+ parsed = urlparse(abs_url)
134
+ if parsed.netloc != base_domain:
135
+ continue
136
+ if abs_url in seen:
137
+ continue
138
+ seen.add(abs_url)
139
+ links.append(abs_url)
140
+ if len(links) >= limit:
141
+ break
142
+ return links
143
+ except Exception:
144
+ return []
145
+
146
  def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
147
  """Search for query and extract content from top results"""
148
  logger.info(f"Searching for: {query}")
149
 
150
+ # Aggregate from multiple sources to hit more sites
151
+ ddg_results = self.search_duckduckgo(query, min(num_results * 2, 20))
152
+ # TODO: add additional engines/APIs if available in env (e.g., SerpAPI, Bing). For now, DDG only.
153
+ search_results = ddg_results
154
 
155
  # Extract content from each result with parallel processing
156
  enriched_results = []
 
167
 
168
  try:
169
  logger.info(f"Extracting content from {result['url']}")
170
+ # Fetch HTML once to support intrasite expansion
171
+ resp = self.session.get(result['url'], timeout=self.timeout)
172
+ resp.raise_for_status()
173
+ soup = BeautifulSoup(resp.content, 'html.parser')
174
  content = self.extract_content(result['url'])
175
 
176
+ relevant_snippets: List[str] = []
177
+ if content and len(content.strip()) > 50:
178
+ # Chunk and summarize only relevant parts
179
+ for chunk in self._chunk_text(content, chunk_size=1400, overlap=200):
180
+ rel = self._summarize_relevant(chunk, query)
181
+ if rel:
182
+ relevant_snippets.append(rel)
183
+
184
+ # Try a few intrasite links (same domain) to gather more context
185
+ for extra_url in self._expand_intrasite_links(result['url'], soup, limit=2):
186
+ try:
187
+ extra_text = self.extract_content(extra_url)
188
+ if not extra_text:
189
+ continue
190
+ for chunk in self._chunk_text(extra_text, chunk_size=1200, overlap=150):
191
+ rel = self._summarize_relevant(chunk, query)
192
+ if rel:
193
+ relevant_snippets.append(rel)
194
+ # Be polite between requests
195
+ time.sleep(0.3)
196
+ except Exception as e:
197
+ logger.debug(f"Failed extra link {extra_url}: {e}")
198
+
199
+ # Only keep entries with relevant snippets
200
+ if relevant_snippets:
201
  enriched_results.append({
202
+ 'id': len(enriched_results) + 1,
203
  'url': result['url'],
204
  'title': result['title'],
205
+ 'content': " \n".join(relevant_snippets)[:2000]
206
  })
207
  failed_count = 0 # Reset failure counter
208
  else:
209
  failed_count += 1
210
+ logger.warning(f"No query-relevant content from {result['url']}")
211
 
212
  # Add delay to be respectful
213
+ time.sleep(0.4)
214
 
215
  except Exception as e:
216
  failed_count += 1
 
223
  def search_web(query: str, num_results: int = 10) -> List[Dict]:
224
  """Main function to search the web and return enriched results"""
225
  searcher = WebSearcher()
226
+ return searcher.search_and_extract(query, num_results)