Spaces:

BinKhoaLe1812
/

Medical-Chatbot

Sleeping

App Files Files Community

BinKhoaLe1812 commited on Oct 8

Commit

c54a81c

verified ·

1 Parent(s): 87e9584

Update search chunker

Browse files

Files changed (1) hide show

search/search.py +91 -14

search/search.py CHANGED Viewed

@@ -4,7 +4,8 @@ import re
 from urllib.parse import urljoin, urlparse
 import time
 import logging
-from typing import List, Dict, Tuple
 import os
 logger = logging.getLogger(__name__)
@@ -78,17 +79,18 @@ class WebSearcher:
             for script in soup(["script", "style"]):
                 script.decompose()
-            # Get text content
-            text = soup.get_text()
             # Clean up text
             lines = (line.strip() for line in text.splitlines())
             chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
             text = ' '.join(chunk for chunk in chunks if chunk)
-            # Limit content length
-            if len(text) > 2000:
-                text = text[:2000] + "..."
             return text
@@ -96,12 +98,59 @@ class WebSearcher:
             logger.warning(f"Failed to extract content from {url}: {e}")
             return ""
     def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
         """Search for query and extract content from top results"""
         logger.info(f"Searching for: {query}")
-        # Get search results (fetch more than needed for filtering)
-        search_results = self.search_duckduckgo(query, min(num_results * 2, 20))
         # Extract content from each result with parallel processing
         enriched_results = []
@@ -118,22 +167,50 @@ class WebSearcher:
             try:
                 logger.info(f"Extracting content from {result['url']}")
                 content = self.extract_content(result['url'])
-                if content and len(content.strip()) > 50:  # Only include substantial content
                     enriched_results.append({
-                        'id': len(enriched_results) + 1,  # Sequential ID
                         'url': result['url'],
                         'title': result['title'],
-                        'content': content
                     })
                     failed_count = 0  # Reset failure counter
                 else:
                     failed_count += 1
-                    logger.warning(f"Insufficient content from {result['url']}")
                 # Add delay to be respectful
-                time.sleep(0.5)  # Reduced delay for better performance
             except Exception as e:
                 failed_count += 1
@@ -146,4 +223,4 @@ class WebSearcher:
 def search_web(query: str, num_results: int = 10) -> List[Dict]:
     """Main function to search the web and return enriched results"""
     searcher = WebSearcher()
-    return searcher.search_and_extract(query, num_results)

 from urllib.parse import urljoin, urlparse
 import time
 import logging
+from typing import List, Dict, Tuple, Set
+from models import summarizer
 import os
 logger = logging.getLogger(__name__)
             for script in soup(["script", "style"]):
                 script.decompose()
+            # Prefer main/article content if available
+            main = soup.find('main') or soup.find('article') or soup.find('div', {'role': 'main'})
+            text = (main.get_text(separator=' ') if main else soup.get_text(separator=' '))
             # Clean up text
             lines = (line.strip() for line in text.splitlines())
             chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
             text = ' '.join(chunk for chunk in chunks if chunk)
+            # Keep larger text; we'll chunk later
+            if len(text) > 20000:
+                text = text[:20000]
             return text
             logger.warning(f"Failed to extract content from {url}: {e}")
             return ""
+    def _chunk_text(self, text: str, chunk_size: int = 1200, overlap: int = 150) -> List[str]:
+        """Chunk large texts with overlap for LLM summarization."""
+        chunks = []
+        start = 0
+        n = len(text)
+        while start < n:
+            end = min(start + chunk_size, n)
+            chunk = text[start:end]
+            chunks.append(chunk)
+            if end == n:
+                break
+            start = end - overlap
+            if start < 0:
+                start = 0
+        return chunks
+    def _summarize_relevant(self, text: str, query: str) -> str:
+        """Summarize only query-relevant facts from a text chunk using NVIDIA Llama."""
+        return summarizer.summarize_for_query(text, query, max_length=260)
+    def _expand_intrasite_links(self, base_url: str, soup: BeautifulSoup, limit: int = 3) -> List[str]:
+        """Collect a few same-domain links for deeper crawl (e.g., subpages, sections)."""
+        try:
+            parsed_base = urlparse(base_url)
+            base_domain = parsed_base.netloc
+            links: List[str] = []
+            seen: Set[str] = set()
+            for a in soup.find_all('a', href=True):
+                href = a['href']
+                if href.startswith('#'):
+                    continue
+                abs_url = urljoin(base_url, href)
+                parsed = urlparse(abs_url)
+                if parsed.netloc != base_domain:
+                    continue
+                if abs_url in seen:
+                    continue
+                seen.add(abs_url)
+                links.append(abs_url)
+                if len(links) >= limit:
+                    break
+            return links
+        except Exception:
+            return []
     def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
         """Search for query and extract content from top results"""
         logger.info(f"Searching for: {query}")
+        # Aggregate from multiple sources to hit more sites
+        ddg_results = self.search_duckduckgo(query, min(num_results * 2, 20))
+        # TODO: add additional engines/APIs if available in env (e.g., SerpAPI, Bing). For now, DDG only.
+        search_results = ddg_results
         # Extract content from each result with parallel processing
         enriched_results = []
             try:
                 logger.info(f"Extracting content from {result['url']}")
+                # Fetch HTML once to support intrasite expansion
+                resp = self.session.get(result['url'], timeout=self.timeout)
+                resp.raise_for_status()
+                soup = BeautifulSoup(resp.content, 'html.parser')
                 content = self.extract_content(result['url'])
+                relevant_snippets: List[str] = []
+                if content and len(content.strip()) > 50:
+                    # Chunk and summarize only relevant parts
+                    for chunk in self._chunk_text(content, chunk_size=1400, overlap=200):
+                        rel = self._summarize_relevant(chunk, query)
+                        if rel:
+                            relevant_snippets.append(rel)
+                # Try a few intrasite links (same domain) to gather more context
+                for extra_url in self._expand_intrasite_links(result['url'], soup, limit=2):
+                    try:
+                        extra_text = self.extract_content(extra_url)
+                        if not extra_text:
+                            continue
+                        for chunk in self._chunk_text(extra_text, chunk_size=1200, overlap=150):
+                            rel = self._summarize_relevant(chunk, query)
+                            if rel:
+                                relevant_snippets.append(rel)
+                        # Be polite between requests
+                        time.sleep(0.3)
+                    except Exception as e:
+                        logger.debug(f"Failed extra link {extra_url}: {e}")
+                # Only keep entries with relevant snippets
+                if relevant_snippets:
                     enriched_results.append({
+                        'id': len(enriched_results) + 1,
                         'url': result['url'],
                         'title': result['title'],
+                        'content': " \n".join(relevant_snippets)[:2000]
                     })
                     failed_count = 0  # Reset failure counter
                 else:
                     failed_count += 1
+                    logger.warning(f"No query-relevant content from {result['url']}")
                 # Add delay to be respectful
+                time.sleep(0.4)
             except Exception as e:
                 failed_count += 1
 def search_web(query: str, num_results: int = 10) -> List[Dict]:
     """Main function to search the web and return enriched results"""
     searcher = WebSearcher()
+    return searcher.search_and_extract(query, num_results)