Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -45,7 +45,7 @@ PLAYWRIGHT_STATE: Dict = {}
|
|
| 45 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
| 46 |
|
| 47 |
SEARCH_ENGINES = {
|
| 48 |
-
"Google": "https://www.google.com/search?q={query}",
|
| 49 |
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
| 50 |
"Bing": "https://www.bing.com/search?q={query}",
|
| 51 |
"Brave": "https://search.brave.com/search?q={query}",
|
|
@@ -151,9 +151,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
| 151 |
|
| 152 |
context_args = {
|
| 153 |
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 154 |
-
'java_script_enabled': True,
|
| 155 |
-
'ignore_https_errors': True,
|
| 156 |
-
'bypass_csp': True
|
| 157 |
}
|
| 158 |
if proxy_config:
|
| 159 |
context_args['proxy'] = proxy_config
|
|
@@ -162,27 +160,31 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
| 162 |
page = await context.new_page()
|
| 163 |
|
| 164 |
try:
|
| 165 |
-
response = await page.goto(url, wait_until='
|
| 166 |
-
|
| 167 |
final_url = page.url
|
| 168 |
-
|
| 169 |
html_content = await page.content()
|
| 170 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 171 |
-
|
| 172 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
| 173 |
markdown_text = converter.convert()
|
| 174 |
-
status_code = response.status if response else 0
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
return {
|
| 181 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
| 182 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
| 183 |
}
|
| 184 |
except PlaywrightTimeoutError:
|
| 185 |
-
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after
|
| 186 |
except Exception as e:
|
| 187 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
| 188 |
finally:
|
|
|
|
| 45 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
| 46 |
|
| 47 |
SEARCH_ENGINES = {
|
| 48 |
+
"Google": "https://www.google.com/search?q={query}&hl=en",
|
| 49 |
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
| 50 |
"Bing": "https://www.bing.com/search?q={query}",
|
| 51 |
"Brave": "https://search.brave.com/search?q={query}",
|
|
|
|
| 151 |
|
| 152 |
context_args = {
|
| 153 |
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 154 |
+
'java_script_enabled': True, 'ignore_https_errors': True, 'bypass_csp': True
|
|
|
|
|
|
|
| 155 |
}
|
| 156 |
if proxy_config:
|
| 157 |
context_args['proxy'] = proxy_config
|
|
|
|
| 160 |
page = await context.new_page()
|
| 161 |
|
| 162 |
try:
|
| 163 |
+
response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
|
|
|
|
| 164 |
final_url = page.url
|
| 165 |
+
|
| 166 |
html_content = await page.content()
|
| 167 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
|
| 168 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
| 169 |
markdown_text = converter.convert()
|
|
|
|
| 170 |
|
| 171 |
+
# HYBRID STRATEGY: If content is empty/trivial, wait briefly for JS to render.
|
| 172 |
+
if len(markdown_text.split()) < 20:
|
| 173 |
+
await page.wait_for_timeout(3000)
|
| 174 |
+
html_content = await page.content()
|
| 175 |
+
soup = BeautifulSoup(html_content, 'lxml')
|
| 176 |
+
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
| 177 |
+
markdown_text = converter.convert()
|
| 178 |
+
|
| 179 |
+
title = await page.title() or "No Title"
|
| 180 |
+
status_code = response.status if response else 0
|
| 181 |
|
| 182 |
return {
|
| 183 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
| 184 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
| 185 |
}
|
| 186 |
except PlaywrightTimeoutError:
|
| 187 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 25s. Site is likely too slow or blocking requests."}
|
| 188 |
except Exception as e:
|
| 189 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
| 190 |
finally:
|