Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ from itertools import cycle
|
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
from bs4 import BeautifulSoup, NavigableString
|
| 11 |
-
from playwright.async_api import async_playwright
|
| 12 |
|
| 13 |
class CredentialRevolver:
|
| 14 |
def __init__(self, proxy_string: str):
|
|
@@ -27,9 +27,9 @@ class CredentialRevolver:
|
|
| 27 |
server = f"http://{parsed.hostname}:{parsed.port}"
|
| 28 |
proxy_dict = {"server": server}
|
| 29 |
if parsed.username:
|
| 30 |
-
proxy_dict["username"] = parsed.username
|
| 31 |
if parsed.password:
|
| 32 |
-
proxy_dict["password"] = parsed.password
|
| 33 |
proxies.append(proxy_dict)
|
| 34 |
except Exception:
|
| 35 |
pass
|
|
@@ -150,9 +150,10 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
| 150 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
| 151 |
|
| 152 |
context_args = {
|
| 153 |
-
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/
|
| 154 |
'java_script_enabled': True,
|
| 155 |
-
'ignore_https_errors': True
|
|
|
|
| 156 |
}
|
| 157 |
if proxy_config:
|
| 158 |
context_args['proxy'] = proxy_config
|
|
@@ -161,27 +162,27 @@ async def perform_web_browse(query: str, browser_name: str, search_engine: str):
|
|
| 161 |
page = await context.new_page()
|
| 162 |
|
| 163 |
try:
|
| 164 |
-
response = await page.goto(url, wait_until='
|
| 165 |
-
|
| 166 |
-
current_url = page.url
|
| 167 |
-
if "google.com" in current_url:
|
| 168 |
-
await page.wait_for_selector('div#rso, div#search, body[jsmodel]', timeout=15000)
|
| 169 |
-
elif "perplexity.ai" in current_url or "you.com" in current_url:
|
| 170 |
-
await page.wait_for_timeout(4000)
|
| 171 |
-
|
| 172 |
-
final_url, title = page.url, await page.title() or "No Title"
|
| 173 |
|
|
|
|
|
|
|
| 174 |
html_content = await page.content()
|
| 175 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 176 |
|
| 177 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
| 178 |
markdown_text = converter.convert()
|
| 179 |
-
status_code = response.status if response else
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
return {
|
| 182 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
| 183 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
| 184 |
}
|
|
|
|
|
|
|
| 185 |
except Exception as e:
|
| 186 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
| 187 |
finally:
|
|
|
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
from bs4 import BeautifulSoup, NavigableString
|
| 11 |
+
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError
|
| 12 |
|
| 13 |
class CredentialRevolver:
|
| 14 |
def __init__(self, proxy_string: str):
|
|
|
|
| 27 |
server = f"http://{parsed.hostname}:{parsed.port}"
|
| 28 |
proxy_dict = {"server": server}
|
| 29 |
if parsed.username:
|
| 30 |
+
proxy_dict["username"] = urllib.parse.unquote(parsed.username)
|
| 31 |
if parsed.password:
|
| 32 |
+
proxy_dict["password"] = urllib.parse.unquote(parsed.password)
|
| 33 |
proxies.append(proxy_dict)
|
| 34 |
except Exception:
|
| 35 |
pass
|
|
|
|
| 150 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
| 151 |
|
| 152 |
context_args = {
|
| 153 |
+
'user_agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
|
| 154 |
'java_script_enabled': True,
|
| 155 |
+
'ignore_https_errors': True,
|
| 156 |
+
'bypass_csp': True
|
| 157 |
}
|
| 158 |
if proxy_config:
|
| 159 |
context_args['proxy'] = proxy_config
|
|
|
|
| 162 |
page = await context.new_page()
|
| 163 |
|
| 164 |
try:
|
| 165 |
+
response = await page.goto(url, wait_until='networkidle', timeout=35000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
+
final_url = page.url
|
| 168 |
+
title = await page.title() or "No Title"
|
| 169 |
html_content = await page.content()
|
| 170 |
soup = BeautifulSoup(html_content, 'lxml')
|
| 171 |
|
| 172 |
converter = HTML_TO_MARKDOWN_CONVERTER(soup, base_url=final_url)
|
| 173 |
markdown_text = converter.convert()
|
| 174 |
+
status_code = response.status if response else 0
|
| 175 |
+
|
| 176 |
+
if status_code not in {200, 204} and not markdown_text:
|
| 177 |
+
error_info = f"Page loaded with non-2xx status code: {status_code}. Content may be empty or an error page."
|
| 178 |
+
return {"status": "error", "query": query, "final_url": final_url, "http_status": status_code, "error_message": error_info}
|
| 179 |
+
|
| 180 |
return {
|
| 181 |
"status": "success", "query": query, "final_url": final_url, "page_title": title,
|
| 182 |
"http_status": status_code, "proxy_used": proxy_server_used, "markdown_content": markdown_text,
|
| 183 |
}
|
| 184 |
+
except PlaywrightTimeoutError:
|
| 185 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation timed out after 35s. The site may be slow, blocking automation, or the proxy ({proxy_server_used}) may have failed."}
|
| 186 |
except Exception as e:
|
| 187 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": str(e).splitlines()[0]}
|
| 188 |
finally:
|