Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -40,21 +40,21 @@ PLAYWRIGHT_STATE: Dict = {}
|
|
| 40 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
| 41 |
|
| 42 |
SEARCH_ENGINES = {
|
| 43 |
-
"Google":
|
| 44 |
-
"DuckDuckGo":
|
| 45 |
-
"Bing":
|
| 46 |
-
"Brave":
|
| 47 |
-
"Ecosia":
|
| 48 |
-
"Yahoo":
|
| 49 |
-
"Startpage":
|
| 50 |
-
"Qwant":
|
| 51 |
-
"Swisscows":
|
| 52 |
-
"You.com":
|
| 53 |
-
"SearXNG":
|
| 54 |
-
"MetaGer":
|
| 55 |
-
"Yandex":
|
| 56 |
-
"Baidu":
|
| 57 |
-
"Perplexity":
|
| 58 |
}
|
| 59 |
|
| 60 |
class HTML_TO_MARKDOWN_CONVERTER:
|
|
@@ -130,13 +130,11 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
| 130 |
|
| 131 |
if is_direct_url:
|
| 132 |
url = query
|
| 133 |
-
content_selector = 'body'
|
| 134 |
else:
|
| 135 |
-
|
| 136 |
-
if not
|
| 137 |
return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
|
| 138 |
-
url
|
| 139 |
-
url = url.format(query=urllib.parse.quote_plus(query))
|
| 140 |
|
| 141 |
proxy_config = REVOLVER.get_next()
|
| 142 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
|
@@ -148,17 +146,12 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
| 148 |
page = await context.new_page()
|
| 149 |
|
| 150 |
try:
|
| 151 |
-
response = await page.goto(url, wait_until='
|
| 152 |
-
|
| 153 |
-
await asyncio.wait([
|
| 154 |
-
page.wait_for_load_state('domcontentloaded', timeout=15000),
|
| 155 |
-
page.wait_for_selector(content_selector, timeout=15000),
|
| 156 |
-
], return_when=asyncio.FIRST_COMPLETED)
|
| 157 |
|
| 158 |
html_content = await page.content()
|
| 159 |
|
| 160 |
-
if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "
|
| 161 |
-
raise Exception("Anti-bot measure detected. Try another search engine or proxy.")
|
| 162 |
|
| 163 |
final_url, title = page.url, await page.title() or "No Title"
|
| 164 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
@@ -170,7 +163,7 @@ async def perform_web_browse(query: str, browser_name: str, search_engine_name:
|
|
| 170 |
except Exception as e:
|
| 171 |
error_message = str(e).splitlines()[0]
|
| 172 |
if "Timeout" in error_message:
|
| 173 |
-
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"
|
| 174 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
|
| 175 |
finally:
|
| 176 |
if 'page' in locals() and not page.is_closed(): await page.close()
|
|
|
|
| 40 |
REVOLVER = CredentialRevolver(os.getenv("PROXY_LIST", ""))
|
| 41 |
|
| 42 |
SEARCH_ENGINES = {
|
| 43 |
+
"Google": "https://www.google.com/search?q={query}&hl=en",
|
| 44 |
+
"DuckDuckGo": "https://duckduckgo.com/html/?q={query}",
|
| 45 |
+
"Bing": "https://www.bing.com/search?q={query}",
|
| 46 |
+
"Brave": "https://search.brave.com/search?q={query}",
|
| 47 |
+
"Ecosia": "https://www.ecosia.org/search?q={query}",
|
| 48 |
+
"Yahoo": "https://search.yahoo.com/search?p={query}",
|
| 49 |
+
"Startpage": "https://www.startpage.com/sp/search?q={query}",
|
| 50 |
+
"Qwant": "https://www.qwant.com/?q={query}",
|
| 51 |
+
"Swisscows": "https://swisscows.com/web?query={query}",
|
| 52 |
+
"You.com": "https://you.com/search?q={query}",
|
| 53 |
+
"SearXNG": "https://searx.be/search?q={query}",
|
| 54 |
+
"MetaGer": "https://metager.org/meta/meta.ger-en?eingabe={query}",
|
| 55 |
+
"Yandex": "https://yandex.com/search/?text={query}",
|
| 56 |
+
"Baidu": "https://www.baidu.com/s?wd={query}",
|
| 57 |
+
"Perplexity": "https://www.perplexity.ai/search?q={query}",
|
| 58 |
}
|
| 59 |
|
| 60 |
class HTML_TO_MARKDOWN_CONVERTER:
|
|
|
|
| 130 |
|
| 131 |
if is_direct_url:
|
| 132 |
url = query
|
|
|
|
| 133 |
else:
|
| 134 |
+
url_template = SEARCH_ENGINES.get(search_engine_name)
|
| 135 |
+
if not url_template:
|
| 136 |
return {"status": "error", "query": query, "error_message": f"Invalid search engine: '{search_engine_name}'."}
|
| 137 |
+
url = url_template.format(query=urllib.parse.quote_plus(query))
|
|
|
|
| 138 |
|
| 139 |
proxy_config = REVOLVER.get_next()
|
| 140 |
proxy_server_used = proxy_config["server"] if proxy_config else "Direct Connection"
|
|
|
|
| 146 |
page = await context.new_page()
|
| 147 |
|
| 148 |
try:
|
| 149 |
+
response = await page.goto(url, wait_until='domcontentloaded', timeout=25000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
html_content = await page.content()
|
| 152 |
|
| 153 |
+
if any(phrase in html_content for phrase in ["unusual traffic", "CAPTCHA", "are you human", "not a robot"]):
|
| 154 |
+
raise Exception(f"Anti-bot measure detected on {page.url}. Try another search engine or proxy.")
|
| 155 |
|
| 156 |
final_url, title = page.url, await page.title() or "No Title"
|
| 157 |
soup = BeautifulSoup(html_content, 'lxml')
|
|
|
|
| 163 |
except Exception as e:
|
| 164 |
error_message = str(e).splitlines()[0]
|
| 165 |
if "Timeout" in error_message:
|
| 166 |
+
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": f"Navigation Timeout: The page for '{query}' took too long to load."}
|
| 167 |
return {"status": "error", "query": query, "proxy_used": proxy_server_used, "error_message": error_message}
|
| 168 |
finally:
|
| 169 |
if 'page' in locals() and not page.is_closed(): await page.close()
|