Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- __pycache__/models.cpython-310.pyc +0 -0
- models.py +15 -3
- requirements.txt +16 -12
__pycache__/models.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ
|
|
|
models.py
CHANGED
|
@@ -8,7 +8,7 @@ import tldextract
|
|
| 8 |
from urllib.parse import urlparse, urljoin, urlunparse
|
| 9 |
from datetime import datetime
|
| 10 |
from typing import Dict, List, Any, Optional, Set, Tuple
|
| 11 |
-
from pydantic import BaseModel, Field, HttpUrl,
|
| 12 |
from enum import Enum
|
| 13 |
import logging
|
| 14 |
|
|
@@ -50,14 +50,14 @@ class URL(BaseModel):
|
|
| 50 |
error: Optional[str] = None # Error message if failed
|
| 51 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
| 52 |
|
| 53 |
-
@
|
| 54 |
def set_normalized_url(cls, v, values):
|
| 55 |
"""Normalize the URL if not already set"""
|
| 56 |
if not v and "url" in values:
|
| 57 |
return normalize_url(values["url"])
|
| 58 |
return v
|
| 59 |
|
| 60 |
-
@
|
| 61 |
def set_domain(cls, v, values):
|
| 62 |
"""Extract domain from URL if not already set"""
|
| 63 |
if not v and "url" in values:
|
|
@@ -65,6 +65,9 @@ class URL(BaseModel):
|
|
| 65 |
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
| 66 |
return v
|
| 67 |
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
class RobotsInfo(BaseModel):
|
| 70 |
"""Information from robots.txt for a domain"""
|
|
@@ -75,6 +78,9 @@ class RobotsInfo(BaseModel):
|
|
| 75 |
user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
|
| 76 |
status_code: Optional[int] = None # HTTP status code when fetching robots.txt
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
class Page(BaseModel):
|
| 80 |
"""Web page model with content and metadata"""
|
|
@@ -92,6 +98,9 @@ class Page(BaseModel):
|
|
| 92 |
is_duplicate: bool = False # Whether this is duplicate content
|
| 93 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
| 94 |
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
class DomainStats(BaseModel):
|
| 97 |
"""Statistics for a domain"""
|
|
@@ -104,6 +113,9 @@ class DomainStats(BaseModel):
|
|
| 104 |
crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
|
| 105 |
errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
|
| 106 |
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
def normalize_url(url: str) -> str:
|
| 109 |
"""
|
|
|
|
| 8 |
from urllib.parse import urlparse, urljoin, urlunparse
|
| 9 |
from datetime import datetime
|
| 10 |
from typing import Dict, List, Any, Optional, Set, Tuple
|
| 11 |
+
from pydantic import BaseModel, Field, HttpUrl, field_validator
|
| 12 |
from enum import Enum
|
| 13 |
import logging
|
| 14 |
|
|
|
|
| 50 |
error: Optional[str] = None # Error message if failed
|
| 51 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
| 52 |
|
| 53 |
+
@field_validator("normalized_url", mode="before")
|
| 54 |
def set_normalized_url(cls, v, values):
|
| 55 |
"""Normalize the URL if not already set"""
|
| 56 |
if not v and "url" in values:
|
| 57 |
return normalize_url(values["url"])
|
| 58 |
return v
|
| 59 |
|
| 60 |
+
@field_validator("domain", mode="before")
|
| 61 |
def set_domain(cls, v, values):
|
| 62 |
"""Extract domain from URL if not already set"""
|
| 63 |
if not v and "url" in values:
|
|
|
|
| 65 |
return f"{parsed.domain}.{parsed.suffix}" if parsed.suffix else parsed.domain
|
| 66 |
return v
|
| 67 |
|
| 68 |
+
class Config:
|
| 69 |
+
arbitrary_types_allowed = True
|
| 70 |
+
|
| 71 |
|
| 72 |
class RobotsInfo(BaseModel):
|
| 73 |
"""Information from robots.txt for a domain"""
|
|
|
|
| 78 |
user_agents: Dict[str, Dict[str, Any]] = Field(default_factory=dict) # Info per user agent
|
| 79 |
status_code: Optional[int] = None # HTTP status code when fetching robots.txt
|
| 80 |
|
| 81 |
+
class Config:
|
| 82 |
+
arbitrary_types_allowed = True
|
| 83 |
+
|
| 84 |
|
| 85 |
class Page(BaseModel):
|
| 86 |
"""Web page model with content and metadata"""
|
|
|
|
| 98 |
is_duplicate: bool = False # Whether this is duplicate content
|
| 99 |
metadata: Dict[str, Any] = Field(default_factory=dict) # Additional metadata
|
| 100 |
|
| 101 |
+
class Config:
|
| 102 |
+
arbitrary_types_allowed = True
|
| 103 |
+
|
| 104 |
|
| 105 |
class DomainStats(BaseModel):
|
| 106 |
"""Statistics for a domain"""
|
|
|
|
| 113 |
crawl_times: List[float] = Field(default_factory=list) # Recent crawl times
|
| 114 |
errors: Dict[int, int] = Field(default_factory=dict) # Status code counts for errors
|
| 115 |
|
| 116 |
+
class Config:
|
| 117 |
+
arbitrary_types_allowed = True
|
| 118 |
+
|
| 119 |
|
| 120 |
def normalize_url(url: str) -> str:
|
| 121 |
"""
|
requirements.txt
CHANGED
|
@@ -1,24 +1,24 @@
|
|
| 1 |
# Core dependencies
|
| 2 |
-
requests
|
| 3 |
-
beautifulsoup4
|
| 4 |
-
aiohttp
|
| 5 |
lxml==4.9.2
|
| 6 |
html5lib==1.1
|
| 7 |
-
pydantic
|
| 8 |
-
pymongo
|
| 9 |
-
redis
|
| 10 |
boto3==1.26.123
|
| 11 |
docopt==0.6.2
|
| 12 |
|
| 13 |
# URL and DNS handling
|
| 14 |
dnspython==2.3.0
|
| 15 |
-
tldextract
|
| 16 |
validators==0.20.0
|
| 17 |
robotexclusionrulesparser==1.7.1
|
| 18 |
urllib3==1.26.15
|
| 19 |
|
| 20 |
# Monitoring and metrics
|
| 21 |
-
prometheus-client
|
| 22 |
|
| 23 |
# HTML processing
|
| 24 |
html2text==2020.1.16
|
|
@@ -28,16 +28,20 @@ anyio==3.6.2
|
|
| 28 |
asyncio==3.4.3
|
| 29 |
|
| 30 |
# Utilities
|
| 31 |
-
python-dateutil
|
| 32 |
pytz==2023.3
|
| 33 |
retry==0.9.2
|
| 34 |
cryptography==40.0.1
|
| 35 |
cachetools==5.3.0
|
| 36 |
|
| 37 |
# Added from the code block
|
| 38 |
-
openai
|
| 39 |
-
gradio
|
| 40 |
chardet==5.2.0
|
| 41 |
|
| 42 |
# Dotenv
|
| 43 |
-
python-dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Core dependencies
|
| 2 |
+
requests>=2.31.0
|
| 3 |
+
beautifulsoup4>=4.12.0
|
| 4 |
+
aiohttp>=3.9.0
|
| 5 |
lxml==4.9.2
|
| 6 |
html5lib==1.1
|
| 7 |
+
pydantic>=2.0,<3.0
|
| 8 |
+
pymongo>=4.6.0
|
| 9 |
+
redis>=5.0.0
|
| 10 |
boto3==1.26.123
|
| 11 |
docopt==0.6.2
|
| 12 |
|
| 13 |
# URL and DNS handling
|
| 14 |
dnspython==2.3.0
|
| 15 |
+
tldextract>=5.1.1
|
| 16 |
validators==0.20.0
|
| 17 |
robotexclusionrulesparser==1.7.1
|
| 18 |
urllib3==1.26.15
|
| 19 |
|
| 20 |
# Monitoring and metrics
|
| 21 |
+
prometheus-client>=0.19.0
|
| 22 |
|
| 23 |
# HTML processing
|
| 24 |
html2text==2020.1.16
|
|
|
|
| 28 |
asyncio==3.4.3
|
| 29 |
|
| 30 |
# Utilities
|
| 31 |
+
python-dateutil>=2.8.2
|
| 32 |
pytz==2023.3
|
| 33 |
retry==0.9.2
|
| 34 |
cryptography==40.0.1
|
| 35 |
cachetools==5.3.0
|
| 36 |
|
| 37 |
# Added from the code block
|
| 38 |
+
openai>=1.12.0
|
| 39 |
+
gradio>=4.16.0
|
| 40 |
chardet==5.2.0
|
| 41 |
|
| 42 |
# Dotenv
|
| 43 |
+
python-dotenv>=1.0.0
|
| 44 |
+
|
| 45 |
+
# New dependencies
|
| 46 |
+
mmh3>=4.0.0
|
| 47 |
+
httpx>=0.26.0
|