Spaces:
Sleeping
Sleeping
Update services/evaluation.py
Browse files- services/evaluation.py +27 -15
services/evaluation.py
CHANGED
|
@@ -11,45 +11,58 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
# --- NLTK FIX START (Runtime Setup) ---
|
| 14 |
-
#
|
| 15 |
-
# This ensures the application looks in /app/nltk_data, where the Dockerfile installs the files.
|
| 16 |
NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
|
| 17 |
|
| 18 |
-
# 1. Append the custom data path to NLTK's search paths
|
| 19 |
if NLTK_DATA_PATH not in nltk.data.path:
|
| 20 |
nltk.data.path.append(NLTK_DATA_PATH)
|
| 21 |
|
| 22 |
-
# 2.
|
| 23 |
-
# Since the Dockerfile
|
| 24 |
-
# the files
|
| 25 |
try:
|
| 26 |
# Ensure the target directory exists (safety check)
|
| 27 |
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
|
| 28 |
|
| 29 |
# Run the downloads using the safe path (quiet=True suppresses console output)
|
| 30 |
-
# If the files are already installed by the Dockerfile, these finish quickly.
|
| 31 |
nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 32 |
nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 33 |
nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 34 |
|
| 35 |
except Exception as e:
|
| 36 |
-
# Log a warning, but allow the process to proceed since the files should be present
|
| 37 |
logger.warning(f"NLTK data verification setup encountered an issue: {e}")
|
| 38 |
|
| 39 |
# --- NLTK FIX END ---
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# Initialize NLP tools
|
| 42 |
-
# These lines now safely find the data because nltk.data.path was correctly set.
|
| 43 |
lemmatizer = WordNetLemmatizer()
|
| 44 |
stop_words = set(stopwords.words("english"))
|
| 45 |
negation_words = {"not", "never", "no", "none", "cannot", "n't"}
|
| 46 |
|
| 47 |
-
# SBERT Model for Similarity
|
| 48 |
-
|
|
|
|
| 49 |
|
| 50 |
-
# Cross-Encoder for Contextual Understanding
|
| 51 |
-
cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
# -------------------------------
|
|
@@ -57,7 +70,6 @@ cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/stsb-robe
|
|
| 57 |
# -------------------------------
|
| 58 |
def preprocess_text(text: str):
|
| 59 |
tokens = word_tokenize(text.lower()) # Lowercase & tokenize
|
| 60 |
-
# The required NLTK data is now guaranteed to be loaded or downloaded previously by the setup steps
|
| 61 |
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Remove stopwords & lemmatize
|
| 62 |
return " ".join(tokens)
|
| 63 |
|
|
|
|
| 11 |
logger = logging.getLogger(__name__)
|
| 12 |
|
| 13 |
# --- NLTK FIX START (Runtime Setup) ---
|
| 14 |
+
# Read the NLTK data path defined in the Dockerfile ENV variable (/app/nltk_data).
|
|
|
|
| 15 |
NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
|
| 16 |
|
| 17 |
+
# 1. Append the custom data path to NLTK's search paths to ensure it finds the files.
|
| 18 |
if NLTK_DATA_PATH not in nltk.data.path:
|
| 19 |
nltk.data.path.append(NLTK_DATA_PATH)
|
| 20 |
|
| 21 |
+
# 2. Safety check/download calls for verification.
|
| 22 |
+
# Since the Dockerfile installs the data, this step confirms existence without
|
| 23 |
+
# crashing if the files are already there.
|
| 24 |
try:
|
| 25 |
# Ensure the target directory exists (safety check)
|
| 26 |
os.makedirs(NLTK_DATA_PATH, exist_ok=True)
|
| 27 |
|
| 28 |
# Run the downloads using the safe path (quiet=True suppresses console output)
|
|
|
|
| 29 |
nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 30 |
nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 31 |
nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
|
| 32 |
|
| 33 |
except Exception as e:
|
|
|
|
| 34 |
logger.warning(f"NLTK data verification setup encountered an issue: {e}")
|
| 35 |
|
| 36 |
# --- NLTK FIX END ---
|
| 37 |
|
| 38 |
+
|
| 39 |
+
# --- HUGGING FACE CACHE FIX START ---
|
| 40 |
+
# Read the cache path defined in the Dockerfile ENV variable (/app/.cache).
|
| 41 |
+
HF_CACHE_PATH = os.environ.get("HF_HOME", "/app/.cache")
|
| 42 |
+
# Ensure the directory exists before models try to write to it.
|
| 43 |
+
os.makedirs(HF_CACHE_PATH, exist_ok=True)
|
| 44 |
+
# --- HUGGING FACE CACHE FIX END ---
|
| 45 |
+
|
| 46 |
+
|
| 47 |
# Initialize NLP tools
|
| 48 |
+
# These lines now safely find the NLTK data because nltk.data.path was correctly set.
|
| 49 |
lemmatizer = WordNetLemmatizer()
|
| 50 |
stop_words = set(stopwords.words("english"))
|
| 51 |
negation_words = {"not", "never", "no", "none", "cannot", "n't"}
|
| 52 |
|
| 53 |
+
# SBERT Model for Similarity - Pass cache_folder to SBERT
|
| 54 |
+
# This explicitly directs the model download/cache to the safe path.
|
| 55 |
+
sbert_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=HF_CACHE_PATH)
|
| 56 |
|
| 57 |
+
# Cross-Encoder for Contextual Understanding - Pass cache_dir to AutoModel/AutoTokenizer
|
| 58 |
+
cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(
|
| 59 |
+
"cross-encoder/stsb-roberta-large",
|
| 60 |
+
cache_dir=HF_CACHE_PATH
|
| 61 |
+
)
|
| 62 |
+
cross_encoder_tokenizer = AutoTokenizer.from_pretrained(
|
| 63 |
+
"cross-encoder/stsb-roberta-large",
|
| 64 |
+
cache_dir=HF_CACHE_PATH
|
| 65 |
+
)
|
| 66 |
|
| 67 |
|
| 68 |
# -------------------------------
|
|
|
|
| 70 |
# -------------------------------
|
| 71 |
def preprocess_text(text: str):
|
| 72 |
tokens = word_tokenize(text.lower()) # Lowercase & tokenize
|
|
|
|
| 73 |
tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Remove stopwords & lemmatize
|
| 74 |
return " ".join(tokens)
|
| 75 |
|