Spaces:

sailajaai
/

Subjective_assesment

Sleeping

App Files Files Community

sailajaai commited on 23 days ago

Commit

72275a6

verified ·

1 Parent(s): 8e474de

Update services/evaluation.py

Browse files

Files changed (1) hide show

services/evaluation.py +27 -15

services/evaluation.py CHANGED Viewed

@@ -11,45 +11,58 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 logger = logging.getLogger(__name__)
 # --- NLTK FIX START (Runtime Setup) ---
-# Set the NLTK_DATA path to the local directory used by the Dockerfile.
-# This ensures the application looks in /app/nltk_data, where the Dockerfile installs the files.
 NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
-# 1. Append the custom data path to NLTK's search paths first
 if NLTK_DATA_PATH not in nltk.data.path:
     nltk.data.path.append(NLTK_DATA_PATH)
-# 2. The download calls are now used for VERIFICATION/FALLBACK.
-#    Since the Dockerfile is designed to install the data, this step should confirm
-#    the files exist without requiring a new, unauthorized download.
 try:
     # Ensure the target directory exists (safety check)
     os.makedirs(NLTK_DATA_PATH, exist_ok=True)
     # Run the downloads using the safe path (quiet=True suppresses console output)
-    # If the files are already installed by the Dockerfile, these finish quickly.
     nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
     nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
     nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
 except Exception as e:
-    # Log a warning, but allow the process to proceed since the files should be present
     logger.warning(f"NLTK data verification setup encountered an issue: {e}")
 # --- NLTK FIX END ---
 # Initialize NLP tools
-# These lines now safely find the data because nltk.data.path was correctly set.
 lemmatizer = WordNetLemmatizer()
 stop_words = set(stopwords.words("english"))
 negation_words = {"not", "never", "no", "none", "cannot", "n't"}
-# SBERT Model for Similarity
-sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Cross-Encoder for Contextual Understanding
-cross_encoder_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/stsb-roberta-large")
-cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/stsb-roberta-large")
 # -------------------------------
@@ -57,7 +70,6 @@ cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/stsb-robe
 # -------------------------------
 def preprocess_text(text: str):
     tokens = word_tokenize(text.lower())  # Lowercase & tokenize
-    # The required NLTK data is now guaranteed to be loaded or downloaded previously by the setup steps
     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords & lemmatize
     return " ".join(tokens)

 logger = logging.getLogger(__name__)
 # --- NLTK FIX START (Runtime Setup) ---
+# Read the NLTK data path defined in the Dockerfile ENV variable (/app/nltk_data).
 NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
+# 1. Append the custom data path to NLTK's search paths to ensure it finds the files.
 if NLTK_DATA_PATH not in nltk.data.path:
     nltk.data.path.append(NLTK_DATA_PATH)
+# 2. Safety check/download calls for verification.
+#    Since the Dockerfile installs the data, this step confirms existence without
+#    crashing if the files are already there.
 try:
     # Ensure the target directory exists (safety check)
     os.makedirs(NLTK_DATA_PATH, exist_ok=True)
     # Run the downloads using the safe path (quiet=True suppresses console output)
     nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
     nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
     nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
 except Exception as e:
     logger.warning(f"NLTK data verification setup encountered an issue: {e}")
 # --- NLTK FIX END ---
+# --- HUGGING FACE CACHE FIX START ---
+# Read the cache path defined in the Dockerfile ENV variable (/app/.cache).
+HF_CACHE_PATH = os.environ.get("HF_HOME", "/app/.cache")
+# Ensure the directory exists before models try to write to it.
+os.makedirs(HF_CACHE_PATH, exist_ok=True)
+# --- HUGGING FACE CACHE FIX END ---
 # Initialize NLP tools
+# These lines now safely find the NLTK data because nltk.data.path was correctly set.
 lemmatizer = WordNetLemmatizer()
 stop_words = set(stopwords.words("english"))
 negation_words = {"not", "never", "no", "none", "cannot", "n't"}
+# SBERT Model for Similarity - Pass cache_folder to SBERT
+# This explicitly directs the model download/cache to the safe path.
+sbert_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=HF_CACHE_PATH)
+# Cross-Encoder for Contextual Understanding - Pass cache_dir to AutoModel/AutoTokenizer
+cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(
+    "cross-encoder/stsb-roberta-large",
+    cache_dir=HF_CACHE_PATH
+)
+cross_encoder_tokenizer = AutoTokenizer.from_pretrained(
+    "cross-encoder/stsb-roberta-large",
+    cache_dir=HF_CACHE_PATH
+)
 # -------------------------------
 # -------------------------------
 def preprocess_text(text: str):
     tokens = word_tokenize(text.lower())  # Lowercase & tokenize
     tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords & lemmatize
     return " ".join(tokens)