sailajaai commited on
Commit
72275a6
·
verified ·
1 Parent(s): 8e474de

Update services/evaluation.py

Browse files
Files changed (1) hide show
  1. services/evaluation.py +27 -15
services/evaluation.py CHANGED
@@ -11,45 +11,58 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
  logger = logging.getLogger(__name__)
12
 
13
  # --- NLTK FIX START (Runtime Setup) ---
14
- # Set the NLTK_DATA path to the local directory used by the Dockerfile.
15
- # This ensures the application looks in /app/nltk_data, where the Dockerfile installs the files.
16
  NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
17
 
18
- # 1. Append the custom data path to NLTK's search paths first
19
  if NLTK_DATA_PATH not in nltk.data.path:
20
  nltk.data.path.append(NLTK_DATA_PATH)
21
 
22
- # 2. The download calls are now used for VERIFICATION/FALLBACK.
23
- # Since the Dockerfile is designed to install the data, this step should confirm
24
- # the files exist without requiring a new, unauthorized download.
25
  try:
26
  # Ensure the target directory exists (safety check)
27
  os.makedirs(NLTK_DATA_PATH, exist_ok=True)
28
 
29
  # Run the downloads using the safe path (quiet=True suppresses console output)
30
- # If the files are already installed by the Dockerfile, these finish quickly.
31
  nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
32
  nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
33
  nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
34
 
35
  except Exception as e:
36
- # Log a warning, but allow the process to proceed since the files should be present
37
  logger.warning(f"NLTK data verification setup encountered an issue: {e}")
38
 
39
  # --- NLTK FIX END ---
40
 
 
 
 
 
 
 
 
 
 
41
  # Initialize NLP tools
42
- # These lines now safely find the data because nltk.data.path was correctly set.
43
  lemmatizer = WordNetLemmatizer()
44
  stop_words = set(stopwords.words("english"))
45
  negation_words = {"not", "never", "no", "none", "cannot", "n't"}
46
 
47
- # SBERT Model for Similarity
48
- sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
 
49
 
50
- # Cross-Encoder for Contextual Understanding
51
- cross_encoder_model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/stsb-roberta-large")
52
- cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/stsb-roberta-large")
 
 
 
 
 
 
53
 
54
 
55
  # -------------------------------
@@ -57,7 +70,6 @@ cross_encoder_tokenizer = AutoTokenizer.from_pretrained("cross-encoder/stsb-robe
57
  # -------------------------------
58
  def preprocess_text(text: str):
59
  tokens = word_tokenize(text.lower()) # Lowercase & tokenize
60
- # The required NLTK data is now guaranteed to be loaded or downloaded previously by the setup steps
61
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Remove stopwords & lemmatize
62
  return " ".join(tokens)
63
 
 
11
  logger = logging.getLogger(__name__)
12
 
13
  # --- NLTK FIX START (Runtime Setup) ---
14
+ # Read the NLTK data path defined in the Dockerfile ENV variable (/app/nltk_data).
 
15
  NLTK_DATA_PATH = os.environ.get("NLTK_DATA", "/app/nltk_data")
16
 
17
+ # 1. Append the custom data path to NLTK's search paths to ensure it finds the files.
18
  if NLTK_DATA_PATH not in nltk.data.path:
19
  nltk.data.path.append(NLTK_DATA_PATH)
20
 
21
+ # 2. Safety check/download calls for verification.
22
+ # Since the Dockerfile installs the data, this step confirms existence without
23
+ # crashing if the files are already there.
24
  try:
25
  # Ensure the target directory exists (safety check)
26
  os.makedirs(NLTK_DATA_PATH, exist_ok=True)
27
 
28
  # Run the downloads using the safe path (quiet=True suppresses console output)
 
29
  nltk.download('stopwords', download_dir=NLTK_DATA_PATH, quiet=True)
30
  nltk.download('wordnet', download_dir=NLTK_DATA_PATH, quiet=True)
31
  nltk.download('punkt', download_dir=NLTK_DATA_PATH, quiet=True)
32
 
33
  except Exception as e:
 
34
  logger.warning(f"NLTK data verification setup encountered an issue: {e}")
35
 
36
  # --- NLTK FIX END ---
37
 
38
+
39
+ # --- HUGGING FACE CACHE FIX START ---
40
+ # Read the cache path defined in the Dockerfile ENV variable (/app/.cache).
41
+ HF_CACHE_PATH = os.environ.get("HF_HOME", "/app/.cache")
42
+ # Ensure the directory exists before models try to write to it.
43
+ os.makedirs(HF_CACHE_PATH, exist_ok=True)
44
+ # --- HUGGING FACE CACHE FIX END ---
45
+
46
+
47
  # Initialize NLP tools
48
+ # These lines now safely find the NLTK data because nltk.data.path was correctly set.
49
  lemmatizer = WordNetLemmatizer()
50
  stop_words = set(stopwords.words("english"))
51
  negation_words = {"not", "never", "no", "none", "cannot", "n't"}
52
 
53
+ # SBERT Model for Similarity - Pass cache_folder to SBERT
54
+ # This explicitly directs the model download/cache to the safe path.
55
+ sbert_model = SentenceTransformer("all-MiniLM-L6-v2", cache_folder=HF_CACHE_PATH)
56
 
57
+ # Cross-Encoder for Contextual Understanding - Pass cache_dir to AutoModel/AutoTokenizer
58
+ cross_encoder_model = AutoModelForSequenceClassification.from_pretrained(
59
+ "cross-encoder/stsb-roberta-large",
60
+ cache_dir=HF_CACHE_PATH
61
+ )
62
+ cross_encoder_tokenizer = AutoTokenizer.from_pretrained(
63
+ "cross-encoder/stsb-roberta-large",
64
+ cache_dir=HF_CACHE_PATH
65
+ )
66
 
67
 
68
  # -------------------------------
 
70
  # -------------------------------
71
  def preprocess_text(text: str):
72
  tokens = word_tokenize(text.lower()) # Lowercase & tokenize
 
73
  tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words] # Remove stopwords & lemmatize
74
  return " ".join(tokens)
75