Hoang Kha commited on
Commit
6c510a2
·
1 Parent(s): 0514b67

Fix huggingface cache to /data for permission issues

Browse files
Files changed (2) hide show
  1. Dockerfile +2 -1
  2. main.py +23 -27
Dockerfile CHANGED
@@ -10,7 +10,8 @@ RUN pip install --no-cache-dir -r requirements.txt
10
  EXPOSE 7860
11
 
12
  ENV HF_HUB_DISABLE_CACHE=1
 
 
13
  ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
14
- ENV TRANSFORMERS_OFFLINE=0
15
 
16
  CMD ["python", "main.py"]
 
10
  EXPOSE 7860
11
 
12
  ENV HF_HUB_DISABLE_CACHE=1
13
+ ENV TRANSFORMERS_CACHE=/dev/null
14
+ ENV HF_HOME=/dev/null
15
  ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
 
16
 
17
  CMD ["python", "main.py"]
main.py CHANGED
@@ -201,15 +201,14 @@ import os
201
  from flask import Flask, render_template, request, jsonify
202
  from langdetect import detect
203
  import torch
204
- import torch.nn.functional as F
205
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
206
 
207
- # Tắt toàn bộ cache ghi đĩa
208
  os.environ["HF_HUB_DISABLE_CACHE"] = "1"
209
- os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
 
210
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
211
  os.environ["TRANSFORMERS_OFFLINE"] = "0"
212
- os.environ["HF_DATASETS_OFFLINE"] = "1"
213
  os.environ["DISABLE_TELEMETRY"] = "1"
214
 
215
  app = Flask(__name__)
@@ -220,38 +219,38 @@ EN_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
220
 
221
  device = "cuda" if torch.cuda.is_available() else "cpu"
222
 
223
- print("🔄 Loading Vietnamese model (memory-only mode)...")
224
  vi_tokenizer = AutoTokenizer.from_pretrained(
225
- VI_MODEL_NAME, use_fast=False, cache_dir=None, local_files_only=False
 
 
 
226
  )
227
  vi_model = AutoModelForSequenceClassification.from_pretrained(
228
- VI_MODEL_NAME, cache_dir=None, local_files_only=False
 
 
229
  ).to(device)
230
  vi_model.eval()
231
  sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
232
- print("✅ Vietnamese model loaded!")
233
 
234
- print("🔄 Loading English model (memory-only mode)...")
235
  en_tokenizer = AutoTokenizer.from_pretrained(
236
- EN_MODEL_NAME, cache_dir=None, local_files_only=False
 
 
237
  )
238
  en_model = AutoModelForSequenceClassification.from_pretrained(
239
- EN_MODEL_NAME, cache_dir=None, local_files_only=False
 
 
240
  ).to(device)
241
  en_model.eval()
242
- print("✅ English model loaded!")
243
 
244
  # -----------------------------
245
- # Label mapping
246
- # -----------------------------
247
- vi_label_map = {
248
- "POS": "Tích cực",
249
- "NEG": "Tiêu cực",
250
- "NEU": "Trung tính"
251
- }
252
-
253
- # -----------------------------
254
- # Language detection
255
  # -----------------------------
256
  def detect_lang(text: str) -> str:
257
  try:
@@ -270,14 +269,13 @@ def detect_lang(text: str) -> str:
270
  # Vietnamese analysis
271
  # -----------------------------
272
  def analyze_vi(text: str):
273
- if not text.strip():
274
- return {"error": "Empty text."}
275
  result = sentiment_pipeline(text)[0]
 
276
  label = result["label"]
277
  score = round(result["score"], 3)
278
  return {
279
  "language": "vi",
280
- "label": vi_label_map.get(label, label),
281
  "english_label": label,
282
  "score": score,
283
  "scores": {
@@ -318,10 +316,8 @@ def analyze():
318
  lang = (data.get("lang") or "auto").lower()
319
  if not text:
320
  return jsonify({"error": "Text is empty."}), 400
321
-
322
  if lang == "auto":
323
  lang = detect_lang(text)
324
-
325
  result = analyze_vi(text) if lang == "vi" else analyze_en(text)
326
  return jsonify({"ok": True, "input": {"text": text, "lang": lang}, "result": result})
327
 
 
201
  from flask import Flask, render_template, request, jsonify
202
  from langdetect import detect
203
  import torch
 
204
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
205
 
206
+ # ⚙️ Ép Hugging Face không ghi cache, chỉ load vào RAM
207
  os.environ["HF_HUB_DISABLE_CACHE"] = "1"
208
+ os.environ["TRANSFORMERS_CACHE"] = "/dev/null" # ⛔ cache về null
209
+ os.environ["HF_HOME"] = "/dev/null" # ⛔ home cache về null
210
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
211
  os.environ["TRANSFORMERS_OFFLINE"] = "0"
 
212
  os.environ["DISABLE_TELEMETRY"] = "1"
213
 
214
  app = Flask(__name__)
 
219
 
220
  device = "cuda" if torch.cuda.is_available() else "cpu"
221
 
222
+ print("🔄 Loading Vietnamese model (RAM-only mode)...")
223
  vi_tokenizer = AutoTokenizer.from_pretrained(
224
+ VI_MODEL_NAME,
225
+ use_fast=False,
226
+ local_files_only=False,
227
+ cache_dir=None,
228
  )
229
  vi_model = AutoModelForSequenceClassification.from_pretrained(
230
+ VI_MODEL_NAME,
231
+ local_files_only=False,
232
+ cache_dir=None,
233
  ).to(device)
234
  vi_model.eval()
235
  sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
236
+ print("✅ Vietnamese model loaded successfully.")
237
 
238
+ print("🔄 Loading English model (RAM-only mode)...")
239
  en_tokenizer = AutoTokenizer.from_pretrained(
240
+ EN_MODEL_NAME,
241
+ local_files_only=False,
242
+ cache_dir=None,
243
  )
244
  en_model = AutoModelForSequenceClassification.from_pretrained(
245
+ EN_MODEL_NAME,
246
+ local_files_only=False,
247
+ cache_dir=None,
248
  ).to(device)
249
  en_model.eval()
250
+ print("✅ English model loaded successfully.")
251
 
252
  # -----------------------------
253
+ # Detect language
 
 
 
 
 
 
 
 
 
254
  # -----------------------------
255
  def detect_lang(text: str) -> str:
256
  try:
 
269
  # Vietnamese analysis
270
  # -----------------------------
271
  def analyze_vi(text: str):
 
 
272
  result = sentiment_pipeline(text)[0]
273
+ label_map = {"POS": "Tích cực", "NEG": "Tiêu cực", "NEU": "Trung tính"}
274
  label = result["label"]
275
  score = round(result["score"], 3)
276
  return {
277
  "language": "vi",
278
+ "label": label_map.get(label, label),
279
  "english_label": label,
280
  "score": score,
281
  "scores": {
 
316
  lang = (data.get("lang") or "auto").lower()
317
  if not text:
318
  return jsonify({"error": "Text is empty."}), 400
 
319
  if lang == "auto":
320
  lang = detect_lang(text)
 
321
  result = analyze_vi(text) if lang == "vi" else analyze_en(text)
322
  return jsonify({"ok": True, "input": {"text": text, "lang": lang}, "result": result})
323