Hoang Kha commited on
Commit
0514b67
·
1 Parent(s): 1482be8

Fix huggingface cache to /data for permission issues

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -3
  2. main.py +250 -117
Dockerfile CHANGED
@@ -9,8 +9,8 @@ RUN pip install --no-cache-dir -r requirements.txt
9
 
10
  EXPOSE 7860
11
 
12
- ENV HF_HOME=/data/huggingface
13
- ENV TRANSFORMERS_CACHE=/data/huggingface
14
- RUN mkdir -p /data/huggingface
15
 
16
  CMD ["python", "main.py"]
 
9
 
10
  EXPOSE 7860
11
 
12
+ ENV HF_HUB_DISABLE_CACHE=1
13
+ ENV HF_HUB_DISABLE_SYMLINKS_WARNING=1
14
+ ENV TRANSFORMERS_OFFLINE=0
15
 
16
  CMD ["python", "main.py"]
main.py CHANGED
@@ -1,15 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from flask import Flask, render_template, request, jsonify
3
  from langdetect import detect
4
  import torch
5
  import torch.nn.functional as F
6
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
7
- os.environ["HF_HOME"] = "/data/huggingface"
8
- os.environ["TRANSFORMERS_CACHE"] = "/data/huggingface"
9
- os.makedirs("/data/huggingface", exist_ok=True)
 
10
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
11
  os.environ["TRANSFORMERS_OFFLINE"] = "0"
12
- os.environ["HF_HUB_DISABLE_CACHE"] = "1"
 
13
 
14
  app = Flask(__name__)
15
 
@@ -19,47 +220,38 @@ EN_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
- # Vietnamese model
23
- # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
24
- # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME).to(device)
25
- # vi_model.eval()
26
- # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
27
- # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME)
28
- # vi_model.eval()
29
- # sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
30
-
31
-
32
- # # English model
33
- # en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME)
34
- # en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME).to(device)
35
- # en_model.eval()
36
-
37
- print("🔄 Loading Vietnamese model from Hugging Face Hub (no cache)...")
38
- vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False, local_files_only=False)
39
- vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME, local_files_only=False)
40
  vi_model.eval()
41
  sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
42
-
43
- print("🔄 Loading English model from Hugging Face Hub (no cache)...")
44
- en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME, local_files_only=False)
45
- en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME, local_files_only=False)
 
 
 
 
 
46
  en_model.eval()
47
- # Label mapping cho PhoBERT
48
- vi_label_map = {
49
- 0: ("NEGATIVE", "Tiêu cực"),
50
- 1: ("NEUTRAL", "Trung tính"),
51
- 2: ("POSITIVE", "Tích cực")
52
- }
53
 
54
- # Label mapping cho tiếng Anh
55
- en_label_map = {
56
- 0: ("NEGATIVE", "Negative"),
57
- 1: ("POSITIVE", "Positive")
 
 
 
58
  }
59
 
60
-
61
  # -----------------------------
62
- # Ngôn ngữ nhận diện
63
  # -----------------------------
64
  def detect_lang(text: str) -> str:
65
  try:
@@ -68,99 +260,50 @@ def detect_lang(text: str) -> str:
68
  return "vi"
69
  elif lang.startswith("en"):
70
  return "en"
71
- else:
72
- if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
73
- return "vi"
74
- return "en"
75
  except Exception:
76
- if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
77
- return "vi"
78
- return "en"
79
-
80
 
81
  # -----------------------------
82
- # Phân tích tiếng Việt (PhoBERT)
83
  # -----------------------------
84
- # def analyze_vi(text: str):
85
- # inputs = vi_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
86
- # with torch.no_grad():
87
- # outputs = vi_model(**inputs)
88
- # logits = outputs.logits.squeeze(0)
89
- # probs = torch.softmax(logits, dim=-1)
90
-
91
- # label_idx = int(torch.argmax(probs).item())
92
- # eng_label, vi_label = vi_label_map[label_idx]
93
- # confidence = float(probs[label_idx].item())
94
-
95
- # scores = {
96
- # vi_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(3)
97
- # }
98
-
99
- # return {
100
- # "language": "vi",
101
- # "label": vi_label,
102
- # "english_label": eng_label,
103
- # "score": round(confidence, 3),
104
- # "scores": scores
105
- # }
106
-
107
  def analyze_vi(text: str):
108
  if not text.strip():
109
- return {"error": "Text is empty."}
110
-
111
- # Dùng pipeline của transformers
112
  result = sentiment_pipeline(text)[0]
113
  label = result["label"]
114
  score = round(result["score"], 3)
115
-
116
- # Map nhãn tiếng Việt
117
- label_map = {
118
- "POS": "Tích cực",
119
- "NEG": "Tiêu cực",
120
- "NEU": "Trung tính"
121
- }
122
-
123
- vi_label = label_map.get(label, label)
124
-
125
- # Trả kết quả tương thích với frontend
126
  return {
127
  "language": "vi",
128
- "label": vi_label,
129
- "english_label": label, # Giữ nhãn gốc POS/NEG/NEU
130
  "score": score,
131
  "scores": {
132
  "Tích cực": score if label == "POS" else 0.0,
133
  "Trung tính": score if label == "NEU" else 0.0,
134
- "Tiêu cực": score if label == "NEG" else 0.0
135
- }
136
  }
 
137
  # -----------------------------
138
- # Phân tích tiếng Anh
139
  # -----------------------------
140
  def analyze_en(text: str):
141
  inputs = en_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
142
  with torch.no_grad():
143
- outputs = en_model(**inputs)
144
- logits = outputs.logits.squeeze(0)
145
  probs = torch.softmax(logits, dim=-1)
146
-
147
- label_idx = int(torch.argmax(probs).item())
148
- eng_label, vi_label = en_label_map[label_idx]
149
- confidence = float(probs[label_idx].item())
150
-
151
- scores = {
152
- en_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(2)
153
- }
154
-
155
  return {
156
  "language": "en",
157
- "label": vi_label, # Giữ English, có thể đổi sang tiếng Việt nếu muốn
158
- "english_label": eng_label,
159
- "score": round(confidence, 3),
160
- "scores": scores
161
  }
162
 
163
-
164
  # -----------------------------
165
  # Flask routes
166
  # -----------------------------
@@ -168,7 +311,6 @@ def analyze_en(text: str):
168
  def home():
169
  return render_template("index.html")
170
 
171
-
172
  @app.route("/analyze", methods=["POST"])
173
  def analyze():
174
  data = request.get_json(force=True)
@@ -180,17 +322,8 @@ def analyze():
180
  if lang == "auto":
181
  lang = detect_lang(text)
182
 
183
- if lang == "vi":
184
- result = analyze_vi(text)
185
- else:
186
- result = analyze_en(text)
187
-
188
- return jsonify({
189
- "ok": True,
190
- "input": {"text": text, "lang": lang},
191
- "result": result
192
- })
193
-
194
 
195
  if __name__ == "__main__":
196
  port = int(os.environ.get("PORT", 7860))
 
1
+ # import os
2
+ # from flask import Flask, render_template, request, jsonify
3
+ # from langdetect import detect
4
+ # import torch
5
+ # import torch.nn.functional as F
6
+ # from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
7
+ # os.environ["HF_HOME"] = "/data/huggingface"
8
+ # os.environ["TRANSFORMERS_CACHE"] = "/data/huggingface"
9
+ # os.makedirs("/data/huggingface", exist_ok=True)
10
+ # os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
11
+ # os.environ["TRANSFORMERS_OFFLINE"] = "0"
12
+ # os.environ["HF_HUB_DISABLE_CACHE"] = "1"
13
+
14
+ # app = Flask(__name__)
15
+
16
+ # # --------- Models ----------
17
+ # VI_MODEL_NAME = "wonrax/phobert-base-vietnamese-sentiment"
18
+ # EN_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
19
+
20
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
21
+
22
+ # # Vietnamese model
23
+ # # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
24
+ # # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME).to(device)
25
+ # # vi_model.eval()
26
+ # # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
27
+ # # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME)
28
+ # # vi_model.eval()
29
+ # # sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
30
+
31
+
32
+ # # # English model
33
+ # # en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME)
34
+ # # en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME).to(device)
35
+ # # en_model.eval()
36
+
37
+ # print("Loading Vietnamese model from Hugging Face Hub (no cache)...")
38
+ # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False, local_files_only=False)
39
+ # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME, local_files_only=False)
40
+ # vi_model.eval()
41
+ # sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
42
+
43
+ # print("Loading English model from Hugging Face Hub (no cache)...")
44
+ # en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME, local_files_only=False)
45
+ # en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME, local_files_only=False)
46
+ # en_model.eval()
47
+ # # Label mapping cho PhoBERT
48
+ # vi_label_map = {
49
+ # 0: ("NEGATIVE", "Tiêu cực"),
50
+ # 1: ("NEUTRAL", "Trung tính"),
51
+ # 2: ("POSITIVE", "Tích cực")
52
+ # }
53
+
54
+ # # Label mapping cho tiếng Anh
55
+ # en_label_map = {
56
+ # 0: ("NEGATIVE", "Negative"),
57
+ # 1: ("POSITIVE", "Positive")
58
+ # }
59
+
60
+
61
+ # # -----------------------------
62
+ # # Ngôn ngữ nhận diện
63
+ # # -----------------------------
64
+ # def detect_lang(text: str) -> str:
65
+ # try:
66
+ # lang = detect(text)
67
+ # if lang.startswith("vi"):
68
+ # return "vi"
69
+ # elif lang.startswith("en"):
70
+ # return "en"
71
+ # else:
72
+ # if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
73
+ # return "vi"
74
+ # return "en"
75
+ # except Exception:
76
+ # if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
77
+ # return "vi"
78
+ # return "en"
79
+
80
+
81
+ # # -----------------------------
82
+ # # Phân tích tiếng Việt (PhoBERT)
83
+ # # -----------------------------
84
+ # # def analyze_vi(text: str):
85
+ # # inputs = vi_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
86
+ # # with torch.no_grad():
87
+ # # outputs = vi_model(**inputs)
88
+ # # logits = outputs.logits.squeeze(0)
89
+ # # probs = torch.softmax(logits, dim=-1)
90
+
91
+ # # label_idx = int(torch.argmax(probs).item())
92
+ # # eng_label, vi_label = vi_label_map[label_idx]
93
+ # # confidence = float(probs[label_idx].item())
94
+
95
+ # # scores = {
96
+ # # vi_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(3)
97
+ # # }
98
+
99
+ # # return {
100
+ # # "language": "vi",
101
+ # # "label": vi_label,
102
+ # # "english_label": eng_label,
103
+ # # "score": round(confidence, 3),
104
+ # # "scores": scores
105
+ # # }
106
+
107
+ # def analyze_vi(text: str):
108
+ # if not text.strip():
109
+ # return {"error": "Text is empty."}
110
+
111
+ # # Dùng pipeline của transformers
112
+ # result = sentiment_pipeline(text)[0]
113
+ # label = result["label"]
114
+ # score = round(result["score"], 3)
115
+
116
+ # # Map nhãn tiếng Việt
117
+ # label_map = {
118
+ # "POS": "Tích cực",
119
+ # "NEG": "Tiêu cực",
120
+ # "NEU": "Trung tính"
121
+ # }
122
+
123
+ # vi_label = label_map.get(label, label)
124
+
125
+ # # Trả kết quả tương thích với frontend
126
+ # return {
127
+ # "language": "vi",
128
+ # "label": vi_label,
129
+ # "english_label": label, # Giữ nhãn gốc POS/NEG/NEU
130
+ # "score": score,
131
+ # "scores": {
132
+ # "Tích cực": score if label == "POS" else 0.0,
133
+ # "Trung tính": score if label == "NEU" else 0.0,
134
+ # "Tiêu cực": score if label == "NEG" else 0.0
135
+ # }
136
+ # }
137
+ # # -----------------------------
138
+ # # Phân tích tiếng Anh
139
+ # # -----------------------------
140
+ # def analyze_en(text: str):
141
+ # inputs = en_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
142
+ # with torch.no_grad():
143
+ # outputs = en_model(**inputs)
144
+ # logits = outputs.logits.squeeze(0)
145
+ # probs = torch.softmax(logits, dim=-1)
146
+
147
+ # label_idx = int(torch.argmax(probs).item())
148
+ # eng_label, vi_label = en_label_map[label_idx]
149
+ # confidence = float(probs[label_idx].item())
150
+
151
+ # scores = {
152
+ # en_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(2)
153
+ # }
154
+
155
+ # return {
156
+ # "language": "en",
157
+ # "label": vi_label, # Giữ English, có thể đổi sang tiếng Việt nếu muốn
158
+ # "english_label": eng_label,
159
+ # "score": round(confidence, 3),
160
+ # "scores": scores
161
+ # }
162
+
163
+
164
+ # # -----------------------------
165
+ # # Flask routes
166
+ # # -----------------------------
167
+ # @app.route("/", methods=["GET"])
168
+ # def home():
169
+ # return render_template("index.html")
170
+
171
+
172
+ # @app.route("/analyze", methods=["POST"])
173
+ # def analyze():
174
+ # data = request.get_json(force=True)
175
+ # text = (data.get("text") or "").strip()
176
+ # lang = (data.get("lang") or "auto").lower()
177
+ # if not text:
178
+ # return jsonify({"error": "Text is empty."}), 400
179
+
180
+ # if lang == "auto":
181
+ # lang = detect_lang(text)
182
+
183
+ # if lang == "vi":
184
+ # result = analyze_vi(text)
185
+ # else:
186
+ # result = analyze_en(text)
187
+
188
+ # return jsonify({
189
+ # "ok": True,
190
+ # "input": {"text": text, "lang": lang},
191
+ # "result": result
192
+ # })
193
+
194
+
195
+ # if __name__ == "__main__":
196
+ # port = int(os.environ.get("PORT", 7860))
197
+ # app.run(host="0.0.0.0", port=port)
198
+
199
+
200
  import os
201
  from flask import Flask, render_template, request, jsonify
202
  from langdetect import detect
203
  import torch
204
  import torch.nn.functional as F
205
  from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
206
+
207
+ # Tắt toàn bộ cache và ghi đĩa
208
+ os.environ["HF_HUB_DISABLE_CACHE"] = "1"
209
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
210
  os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
211
  os.environ["TRANSFORMERS_OFFLINE"] = "0"
212
+ os.environ["HF_DATASETS_OFFLINE"] = "1"
213
+ os.environ["DISABLE_TELEMETRY"] = "1"
214
 
215
  app = Flask(__name__)
216
 
 
220
 
221
  device = "cuda" if torch.cuda.is_available() else "cpu"
222
 
223
+ print("🔄 Loading Vietnamese model (memory-only mode)...")
224
+ vi_tokenizer = AutoTokenizer.from_pretrained(
225
+ VI_MODEL_NAME, use_fast=False, cache_dir=None, local_files_only=False
226
+ )
227
+ vi_model = AutoModelForSequenceClassification.from_pretrained(
228
+ VI_MODEL_NAME, cache_dir=None, local_files_only=False
229
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
230
  vi_model.eval()
231
  sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
232
+ print("✅ Vietnamese model loaded!")
233
+
234
+ print("🔄 Loading English model (memory-only mode)...")
235
+ en_tokenizer = AutoTokenizer.from_pretrained(
236
+ EN_MODEL_NAME, cache_dir=None, local_files_only=False
237
+ )
238
+ en_model = AutoModelForSequenceClassification.from_pretrained(
239
+ EN_MODEL_NAME, cache_dir=None, local_files_only=False
240
+ ).to(device)
241
  en_model.eval()
242
+ print("✅ English model loaded!")
 
 
 
 
 
243
 
244
+ # -----------------------------
245
+ # Label mapping
246
+ # -----------------------------
247
+ vi_label_map = {
248
+ "POS": "Tích cực",
249
+ "NEG": "Tiêu cực",
250
+ "NEU": "Trung tính"
251
  }
252
 
 
253
  # -----------------------------
254
+ # Language detection
255
  # -----------------------------
256
  def detect_lang(text: str) -> str:
257
  try:
 
260
  return "vi"
261
  elif lang.startswith("en"):
262
  return "en"
 
 
 
 
263
  except Exception:
264
+ pass
265
+ if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
266
+ return "vi"
267
+ return "en"
268
 
269
  # -----------------------------
270
+ # Vietnamese analysis
271
  # -----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  def analyze_vi(text: str):
273
  if not text.strip():
274
+ return {"error": "Empty text."}
 
 
275
  result = sentiment_pipeline(text)[0]
276
  label = result["label"]
277
  score = round(result["score"], 3)
 
 
 
 
 
 
 
 
 
 
 
278
  return {
279
  "language": "vi",
280
+ "label": vi_label_map.get(label, label),
281
+ "english_label": label,
282
  "score": score,
283
  "scores": {
284
  "Tích cực": score if label == "POS" else 0.0,
285
  "Trung tính": score if label == "NEU" else 0.0,
286
+ "Tiêu cực": score if label == "NEG" else 0.0,
287
+ },
288
  }
289
+
290
  # -----------------------------
291
+ # English analysis
292
  # -----------------------------
293
  def analyze_en(text: str):
294
  inputs = en_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
295
  with torch.no_grad():
296
+ logits = en_model(**inputs).logits.squeeze(0)
 
297
  probs = torch.softmax(logits, dim=-1)
298
+ label_idx = int(torch.argmax(probs))
299
+ labels = ["Negative", "Positive"]
 
 
 
 
 
 
 
300
  return {
301
  "language": "en",
302
+ "label": labels[label_idx],
303
+ "score": round(float(probs[label_idx]), 3),
304
+ "scores": {labels[i]: round(float(probs[i]), 3) for i in range(2)},
 
305
  }
306
 
 
307
  # -----------------------------
308
  # Flask routes
309
  # -----------------------------
 
311
  def home():
312
  return render_template("index.html")
313
 
 
314
  @app.route("/analyze", methods=["POST"])
315
  def analyze():
316
  data = request.get_json(force=True)
 
322
  if lang == "auto":
323
  lang = detect_lang(text)
324
 
325
+ result = analyze_vi(text) if lang == "vi" else analyze_en(text)
326
+ return jsonify({"ok": True, "input": {"text": text, "lang": lang}, "result": result})
 
 
 
 
 
 
 
 
 
327
 
328
  if __name__ == "__main__":
329
  port = int(os.environ.get("PORT", 7860))