Hoang Kha
Fix huggingface cache to /data for permission issues
6c510a2
# import os
# from flask import Flask, render_template, request, jsonify
# from langdetect import detect
# import torch
# import torch.nn.functional as F
# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# os.environ["HF_HOME"] = "/data/huggingface"
# os.environ["TRANSFORMERS_CACHE"] = "/data/huggingface"
# os.makedirs("/data/huggingface", exist_ok=True)
# os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
# os.environ["TRANSFORMERS_OFFLINE"] = "0"
# os.environ["HF_HUB_DISABLE_CACHE"] = "1"
# app = Flask(__name__)
# # --------- Models ----------
# VI_MODEL_NAME = "wonrax/phobert-base-vietnamese-sentiment"
# EN_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
# device = "cuda" if torch.cuda.is_available() else "cpu"
# # Vietnamese model
# # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
# # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME).to(device)
# # vi_model.eval()
# # vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False)
# # vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME)
# # vi_model.eval()
# # sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
# # # English model
# # en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME)
# # en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME).to(device)
# # en_model.eval()
# print("Loading Vietnamese model from Hugging Face Hub (no cache)...")
# vi_tokenizer = AutoTokenizer.from_pretrained(VI_MODEL_NAME, use_fast=False, local_files_only=False)
# vi_model = AutoModelForSequenceClassification.from_pretrained(VI_MODEL_NAME, local_files_only=False)
# vi_model.eval()
# sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
# print("Loading English model from Hugging Face Hub (no cache)...")
# en_tokenizer = AutoTokenizer.from_pretrained(EN_MODEL_NAME, local_files_only=False)
# en_model = AutoModelForSequenceClassification.from_pretrained(EN_MODEL_NAME, local_files_only=False)
# en_model.eval()
# # Label mapping cho PhoBERT
# vi_label_map = {
# 0: ("NEGATIVE", "Tiêu cực"),
# 1: ("NEUTRAL", "Trung tính"),
# 2: ("POSITIVE", "Tích cực")
# }
# # Label mapping cho tiếng Anh
# en_label_map = {
# 0: ("NEGATIVE", "Negative"),
# 1: ("POSITIVE", "Positive")
# }
# # -----------------------------
# # Ngôn ngữ nhận diện
# # -----------------------------
# def detect_lang(text: str) -> str:
# try:
# lang = detect(text)
# if lang.startswith("vi"):
# return "vi"
# elif lang.startswith("en"):
# return "en"
# else:
# if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
# return "vi"
# return "en"
# except Exception:
# if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
# return "vi"
# return "en"
# # -----------------------------
# # Phân tích tiếng Việt (PhoBERT)
# # -----------------------------
# # def analyze_vi(text: str):
# # inputs = vi_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
# # with torch.no_grad():
# # outputs = vi_model(**inputs)
# # logits = outputs.logits.squeeze(0)
# # probs = torch.softmax(logits, dim=-1)
# # label_idx = int(torch.argmax(probs).item())
# # eng_label, vi_label = vi_label_map[label_idx]
# # confidence = float(probs[label_idx].item())
# # scores = {
# # vi_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(3)
# # }
# # return {
# # "language": "vi",
# # "label": vi_label,
# # "english_label": eng_label,
# # "score": round(confidence, 3),
# # "scores": scores
# # }
# def analyze_vi(text: str):
# if not text.strip():
# return {"error": "Text is empty."}
# # Dùng pipeline của transformers
# result = sentiment_pipeline(text)[0]
# label = result["label"]
# score = round(result["score"], 3)
# # Map nhãn tiếng Việt
# label_map = {
# "POS": "Tích cực",
# "NEG": "Tiêu cực",
# "NEU": "Trung tính"
# }
# vi_label = label_map.get(label, label)
# # Trả kết quả tương thích với frontend
# return {
# "language": "vi",
# "label": vi_label,
# "english_label": label, # Giữ nhãn gốc POS/NEG/NEU
# "score": score,
# "scores": {
# "Tích cực": score if label == "POS" else 0.0,
# "Trung tính": score if label == "NEU" else 0.0,
# "Tiêu cực": score if label == "NEG" else 0.0
# }
# }
# # -----------------------------
# # Phân tích tiếng Anh
# # -----------------------------
# def analyze_en(text: str):
# inputs = en_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
# with torch.no_grad():
# outputs = en_model(**inputs)
# logits = outputs.logits.squeeze(0)
# probs = torch.softmax(logits, dim=-1)
# label_idx = int(torch.argmax(probs).item())
# eng_label, vi_label = en_label_map[label_idx]
# confidence = float(probs[label_idx].item())
# scores = {
# en_label_map[i][1]: round(float(probs[i].item()), 3) for i in range(2)
# }
# return {
# "language": "en",
# "label": vi_label, # Giữ English, có thể đổi sang tiếng Việt nếu muốn
# "english_label": eng_label,
# "score": round(confidence, 3),
# "scores": scores
# }
# # -----------------------------
# # Flask routes
# # -----------------------------
# @app.route("/", methods=["GET"])
# def home():
# return render_template("index.html")
# @app.route("/analyze", methods=["POST"])
# def analyze():
# data = request.get_json(force=True)
# text = (data.get("text") or "").strip()
# lang = (data.get("lang") or "auto").lower()
# if not text:
# return jsonify({"error": "Text is empty."}), 400
# if lang == "auto":
# lang = detect_lang(text)
# if lang == "vi":
# result = analyze_vi(text)
# else:
# result = analyze_en(text)
# return jsonify({
# "ok": True,
# "input": {"text": text, "lang": lang},
# "result": result
# })
# if __name__ == "__main__":
# port = int(os.environ.get("PORT", 7860))
# app.run(host="0.0.0.0", port=port)
import os
from flask import Flask, render_template, request, jsonify
from langdetect import detect
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# ⚙️ Ép Hugging Face không ghi cache, chỉ load vào RAM
os.environ["HF_HUB_DISABLE_CACHE"] = "1"
os.environ["TRANSFORMERS_CACHE"] = "/dev/null" # ⛔ cache về null
os.environ["HF_HOME"] = "/dev/null" # ⛔ home cache về null
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "0"
os.environ["DISABLE_TELEMETRY"] = "1"
app = Flask(__name__)
# --------- Models ----------
VI_MODEL_NAME = "wonrax/phobert-base-vietnamese-sentiment"
EN_MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("🔄 Loading Vietnamese model (RAM-only mode)...")
vi_tokenizer = AutoTokenizer.from_pretrained(
VI_MODEL_NAME,
use_fast=False,
local_files_only=False,
cache_dir=None,
)
vi_model = AutoModelForSequenceClassification.from_pretrained(
VI_MODEL_NAME,
local_files_only=False,
cache_dir=None,
).to(device)
vi_model.eval()
sentiment_pipeline = pipeline("sentiment-analysis", model=vi_model, tokenizer=vi_tokenizer)
print("✅ Vietnamese model loaded successfully.")
print("🔄 Loading English model (RAM-only mode)...")
en_tokenizer = AutoTokenizer.from_pretrained(
EN_MODEL_NAME,
local_files_only=False,
cache_dir=None,
)
en_model = AutoModelForSequenceClassification.from_pretrained(
EN_MODEL_NAME,
local_files_only=False,
cache_dir=None,
).to(device)
en_model.eval()
print("✅ English model loaded successfully.")
# -----------------------------
# Detect language
# -----------------------------
def detect_lang(text: str) -> str:
try:
lang = detect(text)
if lang.startswith("vi"):
return "vi"
elif lang.startswith("en"):
return "en"
except Exception:
pass
if any(ch in text for ch in "ăâđêôơưáàạảãấầậẩẫắằặẳẵéèẹẻẽếềệểễóòọỏõốồộổỗớờợởỡíìịỉĩúùụủũứừựửữýỳỵỷỹ"):
return "vi"
return "en"
# -----------------------------
# Vietnamese analysis
# -----------------------------
def analyze_vi(text: str):
result = sentiment_pipeline(text)[0]
label_map = {"POS": "Tích cực", "NEG": "Tiêu cực", "NEU": "Trung tính"}
label = result["label"]
score = round(result["score"], 3)
return {
"language": "vi",
"label": label_map.get(label, label),
"english_label": label,
"score": score,
"scores": {
"Tích cực": score if label == "POS" else 0.0,
"Trung tính": score if label == "NEU" else 0.0,
"Tiêu cực": score if label == "NEG" else 0.0,
},
}
# -----------------------------
# English analysis
# -----------------------------
def analyze_en(text: str):
inputs = en_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
with torch.no_grad():
logits = en_model(**inputs).logits.squeeze(0)
probs = torch.softmax(logits, dim=-1)
label_idx = int(torch.argmax(probs))
labels = ["Negative", "Positive"]
return {
"language": "en",
"label": labels[label_idx],
"score": round(float(probs[label_idx]), 3),
"scores": {labels[i]: round(float(probs[i]), 3) for i in range(2)},
}
# -----------------------------
# Flask routes
# -----------------------------
@app.route("/", methods=["GET"])
def home():
return render_template("index.html")
@app.route("/analyze", methods=["POST"])
def analyze():
data = request.get_json(force=True)
text = (data.get("text") or "").strip()
lang = (data.get("lang") or "auto").lower()
if not text:
return jsonify({"error": "Text is empty."}), 400
if lang == "auto":
lang = detect_lang(text)
result = analyze_vi(text) if lang == "vi" else analyze_en(text)
return jsonify({"ok": True, "input": {"text": text, "lang": lang}, "result": result})
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
app.run(host="0.0.0.0", port=port)