Spaces:

intelli-zen
/

language_identification

Sleeping

App Files Files Community

HoneyTian commited on Apr 28, 2024

Commit

e6fd0e8

1 Parent(s): 820797e

update

Browse files

Files changed (6) hide show

.gitignore +1 -1
Dockerfile +3 -1
install.sh +56 -0
language_identification.md +13 -0
main.py +39 -58
requirements.txt +3 -2

.gitignore CHANGED Viewed

@@ -3,7 +3,7 @@
 .idea/
 #data/
-#pretrained_models/
 temp/
 **/cache/

 .idea/
 #data/
+pretrained_models/
 temp/
 **/cache/

Dockerfile CHANGED Viewed

@@ -1,7 +1,7 @@
 # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
-FROM python:3.8
 WORKDIR /code
@@ -27,4 +27,6 @@ WORKDIR $HOME/app
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
 CMD ["python", "main.py"]

 # read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
 # you will also find guides on how best to write your Dockerfile
+FROM python:3.6
 WORKDIR /code
 # Copy the current directory contents into the container at $HOME/app setting the owner to the user
 COPY --chown=user . $HOME/app
+RUN bash -c 'bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu'
 CMD ["python", "main.py"]

install.sh ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env bash
+# bash install.sh --stage 1 --stop_stage 1 --system_version centos
+verbose=true;
+stage=-1
+stop_stage=2
+work_dir="$(pwd)"
+# parse options
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
+      eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      old_value="(eval echo \\$$name)";
+      if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval "${name}=\"$2\"";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+    *) break;
+  esac
+done
+$verbose && echo "system_version: ${system_version}"
+pretrained_models_dir="$(pwd)/pretrained_models"
+mkdir -p "${pretrained_models_dir}"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+  $verbose && echo "stage 1: download fasttext models"
+  cd "${pretrained_models_dir}" || exit 1;
+  wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
+  wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
+fi

language_identification.md CHANGED Viewed

@@ -16,3 +16,16 @@ https://github.com/saffsd/langid.py/tree/master/langid/train
 4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
 ```

 4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
 ```
+### fasttext
+识别 176 种语言。
+https://fasttext.cc/docs/en/language-identification.html
+### 参考
+```text
+https://zhuanlan.zhihu.com/p/600245782
+```

main.py CHANGED Viewed

@@ -6,14 +6,12 @@ https://huggingface.co/spaces/sayakpaul/demo-docker-gradio
 import argparse
 import json
 import platform
-from typing import Tuple
 import gradio as gr
-import langid
 from langid.langid import LanguageIdentifier, model
-import matplotlib.pyplot as plt
-import numpy as np
-from PIL import Image
 from project_settings import project_path, temp_directory
@@ -30,29 +28,40 @@ def get_args():
         default=(project_path / "lang_id_examples.json").as_posix(),
         type=str
     )
     args = parser.parse_args()
     return args
-lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
 def click_lang_id_button(text: str, ground_true: str, model_name: str):
     global lang_id_identifier
     if model_name == "langid":
         label, prob = lang_id_identifier.classify(text)
     else:
         label = "model_name not available."
-        prob = 0.0
-    return label, round(prob, 4)
 def main():
     args = get_args()
     brief_description = """
-    ### Language Identification
     """
     # description
@@ -63,56 +72,28 @@ def main():
     with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
         lang_id_examples = json.load(f)
-    # ui
-    with gr.Blocks() as blocks:
-        gr.Markdown(value=brief_description)
-        with gr.Row():
-            with gr.Column(scale=5):
-                with gr.Tabs():
-                    with gr.TabItem("lang_id"):
-                        gr.Markdown(value="")
-                        with gr.Row():
-                            with gr.Column(scale=1):
-                                lang_id_text = gr.Textbox(lines=2, max_lines=50, label="text")
-                                lang_id_ground_true = gr.Textbox(label="ground_true")
-                                lang_id_model_name = gr.Dropdown(choices=["langid"], value="langid", label="model_name")
-                                lang_id_button = gr.Button("run", variant="primary")
-                            with gr.Column(scale=1):
-                                lang_id_label = gr.Textbox(label="label")
-                                lang_id_prob = gr.Number(label="prob")
-                        gr.Examples(
-                            examples=lang_id_examples,
-                            inputs=[
-                                lang_id_text,
-                                lang_id_ground_true,
-                                lang_id_model_name,
-                            ],
-                            outputs=[lang_id_label, lang_id_prob],
-                            fn=click_lang_id_button
-                        )
-                        # click event
-                        lang_id_button.click(
-                            click_lang_id_button,
-                            inputs=[
-                                lang_id_text,
-                                lang_id_ground_true,
-                                lang_id_model_name,
-                            ],
-                            outputs=[lang_id_label, lang_id_prob],
-                        )
-                    gr.Markdown(value=description)
-    blocks.queue().launch(
         share=False if platform.system() == "Windows" else False,
-        server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
-        server_port=7860
     )
     return

 import argparse
 import json
 import platform
+import fasttext
+from fasttext.FastText import load_model, _FastText
 import gradio as gr
+from gradio import inputs, outputs
 from langid.langid import LanguageIdentifier, model
 from project_settings import project_path, temp_directory
         default=(project_path / "lang_id_examples.json").as_posix(),
         type=str
     )
+    parser.add_argument(
+        "--fasttext_model",
+        default=(project_path / "pretrained_models/lid.176.bin").as_posix(),
+        type=str
+    )
     args = parser.parse_args()
     return args
+lang_id_identifier: LanguageIdentifier = None
+fasttext_model: _FastText = None
 def click_lang_id_button(text: str, ground_true: str, model_name: str):
     global lang_id_identifier
+    global fasttext_model
     if model_name == "langid":
         label, prob = lang_id_identifier.classify(text)
+    elif model_name == "fasttext":
+        label, prob = fasttext_model.predict(text, k=1)
+        label = label[0][9:]
+        prob = prob[0]
     else:
         label = "model_name not available."
+        prob = -1
+    return label, str(round(prob, 4))
 def main():
     args = get_args()
     brief_description = """
+    Language Identification
     """
     # description
     with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
         lang_id_examples = json.load(f)
+    global lang_id_identifier
+    global fasttext_model
+    lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
+    fasttext_model = fasttext.load_model(args.fasttext_model)
+    blocks = gr.Interface(
+        click_lang_id_button,
+        inputs=[
+            inputs.Textbox(lines=3, label="text"),
+            inputs.Textbox(label="ground_true"),
+            inputs.Dropdown(choices=["langid", "fasttext"], default="langid", label="model_name"),
+        ],
+        outputs=[
+            outputs.Textbox(label="label"),
+            outputs.Textbox(label="prob"),
+        ],
+        examples=lang_id_examples,
+        description=brief_description
+    )
+    blocks.launch(
         share=False if platform.system() == "Windows" else False,
     )
     return

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
-gradio==4.28.3
-langid==1.1.6

+gradio==2.1.1
+langid==1.1.6
+fasttext==0.9.2