Spaces:
Sleeping
Sleeping
update
Browse files- .gitignore +1 -1
- Dockerfile +3 -1
- install.sh +56 -0
- language_identification.md +13 -0
- main.py +39 -58
- requirements.txt +3 -2
.gitignore
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
.idea/
|
| 4 |
|
| 5 |
#data/
|
| 6 |
-
|
| 7 |
temp/
|
| 8 |
|
| 9 |
**/cache/
|
|
|
|
| 3 |
.idea/
|
| 4 |
|
| 5 |
#data/
|
| 6 |
+
pretrained_models/
|
| 7 |
temp/
|
| 8 |
|
| 9 |
**/cache/
|
Dockerfile
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
# you will also find guides on how best to write your Dockerfile
|
| 3 |
|
| 4 |
-
FROM python:3.
|
| 5 |
|
| 6 |
WORKDIR /code
|
| 7 |
|
|
@@ -27,4 +27,6 @@ WORKDIR $HOME/app
|
|
| 27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 28 |
COPY --chown=user . $HOME/app
|
| 29 |
|
|
|
|
|
|
|
| 30 |
CMD ["python", "main.py"]
|
|
|
|
| 1 |
# read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
|
| 2 |
# you will also find guides on how best to write your Dockerfile
|
| 3 |
|
| 4 |
+
FROM python:3.6
|
| 5 |
|
| 6 |
WORKDIR /code
|
| 7 |
|
|
|
|
| 27 |
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 28 |
COPY --chown=user . $HOME/app
|
| 29 |
|
| 30 |
+
RUN bash -c 'bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu'
|
| 31 |
+
|
| 32 |
CMD ["python", "main.py"]
|
install.sh
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
# bash install.sh --stage 1 --stop_stage 1 --system_version centos
|
| 4 |
+
|
| 5 |
+
verbose=true;
|
| 6 |
+
stage=-1
|
| 7 |
+
stop_stage=2
|
| 8 |
+
|
| 9 |
+
work_dir="$(pwd)"
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# parse options
|
| 13 |
+
while true; do
|
| 14 |
+
[ -z "${1:-}" ] && break; # break if there are no arguments
|
| 15 |
+
case "$1" in
|
| 16 |
+
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
|
| 17 |
+
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
| 18 |
+
old_value="(eval echo \\$$name)";
|
| 19 |
+
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
|
| 20 |
+
was_bool=true;
|
| 21 |
+
else
|
| 22 |
+
was_bool=false;
|
| 23 |
+
fi
|
| 24 |
+
|
| 25 |
+
# Set the variable to the right value-- the escaped quotes make it work if
|
| 26 |
+
# the option had spaces, like --cmd "queue.pl -sync y"
|
| 27 |
+
eval "${name}=\"$2\"";
|
| 28 |
+
|
| 29 |
+
# Check that Boolean-valued arguments are really Boolean.
|
| 30 |
+
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
| 31 |
+
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
| 32 |
+
exit 1;
|
| 33 |
+
fi
|
| 34 |
+
shift 2;
|
| 35 |
+
;;
|
| 36 |
+
|
| 37 |
+
*) break;
|
| 38 |
+
esac
|
| 39 |
+
done
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
$verbose && echo "system_version: ${system_version}"
|
| 43 |
+
|
| 44 |
+
pretrained_models_dir="$(pwd)/pretrained_models"
|
| 45 |
+
|
| 46 |
+
mkdir -p "${pretrained_models_dir}"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
| 50 |
+
$verbose && echo "stage 1: download fasttext models"
|
| 51 |
+
cd "${pretrained_models_dir}" || exit 1;
|
| 52 |
+
|
| 53 |
+
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
|
| 54 |
+
wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz
|
| 55 |
+
|
| 56 |
+
fi
|
language_identification.md
CHANGED
|
@@ -16,3 +16,16 @@ https://github.com/saffsd/langid.py/tree/master/langid/train
|
|
| 16 |
4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
|
| 17 |
|
| 18 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
4. 训练 NB (Naive Bayes) 概率模型, 即每个 item 对每个类型的概率贡献.
|
| 17 |
|
| 18 |
```
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
### fasttext
|
| 22 |
+
|
| 23 |
+
识别 176 种语言。
|
| 24 |
+
https://fasttext.cc/docs/en/language-identification.html
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
### 参考
|
| 28 |
+
|
| 29 |
+
```text
|
| 30 |
+
https://zhuanlan.zhihu.com/p/600245782
|
| 31 |
+
```
|
main.py
CHANGED
|
@@ -6,14 +6,12 @@ https://huggingface.co/spaces/sayakpaul/demo-docker-gradio
|
|
| 6 |
import argparse
|
| 7 |
import json
|
| 8 |
import platform
|
| 9 |
-
from typing import Tuple
|
| 10 |
|
|
|
|
|
|
|
| 11 |
import gradio as gr
|
| 12 |
-
import
|
| 13 |
from langid.langid import LanguageIdentifier, model
|
| 14 |
-
import matplotlib.pyplot as plt
|
| 15 |
-
import numpy as np
|
| 16 |
-
from PIL import Image
|
| 17 |
|
| 18 |
from project_settings import project_path, temp_directory
|
| 19 |
|
|
@@ -30,29 +28,40 @@ def get_args():
|
|
| 30 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
| 31 |
type=str
|
| 32 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
args = parser.parse_args()
|
| 34 |
return args
|
| 35 |
|
| 36 |
|
| 37 |
-
lang_id_identifier
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def click_lang_id_button(text: str, ground_true: str, model_name: str):
|
| 41 |
global lang_id_identifier
|
|
|
|
| 42 |
|
| 43 |
if model_name == "langid":
|
| 44 |
label, prob = lang_id_identifier.classify(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
else:
|
| 46 |
label = "model_name not available."
|
| 47 |
-
prob =
|
| 48 |
-
return label, round(prob, 4)
|
| 49 |
|
| 50 |
|
| 51 |
def main():
|
| 52 |
args = get_args()
|
| 53 |
|
| 54 |
brief_description = """
|
| 55 |
-
|
| 56 |
"""
|
| 57 |
|
| 58 |
# description
|
|
@@ -63,56 +72,28 @@ def main():
|
|
| 63 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
| 64 |
lang_id_examples = json.load(f)
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
gr.Examples(
|
| 89 |
-
examples=lang_id_examples,
|
| 90 |
-
inputs=[
|
| 91 |
-
lang_id_text,
|
| 92 |
-
lang_id_ground_true,
|
| 93 |
-
lang_id_model_name,
|
| 94 |
-
],
|
| 95 |
-
outputs=[lang_id_label, lang_id_prob],
|
| 96 |
-
fn=click_lang_id_button
|
| 97 |
-
)
|
| 98 |
-
|
| 99 |
-
# click event
|
| 100 |
-
lang_id_button.click(
|
| 101 |
-
click_lang_id_button,
|
| 102 |
-
inputs=[
|
| 103 |
-
lang_id_text,
|
| 104 |
-
lang_id_ground_true,
|
| 105 |
-
lang_id_model_name,
|
| 106 |
-
],
|
| 107 |
-
outputs=[lang_id_label, lang_id_prob],
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
gr.Markdown(value=description)
|
| 111 |
-
|
| 112 |
-
blocks.queue().launch(
|
| 113 |
share=False if platform.system() == "Windows" else False,
|
| 114 |
-
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0",
|
| 115 |
-
server_port=7860
|
| 116 |
)
|
| 117 |
return
|
| 118 |
|
|
|
|
| 6 |
import argparse
|
| 7 |
import json
|
| 8 |
import platform
|
|
|
|
| 9 |
|
| 10 |
+
import fasttext
|
| 11 |
+
from fasttext.FastText import load_model, _FastText
|
| 12 |
import gradio as gr
|
| 13 |
+
from gradio import inputs, outputs
|
| 14 |
from langid.langid import LanguageIdentifier, model
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
from project_settings import project_path, temp_directory
|
| 17 |
|
|
|
|
| 28 |
default=(project_path / "lang_id_examples.json").as_posix(),
|
| 29 |
type=str
|
| 30 |
)
|
| 31 |
+
parser.add_argument(
|
| 32 |
+
"--fasttext_model",
|
| 33 |
+
default=(project_path / "pretrained_models/lid.176.bin").as_posix(),
|
| 34 |
+
type=str
|
| 35 |
+
)
|
| 36 |
args = parser.parse_args()
|
| 37 |
return args
|
| 38 |
|
| 39 |
|
| 40 |
+
lang_id_identifier: LanguageIdentifier = None
|
| 41 |
+
fasttext_model: _FastText = None
|
| 42 |
|
| 43 |
|
| 44 |
def click_lang_id_button(text: str, ground_true: str, model_name: str):
|
| 45 |
global lang_id_identifier
|
| 46 |
+
global fasttext_model
|
| 47 |
|
| 48 |
if model_name == "langid":
|
| 49 |
label, prob = lang_id_identifier.classify(text)
|
| 50 |
+
elif model_name == "fasttext":
|
| 51 |
+
label, prob = fasttext_model.predict(text, k=1)
|
| 52 |
+
label = label[0][9:]
|
| 53 |
+
prob = prob[0]
|
| 54 |
else:
|
| 55 |
label = "model_name not available."
|
| 56 |
+
prob = -1
|
| 57 |
+
return label, str(round(prob, 4))
|
| 58 |
|
| 59 |
|
| 60 |
def main():
|
| 61 |
args = get_args()
|
| 62 |
|
| 63 |
brief_description = """
|
| 64 |
+
Language Identification
|
| 65 |
"""
|
| 66 |
|
| 67 |
# description
|
|
|
|
| 72 |
with open(args.lang_id_examples_file, "r", encoding="utf-8") as f:
|
| 73 |
lang_id_examples = json.load(f)
|
| 74 |
|
| 75 |
+
global lang_id_identifier
|
| 76 |
+
global fasttext_model
|
| 77 |
+
lang_id_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True)
|
| 78 |
+
fasttext_model = fasttext.load_model(args.fasttext_model)
|
| 79 |
+
|
| 80 |
+
blocks = gr.Interface(
|
| 81 |
+
click_lang_id_button,
|
| 82 |
+
inputs=[
|
| 83 |
+
inputs.Textbox(lines=3, label="text"),
|
| 84 |
+
inputs.Textbox(label="ground_true"),
|
| 85 |
+
inputs.Dropdown(choices=["langid", "fasttext"], default="langid", label="model_name"),
|
| 86 |
+
],
|
| 87 |
+
outputs=[
|
| 88 |
+
outputs.Textbox(label="label"),
|
| 89 |
+
outputs.Textbox(label="prob"),
|
| 90 |
+
],
|
| 91 |
+
examples=lang_id_examples,
|
| 92 |
+
description=brief_description
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
blocks.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
share=False if platform.system() == "Windows" else False,
|
|
|
|
|
|
|
| 97 |
)
|
| 98 |
return
|
| 99 |
|
requirements.txt
CHANGED
|
@@ -1,2 +1,3 @@
|
|
| 1 |
-
gradio==
|
| 2 |
-
langid==1.1.6
|
|
|
|
|
|
| 1 |
+
gradio==2.1.1
|
| 2 |
+
langid==1.1.6
|
| 3 |
+
fasttext==0.9.2
|