Spaces:
Runtime error
Runtime error
Parallelize text cleaning
Browse files- app/data.py +13 -4
- app/utils.py +1 -1
app/data.py
CHANGED
|
@@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Literal, Sequence
|
|
| 9 |
import emoji
|
| 10 |
import pandas as pd
|
| 11 |
import spacy
|
|
|
|
| 12 |
from tqdm import tqdm
|
| 13 |
|
| 14 |
from app.constants import (
|
|
@@ -160,16 +161,24 @@ def tokenize(
|
|
| 160 |
Returns:
|
| 161 |
Tokenized text data
|
| 162 |
"""
|
| 163 |
-
text_data = [
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
for text in tqdm(
|
| 166 |
text_data,
|
| 167 |
desc="Cleaning",
|
| 168 |
unit="doc",
|
| 169 |
disable=not show_progress,
|
| 170 |
)
|
| 171 |
-
|
| 172 |
-
|
| 173 |
return pd.Series(
|
| 174 |
[
|
| 175 |
_lemmatize(doc, character_threshold)
|
|
|
|
| 9 |
import emoji
|
| 10 |
import pandas as pd
|
| 11 |
import spacy
|
| 12 |
+
from joblib import Parallel, delayed
|
| 13 |
from tqdm import tqdm
|
| 14 |
|
| 15 |
from app.constants import (
|
|
|
|
| 161 |
Returns:
|
| 162 |
Tokenized text data
|
| 163 |
"""
|
| 164 |
+
# text_data = [
|
| 165 |
+
# _clean(text)
|
| 166 |
+
# for text in tqdm(
|
| 167 |
+
# text_data,
|
| 168 |
+
# desc="Cleaning",
|
| 169 |
+
# unit="doc",
|
| 170 |
+
# disable=not show_progress,
|
| 171 |
+
# )
|
| 172 |
+
# ]
|
| 173 |
+
text_data = Parallel(n_jobs=n_jobs)(
|
| 174 |
+
delayed(_clean)(text)
|
| 175 |
for text in tqdm(
|
| 176 |
text_data,
|
| 177 |
desc="Cleaning",
|
| 178 |
unit="doc",
|
| 179 |
disable=not show_progress,
|
| 180 |
)
|
| 181 |
+
)
|
|
|
|
| 182 |
return pd.Series(
|
| 183 |
[
|
| 184 |
_lemmatize(doc, character_threshold)
|
app/utils.py
CHANGED
|
@@ -11,7 +11,7 @@ if TYPE_CHECKING:
|
|
| 11 |
__all__ = ["serialize", "deserialize"]
|
| 12 |
|
| 13 |
|
| 14 |
-
def serialize(data: Sequence[str | int], path: Path, max_size: int =
|
| 15 |
"""Serialize data to a file
|
| 16 |
|
| 17 |
Args:
|
|
|
|
| 11 |
__all__ = ["serialize", "deserialize"]
|
| 12 |
|
| 13 |
|
| 14 |
+
def serialize(data: Sequence[str | int], path: Path, max_size: int = 100_000, show_progress: bool = False) -> None:
|
| 15 |
"""Serialize data to a file
|
| 16 |
|
| 17 |
Args:
|