Spaces:
Runtime error
Runtime error
Add min-df option
Browse files- app/cli.py +9 -0
- app/model.py +7 -1
app/cli.py
CHANGED
|
@@ -215,6 +215,13 @@ def evaluate(
|
|
| 215 |
show_default=True,
|
| 216 |
type=click.IntRange(1, None),
|
| 217 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
@click.option(
|
| 219 |
"--cv",
|
| 220 |
default=5,
|
|
@@ -261,6 +268,7 @@ def train(
|
|
| 261 |
dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
|
| 262 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
| 263 |
max_features: int,
|
|
|
|
| 264 |
cv: int,
|
| 265 |
token_batch_size: int,
|
| 266 |
token_jobs: int,
|
|
@@ -324,6 +332,7 @@ def train(
|
|
| 324 |
label_data,
|
| 325 |
vectorizer=vectorizer,
|
| 326 |
max_features=max_features,
|
|
|
|
| 327 |
folds=cv,
|
| 328 |
n_jobs=train_jobs,
|
| 329 |
seed=seed,
|
|
|
|
| 215 |
show_default=True,
|
| 216 |
type=click.IntRange(1, None),
|
| 217 |
)
|
| 218 |
+
@click.option(
|
| 219 |
+
"--min-df",
|
| 220 |
+
default=0.1,
|
| 221 |
+
help="Minimum document frequency for the vectorizer (ignored for hashing)",
|
| 222 |
+
show_default=True,
|
| 223 |
+
type=click.FloatRange(0, 1),
|
| 224 |
+
)
|
| 225 |
@click.option(
|
| 226 |
"--cv",
|
| 227 |
default=5,
|
|
|
|
| 268 |
dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
|
| 269 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
| 270 |
max_features: int,
|
| 271 |
+
min_df: float,
|
| 272 |
cv: int,
|
| 273 |
token_batch_size: int,
|
| 274 |
token_jobs: int,
|
|
|
|
| 332 |
label_data,
|
| 333 |
vectorizer=vectorizer,
|
| 334 |
max_features=max_features,
|
| 335 |
+
min_df=min_df,
|
| 336 |
folds=cv,
|
| 337 |
n_jobs=train_jobs,
|
| 338 |
seed=seed,
|
app/model.py
CHANGED
|
@@ -36,6 +36,7 @@ def _identity(x: list[str]) -> list[str]:
|
|
| 36 |
def _get_vectorizer(
|
| 37 |
name: Literal["tfidf", "count", "hashing"],
|
| 38 |
n_features: int,
|
|
|
|
| 39 |
ngram: tuple[int, int] = (1, 2),
|
| 40 |
) -> TransformerMixin:
|
| 41 |
"""Get the appropriate vectorizer.
|
|
@@ -43,6 +44,7 @@ def _get_vectorizer(
|
|
| 43 |
Args:
|
| 44 |
name: Type of vectorizer
|
| 45 |
n_features: Maximum number of features
|
|
|
|
| 46 |
ngram: N-gram range [min_n, max_n]
|
| 47 |
|
| 48 |
Returns:
|
|
@@ -64,11 +66,13 @@ def _get_vectorizer(
|
|
| 64 |
case "tfidf":
|
| 65 |
return TfidfVectorizer(
|
| 66 |
max_features=n_features,
|
|
|
|
| 67 |
**shared_params,
|
| 68 |
)
|
| 69 |
case "count":
|
| 70 |
return CountVectorizer(
|
| 71 |
max_features=n_features,
|
|
|
|
| 72 |
**shared_params,
|
| 73 |
)
|
| 74 |
case "hashing":
|
|
@@ -92,6 +96,7 @@ def train_model(
|
|
| 92 |
label_data: list[int],
|
| 93 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
| 94 |
max_features: int,
|
|
|
|
| 95 |
folds: int = 5,
|
| 96 |
n_jobs: int = 4,
|
| 97 |
seed: int = 42,
|
|
@@ -103,6 +108,7 @@ def train_model(
|
|
| 103 |
label_data: Label data
|
| 104 |
vectorizer: Which vectorizer to use
|
| 105 |
max_features: Maximum number of features
|
|
|
|
| 106 |
folds: Number of cross-validation folds
|
| 107 |
n_jobs: Number of parallel jobs
|
| 108 |
seed: Random seed (None for random seed)
|
|
@@ -122,7 +128,7 @@ def train_model(
|
|
| 122 |
random_state=rs,
|
| 123 |
)
|
| 124 |
|
| 125 |
-
vectorizer = _get_vectorizer(vectorizer, max_features)
|
| 126 |
classifiers = [
|
| 127 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
| 128 |
# (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
|
|
|
| 36 |
def _get_vectorizer(
|
| 37 |
name: Literal["tfidf", "count", "hashing"],
|
| 38 |
n_features: int,
|
| 39 |
+
min_df: float = 0.1,
|
| 40 |
ngram: tuple[int, int] = (1, 2),
|
| 41 |
) -> TransformerMixin:
|
| 42 |
"""Get the appropriate vectorizer.
|
|
|
|
| 44 |
Args:
|
| 45 |
name: Type of vectorizer
|
| 46 |
n_features: Maximum number of features
|
| 47 |
+
min_df: Minimum document frequency (ignored for hashing)
|
| 48 |
ngram: N-gram range [min_n, max_n]
|
| 49 |
|
| 50 |
Returns:
|
|
|
|
| 66 |
case "tfidf":
|
| 67 |
return TfidfVectorizer(
|
| 68 |
max_features=n_features,
|
| 69 |
+
min_df=min_df,
|
| 70 |
**shared_params,
|
| 71 |
)
|
| 72 |
case "count":
|
| 73 |
return CountVectorizer(
|
| 74 |
max_features=n_features,
|
| 75 |
+
min_df=min_df,
|
| 76 |
**shared_params,
|
| 77 |
)
|
| 78 |
case "hashing":
|
|
|
|
| 96 |
label_data: list[int],
|
| 97 |
vectorizer: Literal["tfidf", "count", "hashing"],
|
| 98 |
max_features: int,
|
| 99 |
+
min_df: float = 0.1,
|
| 100 |
folds: int = 5,
|
| 101 |
n_jobs: int = 4,
|
| 102 |
seed: int = 42,
|
|
|
|
| 108 |
label_data: Label data
|
| 109 |
vectorizer: Which vectorizer to use
|
| 110 |
max_features: Maximum number of features
|
| 111 |
+
min_df: Minimum document frequency (ignored for hashing)
|
| 112 |
folds: Number of cross-validation folds
|
| 113 |
n_jobs: Number of parallel jobs
|
| 114 |
seed: Random seed (None for random seed)
|
|
|
|
| 128 |
random_state=rs,
|
| 129 |
)
|
| 130 |
|
| 131 |
+
vectorizer = _get_vectorizer(vectorizer, max_features, min_df)
|
| 132 |
classifiers = [
|
| 133 |
(LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|
| 134 |
# (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
|