Spaces:

Tymec
/

sentiment-analysis

Runtime error

App Files Files

Tymec commited on Jun 4, 2024

Commit

baf0dee

1 Parent(s): 1414454

Add amazonreviews model

Browse files

Files changed (4) hide show

README.md +1 -2
app/cli.py +3 -4
app/model.py +25 -59
models/amazonreviews_tfidf_ft20000.pkl +3 -0

README.md CHANGED Viewed

@@ -213,8 +213,7 @@ The following pre-trained models are available for use:
 | --- | --- | --- | --- | --- | --- | --- |
 | `imdb50k` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 89.24% ± 0.13% | [Here](models/imdb50k_tfidf_ft20000.pkl) |
 | `sentiment140` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 77.32% ± 0.28% | [Here](models/sentiment140_tfidf_ft20000.pkl) |
-| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | ❌ | ❌ | [Here](models/amazonreviews_tfidf_ft1048576.pkl) |
 ## License
 Distributed under the MIT License. See [LICENSE](LICENSE) for more information.

 | --- | --- | --- | --- | --- | --- | --- |
 | `imdb50k` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 89.24% ± 0.13% | [Here](models/imdb50k_tfidf_ft20000.pkl) |
 | `sentiment140` | `tfidf` | `LinearRegression` | 20 000 | 83.24% ± 0.99% | 77.32% ± 0.28% | [Here](models/sentiment140_tfidf_ft20000.pkl) |
+| `amazonreviews` | `tfidf` | `LinearRegression` | 20 000 | 82.17% ± 0.85% | ❌ | [Here](models/amazonreviews_tfidf_ft20000.pkl) |
 ## License
 Distributed under the MIT License. See [LICENSE](LICENSE) for more information.

app/cli.py CHANGED Viewed

@@ -217,10 +217,9 @@ def evaluate(
 )
 @click.option(
     "--min-df",
-    default=0.1,
-    help="Minimum document frequency for the vectorizer (ignored for hashing)",
     show_default=True,
-    type=click.FloatRange(0, 1),
 )
 @click.option(
     "--cv",
@@ -268,7 +267,7 @@ def train(
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
-    min_df: float,
     cv: int,
     token_batch_size: int,
     token_jobs: int,

 )
 @click.option(
     "--min-df",
+    default=5,
+    help="Minimum document frequency for the features (ignored for hashing)",
     show_default=True,
 )
 @click.option(
     "--cv",
     dataset: Literal["sentiment140", "amazonreviews", "imdb50k"],
     vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
+    min_df: int,
     cv: int,
     token_batch_size: int,
     token_jobs: int,

app/model.py CHANGED Viewed

@@ -10,7 +10,6 @@ from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer,
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
-from tqdm import tqdm
 from app.constants import CACHE_DIR
 from app.data import tokenize
@@ -36,7 +35,7 @@ def _identity(x: list[str]) -> list[str]:
 def _get_vectorizer(
     name: Literal["tfidf", "count", "hashing"],
     n_features: int,
-    min_df: float = 0.1,
     ngram: tuple[int, int] = (1, 2),
 ) -> TransformerMixin:
     """Get the appropriate vectorizer.
@@ -96,7 +95,7 @@ def train_model(
     label_data: list[int],
     vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
-    min_df: float = 0.1,
     folds: int = 5,
     n_jobs: int = 4,
     seed: int = 42,
@@ -129,66 +128,33 @@ def train_model(
     )
     vectorizer = _get_vectorizer(vectorizer, max_features, min_df)
-    classifiers = [
-        (LogisticRegression(max_iter=1000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
-        # (LinearSVC(max_iter=10000, random_state=rs), {"C": np.logspace(-4, 4, 20)}),
-        # (KNeighborsClassifier(), {"n_neighbors": np.arange(1, 10)}),
-        # (RandomForestClassifier(random_state=rs), {"n_estimators": np.arange(50, 500, 50)}),
-        # (
-        #     VotingClassifier(
-        #         estimators=[
-        #             ("lr", LogisticRegression(max_iter=1000, random_state=rs)),
-        #             ("knn", KNeighborsClassifier()),
-        #             ("rf", RandomForestClassifier(random_state=rs)),
-        #         ],
-        #     ),
-        #     {
-        #         "lr__C": np.logspace(-4, 4, 20),
-        #         "knn__n_neighbors": np.arange(1, 10),
-        #         "rf__n_estimators": np.arange(50, 500, 50),
-        #     },
-        # ),
-    ]
-    models = []
-    for clf, param_dist in (pbar := tqdm(classifiers, unit="clf")):
-        param_dist = {f"classifier__{k}": v for k, v in param_dist.items()}
-        model = Pipeline(
-            [("vectorizer", vectorizer), ("classifier", clf)],
-            memory=Memory(CACHE_DIR, verbose=0),
-        )
-        search = RandomizedSearchCV(
-            model,
-            param_dist,
-            cv=folds,
-            random_state=rs,
-            n_jobs=n_jobs,
-            # verbose=2,
-            scoring="accuracy",
-            n_iter=10,
-        )
-        pbar.set_description(f"Searching for {clf.__class__.__name__}")
-        with warnings.catch_warnings():
-            warnings.filterwarnings("once", category=ConvergenceWarning)
-            warnings.filterwarnings("ignore", category=UserWarning, message="Persisting input arguments took")
-            search.fit(text_train, label_train)
-        best_model = search.best_estimator_
-        acc = best_model.score(text_test, label_test)
-        models.append((best_model, acc))
-    print("Final results:")
-    print("--------------")
-    print("\n".join(f"{model.named_steps['classifier'].__class__.__name__}: {acc:.2%}" for model, acc in models))
-    best_model, best_acc = max(models, key=lambda x: x[1])
-    print(f"Settled on {best_model.named_steps['classifier'].__class__.__name__}")
-    return best_model, best_acc
 def evaluate_model(
@@ -211,7 +177,7 @@ def evaluate_model(
         Mean accuracy and standard deviation
     """
     with warnings.catch_warnings():
-        warnings.filterwarnings("ignore", category=UserWarning)
         scores = cross_val_score(
             model,
             token_data,

 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
 from sklearn.pipeline import Pipeline
 from app.constants import CACHE_DIR
 from app.data import tokenize
 def _get_vectorizer(
     name: Literal["tfidf", "count", "hashing"],
     n_features: int,
+    min_df: int = 5,
     ngram: tuple[int, int] = (1, 2),
 ) -> TransformerMixin:
     """Get the appropriate vectorizer.
     label_data: list[int],
     vectorizer: Literal["tfidf", "count", "hashing"],
     max_features: int,
+    min_df: int = 5,
     folds: int = 5,
     n_jobs: int = 4,
     seed: int = 42,
     )
     vectorizer = _get_vectorizer(vectorizer, max_features, min_df)
+    classifier = LogisticRegression(max_iter=1000, random_state=rs)
+    param_dist = {"classifier__C": np.logspace(-4, 4, 20)}
+    model = Pipeline(
+        [("vectorizer", vectorizer), ("classifier", classifier)],
+        memory=Memory(CACHE_DIR, verbose=0),
+    )
+    search = RandomizedSearchCV(
+        model,
+        param_dist,
+        cv=folds,
+        random_state=rs,
+        n_jobs=n_jobs,
+        verbose=2,
+        scoring="accuracy",
+        n_iter=10,
+    )
+    with warnings.catch_warnings():
+        warnings.filterwarnings("once", category=ConvergenceWarning)
+        warnings.filterwarnings("ignore", category=UserWarning, message="Persisting input arguments took")
+        search.fit(text_train, label_train)
+    final_model = search.best_estimator_
+    return final_model, final_model.score(text_test, label_test)
 def evaluate_model(
         Mean accuracy and standard deviation
     """
     with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=UserWarning, message="Persisting input arguments took")
         scores = cross_val_score(
             model,
             token_data,

models/amazonreviews_tfidf_ft20000.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ccc3156426d2086e10c241de5f186c756de550858ee2964471c26d0e24b8996
+size 442646