Spaces:
Runtime error
Runtime error
Ignore amazonreviews test
Browse files- app/constants.py +1 -1
- app/data.py +9 -18
app/constants.py
CHANGED
|
@@ -10,7 +10,7 @@ MODELS_DIR = Path(os.getenv("MODELS_DIR", "models"))
|
|
| 10 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
| 11 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
| 12 |
|
| 13 |
-
AMAZONREVIEWS_PATH =
|
| 14 |
AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
|
| 15 |
|
| 16 |
IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
|
|
|
| 10 |
SENTIMENT140_PATH = DATA_DIR / "sentiment140.csv"
|
| 11 |
SENTIMENT140_URL = "https://www.kaggle.com/datasets/kazanova/sentiment140"
|
| 12 |
|
| 13 |
+
AMAZONREVIEWS_PATH = DATA_DIR / "amazonreviews.train.txt.bz2"
|
| 14 |
AMAZONREVIEWS_URL = "https://www.kaggle.com/datasets/bittlingmayer/amazonreviews"
|
| 15 |
|
| 16 |
IMDB50K_PATH = DATA_DIR / "imdb50k.csv"
|
app/data.py
CHANGED
|
@@ -82,6 +82,7 @@ def tokenize(
|
|
| 82 |
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
|
| 83 |
total=len(text_data),
|
| 84 |
disable=not show_progress,
|
|
|
|
| 85 |
)
|
| 86 |
]
|
| 87 |
|
|
@@ -138,12 +139,9 @@ def load_sentiment140(include_neutral: bool = False) -> tuple[list[str], list[in
|
|
| 138 |
return data["text"].tolist(), data["sentiment"].tolist()
|
| 139 |
|
| 140 |
|
| 141 |
-
def load_amazonreviews(
|
| 142 |
"""Load the amazonreviews dataset and make it suitable for use.
|
| 143 |
|
| 144 |
-
Args:
|
| 145 |
-
merge: Whether to merge the test and train datasets (otherwise ignore test)
|
| 146 |
-
|
| 147 |
Returns:
|
| 148 |
Text and label data
|
| 149 |
|
|
@@ -151,27 +149,20 @@ def load_amazonreviews(merge: bool = True) -> tuple[list[str], list[int]]:
|
|
| 151 |
FileNotFoundError: If the dataset is not found
|
| 152 |
"""
|
| 153 |
# Check if the dataset exists
|
| 154 |
-
|
| 155 |
-
train_exists = AMAZONREVIEWS_PATH[1].exists()
|
| 156 |
-
if not (test_exists and train_exists):
|
| 157 |
msg = (
|
| 158 |
-
f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH
|
| 159 |
"Please download the dataset from:\n"
|
| 160 |
f"{AMAZONREVIEWS_URL}"
|
| 161 |
)
|
| 162 |
raise FileNotFoundError(msg)
|
| 163 |
|
| 164 |
-
# Load the
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
dataset.extend([line.decode("utf-8") for line in train_file])
|
| 168 |
-
|
| 169 |
-
if merge:
|
| 170 |
-
with bz2.BZ2File(AMAZONREVIEWS_PATH[0]) as test_file:
|
| 171 |
-
dataset.extend([line.decode("utf-8") for line in test_file])
|
| 172 |
|
| 173 |
# Split the data into labels and text
|
| 174 |
-
labels, texts = zip(*(line.split(" ", 1) for line in dataset))
|
| 175 |
|
| 176 |
# Map sentiment values
|
| 177 |
sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
|
|
@@ -270,7 +261,7 @@ def load_data(dataset: Literal["sentiment140", "amazonreviews", "imdb50k", "test
|
|
| 270 |
case "sentiment140":
|
| 271 |
return load_sentiment140(include_neutral=False)
|
| 272 |
case "amazonreviews":
|
| 273 |
-
return load_amazonreviews(
|
| 274 |
case "imdb50k":
|
| 275 |
return load_imdb50k()
|
| 276 |
case "test":
|
|
|
|
| 82 |
nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs),
|
| 83 |
total=len(text_data),
|
| 84 |
disable=not show_progress,
|
| 85 |
+
unit="doc",
|
| 86 |
)
|
| 87 |
]
|
| 88 |
|
|
|
|
| 139 |
return data["text"].tolist(), data["sentiment"].tolist()
|
| 140 |
|
| 141 |
|
| 142 |
+
def load_amazonreviews() -> tuple[list[str], list[int]]:
|
| 143 |
"""Load the amazonreviews dataset and make it suitable for use.
|
| 144 |
|
|
|
|
|
|
|
|
|
|
| 145 |
Returns:
|
| 146 |
Text and label data
|
| 147 |
|
|
|
|
| 149 |
FileNotFoundError: If the dataset is not found
|
| 150 |
"""
|
| 151 |
# Check if the dataset exists
|
| 152 |
+
if not AMAZONREVIEWS_PATH.exists():
|
|
|
|
|
|
|
| 153 |
msg = (
|
| 154 |
+
f"Amazonreviews dataset not found at: '{AMAZONREVIEWS_PATH}'\n"
|
| 155 |
"Please download the dataset from:\n"
|
| 156 |
f"{AMAZONREVIEWS_URL}"
|
| 157 |
)
|
| 158 |
raise FileNotFoundError(msg)
|
| 159 |
|
| 160 |
+
# Load the dataset
|
| 161 |
+
with bz2.BZ2File(AMAZONREVIEWS_PATH) as f:
|
| 162 |
+
dataset = [line.decode("utf-8") for line in f]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
# Split the data into labels and text
|
| 165 |
+
labels, texts = zip(*(line.split(" ", 1) for line in dataset))
|
| 166 |
|
| 167 |
# Map sentiment values
|
| 168 |
sentiments = [int(label.split("__label__")[1]) - 1 for label in labels]
|
|
|
|
| 261 |
case "sentiment140":
|
| 262 |
return load_sentiment140(include_neutral=False)
|
| 263 |
case "amazonreviews":
|
| 264 |
+
return load_amazonreviews()
|
| 265 |
case "imdb50k":
|
| 266 |
return load_imdb50k()
|
| 267 |
case "test":
|