Spaces:
Running
Running
Update pages/1_TensorIntro.py
Browse files- pages/1_TensorIntro.py +42 -26
pages/1_TensorIntro.py
CHANGED
|
@@ -207,27 +207,44 @@ print("Normalized data:", normalized_data)
|
|
| 207 |
"code": '''import torch
|
| 208 |
import torch.nn as nn
|
| 209 |
import torch.optim as optim
|
| 210 |
-
from torchtext.
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
#
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
# Define the LSTM model
|
| 233 |
class LSTM(nn.Module):
|
|
@@ -239,12 +256,12 @@ class LSTM(nn.Module):
|
|
| 239 |
|
| 240 |
def forward(self, text, text_lengths):
|
| 241 |
embedded = self.embedding(text)
|
| 242 |
-
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
|
| 243 |
packed_output, (hidden, cell) = self.lstm(packed_embedded)
|
| 244 |
return self.fc(hidden.squeeze(0))
|
| 245 |
|
| 246 |
# Instantiate the model
|
| 247 |
-
INPUT_DIM = len(
|
| 248 |
EMBEDDING_DIM = 100
|
| 249 |
HIDDEN_DIM = 256
|
| 250 |
OUTPUT_DIM = 1
|
|
@@ -258,11 +275,10 @@ optimizer = optim.Adam(model.parameters())
|
|
| 258 |
N_EPOCHS = 5
|
| 259 |
for epoch in range(N_EPOCHS):
|
| 260 |
model.train()
|
| 261 |
-
for
|
| 262 |
optimizer.zero_grad()
|
| 263 |
-
text, text_lengths = batch.text
|
| 264 |
predictions = model(text, text_lengths).squeeze(1)
|
| 265 |
-
loss = criterion(predictions,
|
| 266 |
loss.backward()
|
| 267 |
optimizer.step()
|
| 268 |
|
|
|
|
| 207 |
"code": '''import torch
|
| 208 |
import torch.nn as nn
|
| 209 |
import torch.optim as optim
|
| 210 |
+
from torchtext.datasets import IMDB
|
| 211 |
+
from torchtext.data.utils import get_tokenizer
|
| 212 |
+
from torchtext.vocab import build_vocab_from_iterator
|
| 213 |
+
from torch.utils.data import DataLoader
|
| 214 |
+
from torch.nn.utils.rnn import pad_sequence
|
| 215 |
+
|
| 216 |
+
# Define the tokenizer and vocabulary
|
| 217 |
+
tokenizer = get_tokenizer('basic_english')
|
| 218 |
+
train_iter = IMDB(split='train')
|
| 219 |
+
|
| 220 |
+
def yield_tokens(data_iter):
|
| 221 |
+
for _, text in data_iter:
|
| 222 |
+
yield tokenizer(text)
|
| 223 |
+
|
| 224 |
+
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
|
| 225 |
+
vocab.set_default_index(vocab["<unk>"])
|
| 226 |
+
|
| 227 |
+
# Define the text and label preprocessing pipeline
|
| 228 |
+
text_pipeline = lambda x: vocab(tokenizer(x))
|
| 229 |
+
label_pipeline = lambda x: 1 if x == 'pos' else 0
|
| 230 |
+
|
| 231 |
+
# Define the collate function for the DataLoader
|
| 232 |
+
def collate_batch(batch):
|
| 233 |
+
label_list, text_list, lengths = [], [], []
|
| 234 |
+
for _label, _text in batch:
|
| 235 |
+
label_list.append(label_pipeline(_label))
|
| 236 |
+
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
|
| 237 |
+
text_list.append(processed_text)
|
| 238 |
+
lengths.append(processed_text.size(0))
|
| 239 |
+
label_list = torch.tensor(label_list, dtype=torch.float)
|
| 240 |
+
text_list = pad_sequence(text_list, batch_first=True)
|
| 241 |
+
lengths = torch.tensor(lengths, dtype=torch.int64)
|
| 242 |
+
return label_list, text_list, lengths
|
| 243 |
+
|
| 244 |
+
# Create DataLoaders for training and testing
|
| 245 |
+
train_iter, test_iter = IMDB()
|
| 246 |
+
train_dataloader = DataLoader(list(train_iter), batch_size=8, shuffle=True, collate_fn=collate_batch)
|
| 247 |
+
test_dataloader = DataLoader(list(test_iter), batch_size=8, shuffle=False, collate_fn=collate_batch)
|
| 248 |
|
| 249 |
# Define the LSTM model
|
| 250 |
class LSTM(nn.Module):
|
|
|
|
| 256 |
|
| 257 |
def forward(self, text, text_lengths):
|
| 258 |
embedded = self.embedding(text)
|
| 259 |
+
packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
|
| 260 |
packed_output, (hidden, cell) = self.lstm(packed_embedded)
|
| 261 |
return self.fc(hidden.squeeze(0))
|
| 262 |
|
| 263 |
# Instantiate the model
|
| 264 |
+
INPUT_DIM = len(vocab)
|
| 265 |
EMBEDDING_DIM = 100
|
| 266 |
HIDDEN_DIM = 256
|
| 267 |
OUTPUT_DIM = 1
|
|
|
|
| 275 |
N_EPOCHS = 5
|
| 276 |
for epoch in range(N_EPOCHS):
|
| 277 |
model.train()
|
| 278 |
+
for labels, text, text_lengths in train_dataloader:
|
| 279 |
optimizer.zero_grad()
|
|
|
|
| 280 |
predictions = model(text, text_lengths).squeeze(1)
|
| 281 |
+
loss = criterion(predictions, labels)
|
| 282 |
loss.backward()
|
| 283 |
optimizer.step()
|
| 284 |
|