Spaces:
Running
Running
| import torch | |
| from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_scheduler | |
| from datasets import load_dataset | |
| from torch.utils.data import DataLoader | |
| import streamlit as st | |
| import matplotlib.pyplot as plt | |
| from tqdm.auto import tqdm | |
| # Load pre-trained model and tokenizer from Hugging Face | |
| tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | |
| model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) | |
| device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
| model.to(device) | |
| # Streamlit interface | |
| st.title("Sentiment Analysis with BERT") | |
| # Training setup | |
| st.sidebar.title("Training Setup") | |
| num_epochs = st.sidebar.slider("Number of Epochs", 1, 5, 3) | |
| batch_size = st.sidebar.slider("Batch Size", 4, 32, 8) | |
| learning_rate = st.sidebar.slider("Learning Rate", 1e-6, 1e-3, 5e-5, format="%.6f") | |
| # Define a custom hash function for AddedToken type | |
| def load_and_preprocess_data(): | |
| dataset = load_dataset("imdb", split="train[:1%]") | |
| def preprocess_function(examples): | |
| return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128) | |
| encoded_dataset = dataset.map(preprocess_function, batched=True) | |
| encoded_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label']) | |
| encoded_dataset = encoded_dataset.rename_column("label", "labels") # Rename the column to 'labels' | |
| return DataLoader(encoded_dataset, shuffle=True, batch_size=batch_size) | |
| train_dataloader = load_and_preprocess_data() | |
| # Initialize training status | |
| training_completed = st.sidebar.empty() | |
| # Training loop | |
| if st.sidebar.button("Train"): | |
| optimizer = AdamW(model.parameters(), lr=learning_rate) | |
| num_training_steps = num_epochs * len(train_dataloader) | |
| lr_scheduler = get_scheduler( | |
| name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps | |
| ) | |
| progress_bar = tqdm(range(num_training_steps)) | |
| loss_values = [] | |
| model.train() | |
| for epoch in range(num_epochs): | |
| for batch in train_dataloader: | |
| batch = {k: v.to(device) for k, v in batch.items()} | |
| outputs = model(**batch) | |
| loss = outputs.loss | |
| loss.backward() | |
| optimizer.step() | |
| lr_scheduler.step() | |
| optimizer.zero_grad() | |
| progress_bar.update(1) | |
| loss_values.append(loss.item()) | |
| training_completed.success("Training completed") | |
| # Plot loss values | |
| st.write("### Training Loss") | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(loss_values, label="Training Loss") | |
| plt.xlabel("Training Steps") | |
| plt.ylabel("Loss") | |
| plt.legend() | |
| st.pyplot(plt) | |
| # Text input for prediction | |
| st.write("### Predict Sentiment") | |
| user_input = st.text_area("Enter text:", "I loved this movie!") | |
| if user_input: | |
| inputs = tokenizer(user_input, padding="max_length", truncation=True, max_length=128, return_tensors="pt") | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| model.eval() | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| prediction = outputs.logits.argmax(dim=-1).item() | |
| sentiment = "Positive" if prediction == 1 else "Negative" | |
| st.write(f"Sentiment: **{sentiment}**") | |