Spaces:

eaglelandsonce
/

pytorch

Running

App Files Files Community

eaglelandsonce commited on Jun 21, 2024

Commit

c4eca14

verified ·

1 Parent(s): f50c8e1

Update pages/21_NLP_Transformer.py

Browse files

Files changed (1) hide show

pages/21_NLP_Transformer.py +33 -193

pages/21_NLP_Transformer.py CHANGED Viewed

@@ -1,199 +1,39 @@
-import torch
-from torch.utils.data import DataLoader, Dataset
-from transformers import BertTokenizer, BertForSequenceClassification, AdamW
-from transformers import get_linear_schedule_with_warmup
-import numpy as np
-from datasets import load_dataset
 import streamlit as st
-# Load IMDb dataset
-dataset = load_dataset('imdb')
-train_df = dataset['train'].to_pandas()
-test_df = dataset['test'].to_pandas()
-# Preprocess the data
-train_df = train_df[['text', 'label']]
-test_df = test_df[['text', 'label']]
-class SentimentDataset(Dataset):
-    def __init__(self, dataframe, tokenizer, max_len):
-        self.tokenizer = tokenizer
-        self.data = dataframe
-        self.max_len = max_len
-    def __len__(self):
-        return len(self.data)
-    def __getitem__(self, index):
-        review = str(self.data.iloc[index, 0])
-        label = self.data.iloc[index, 1]
-        encoding = self.tokenizer.encode_plus(
-            review,
-            add_special_tokens=True,
-            max_length=self.max_len,
-            return_token_type_ids=False,
-            pad_to_max_length=True,
-            return_attention_mask=True,
-            return_tensors='pt',
-        )
-        return {
-            'review_text': review,
-            'input_ids': encoding['input_ids'].flatten(),
-            'attention_mask': encoding['attention_mask'].flatten(),
-            'labels': torch.tensor(label, dtype=torch.long)
-        }
-def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
-    model = model.train()
-    losses = []
-    correct_predictions = 0
-    for d in data_loader:
-        input_ids = d["input_ids"].to(device)
-        attention_mask = d["attention_mask"].to(device)
-        labels = d["labels"].to(device)
-        outputs = model(
-            input_ids=input_ids,
-            attention_mask=attention_mask
-        )
-        loss = loss_fn(outputs.logits, labels)
-        correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
-        losses.append(loss.item())
-        loss.backward()
-        optimizer.step()
-        scheduler.step()
-        optimizer.zero_grad()
-    return correct_predictions.double() / n_examples, np.mean(losses)
-def eval_model(model, data_loader, loss_fn, device, n_examples):
-    model = model.eval()
-    losses = []
-    correct_predictions = 0
-    with torch.no_grad():
-        for d in data_loader:
-            input_ids = d["input_ids"].to(device)
-            attention_mask = d["attention_mask"].to(device)
-            labels = d["labels"].to(device)
-            outputs = model(
-                input_ids=input_ids,
-                attention_mask=attention_mask
-            )
-            loss = loss_fn(outputs.logits, labels)
-            correct_predictions += torch.sum(torch.argmax(outputs.logits, dim=1) == labels)
-            losses.append(loss.item())
-    return correct_predictions.double() / n_examples, np.mean(losses)
-def create_data_loader(df, tokenizer, max_len, batch_size):
-    ds = SentimentDataset(
-        dataframe=df,
-        tokenizer=tokenizer,
-        max_len=max_len
-    )
-    return DataLoader(
-        ds,
-        batch_size=batch_size,
-        num_workers=4
-    )
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-# Create data loaders
-BATCH_SIZE = 16
-MAX_LEN = 128
-train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
-test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)
-EPOCHS = 2
-optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
-total_steps = len(train_data_loader) * EPOCHS
-scheduler = get_linear_schedule_with_warmup(
-    optimizer,
-    num_warmup_steps=0,
-    num_training_steps=total_steps
-)
-loss_fn = torch.nn.CrossEntropyLoss().to(device)
-model = model.to(device)
-# Streamlit app
-st.title("Sentiment Analysis with BERT")
-st.write("""
-This application allows you to train a BERT model for sentiment analysis on the IMDb dataset.
-You can input a movie review and the model will predict whether the sentiment is positive or negative.
-""")
-if st.button("Train Model"):
-    with st.spinner("Training the model..."):
-        # Training loop
-        for epoch in range(EPOCHS):
-            train_acc, train_loss = train_epoch(
-                model,
-                train_data_loader,
-                loss_fn,
-                optimizer,
-                device,
-                scheduler,
-                len(train_df)
-            )
-            st.write(f'Epoch {epoch + 1}/{EPOCHS}')
-            st.write(f'Train loss {train_loss} accuracy {train_acc}')
-            val_acc, val_loss = eval_model(
-                model,
-                test_data_loader,
-                loss_fn,
-                device,
-                len(test_df)
-            )
-            st.write(f'Val loss {val_loss} accuracy {val_acc}')
-        # Save the model
-        model.save_pretrained('bert-sentiment-model')
-        tokenizer.save_pretrained('bert-sentiment-model')
-    st.success("Model training complete!")
-model = BertForSequenceClassification.from_pretrained('bert-sentiment-model')
-tokenizer = BertTokenizer.from_pretrained('bert-sentiment-model')
-model = model.eval()
-def predict_sentiment(text):
-    encoding = tokenizer.encode_plus(
-        text,
-        add_special_tokens=True,
-        max_length=128,
-        return_token_type_ids=False,
-        pad_to_max_length=True,
-        return_attention_mask=True,
-        return_tensors='pt',
-    )
-    input_ids = encoding['input_ids']
-    attention_mask = encoding['attention_mask']
-    with torch.no_grad():
-        outputs = model(input_ids, attention_mask=attention_mask)
-        probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
-        predicted_class = torch.argmax(probabilities, dim=1).item()
-    return 'positive' if predicted_class == 1 else 'negative'
-st.title("Sentiment Analysis with BERT")
-user_input = st.text_area("Enter a movie review:")
 if st.button("Analyze"):
-    sentiment = predict_sentiment(user_input)
-    st.write(f'The sentiment of the review is: **{sentiment}**')

 import streamlit as st
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
+import matplotlib.pyplot as plt
+# Load model and tokenizer
+model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
+# Streamlit interface
+st.title("Sentiment Analysis with Hugging Face Transformers")
+st.write("Enter text to analyze its sentiment:")
+input_text = st.text_area("Input Text", height=200)
 if st.button("Analyze"):
+    if input_text:
+        # Perform sentiment analysis
+        results = classifier(input_text)
+        # Display results
+        st.write("Results:")
+        st.write(results)
+        # Extract scores for plotting
+        scores = results[0]['score']
+        labels = results[0]['label']
+        # Plotting
+        fig, ax = plt.subplots()
+        ax.bar(labels, scores, color='skyblue')
+        ax.set_ylabel('Score')
+        ax.set_title('Sentiment Analysis Result')
+        st.pyplot(fig)
+    else:
+        st.write("Please enter text to analyze.")