Spaces:
Sleeping
Sleeping
| from transformers import RobertaForSequenceClassification, AutoTokenizer | |
| import torch | |
| import docx2txt | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import openpyxl | |
| from openpyxl.styles import Font, Color, PatternFill | |
| from openpyxl.styles.colors import WHITE | |
| import gradio as gr | |
| import underthesea | |
| # Load the model and tokenizer | |
| senti_model = RobertaForSequenceClassification.from_pretrained("wonrax/phobert-base-vietnamese-sentiment") | |
| senti_tokenizer = AutoTokenizer.from_pretrained("wonrax/phobert-base-vietnamese-sentiment", use_fast=False) | |
| # Word segmented | |
| def segmentation(sentences): | |
| segmented_sentences = [] | |
| for sentence in sentences: | |
| segmented_sentence = underthesea.word_tokenize(sentence) | |
| segmented_sentences.append(' '.join(segmented_sentence)) | |
| return segmented_sentences | |
| # File read | |
| def read_file(docx): | |
| try: | |
| text = docx2txt.process(docx) | |
| lines = text.split('\n') | |
| lines = [line.strip() for line in lines] | |
| lines = [line for line in lines if line] | |
| return lines # add this line | |
| except Exception as e: | |
| print(f"Error reading file: {e}") | |
| # Define a function to analyze the sentiment of a text | |
| def analyze(sentence): | |
| input_ids = torch.tensor([senti_tokenizer.encode(sentence)]) | |
| with torch.no_grad(): | |
| out = senti_model(input_ids) | |
| results = out.logits.softmax(dim=-1).tolist() | |
| return results[0] | |
| def file_analysis(docx): | |
| # Read the file and segment the sentences | |
| sentences = read_file(docx) | |
| segmented_sentences = segmentation(sentences) | |
| # Analyze the sentiment of each sentence | |
| results = [] | |
| for sentence in segmented_sentences: | |
| results.append(analyze(sentence)) | |
| return results | |
| def generate_pie_chart(df): | |
| # Calculate the average scores | |
| neg_avg = df['Negative'].mean() | |
| pos_avg = df['Positive'].mean() | |
| neu_avg = df['Neutral'].mean() | |
| # Create a new DataFrame with the average scores | |
| avg_df = pd.DataFrame({'Sentiment': ['Negative', 'Positive', 'Neutral'], | |
| 'Score': [neg_avg, pos_avg, neu_avg]}) | |
| # Set custom colors for the pie chart | |
| colors = ['#BDBDBD', '#9ACD32', '#87CEFA'] | |
| # Create a pie chart showing the average scores | |
| plt.pie(avg_df['Score'], labels=avg_df['Sentiment'], colors=colors, autopct='%1.1f%%') | |
| plt.title('Average Scores by Sentiment') | |
| # Save the pie chart as an image file in the static folder | |
| pie_chart_name = 'pie_chart.png' | |
| plt.savefig(pie_chart_name) | |
| plt.close() | |
| return pie_chart_name | |
| def generate_excel_file(df): | |
| # Create a new workbook and worksheet | |
| wb = openpyxl.Workbook() | |
| ws = wb.active | |
| # Add column headers to the worksheet | |
| headers = ['Negative', 'Positive', 'Neutral', 'Text'] | |
| for col_num, header in enumerate(headers, 1): | |
| cell = ws.cell(row=1, column=col_num) | |
| cell.value = header | |
| cell.font = Font(bold=True) | |
| # Set up cell formatting for each sentiment | |
| fill_dict = { | |
| 'Negative': PatternFill(start_color='BDBDBD', end_color='BDBDBD', fill_type='solid'), | |
| 'Positive': PatternFill(start_color='9ACD32', end_color='9ACD32', fill_type='solid'), | |
| 'Neutral': PatternFill(start_color='87CEFA', end_color='87CEFA', fill_type='solid') | |
| } | |
| # Loop through each row of the input DataFrame and write data to the worksheet | |
| for row_num, row_data in df.iterrows(): | |
| # Calculate the highest score and corresponding sentiment for this row | |
| sentiment_cols = ['Negative', 'Positive', 'Neutral'] | |
| scores = [row_data[col] for col in sentiment_cols] | |
| max_score = max(scores) | |
| max_index = scores.index(max_score) | |
| sentiment = sentiment_cols[max_index] | |
| # Write the data to the worksheet | |
| for col_num, col_data in enumerate(row_data, 1): | |
| cell = ws.cell(row=row_num + 2, column=col_num) | |
| cell.value = col_data | |
| if col_num in [1, 2, 3]: | |
| if col_data == max_score: | |
| cell.fill = fill_dict[sentiment] | |
| if col_num == 4: | |
| fill = fill_dict[sentiment] | |
| font_color = WHITE if fill.start_color.rgb == 'BDBDBD' else Color('000000') | |
| cell.fill = fill | |
| cell.font = Font(color=font_color) | |
| if col_data == max_score: | |
| cell.fill = fill_dict[sentiment] | |
| # Save the workbook | |
| excel_file_path = 'result.xlsx' | |
| wb.save(excel_file_path) | |
| return excel_file_path | |
| def process_file(docx): | |
| # Perform analysis on the file | |
| results = file_analysis(docx) | |
| # Create a DataFrame from the results | |
| df = pd.DataFrame(results, columns=['Negative', 'Positive', 'Neutral']) | |
| df['Text'] = read_file(docx) | |
| # Generate the pie chart and excel file | |
| pie_chart_name = generate_pie_chart(df) | |
| excel_file_path = generate_excel_file(df) | |
| return pie_chart_name, excel_file_path | |
| def analyze_file(docx_file): | |
| # Process the file and generate the output files | |
| pie_chart_name, excel_file_path = process_file(docx_file.name) | |
| # Return the file paths for the pie chart and excel file | |
| return excel_file_path, pie_chart_name | |
| inputs = gr.File(label="Chọn Tệp Bạn Muốn Phân Tích") | |
| outputs = [ | |
| gr.File(label="Kết Quả Phân Tích Excel"), | |
| gr.Image(type="filepath", label="Thông Số Phân Tích") | |
| ] | |
| interface = gr.Interface( | |
| fn=analyze_file, | |
| inputs=inputs, | |
| outputs=outputs, | |
| title="Sentiment Analysis", | |
| allow_flagging="never" # Disable flag button | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() |