| import streamlit as st | |
| import numpy as np | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from scipy import linalg | |
| import regex as re | |
| from configs.db_configs import add_one_item | |
| from streamlit.components.v1 import html | |
| from configs.html_features import set_image | |
| from configs.download_files import FileDownloader | |
| def preprocess_text(text): | |
| vectorizer = CountVectorizer(stop_words='english') | |
| vector = vectorizer.fit_transform([text]).todense() | |
| vocab = np.array(vectorizer.get_feature_names_out()) | |
| U, s, Vh = linalg.svd(vector, full_matrices=False) | |
| return vocab, U, s, Vh | |
| def show_topics(text, num_top_words): | |
| vocab, U, s, Vh = preprocess_text(text) | |
| pattern = '\d+' | |
| top_words = lambda Vh: [vocab[i] for i in np.argsort(Vh)[:-num_top_words-1:-1]] | |
| topic_words = top_words(Vh[0]) | |
| topic_words = ' '.join(topic_words) | |
| return ' '.join([re.sub(pattern, '', word) for word in topic_words.split()]) | |
| def main(): | |
| st.title('Topic Modeling by Top Keywords') | |
| im1, im2, im3 = st.columns([1, 5.3, 1]) | |
| with im1: | |
| pass | |
| with im2: | |
| url = "https://i.postimg.cc/jdF1hPng/combined.png" | |
| html(set_image(url), height=400, width=400) | |
| with im3: | |
| pass | |
| text = st.text_area('Find Topic', placeholder='Enter your input text here ...', height=200, label_visibility='hidden') | |
| num_top_words = st.sidebar.slider('Number of Top Keywords', min_value=5, max_value=20, step=1, value=10) | |
| if st.button('Find Topic'): | |
| if text != '': | |
| with st.expander('Original Text'): | |
| st.write(text) | |
| add_one_item(text, 'Topic Modeling') | |
| with st.expander(f'Show Topic by {num_top_words} Top Keywords'): | |
| topic_words = show_topics(text, num_top_words) | |
| st.write(topic_words) | |
| with st.expander('Download Topic words'): | |
| FileDownloader(data=topic_words, file_ext='txt').download() | |
| if __name__ == '__main__': | |
| main() |