Update pages/Analyze_Text.py
Browse files- pages/Analyze_Text.py +27 -22
pages/Analyze_Text.py
CHANGED
|
@@ -6,29 +6,19 @@ import plotly.express as px
|
|
| 6 |
from wordcloud.wordcloud import WordCloud
|
| 7 |
from configs.db_configs import add_one_item
|
| 8 |
from configs.html_features import set_image, HTML_WRAPPER
|
| 9 |
-
|
| 10 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 11 |
-
|
| 12 |
import torch
|
| 13 |
from torch.nn.functional import softmax
|
| 14 |
-
|
| 15 |
from spacy import displacy
|
| 16 |
import spacy
|
| 17 |
nlp = spacy.load('en_core_web_sm')
|
| 18 |
-
|
| 19 |
from collections import Counter
|
| 20 |
import neattext as nt
|
| 21 |
import neattext.functions as nfx
|
| 22 |
from textblob import TextBlob
|
| 23 |
import nltk
|
| 24 |
|
| 25 |
-
|
| 26 |
-
nltk.download('brown')
|
| 27 |
-
nltk.download('punkt')
|
| 28 |
-
nltk.download('wordnet')
|
| 29 |
-
nltk.download('averaged_perceptron_tagger')
|
| 30 |
-
nltk.download('conll2000')
|
| 31 |
-
nltk.download('movie_reviews')
|
| 32 |
|
| 33 |
def get_tokens_analysis(text):
|
| 34 |
doc_obj = nlp(text)
|
|
@@ -39,7 +29,6 @@ def get_tokens_analysis(text):
|
|
| 39 |
|
| 40 |
def get_entities_tokens(text):
|
| 41 |
doc_obj = nlp(text)
|
| 42 |
-
|
| 43 |
html = displacy.render(doc_obj, style='ent')
|
| 44 |
html = html.replace('\n\n', '\n')
|
| 45 |
entities_tokens_html = HTML_WRAPPER.format(html)
|
|
@@ -69,15 +58,29 @@ def plot_top_keywords_frequencies(text, n_top_keywords):
|
|
| 69 |
|
| 70 |
|
| 71 |
def get_sentence_stats(text):
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
def plot_tokens_pos(tokens_stats_df):
|
|
@@ -109,6 +112,7 @@ def plot_word_frequency(text):
|
|
| 109 |
plt.axis('off')
|
| 110 |
return fig
|
| 111 |
|
|
|
|
| 112 |
def main():
|
| 113 |
st.title('Text Analyzer')
|
| 114 |
im1, im2, im3 = st.columns([1, 5.3, 1])
|
|
@@ -122,6 +126,7 @@ def main():
|
|
| 122 |
|
| 123 |
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
| 124 |
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
|
|
|
|
| 125 |
if st.button('Analyze it'):
|
| 126 |
if text != '':
|
| 127 |
with st.expander('Original Text'):
|
|
@@ -157,7 +162,7 @@ def main():
|
|
| 157 |
st.write('Noun Phrases:\n', noun_phrases)
|
| 158 |
|
| 159 |
with col22:
|
| 160 |
-
with st.expander('The
|
| 161 |
figure = plot_tokens_pos(tokens_stats_df)
|
| 162 |
st.plotly_chart(figure)
|
| 163 |
|
|
|
|
| 6 |
from wordcloud.wordcloud import WordCloud
|
| 7 |
from configs.db_configs import add_one_item
|
| 8 |
from configs.html_features import set_image, HTML_WRAPPER
|
|
|
|
| 9 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
| 10 |
import torch
|
| 11 |
from torch.nn.functional import softmax
|
|
|
|
| 12 |
from spacy import displacy
|
| 13 |
import spacy
|
| 14 |
nlp = spacy.load('en_core_web_sm')
|
|
|
|
| 15 |
from collections import Counter
|
| 16 |
import neattext as nt
|
| 17 |
import neattext.functions as nfx
|
| 18 |
from textblob import TextBlob
|
| 19 |
import nltk
|
| 20 |
|
| 21 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def get_tokens_analysis(text):
|
| 24 |
doc_obj = nlp(text)
|
|
|
|
| 29 |
|
| 30 |
def get_entities_tokens(text):
|
| 31 |
doc_obj = nlp(text)
|
|
|
|
| 32 |
html = displacy.render(doc_obj, style='ent')
|
| 33 |
html = html.replace('\n\n', '\n')
|
| 34 |
entities_tokens_html = HTML_WRAPPER.format(html)
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
def get_sentence_stats(text):
|
| 61 |
+
try:
|
| 62 |
+
blob = TextBlob(text)
|
| 63 |
+
sentences = [str(sentence) for sentence in blob.sentences]
|
| 64 |
+
noun_phrases = list(blob.noun_phrases)
|
| 65 |
+
except:
|
| 66 |
+
# These corpora are commonly used by TextBlob for various natural language processing tasks.
|
| 67 |
+
nltk.download('brown')
|
| 68 |
+
nltk.download('punkt')
|
| 69 |
+
nltk.download('wordnet')
|
| 70 |
+
nltk.download('averaged_perceptron_tagger')
|
| 71 |
+
nltk.download('conll2000')
|
| 72 |
+
nltk.download('movie_reviews')
|
| 73 |
+
|
| 74 |
+
blob = TextBlob(text)
|
| 75 |
+
sentences = [str(sentence) for sentence in blob.sentences]
|
| 76 |
+
noun_phrases = list(blob.noun_phrases)
|
| 77 |
+
finally:
|
| 78 |
+
sentence_stats = {
|
| 79 |
+
'Number of Sentences' : len(sentences),
|
| 80 |
+
'Number of Noun Phrases' : len(noun_phrases)
|
| 81 |
+
}
|
| 82 |
+
sentence_stats_df = pd.DataFrame(sentence_stats, index=[0])
|
| 83 |
+
return sentences, noun_phrases, sentence_stats_df
|
| 84 |
|
| 85 |
|
| 86 |
def plot_tokens_pos(tokens_stats_df):
|
|
|
|
| 112 |
plt.axis('off')
|
| 113 |
return fig
|
| 114 |
|
| 115 |
+
|
| 116 |
def main():
|
| 117 |
st.title('Text Analyzer')
|
| 118 |
im1, im2, im3 = st.columns([1, 5.3, 1])
|
|
|
|
| 126 |
|
| 127 |
text = st.text_area('Text Analyzer', placeholder='Enter your input text here ...', height=200, label_visibility='hidden')
|
| 128 |
n_top_keywords = st.sidebar.slider('n Top keywords', 5, 15, 5, 1)
|
| 129 |
+
|
| 130 |
if st.button('Analyze it'):
|
| 131 |
if text != '':
|
| 132 |
with st.expander('Original Text'):
|
|
|
|
| 162 |
st.write('Noun Phrases:\n', noun_phrases)
|
| 163 |
|
| 164 |
with col22:
|
| 165 |
+
with st.expander('The Distribution of different Parts of Speech'):
|
| 166 |
figure = plot_tokens_pos(tokens_stats_df)
|
| 167 |
st.plotly_chart(figure)
|
| 168 |
|