mihalykiss commited on
Commit
cd8fc19
·
verified ·
1 Parent(s): e74dcef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -19
app.py CHANGED
@@ -52,7 +52,6 @@ def clean_text(text: str) -> str:
52
  text = re.sub(r'\s+([,.;:?!])', r'\1', text)
53
  return text
54
 
55
- # --- Tokenizer Normalizer Configuration ---
56
  newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
57
  join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
58
  tokenizer.backend_tokenizer.normalizer = Sequence([
@@ -82,27 +81,22 @@ def classify_text(text):
82
  logits_2 = model_2(**inputs).logits
83
  logits_3 = model_3(**inputs).logits
84
 
85
- # Apply softmax to get probabilities
86
  softmax_1 = torch.softmax(logits_1, dim=1)
87
  softmax_2 = torch.softmax(logits_2, dim=1)
88
  softmax_3 = torch.softmax(logits_3, dim=1)
89
 
90
- # Average the probabilities from the three models
91
  averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
92
  probabilities = averaged_probabilities[0]
93
 
94
- # --- Generate Text Result ---
95
  human_prob = probabilities[24].item()
96
  ai_probs_clone = probabilities.clone()
97
- ai_probs_clone[24] = 0 # Exclude human probability for AI total
98
  ai_total_prob = ai_probs_clone.sum().item()
99
 
100
- # Normalize probabilities to get a percentage-based decision
101
  total_decision_prob = human_prob + ai_total_prob
102
  human_percentage = (human_prob / total_decision_prob) * 100
103
  ai_percentage = (ai_total_prob / total_decision_prob) * 100
104
 
105
- # Determine the most likely AI model
106
  ai_argmax_index = torch.argmax(ai_probs_clone).item()
107
  ai_argmax_model = label_mapping[ai_argmax_index]
108
 
@@ -116,38 +110,33 @@ def classify_text(text):
116
  f"**Identified LLM: {ai_argmax_model}**"
117
  )
118
 
119
- # --- Generate Plot ---
120
- # Find the top 5 AI models by probability
121
  ai_probs_for_plot = probabilities.clone()
122
- ai_probs_for_plot[24] = -1 # Ensure 'human' isn't in the top 5 AI list
123
  top_5_probs, top_5_indices = torch.topk(ai_probs_for_plot, 5)
124
 
125
- # Prepare data for plotting
126
  top_5_probs = top_5_probs.cpu().numpy()
127
  top_5_labels = [label_mapping[i.item()] for i in top_5_indices]
128
 
129
- # Create a horizontal bar plot
130
  fig, ax = plt.subplots(figsize=(10, 5))
131
  bars = ax.barh(top_5_labels, top_5_probs, color='#4CAF50', alpha=0.8)
132
  ax.set_xlabel('Probability', fontsize=12)
133
  ax.set_title('Top 5 Predicted AI Models', fontsize=14, fontweight='bold')
134
- ax.invert_yaxis() # Highest probability on top
135
  ax.grid(axis='x', linestyle='--', alpha=0.6)
136
 
137
- # Add percentage labels on the bars
138
  for bar in bars:
139
  width = bar.get_width()
140
  label_x_pos = width + 0.01
141
  ax.text(label_x_pos, bar.get_y() + bar.get_height() / 2, f'{width:.2%}', va='center')
142
 
143
- ax.set_xlim(0, max(top_5_probs) * 1.18) # Adjust x-axis limit for labels
144
  plt.tight_layout()
145
 
146
- # Return both the text message and the plot figure
147
  return result_message, fig
148
 
149
 
150
- # --- Gradio Interface Definition ---
151
 
152
  title = "AI Text Detector"
153
 
@@ -165,7 +154,6 @@ Paste your text below to analyze its origin.
165
  """
166
  bottom_text = "**Developed by SzegedAI**"
167
 
168
- # Example texts
169
  AI_texts = [
170
  "Camels are remarkable desert animals known for their unique adaptations to harsh, arid environments. Native to the Middle East, North Africa, and parts of Asia, camels have been essential to human life for centuries, serving as a mode of transportation, a source of food, and even a symbol of endurance and survival. There are two primary species of camels: the dromedary camel, which has a single hump and is commonly found in the Middle East and North Africa, and the Bactrian camel, which has two humps and is native to Central Asia. Their humps store fat, not water, as commonly believed, allowing them to survive long periods without food by metabolizing the stored fat for energy. Camels are highly adapted to desert life. They can go for weeks without water, and when they do drink, they can consume up to 40 gallons in one sitting. Their thick eyelashes, sealable nostrils, and wide, padded feet protect them from sand and help them walk easily on loose desert terrain.",
171
  "Wines are a fascinating reflection of culture, history, and craftsmanship. They embody a rich diversity shaped by the land, climate, and traditions where they are produced. From the bold reds of Bordeaux to the crisp whites of New Zealand, each bottle tells a unique story. What makes wine so special is its ability to connect people. Whether shared at a family dinner, a celebratory event, or a quiet evening with friends, wine enhances experiences and brings people together. The variety of flavors and aromas, influenced by grape type, fermentation techniques, and aging processes, make wine tasting a complex yet rewarding journey for the senses.",
@@ -177,7 +165,6 @@ Human_texts = [
177
  "Fats are rich in energy, build body cells, support brain development of infants, help body processes, and facilitate the absorption and use of fat-soluble vitamins A, D, E, and K. The major component of lipids is glycerol and fatty acids. According to chemical properties, fatty acids can be divided into saturated and unsaturated fatty acids. Generally lipids containing saturated fatty acids are solid at room temperature and include animal fats (butter, lard, tallow, ghee) and tropical oils (palm,coconut, palm kernel). Saturated fats increase the risk of heart disease.",
178
  "To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a “sentence” can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A “sequence” refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. We use WordPiece embeddings (Wu et al., 2016) with a 30,000 token vocabulary. The first token of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence."]
179
 
180
- # Define the Gradio interface with CSS styling
181
  iface = gr.Blocks(css="""
182
  @import url('https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@400;700&display=swap');
183
  #text_input_box { border-radius: 10px; border: 2px solid #4CAF50; font-size: 18px; padding: 15px; margin-bottom: 20px; width: 60%; box-sizing: border-box; margin: auto; }
 
52
  text = re.sub(r'\s+([,.;:?!])', r'\1', text)
53
  return text
54
 
 
55
  newline_to_space = Replace(Regex(r'\s*\n\s*'), " ")
56
  join_hyphen_break = Replace(Regex(r'(\w+)[--]\s*\n\s*(\w+)'), r"\1\2")
57
  tokenizer.backend_tokenizer.normalizer = Sequence([
 
81
  logits_2 = model_2(**inputs).logits
82
  logits_3 = model_3(**inputs).logits
83
 
 
84
  softmax_1 = torch.softmax(logits_1, dim=1)
85
  softmax_2 = torch.softmax(logits_2, dim=1)
86
  softmax_3 = torch.softmax(logits_3, dim=1)
87
 
 
88
  averaged_probabilities = (softmax_1 + softmax_2 + softmax_3) / 3
89
  probabilities = averaged_probabilities[0]
90
 
 
91
  human_prob = probabilities[24].item()
92
  ai_probs_clone = probabilities.clone()
93
+ ai_probs_clone[24] = 0
94
  ai_total_prob = ai_probs_clone.sum().item()
95
 
 
96
  total_decision_prob = human_prob + ai_total_prob
97
  human_percentage = (human_prob / total_decision_prob) * 100
98
  ai_percentage = (ai_total_prob / total_decision_prob) * 100
99
 
 
100
  ai_argmax_index = torch.argmax(ai_probs_clone).item()
101
  ai_argmax_model = label_mapping[ai_argmax_index]
102
 
 
110
  f"**Identified LLM: {ai_argmax_model}**"
111
  )
112
 
113
+
 
114
  ai_probs_for_plot = probabilities.clone()
 
115
  top_5_probs, top_5_indices = torch.topk(ai_probs_for_plot, 5)
116
 
 
117
  top_5_probs = top_5_probs.cpu().numpy()
118
  top_5_labels = [label_mapping[i.item()] for i in top_5_indices]
119
 
 
120
  fig, ax = plt.subplots(figsize=(10, 5))
121
  bars = ax.barh(top_5_labels, top_5_probs, color='#4CAF50', alpha=0.8)
122
  ax.set_xlabel('Probability', fontsize=12)
123
  ax.set_title('Top 5 Predicted AI Models', fontsize=14, fontweight='bold')
124
+ ax.invert_yaxis()
125
  ax.grid(axis='x', linestyle='--', alpha=0.6)
126
 
127
+
128
  for bar in bars:
129
  width = bar.get_width()
130
  label_x_pos = width + 0.01
131
  ax.text(label_x_pos, bar.get_y() + bar.get_height() / 2, f'{width:.2%}', va='center')
132
 
133
+ ax.set_xlim(0, max(top_5_probs) * 1.18)
134
  plt.tight_layout()
135
 
136
+
137
  return result_message, fig
138
 
139
 
 
140
 
141
  title = "AI Text Detector"
142
 
 
154
  """
155
  bottom_text = "**Developed by SzegedAI**"
156
 
 
157
  AI_texts = [
158
  "Camels are remarkable desert animals known for their unique adaptations to harsh, arid environments. Native to the Middle East, North Africa, and parts of Asia, camels have been essential to human life for centuries, serving as a mode of transportation, a source of food, and even a symbol of endurance and survival. There are two primary species of camels: the dromedary camel, which has a single hump and is commonly found in the Middle East and North Africa, and the Bactrian camel, which has two humps and is native to Central Asia. Their humps store fat, not water, as commonly believed, allowing them to survive long periods without food by metabolizing the stored fat for energy. Camels are highly adapted to desert life. They can go for weeks without water, and when they do drink, they can consume up to 40 gallons in one sitting. Their thick eyelashes, sealable nostrils, and wide, padded feet protect them from sand and help them walk easily on loose desert terrain.",
159
  "Wines are a fascinating reflection of culture, history, and craftsmanship. They embody a rich diversity shaped by the land, climate, and traditions where they are produced. From the bold reds of Bordeaux to the crisp whites of New Zealand, each bottle tells a unique story. What makes wine so special is its ability to connect people. Whether shared at a family dinner, a celebratory event, or a quiet evening with friends, wine enhances experiences and brings people together. The variety of flavors and aromas, influenced by grape type, fermentation techniques, and aging processes, make wine tasting a complex yet rewarding journey for the senses.",
 
165
  "Fats are rich in energy, build body cells, support brain development of infants, help body processes, and facilitate the absorption and use of fat-soluble vitamins A, D, E, and K. The major component of lipids is glycerol and fatty acids. According to chemical properties, fatty acids can be divided into saturated and unsaturated fatty acids. Generally lipids containing saturated fatty acids are solid at room temperature and include animal fats (butter, lard, tallow, ghee) and tropical oils (palm,coconut, palm kernel). Saturated fats increase the risk of heart disease.",
166
  "To make BERT handle a variety of down-stream tasks, our input representation is able to unambiguously represent both a single sentence and a pair of sentences (e.g., h Question, Answeri) in one token sequence. Throughout this work, a “sentence” can be an arbitrary span of contiguous text, rather than an actual linguistic sentence. A “sequence” refers to the input token sequence to BERT, which may be a single sentence or two sentences packed together. We use WordPiece embeddings (Wu et al., 2016) with a 30,000 token vocabulary. The first token of every sequence is always a special classification token ([CLS]). The final hidden state corresponding to this token is used as the aggregate sequence representation for classification tasks. Sentence pairs are packed together into a single sequence."]
167
 
 
168
  iface = gr.Blocks(css="""
169
  @import url('https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@400;700&display=swap');
170
  #text_input_box { border-radius: 10px; border: 2px solid #4CAF50; font-size: 18px; padding: 15px; margin-bottom: 20px; width: 60%; box-sizing: border-box; margin: auto; }