Spaces:

orionweller
/

human-mlm-clm-predictor

Runtime error

App Files Files Community

orionweller commited on Mar 4

Commit

ea15511

verified ·

1 Parent(s): 9b671c4

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -4

app.py CHANGED Viewed

@@ -60,13 +60,21 @@ def prepare_mlm_sample(text, mask_ratio=0.15):
     global masked_indices, masked_tokens, original_text
     tokens = tokenizer.tokenize(text)
     # Only mask whole words, not special tokens or punctuation
     maskable_indices = [i for i, token in enumerate(tokens)
                         if not token.startswith("##") and not token.startswith("[") and not token.endswith("]")
                         and token not in [".", ",", "!", "?", ";", ":", "'", "\"", "-"]]
     # Calculate how many tokens to mask, but ensure at least 1 and at most 8
     num_to_mask = max(1, min(8, int(len(maskable_indices) * mask_ratio)))
     # Randomly select indices to mask
     indices_to_mask = random.sample(maskable_indices, min(num_to_mask, len(maskable_indices)))
     # Sort indices to ensure they're in order
@@ -101,15 +109,20 @@ def prepare_ntp_sample(text, cut_ratio=0.3):
     # Tokenize text to ensure reasonable cutting
     tokens = tokenizer.tokenize(text)
     # Ensure we have enough tokens
     if len(tokens) < 5:
         return text, ""  # Return original if too short
-    # Calculate cutoff point (70% of tokens if cut_ratio is 0.3)
-    # But make sure we have at least 3 tokens visible and 1 token hidden
     cutoff = max(3, int(len(tokens) * (1 - cut_ratio)))
     cutoff = min(cutoff, len(tokens) - 1)  # Ensure there's at least 1 token to predict
     # Get the visible part
     visible_tokens = tokens[:cutoff]
@@ -120,15 +133,24 @@ def prepare_ntp_sample(text, cut_ratio=0.3):
     visible_text = tokenizer.convert_tokens_to_string(visible_tokens)
     hidden_text = tokenizer.convert_tokens_to_string(hidden_tokens)
     return visible_text, hidden_text
 def get_new_sample(task, mask_ratio=0.15):
     """Get a new text sample based on the task."""
-    global current_sample, masked_text, masked_indices, masked_tokens, original_text, ntp_state
     # Select a random sample
     current_sample = random.choice(data_samples)
     if task == "mlm":
         # Prepare MLM sample
         masked_text, masked_indices, masked_tokens = prepare_mlm_sample(current_sample, mask_ratio)
@@ -373,7 +395,7 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
     )
     with gr.Row():
-        new_button = gr.Button("New Sample")
         reset_button = gr.Button("Reset Stats")
     # Consolidated input area - only one visible at a time
@@ -433,8 +455,21 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
         outputs=[mlm_instructions, ntp_instructions, answer_input, mask_count]
     )
     # Update the sample text and also update the mask count
     def new_sample_with_count(mask_ratio_pct, task):
         ratio = float(mask_ratio_pct) / 100.0
         sample = get_new_sample(task, ratio)
         mask_count_text = ""
@@ -442,8 +477,10 @@ with gr.Blocks(title="MLM and NTP Testing") as demo:
         if task == "mlm":
             count = len(masked_tokens)
             mask_count_text = f"**Number of [MASK] tokens to guess: {count}**"
         else:
             mask_count_text = "**Next Token Prediction mode - guess one token at a time**"
         return sample, mask_count_text, ""

     global masked_indices, masked_tokens, original_text
     tokens = tokenizer.tokenize(text)
+    print(f"Text length: {len(text)} characters, {len(tokens)} tokens")
     # Only mask whole words, not special tokens or punctuation
     maskable_indices = [i for i, token in enumerate(tokens)
                         if not token.startswith("##") and not token.startswith("[") and not token.endswith("]")
                         and token not in [".", ",", "!", "?", ";", ":", "'", "\"", "-"]]
+    print(f"Maskable indices count: {len(maskable_indices)}")
+    print(f"Mask ratio: {mask_ratio}")
     # Calculate how many tokens to mask, but ensure at least 1 and at most 8
+    # Use the maskable_indices length with the ratio
     num_to_mask = max(1, min(8, int(len(maskable_indices) * mask_ratio)))
+    print(f"Number of tokens to mask: {num_to_mask}")
     # Randomly select indices to mask
     indices_to_mask = random.sample(maskable_indices, min(num_to_mask, len(maskable_indices)))
     # Sort indices to ensure they're in order
     # Tokenize text to ensure reasonable cutting
     tokens = tokenizer.tokenize(text)
+    # Print debug info
+    print(f"NTP preparation - Text length: {len(text)} characters, {len(tokens)} tokens")
+    print(f"Cut ratio: {cut_ratio}")
     # Ensure we have enough tokens
     if len(tokens) < 5:
         return text, ""  # Return original if too short
+    # Calculate cutoff point based on the cut ratio
     cutoff = max(3, int(len(tokens) * (1 - cut_ratio)))
     cutoff = min(cutoff, len(tokens) - 1)  # Ensure there's at least 1 token to predict
+    print(f"Cutoff point: {cutoff} (keeping {cutoff} tokens, cutting {len(tokens) - cutoff} tokens)")
     # Get the visible part
     visible_tokens = tokens[:cutoff]
     visible_text = tokenizer.convert_tokens_to_string(visible_tokens)
     hidden_text = tokenizer.convert_tokens_to_string(hidden_tokens)
+    print(f"Visible text length: {len(visible_text)} chars")
+    print(f"Hidden text length: {len(hidden_text)} chars")
     return visible_text, hidden_text
 def get_new_sample(task, mask_ratio=0.15):
     """Get a new text sample based on the task."""
+    global current_sample, masked_text, masked_indices, masked_tokens, original_text, ntp_state, current_task
+    # Update current task
+    current_task = task
     # Select a random sample
     current_sample = random.choice(data_samples)
+    # Print debugging info
+    print(f"Getting new sample for task: {task} with mask ratio: {mask_ratio}")
     if task == "mlm":
         # Prepare MLM sample
         masked_text, masked_indices, masked_tokens = prepare_mlm_sample(current_sample, mask_ratio)
     )
     with gr.Row():
+        new_button = gr.Button("New Sample", variant="primary")
         reset_button = gr.Button("Reset Stats")
     # Consolidated input area - only one visible at a time
         outputs=[mlm_instructions, ntp_instructions, answer_input, mask_count]
     )
+    # Update the sample text when mask ratio changes (without clicking new sample)
+    def update_on_ratio_change(mask_ratio_pct, task):
+        print(f"Ratio changed to {mask_ratio_pct}%")
+        # Don't generate a new sample here, just update the UI to show the effect of ratio change
+        return f"Current mask/cut ratio: {mask_ratio_pct}%. Click 'New Sample' to apply."
+    mask_ratio.change(
+        update_on_ratio_change,
+        inputs=[mask_ratio, task_radio],
+        outputs=[result]
+    )
     # Update the sample text and also update the mask count
     def new_sample_with_count(mask_ratio_pct, task):
+        print(f"Generating new sample with mask ratio: {mask_ratio_pct}% for task: {task}")
         ratio = float(mask_ratio_pct) / 100.0
         sample = get_new_sample(task, ratio)
         mask_count_text = ""
         if task == "mlm":
             count = len(masked_tokens)
             mask_count_text = f"**Number of [MASK] tokens to guess: {count}**"
+            print(f"Generated MLM sample with {count} masks at ratio {ratio}")
         else:
             mask_count_text = "**Next Token Prediction mode - guess one token at a time**"
+            print(f"Generated NTP sample with cut ratio {ratio}")
         return sample, mask_count_text, ""