Spaces:

Ruurd
/

lad

Running on Zero

App Files Files

Ruurd commited on May 15

Commit

df4c990

verified ·

1 Parent(s): d83b96d

Changed initialization to MASK tokens instead of EOS tokens

Browse files

Files changed (1) hide show

app.py +6 -7

app.py CHANGED Viewed

@@ -16,8 +16,8 @@ hf_token = os.getenv("HF_TOKEN")
 # --- Load tokenizer ---
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B", use_fast=True, token=hf_token)
 vocab_size = len(tokenizer)
-pad_token = tokenizer.pad_token_id or tokenizer.eos_token_id
-eot_token_id = tokenizer.eos_token_id
 assistant_marker_ids = tokenizer.encode("Assistant:", add_special_tokens=False)
 # def load_model():
@@ -114,7 +114,6 @@ def confidence_guided_noising(input_ids, answer_start, confidences, noise_clippi
     noised = input_ids.copy()
     answer_len = len(input_ids) - answer_start
     num_to_noise = int(threshold * answer_len * noise_start)
-    mask_token_id = tokenizer.encode('MASK', add_special_tokens = False)[0]
     if num_to_noise == 0:
         return noised
@@ -176,7 +175,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness, clustering, noise_
         return
     if len(input_ids) < 256:
-        input_ids += [pad_token] * (256 - len(input_ids))
     else:
         input_ids = input_ids[:256]
@@ -203,7 +202,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness, clustering, noise_
         highlighted = []
         for j, tok in enumerate(decoded_tokens):
             tok_id = tokenizer.convert_tokens_to_ids(tok)
-            if tok_id == eot_token_id:
                 continue
             token_str = tokenizer.convert_tokens_to_string([tok])
             if prev_decoded_tokens and j < len(prev_decoded_tokens) and tok != prev_decoded_tokens[j]:
@@ -245,7 +244,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness, clustering, noise_
         highlighted = []
         for j, tok in enumerate(decoded_tokens):
             tok_id = tokenizer.convert_tokens_to_ids(tok)
-            if tok_id == eot_token_id:
                 continue
             token_str = tokenizer.convert_tokens_to_string([tok])
             abs_idx = answer_start + j
@@ -259,7 +258,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness, clustering, noise_
     final_tokens = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
-    final_tokens = [tok for tok in final_tokens if tokenizer.convert_tokens_to_ids(tok) != eot_token_id]
     final_output = tokenizer.convert_tokens_to_string(final_tokens)
     print(final_output)
     yield f"<b>Final Output (after {i+1} iterations):</b><br>" + final_output.replace('\n', '<br>')

 # --- Load tokenizer ---
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B", use_fast=True, token=hf_token)
 vocab_size = len(tokenizer)
+eos_token_id = tokenizer.eos_token_id
+mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
 assistant_marker_ids = tokenizer.encode("Assistant:", add_special_tokens=False)
 # def load_model():
     noised = input_ids.copy()
     answer_len = len(input_ids) - answer_start
     num_to_noise = int(threshold * answer_len * noise_start)
     if num_to_noise == 0:
         return noised
         return
     if len(input_ids) < 256:
+        input_ids += [mask_token_id] * (256 - len(input_ids))
     else:
         input_ids = input_ids[:256]
         highlighted = []
         for j, tok in enumerate(decoded_tokens):
             tok_id = tokenizer.convert_tokens_to_ids(tok)
+            if tok_id == eos_token_id:
                 continue
             token_str = tokenizer.convert_tokens_to_string([tok])
             if prev_decoded_tokens and j < len(prev_decoded_tokens) and tok != prev_decoded_tokens[j]:
         highlighted = []
         for j, tok in enumerate(decoded_tokens):
             tok_id = tokenizer.convert_tokens_to_ids(tok)
+            if tok_id == eos_token_id:
                 continue
             token_str = tokenizer.convert_tokens_to_string([tok])
             abs_idx = answer_start + j
     final_tokens = tokenizer.convert_ids_to_tokens(current_tokens[answer_start:])
+    final_tokens = [tok for tok in final_tokens if tokenizer.convert_tokens_to_ids(tok) != eos_token_id]
     final_output = tokenizer.convert_tokens_to_string(final_tokens)
     print(final_output)
     yield f"<b>Final Output (after {i+1} iterations):</b><br>" + final_output.replace('\n', '<br>')