Spaces:

shftan
/

llm-thinking

Sleeping

App Files Files Community

shftan commited on Sep 19

Commit

0f93598

1 Parent(s): 64ea65c

Fix cuda

Browse files

Files changed (2) hide show

app.py +16 -18
utils.py +5 -3

app.py CHANGED Viewed

@@ -1,25 +1,22 @@
 import gradio as gr
 import spaces
-from huggingface_hub import hf_hub_download
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import pyvene as pv
 from utils import get_tokens, select_concepts, get_concepts_dictionary, get_response, plot_tokens_with_highlights
-#zero = torch.Tensor([0]).cuda()
-#print(zero.device) # <-- 'cpu'
-#@spaces.GPU
-#def greet(n):
-#    print(zero.device) # <-- 'cuda:0'
-#    return f"Hello {zero + n} Tensor"
 @spaces.GPU
 def launch_app():
     @spaces.GPU
-    # Function to process user input to the app
     def process_user_input(prompt, concept):
         # Check if prompt or concept are empty
         if not prompt or not concept:
             return f"<h3>Please provide both a prompt and a concept</h3>"
@@ -66,10 +63,10 @@ def launch_app():
         # Combine HTMLs
         output_html = highlighted_tokens_html + concepts_html + "<p>&nbsp;</p>" + response_html + "<p>&nbsp;</p>" + documentation_html
-        return output_html
     # Set model, interpreter, dictionary choices
-    model_name = "google/gemma-2-2b-it"
     interpreter_name = "pyvene/gemma-reft-r1-2b-it-res"
     interpreter_path = "l20/weight.pt"
     interpreter_component = "model.layers[20].output"
@@ -85,14 +82,15 @@ def launch_app():
             return torch.relu(self.proj(base))
     # Load tokenizer and model
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto').to('cuda')
     # Load fast model inference pipeline
     pipe = pipeline(
         task="text-generation",
         model=model_name,
-        use_fast=True
     )
     path_to_params = hf_hub_download(
@@ -100,12 +98,12 @@ def launch_app():
         filename=interpreter_path,
         force_download=False,
     )
-    params = torch.load(path_to_params)
-    encoder = Encoder(embed_dim=params.shape[0], latent_dim=params.shape[1]).cuda()
     encoder.proj.weight.data = params.float()
     pv_model = pv.IntervenableModel({
         "component": interpreter_component,
-        "intervention": encoder}, model=model).cuda()
     # Load dictionary
     all_concepts = get_concepts_dictionary(dictionary_url)
@@ -132,7 +130,7 @@ def launch_app():
             outputs=output_html
         )
-    demo.launch()
 if __name__ == "__main__":
     launch_app()

 import gradio as gr
 import spaces
+from huggingface_hub import HfApi, hf_hub_download
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import pyvene as pv
 from utils import get_tokens, select_concepts, get_concepts_dictionary, get_response, plot_tokens_with_highlights
+import os
+hf_token = os.getenv("HF_TOKEN")
 @spaces.GPU
 def launch_app():
     @spaces.GPU
     def process_user_input(prompt, concept):
+        yield "Processing..."
         # Check if prompt or concept are empty
         if not prompt or not concept:
             return f"<h3>Please provide both a prompt and a concept</h3>"
         # Combine HTMLs
         output_html = highlighted_tokens_html + concepts_html + "<p>&nbsp;</p>" + response_html + "<p>&nbsp;</p>" + documentation_html
+        yield output_html
     # Set model, interpreter, dictionary choices
+    model_name = "google/gemma-3-270m-it" #"google/gemma-2-2b-it"
     interpreter_name = "pyvene/gemma-reft-r1-2b-it-res"
     interpreter_path = "l20/weight.pt"
     interpreter_component = "model.layers[20].output"
             return torch.relu(self.proj(base))
     # Load tokenizer and model
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+    model = AutoModelForCausalLM.from_pretrained(model_name, device_map='auto', token=hf_token).to("cuda" if torch.cuda.is_available() else "cpu")
     # Load fast model inference pipeline
     pipe = pipeline(
         task="text-generation",
         model=model_name,
+        use_fast=True,
+        token=hf_token
     )
     path_to_params = hf_hub_download(
         filename=interpreter_path,
         force_download=False,
     )
+    params = torch.load(path_to_params, map_location="cuda" if torch.cuda.is_available() else "cpu")
+    encoder = Encoder(embed_dim=params.shape[0], latent_dim=params.shape[1]).to("cuda" if torch.cuda.is_available() else "cpu")
     encoder.proj.weight.data = params.float()
     pv_model = pv.IntervenableModel({
         "component": interpreter_component,
+        "intervention": encoder}, model=model).to("cuda" if torch.cuda.is_available() else "cpu")
     # Load dictionary
     all_concepts = get_concepts_dictionary(dictionary_url)
             outputs=output_html
         )
+    demo.launch(debug=True)
 if __name__ == "__main__":
     launch_app()

utils.py CHANGED Viewed

@@ -5,23 +5,25 @@ import requests
 import json
 import pandas as pd
 import torch
 # Function to get tokens given text
 def get_tokens(tokenizer, text):
-  token_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=False).to("cuda")
   tokens = tokenizer.convert_ids_to_tokens(token_ids[0])
   return tokens, token_ids
 # Function to apply chat template to prompt
 def decorate_prompt(tokenizer, prompt):
   chat = [
     {"role": "user", "content": prompt},
     {"role": "assistant", "content": ""},
   ]
   text = tokenizer.apply_chat_template(chat, tokenize=False)
-  token_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=False).to("cuda")
   return token_ids

 import json
 import pandas as pd
 import torch
+import spaces
 # Function to get tokens given text
+@spaces.GPU
 def get_tokens(tokenizer, text):
+  token_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=False).to("cuda" if torch.cuda.is_available() else "cpu")
   tokens = tokenizer.convert_ids_to_tokens(token_ids[0])
   return tokens, token_ids
 # Function to apply chat template to prompt
+@spaces.GPU
 def decorate_prompt(tokenizer, prompt):
   chat = [
     {"role": "user", "content": prompt},
     {"role": "assistant", "content": ""},
   ]
   text = tokenizer.apply_chat_template(chat, tokenize=False)
+  token_ids = tokenizer.encode(text, return_tensors="pt", add_special_tokens=False).to("cuda" if torch.cuda.is_available() else "cpu")
   return token_ids