NewJapaneseLLM

Running on Zero

App Files Files Community

aixsatoshi commited on Jul 23, 2024

Commit

9e9c8af

verified ·

1 Parent(s): bcc0940

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -38

app.py CHANGED Viewed

@@ -1,26 +1,24 @@
-import torch
-from PIL import Image
-import gradio as gr
 import spaces
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-import os
 from threading import Thread
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL_ID = "aixsatoshi/Llama-3-Elyza-Youko-moe-2x8B"
-MODELS = os.environ.get("MODELS")
-MODEL_NAME = MODELS.split("/")[-1]
-TITLE = "<h1><center>Llama-3-Elyza-Youko-moe-2x8B Chat webui</center></h1>"
-DESCRIPTION = f"""
-<h3>MODEL: <a href="https://hf.co/{MODELS}">{MODEL_NAME}</a></h3>
 <center>
-<p>Llama-3-Elyza-JA-8B is the large language model built by Elyza.
-<p>Llama-3-youko-8B is the large language model built by rinna.
-<br>
-Feel free to test without log.
-</p>
 </center>
 """
@@ -42,24 +40,15 @@ h3 {
 }
 """
-model = AutoModelForCausalLM.from_pretrained(
-          MODEL_ID,
-          torch_dtype=torch.float16,
-          device_map="auto",
-        )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-@spaces.GPU
 def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
-    print(f'message is - {message}')
-    print(f'history is - {history}')
     conversation = []
     for prompt, answer in history:
         conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
     conversation.append({"role": "user", "content": message})
-    #print(f"Conversation is -\n{conversation}")
     input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(input_ids, return_tensors="pt").to(0)
@@ -75,7 +64,7 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
-        eos_token_id = [128001, 128009],
     )
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
@@ -86,8 +75,6 @@ def stream_chat(message: str, history: list, temperature: float, max_new_tokens:
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=500)
 with gr.Blocks(css=CSS) as demo:
@@ -145,15 +132,14 @@ with gr.Blocks(css=CSS) as demo:
             ),
         ],
         examples=[
-            ["超能力を持つ主人公のSF物語のシナリオを考えてください。伏線の設定、テーマやログラインを理論的に使用してください"],
-            ["子供の夏休みの自由研究のための、5つのアイデアと、その手法を簡潔に教えてください。"],
-            ["パズルゲームのスクリプト作成のためにアドバイスお願いします"],
-            ["マークダウン記法にて、ブロック崩しのゲーム作成の教科書作成してください"],
         ],
         cache_examples=False,
     )
 if __name__ == "__main__":
     demo.launch()

 import spaces
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import gradio as gr
 from threading import Thread
+model_id = "hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+  model_id,
+  torch_dtype=torch.float16,
+  low_cpu_mem_usage=True,
+  device_map="auto",
+)
+TITLE = "<h1><center>Meta-Llama-3.1-70B-Instruct-AWQ-INT4 Chat webui</center></h1>"
+DESCRIPTION = """
+<h3>MODEL: <a href="https://hf.co/hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4">Meta-Llama-3.1-70B-Instruct-AWQ-INT4</a></h3>
 <center>
+<p>This model is designed for conversational interactions.</p>
 </center>
 """
 }
 """
+@gr.GPU
 def stream_chat(message: str, history: list, temperature: float, max_new_tokens: int, top_p: float, top_k: int, penalty: float):
+    print(f'Message: {message}')
+    print(f'History: {history}')
     conversation = []
     for prompt, answer in history:
         conversation.extend([{"role": "user", "content": prompt}, {"role": "assistant", "content": answer}])
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
     inputs = tokenizer(input_ids, return_tensors="pt").to(0)
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
+        eos_token_id=[128001, 128009],
     )
     thread = Thread(target=model.generate, kwargs=generate_kwargs)
         buffer += new_text
         yield buffer
 chatbot = gr.Chatbot(height=500)
 with gr.Blocks(css=CSS) as demo:
             ),
         ],
         examples=[
+            ["Explain Deep Learning as a pirate."],
+            ["Give me five ideas for a child's summer science project."],
+            ["Provide advice for writing a script for a puzzle game."],
+            ["Create a tutorial for building a breakout game using markdown."],
         ],
         cache_examples=False,
     )
 if __name__ == "__main__":
     demo.launch()