Spaces:

sirochild
/

mari

Sleeping

App Files Files Community

sirochild commited on Jul 24

Commit

dc62d4a

verified ·

1 Parent(s): 7c8b666

Upload 4 files

Browse files

Files changed (3) hide show

app.py +42 -48
generate_dialogue_with_swallow.py +15 -22
requirements.txt +3 -8

app.py CHANGED Viewed

@@ -3,9 +3,10 @@ from groq import Groq
 import os
 import json
 from dotenv import load_dotenv
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import torch
 import re
 from generate_dialogue_with_swallow import generate_dialogue_with_swallow
 # --- 1. 初期設定とAPIクライアントの初期化 ---
@@ -18,47 +19,44 @@ if not GROQ_API_KEY:
 groq_client = Groq(api_key=GROQ_API_KEY)
-# Swallowモデルの初期化
 print("Swallowモデルをロード中...")
-MODEL_ID = "tokyotech-llm/Swallow-MX-8x7b-NVE-v0.1"
 try:
     # Hugging Face Spaceでの実行時はGPUメモリを節約するための設定
     if os.getenv("SPACE_ID"):
         print("Hugging Face Space環境を検出しました。メモリ効率の良い設定を使用します。")
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        swallow_model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.float16,
-            device_map="auto",
-            load_in_8bit=True
         )
     else:
         # ローカル環境での実行時の設定
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-        swallow_model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            torch_dtype=torch.float16,
-            device_map="auto"
         )
     print("Swallowモデルのロード完了")
 except Exception as e:
     print(f"Swallowモデルのロードエラー: {e}")
-    # フォールバックとして小さいモデルを使用
-    try:
-        print("フォールバックモデルをロード中...")
-        tokenizer = AutoTokenizer.from_pretrained("elyza/ELYZA-japanese-Llama-2-7b-instruct")
-        swallow_model = AutoModelForCausalLM.from_pretrained(
-            "elyza/ELYZA-japanese-Llama-2-7b-instruct",
-            torch_dtype=torch.float16,
-            device_map="auto",
-            load_in_8bit=True
-        )
-        print("フォールバックモデルのロード完了")
-    except Exception as fallback_error:
-        print(f"フォールバックモデルのロードエラー: {fallback_error}")
-        swallow_model = None
-        tokenizer = None
 # 日本語感情分析モデルの初期化（グローバル変数として保持）
 print("日本語感情分析モデルを初期化中...")
@@ -118,26 +116,20 @@ def detect_scene_change(history, message):
 ---
 # 出力
 """
-    # Swallowモデルを使用してシーン検出
     try:
-        # トークナイズ
-        inputs = tokenizer(prompt, return_tensors="pt").to(swallow_model.device)
-        # 生成パラメータ - シーン検出には低い温度を使用
-        gen_kwargs = {
-            "max_new_tokens": 50,
-            "temperature": 0.1,
-            "top_p": 0.9,
-            "do_sample": False,
-            "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
-        }
-        # 生成
-        with torch.no_grad():
-            output = swallow_model.generate(**inputs, **gen_kwargs)
-        # デコード
-        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
         # プロンプトを除去して応答のみを取得
         response_text = generated_text[len(prompt):].strip().lower()
@@ -157,6 +149,8 @@ def detect_scene_change(history, message):
         return None
     except Exception as e:
         print(f"シーン検出LLMエラー: {e}")
         return None
 def generate_scene_instruction_with_groq(affection, stage_name, scene, previous_topic):

 import os
 import json
 from dotenv import load_dotenv
+from transformers import pipeline
 import re
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 from generate_dialogue_with_swallow import generate_dialogue_with_swallow
 # --- 1. 初期設定とAPIクライアントの初期化 ---
 groq_client = Groq(api_key=GROQ_API_KEY)
+# Swallowモデルの初期化（GGUF版）
 print("Swallowモデルをロード中...")
+MODEL_REPO = "mmnga/tokyotech-llm-Swallow-MX-8x7b-NVE-v0.1-gguf"
+MODEL_FILE = "tokyotech-llm-Swallow-MX-8x7b-NVE-v0.1-q4_K_M.gguf"
 try:
+    # モデルファイルをダウンロード
+    print(f"モデルファイル {MODEL_FILE} をダウンロード中...")
+    model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+    print(f"モデルファイルのダウンロード完了: {model_path}")
     # Hugging Face Spaceでの実行時はGPUメモリを節約するための設定
     if os.getenv("SPACE_ID"):
         print("Hugging Face Space環境を検出しました。メモリ効率の良い設定を使用します。")
+        # GPUを使用し、低いレイヤー数でロード
+        swallow_model = Llama(
+            model_path=model_path,
+            n_ctx=2048,  # コンテキスト長
+            n_gpu_layers=-1,  # 可能な限りGPUを使用
+            n_threads=4,  # スレッド数を制限
+            verbose=False  # デバッグ出力を無効化
         )
     else:
         # ローカル環境での実行時の設定
+        swallow_model = Llama(
+            model_path=model_path,
+            n_ctx=4096,  # より長いコンテキスト長
+            n_gpu_layers=-1,  # 可能な限りGPUを使用
+            verbose=True  # デバッグ出力を有効化
         )
     print("Swallowモデルのロード完了")
+    tokenizer = None  # llama-cppではtokenizerは不要
 except Exception as e:
     print(f"Swallowモデルのロードエラー: {e}")
+    import traceback
+    traceback.print_exc()
+    swallow_model = None
+    tokenizer = None
 # 日本語感情分析モデルの初期化（グローバル変数として保持）
 print("日本語感情分析モデルを初期化中...")
 ---
 # 出力
 """
+    # Swallowモデル（GGUF版）を使用してシーン検出
     try:
+        # llama-cppを使用して生成
+        output = swallow_model(
+            prompt,
+            max_tokens=50,
+            temperature=0.1,
+            top_p=0.9,
+            stop=["#", "\n\n"],
+            echo=True  # 入力プロンプトも含めて返す
+        )
+        # 生成されたテキストを取得
+        generated_text = output["choices"][0]["text"]
         # プロンプトを除去して応答のみを取得
         response_text = generated_text[len(prompt):].strip().lower()
         return None
     except Exception as e:
         print(f"シーン検出LLMエラー: {e}")
+        import traceback
+        traceback.print_exc()
         return None
 def generate_scene_instruction_with_groq(affection, stage_name, scene, previous_topic):

generate_dialogue_with_swallow.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import torch
 import traceback
 import datetime
 import random
 def generate_dialogue_with_swallow(history, message, affection, stage_name, scene_params, instruction=None, use_simple_prompt=False, swallow_model=None, tokenizer=None, SYSTEM_PROMPT_MARI=None):
     """
-    Swallowモデルを使用して対話応答を生成する関数
     Args:
         history: 会話履歴のリスト [(ユーザー発言, ボット応答), ...]
@@ -15,8 +14,8 @@ def generate_dialogue_with_swallow(history, message, affection, stage_name, scen
         scene_params: シーンパラメータの辞書
         instruction: 特別な指示（シーン遷移時など）
         use_simple_prompt: 簡潔なプロンプトを使用するかどうか
-        swallow_model: Swallowモデルのインスタンス
-        tokenizer: トークナイザーのインスタンス
         SYSTEM_PROMPT_MARI: システムプロンプト
     Returns:
@@ -27,7 +26,7 @@ def generate_dialogue_with_swallow(history, message, affection, stage_name, scen
     print(f"scene_params: {scene_params}")
     # モデルがロードされていない場合はフォールバック応答を返す
-    if swallow_model is None or tokenizer is None:
         print("モデルがロードされていないため、フォールバック応答を返します")
         return "（……システムエラーが発生しました）"
@@ -77,24 +76,18 @@ def generate_dialogue_with_swallow(history, message, affection, stage_name, scen
         print(f"システムプロンプト: {system_prompt[:100]}...（省略）")
         try:
-            # トークナイズ
-            inputs = tokenizer(system_prompt, return_tensors="pt").to(swallow_model.device)
-            # 生成パラメータ
-            gen_kwargs = {
-                "max_new_tokens": 200,
-                "temperature": 0.95,
-                "top_p": 0.9,
-                "do_sample": True,
-                "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
-            }
-            # 生成
-            with torch.no_grad():
-                output = swallow_model.generate(**inputs, **gen_kwargs)
-            # デコード
-            generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
             # プロンプトを除去して応答のみを取得
             response_text = generated_text[len(system_prompt):].strip()

 import traceback
 import datetime
 import random
 def generate_dialogue_with_swallow(history, message, affection, stage_name, scene_params, instruction=None, use_simple_prompt=False, swallow_model=None, tokenizer=None, SYSTEM_PROMPT_MARI=None):
     """
+    Swallowモデル（GGUF版）を使用して対話応答を生成する関数
     Args:
         history: 会話履歴のリスト [(ユーザー発言, ボット応答), ...]
         scene_params: シーンパラメータの辞書
         instruction: 特別な指示（シーン遷移時など）
         use_simple_prompt: 簡潔なプロンプトを使用するかどうか
+        swallow_model: Swallowモデル（llama-cpp）のインスタンス
+        tokenizer: 未使用（llama-cppでは不要）
         SYSTEM_PROMPT_MARI: システムプロンプト
     Returns:
     print(f"scene_params: {scene_params}")
     # モデルがロードされていない場合はフォールバック応答を返す
+    if swallow_model is None:
         print("モデルがロードされていないため、フォールバック応答を返します")
         return "（……システムエラーが発生しました）"
         print(f"システムプロンプト: {system_prompt[:100]}...（省略）")
         try:
+            # llama-cppを使用して生成
+            output = swallow_model(
+                system_prompt,
+                max_tokens=200,
+                temperature=0.95,
+                top_p=0.9,
+                stop=["ユーザー:", "\n\n"],
+                echo=True  # 入力プロンプトも含めて返す
+            )
+            # 生成されたテキストを取得
+            generated_text = output["choices"][0]["text"]
             # プロンプトを除去して応答のみを取得
             response_text = generated_text[len(system_prompt):].strip()

requirements.txt CHANGED Viewed

@@ -1,14 +1,9 @@
 gradio>=5.0.0
 groq
 python-dotenv
-transformers>=4.34.0
-torch>=2.0.0
-sentencepiece
 fugashi
 unidic_lite
-accelerate>=0.20.0
-bitsandbytes>=0.41.0
-einops>=0.6.0
-safetensors>=0.3.1
-huggingface_hub>=0.16.0
 protobuf>=3.20.0

 gradio>=5.0.0
 groq
 python-dotenv
+llama-cpp-python>=0.2.19
+huggingface_hub>=0.16.0
 fugashi
 unidic_lite
+transformers>=4.34.0
 protobuf>=3.20.0