Spaces:

Avinash109
/

qwen2.5

Sleeping

Avinash109 commited on Nov 12, 2024

Commit

fb19b6e

verified ·

1 Parent(s): 088f906

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import streamlit as st
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import datetime
 # Page configuration
@@ -17,28 +17,34 @@ if 'messages' not in st.session_state:
 # Cache the model loading
 @st.cache_resource
 def load_model_and_tokenizer():
-    model_name = "Qwen/Qwen2.5-Coder-32B-Instruct"
-    # Configure quantization
-    bnb_config = BitsAndBytesConfig(
-        load_in_8bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=False,
-    )
-    # Load tokenizer and model
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True
     )
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        quantization_config=bnb_config,
-        torch_dtype=torch.float16,
-        device_map="auto",
-        trust_remote_code=True
-    )
     return tokenizer, model
@@ -52,7 +58,7 @@ with st.sidebar:
     max_length = st.slider(
         "Maximum Length",
         min_value=64,
-        max_value=4096,
         value=512,
         step=64,
         help="Maximum number of tokens to generate"

 import streamlit as st
 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import datetime
 # Page configuration
 # Cache the model loading
 @st.cache_resource
 def load_model_and_tokenizer():
+    model_name = "Qwen/Qwen2.5-Coder-7B-Instruct"  # Using smaller 7B model
+    # Load tokenizer
     tokenizer = AutoTokenizer.from_pretrained(
         model_name,
         trust_remote_code=True
     )
+    # Determine device
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    st.info(f"Using device: {device}")
+    # Load model with appropriate settings for CPU/GPU
+    if device == "cuda":
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float32,
+            device_map={"": device},
+            trust_remote_code=True,
+            low_cpu_mem_usage=True
+        )
     return tokenizer, model
     max_length = st.slider(
         "Maximum Length",
         min_value=64,
+        max_value=2048,  # Reduced for CPU usage
         value=512,
         step=64,
         help="Maximum number of tokens to generate"