updare app & requirements
Browse files- app.py +36 -18
- app_quantized.py +45 -0
- requirements.txt +2 -1
app.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
from fastapi.responses import StreamingResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
-
from transformers import AutoModelForCausalLM,
|
| 5 |
import torch
|
| 6 |
import os
|
| 7 |
import uvicorn
|
|
|
|
| 8 |
|
| 9 |
app = FastAPI()
|
| 10 |
|
|
@@ -39,28 +40,45 @@ class ChatRequest(BaseModel):
|
|
| 39 |
# Generator untuk streaming token
|
| 40 |
def generate_stream(prompt, max_new_tokens=128):
|
| 41 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 42 |
-
streamer = tokenizer.as_target_tokenizer()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
output_ids = model.generate(
|
| 47 |
-
**inputs,
|
| 48 |
-
max_new_tokens=max_new_tokens,
|
| 49 |
-
do_sample=True,
|
| 50 |
-
top_p=0.9,
|
| 51 |
-
temperature=0.7
|
| 52 |
-
)[0]
|
| 53 |
|
| 54 |
-
|
| 55 |
-
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
text = tokenizer.decode(tok, skip_special_tokens=True)
|
| 59 |
-
if text.strip():
|
| 60 |
-
yield text
|
| 61 |
|
| 62 |
|
| 63 |
-
@app.post("/
|
| 64 |
async def chat(req: ChatRequest):
|
| 65 |
# Format prompt sesuai chat template
|
| 66 |
text = tokenizer.apply_chat_template(
|
|
|
|
| 1 |
from fastapi import FastAPI
|
| 2 |
from fastapi.responses import StreamingResponse
|
| 3 |
from pydantic import BaseModel
|
| 4 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
|
| 5 |
import torch
|
| 6 |
import os
|
| 7 |
import uvicorn
|
| 8 |
+
import threading
|
| 9 |
|
| 10 |
app = FastAPI()
|
| 11 |
|
|
|
|
| 40 |
# Generator untuk streaming token
|
| 41 |
def generate_stream(prompt, max_new_tokens=128):
|
| 42 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 43 |
+
# streamer = tokenizer.as_target_tokenizer()
|
| 44 |
+
|
| 45 |
+
# # pakai generate incremental
|
| 46 |
+
# with torch.no_grad():
|
| 47 |
+
# output_ids = model.generate(
|
| 48 |
+
# **inputs,
|
| 49 |
+
# max_new_tokens=max_new_tokens,
|
| 50 |
+
# do_sample=True,
|
| 51 |
+
# top_p=0.9,
|
| 52 |
+
# temperature=0.7
|
| 53 |
+
# )[0]
|
| 54 |
+
|
| 55 |
+
# # Ambil hasil tanpa input
|
| 56 |
+
# generated_tokens = output_ids[inputs["input_ids"].shape[1]:]
|
| 57 |
+
|
| 58 |
+
# for tok in generated_tokens:
|
| 59 |
+
# text = tokenizer.decode(tok, skip_special_tokens=True)
|
| 60 |
+
# if text.strip():
|
| 61 |
+
# yield text
|
| 62 |
+
|
| 63 |
+
streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
|
| 64 |
+
generation_kwargs = dict(
|
| 65 |
+
**inputs,
|
| 66 |
+
max_new_tokens=max_new_tokens,
|
| 67 |
+
temperature=0.7,
|
| 68 |
+
streamer=streamer
|
| 69 |
+
)
|
| 70 |
|
| 71 |
+
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
|
| 72 |
+
thread.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
def token_stream():
|
| 75 |
+
for token in streamer:
|
| 76 |
+
yield token
|
| 77 |
|
| 78 |
+
return StreamingResponse(token_stream(), media_type="text/plain")
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
|
| 81 |
+
@app.post("/stream")
|
| 82 |
async def chat(req: ChatRequest):
|
| 83 |
# Format prompt sesuai chat template
|
| 84 |
text = tokenizer.apply_chat_template(
|
app_quantized.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
+
from pydantic import BaseModel
|
| 3 |
+
from llama_cpp import Llama
|
| 4 |
+
import os
|
| 5 |
+
import uvicorn
|
| 6 |
+
|
| 7 |
+
app = FastAPI()
|
| 8 |
+
|
| 9 |
+
# --- Konfigurasi Model ---
|
| 10 |
+
# Pastikan sudah download model GGUF dari Hugging Face, contoh:
|
| 11 |
+
# https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
|
| 12 |
+
MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal
|
| 13 |
+
|
| 14 |
+
llm = Llama(
|
| 15 |
+
model_path=MODEL_PATH,
|
| 16 |
+
n_ctx=2048, # konteks token
|
| 17 |
+
n_threads=4, # sesuaikan dengan jumlah CPU core
|
| 18 |
+
n_batch=512 # batch size
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
# --- Schema Request ---
|
| 22 |
+
class ChatRequest(BaseModel):
|
| 23 |
+
prompt: str
|
| 24 |
+
max_new_tokens: int = 256
|
| 25 |
+
|
| 26 |
+
# --- Endpoint Chat ---
|
| 27 |
+
@app.post("/chat")
|
| 28 |
+
def chat(req: ChatRequest):
|
| 29 |
+
output = llm(
|
| 30 |
+
req.prompt,
|
| 31 |
+
max_tokens=req.max_new_tokens,
|
| 32 |
+
stop=["</s>", "User:", "Assistant:"],
|
| 33 |
+
echo=False
|
| 34 |
+
)
|
| 35 |
+
response = output["choices"][0]["text"].strip()
|
| 36 |
+
return {"response": response}
|
| 37 |
+
|
| 38 |
+
# --- Root Endpoint ---
|
| 39 |
+
@app.get("/")
|
| 40 |
+
def root():
|
| 41 |
+
return {"message": "Qwen GGUF FastAPI running 🚀"}
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
port = int(os.environ.get("PORT", 7860))
|
| 45 |
+
uvicorn.run("app:app", host="0.0.0.0", port=port)
|
requirements.txt
CHANGED
|
@@ -8,4 +8,5 @@ einops
|
|
| 8 |
transformers_stream_generator
|
| 9 |
scipy
|
| 10 |
sentencepiece
|
| 11 |
-
optimum
|
|
|
|
|
|
| 8 |
transformers_stream_generator
|
| 9 |
scipy
|
| 10 |
sentencepiece
|
| 11 |
+
optimum
|
| 12 |
+
threading
|