aryo100 commited on
Commit
2b65d25
·
1 Parent(s): 5f3b222

updare app & requirements

Browse files
Files changed (3) hide show
  1. app.py +36 -18
  2. app_quantized.py +45 -0
  3. requirements.txt +2 -1
app.py CHANGED
@@ -1,10 +1,11 @@
1
  from fastapi import FastAPI
2
  from fastapi.responses import StreamingResponse
3
  from pydantic import BaseModel
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
5
  import torch
6
  import os
7
  import uvicorn
 
8
 
9
  app = FastAPI()
10
 
@@ -39,28 +40,45 @@ class ChatRequest(BaseModel):
39
  # Generator untuk streaming token
40
  def generate_stream(prompt, max_new_tokens=128):
41
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
42
- streamer = tokenizer.as_target_tokenizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- # pakai generate incremental
45
- with torch.no_grad():
46
- output_ids = model.generate(
47
- **inputs,
48
- max_new_tokens=max_new_tokens,
49
- do_sample=True,
50
- top_p=0.9,
51
- temperature=0.7
52
- )[0]
53
 
54
- # Ambil hasil tanpa input
55
- generated_tokens = output_ids[inputs["input_ids"].shape[1]:]
 
56
 
57
- for tok in generated_tokens:
58
- text = tokenizer.decode(tok, skip_special_tokens=True)
59
- if text.strip():
60
- yield text
61
 
62
 
63
- @app.post("/strean")
64
  async def chat(req: ChatRequest):
65
  # Format prompt sesuai chat template
66
  text = tokenizer.apply_chat_template(
 
1
  from fastapi import FastAPI
2
  from fastapi.responses import StreamingResponse
3
  from pydantic import BaseModel
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
5
  import torch
6
  import os
7
  import uvicorn
8
+ import threading
9
 
10
  app = FastAPI()
11
 
 
40
  # Generator untuk streaming token
41
  def generate_stream(prompt, max_new_tokens=128):
42
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
43
+ # streamer = tokenizer.as_target_tokenizer()
44
+
45
+ # # pakai generate incremental
46
+ # with torch.no_grad():
47
+ # output_ids = model.generate(
48
+ # **inputs,
49
+ # max_new_tokens=max_new_tokens,
50
+ # do_sample=True,
51
+ # top_p=0.9,
52
+ # temperature=0.7
53
+ # )[0]
54
+
55
+ # # Ambil hasil tanpa input
56
+ # generated_tokens = output_ids[inputs["input_ids"].shape[1]:]
57
+
58
+ # for tok in generated_tokens:
59
+ # text = tokenizer.decode(tok, skip_special_tokens=True)
60
+ # if text.strip():
61
+ # yield text
62
+
63
+ streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
64
+ generation_kwargs = dict(
65
+ **inputs,
66
+ max_new_tokens=max_new_tokens,
67
+ temperature=0.7,
68
+ streamer=streamer
69
+ )
70
 
71
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
72
+ thread.start()
 
 
 
 
 
 
 
73
 
74
+ def token_stream():
75
+ for token in streamer:
76
+ yield token
77
 
78
+ return StreamingResponse(token_stream(), media_type="text/plain")
 
 
 
79
 
80
 
81
+ @app.post("/stream")
82
  async def chat(req: ChatRequest):
83
  # Format prompt sesuai chat template
84
  text = tokenizer.apply_chat_template(
app_quantized.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
+ import os
5
+ import uvicorn
6
+
7
+ app = FastAPI()
8
+
9
+ # --- Konfigurasi Model ---
10
+ # Pastikan sudah download model GGUF dari Hugging Face, contoh:
11
+ # https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B-Instruct-GGUF
12
+ MODEL_PATH = "./Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf" # ganti sesuai file lokal
13
+
14
+ llm = Llama(
15
+ model_path=MODEL_PATH,
16
+ n_ctx=2048, # konteks token
17
+ n_threads=4, # sesuaikan dengan jumlah CPU core
18
+ n_batch=512 # batch size
19
+ )
20
+
21
+ # --- Schema Request ---
22
+ class ChatRequest(BaseModel):
23
+ prompt: str
24
+ max_new_tokens: int = 256
25
+
26
+ # --- Endpoint Chat ---
27
+ @app.post("/chat")
28
+ def chat(req: ChatRequest):
29
+ output = llm(
30
+ req.prompt,
31
+ max_tokens=req.max_new_tokens,
32
+ stop=["</s>", "User:", "Assistant:"],
33
+ echo=False
34
+ )
35
+ response = output["choices"][0]["text"].strip()
36
+ return {"response": response}
37
+
38
+ # --- Root Endpoint ---
39
+ @app.get("/")
40
+ def root():
41
+ return {"message": "Qwen GGUF FastAPI running 🚀"}
42
+
43
+ if __name__ == "__main__":
44
+ port = int(os.environ.get("PORT", 7860))
45
+ uvicorn.run("app:app", host="0.0.0.0", port=port)
requirements.txt CHANGED
@@ -8,4 +8,5 @@ einops
8
  transformers_stream_generator
9
  scipy
10
  sentencepiece
11
- optimum
 
 
8
  transformers_stream_generator
9
  scipy
10
  sentencepiece
11
+ optimum
12
+ threading