Spaces:
Build error
Build error
| from transformers import AutoTokenizer | |
| from exllamav2 import ( | |
| ExLlamaV2, | |
| ExLlamaV2Config, | |
| ExLlamaV2Cache, | |
| ExLlamaV2Tokenizer | |
| ) | |
| from exllamav2.generator import ( | |
| ExLlamaV2StreamingGenerator, | |
| ExLlamaV2Sampler | |
| ) | |
| import torch | |
| # Configure model | |
| model_dir = "TheBloke_Wizard-Vicuna-13B-GPTQ" # Path to downloaded model | |
| config = ExLlamaV2Config() | |
| config.model_dir = model_dir | |
| config.prepare() | |
| # Load model | |
| model = ExLlamaV2(config) | |
| cache = ExLlamaV2Cache(model) | |
| model.load_autosplit(cache) | |
| # Load tokenizer (HF-compatible) | |
| tokenizer = AutoTokenizer.from_pretrained(model_dir) | |
| def generate_response(prompt, max_tokens=200, temperature=0.7): | |
| # Initialize generator | |
| generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer) | |
| generator.set_stop_conditions([tokenizer.eos_token_id]) | |
| # Configure sampler | |
| settings = ExLlamaV2Sampler.Settings() | |
| settings.temperature = temperature | |
| settings.top_k = 50 | |
| settings.top_p = 0.8 | |
| # Encode prompt | |
| input_ids = tokenizer(prompt, return_tensors="pt").input_ids.cuda() | |
| # Generate | |
| output = generator.generate_simple( | |
| input_ids, | |
| settings, | |
| max_tokens, | |
| seed=42 | |
| ) | |
| return tokenizer.decode(output[0], skip_special_tokens=True) | |
| ############################################## | |
| from fastapi import FastAPI | |
| import uvicorn | |
| app = FastAPI() | |
| def greet_json(): | |
| return {"Hello": "World!"} | |
| async def message(input: str): | |
| return generate_response(input) | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |