Spaces:
Running
Running
| /// <reference lib="webworker" /> | |
| // @ts-ignore | |
| importScripts('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.13.0/dist/transformers.min.js'); | |
| let processor: any = null; | |
| let model: any = null; | |
| let isLoaded = false; | |
| const MODEL_ID = "onnx-community/FastVLM-0.5B-ONNX"; | |
| async function loadModelAndProcessor() { | |
| if (isLoaded) return; | |
| // @ts-ignore | |
| processor = await window.transformers.AutoProcessor.from_pretrained(MODEL_ID); | |
| // @ts-ignore | |
| model = await window.transformers.AutoModelForImageTextToText.from_pretrained(MODEL_ID, { | |
| dtype: { | |
| embed_tokens: "fp16", | |
| vision_encoder: "q4", | |
| decoder_model_merged: "q4", | |
| }, | |
| device: "webgpu", | |
| }); | |
| isLoaded = true; | |
| } | |
| self.onmessage = async (event) => { | |
| const { imageData, prompt } = event.data; | |
| await loadModelAndProcessor(); | |
| // Convert imageData to RawImage | |
| // @ts-ignore | |
| const rawImg = new window.transformers.RawImage( | |
| imageData.data, | |
| imageData.width, | |
| imageData.height, | |
| 4 | |
| ); | |
| const messages = [ | |
| { | |
| role: "system", | |
| content: `You are a helpful visual AI assistant. Respond concisely and accurately to the user's query in one sentence.`, | |
| }, | |
| { role: "user", content: `<image>${prompt}` }, | |
| ]; | |
| // @ts-ignore | |
| const chatPrompt = processor.apply_chat_template(messages, { add_generation_prompt: true }); | |
| // @ts-ignore | |
| const inputs = await processor(rawImg, chatPrompt, { add_special_tokens: false }); | |
| // @ts-ignore | |
| const outputs = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: 512, | |
| do_sample: false, | |
| repetition_penalty: 1.2, | |
| }); | |
| // @ts-ignore | |
| const decoded = processor.batch_decode(outputs.slice(null, [inputs.input_ids.dims.at(-1), null]), { | |
| skip_special_tokens: true, | |
| }); | |
| // Return the decoded result (should be a string or JSON) | |
| self.postMessage(decoded[0].trim()); | |
| }; | |
| export {}; |