Spaces:
Running
Running
| import init, { Model } from "./build/m.js" | |
| function fixTwo(x) { return Math.floor(x * 100) / 100 } | |
| function humanSize(size) { | |
| if(size < 1e3) return `${fixTwo(size)}b` | |
| if(size < 1e6) return `${fixTwo(size/1e3)}kb` | |
| if(size < 1e9) return `${fixTwo(size/1e6)}mb` | |
| if(size < 1e12) return `${fixTwo(size/1e9)}gb` | |
| return `${fixTwo(size/1e12)}tb` | |
| } | |
| function humanTime(seconds) { | |
| const _year = 31536e3 | |
| const _mon = 2592e3 | |
| const _day = 864e2 | |
| const _hour = 36e2 | |
| const _min = 60 | |
| const _sec = 1 | |
| const year_rem = seconds % _year | |
| const years = (seconds - year_rem) / _year | |
| const month_rem = year_rem % _mon | |
| const months = (year_rem - month_rem) / _mon | |
| const day_rem = month_rem % _day | |
| const days = (month_rem - day_rem) / _day | |
| const hour_rem = day_rem % _hour | |
| const hours = (day_rem - hour_rem) / _hour | |
| const minute_rem = hour_rem % _min | |
| const minutes = (hour_rem - minute_rem) / _min | |
| const second_rem = minute_rem % _sec | |
| const second = (minute_rem - second_rem) / _sec | |
| return (years > 0 ? `${years} year${years == 1 ? '' : 's'} ` : '') + (months > 0 ? `${months} month${months == 1 ? '' : 's'} `: '') + | |
| (days > 0 ? `${days} day${days == 1 ? '' : 's'} ` : '') + (hours > 0 ? `${hours} hour${hours == 1 ? '' : 's'} ` : '') + | |
| (minutes > 0 ? `${minutes} minute${minutes == 1 ? '' : 's'} ` : '') + (seconds > 0 ? `${second} second${second == 1 ? '' : 's'} ` : '') | |
| } | |
| let lastSend = 0 | |
| let lastTime = Infinity | |
| let times = [0, 0, 0, 0] | |
| async function fetchArrayBuffer(url) { | |
| const cacheName = "phi-mixformer-candle-cache" | |
| const cache = await caches.open(cacheName) | |
| const cachedResponse = await cache.match(url) | |
| if (cachedResponse) { | |
| const data = await cachedResponse.arrayBuffer() | |
| return new Uint8Array(data) | |
| } | |
| const res = await fetch(url, { cache: "force-cache" }) | |
| while (!res.body) { } | |
| const reader = res.body.getReader() | |
| const contentLength = +(res.headers.get('Content-Length') ?? 0) | |
| let receivedLength = 0 | |
| let chunks = [] | |
| while (true) { | |
| const { done, value } = await reader.read() | |
| if (done) { | |
| break | |
| } | |
| chunks.push(value) | |
| receivedLength += value.length | |
| if(Date.now() - lastSend > 250) { | |
| times.push(receivedLength) | |
| times = times.slice(1) | |
| let max = [times[3] - times[2], times[2] - times[1], times[1] - times[0]] | |
| let median = (max[0] + max[1] + max[2]) / 3 | |
| let lengthPerSecond = median * 4 | |
| let leftSize = contentLength - receivedLength | |
| let leftTime = Math.abs(leftSize / lengthPerSecond) | |
| if(leftTime > lastTime * 1.5 && lastTime != 0) leftTime = lastTime * 1.2 | |
| // if(leftTime > lastTime) leftTime = lastTime | |
| lastTime = leftTime | |
| let downloadMessage = `Downloading... ${fixTwo((receivedLength / contentLength) * 100)}% (${humanSize(Math.floor(receivedLength * 100) / 100)}) | |
| Estimated time remaining: ${humanTime(leftTime)} (may be inaccurate) | |
| Total size: ${humanSize(fixTwo(contentLength))} | |
| Download URL: ${url}` | |
| self.postMessage({ status: "loading", message: downloadMessage }) | |
| // console.log(downloadMessage) | |
| lastSend = Date.now() | |
| } | |
| } | |
| let chunksAll = new Uint8Array(receivedLength) | |
| let position = 0 | |
| for (let chunk of chunks) { | |
| chunksAll.set(chunk, position) | |
| position += chunk.length | |
| } | |
| cache.put(url, new Response(chunksAll)) | |
| return chunksAll | |
| } | |
| async function concatenateArrayBuffers(urls) { | |
| const arrayBuffers = await Promise.all(urls.map(url => fetchArrayBuffer(url))) | |
| let totalLength = arrayBuffers.reduce((acc, arrayBuffer) => acc + arrayBuffer.byteLength, 0) | |
| let concatenatedBuffer = new Uint8Array(totalLength) | |
| let offset = 0 | |
| arrayBuffers.forEach(buffer => { | |
| concatenatedBuffer.set(new Uint8Array(buffer), offset) | |
| offset += buffer.byteLength | |
| }) | |
| return concatenatedBuffer | |
| } | |
| class Phi { | |
| static instance = {} | |
| static async getInstance( | |
| weightsURL, | |
| modelID, | |
| tokenizerURL, | |
| configURL, | |
| quantized | |
| ) { | |
| // load individual modelID only once | |
| if (!this.instance[modelID]) { | |
| await init() | |
| self.postMessage({ status: "loading", message: "Loading Model" }) | |
| const [weightsArrayU8, tokenizerArrayU8, configArrayU8] = | |
| await Promise.all([ | |
| weightsURL instanceof Array ? concatenateArrayBuffers(weightsURL) : fetchArrayBuffer(weightsURL), | |
| fetchArrayBuffer(tokenizerURL), | |
| fetchArrayBuffer(configURL), | |
| ]) | |
| this.instance[modelID] = new Model( | |
| weightsArrayU8, | |
| tokenizerArrayU8, | |
| configArrayU8, | |
| quantized | |
| ) | |
| } | |
| return this.instance[modelID] | |
| } | |
| } | |
| let controller = null | |
| self.addEventListener("message", (event) => { | |
| if (event.data.command === "start") { | |
| controller = new AbortController() | |
| generate(event.data) | |
| } else if (event.data.command === "abort") { | |
| controller.abort() | |
| } | |
| }) | |
| async function generate(data) { | |
| const { | |
| weightsURL, | |
| modelID, | |
| tokenizerURL, | |
| configURL, | |
| quantized, | |
| prompt, | |
| temp, | |
| top_p, | |
| repeatPenalty, | |
| seed, | |
| maxSeqLen, | |
| stuff | |
| } = data | |
| try { | |
| self.postMessage({ status: "loading", message: "Starting Phi" }) | |
| const model = await Phi.getInstance( | |
| weightsURL, | |
| modelID, | |
| tokenizerURL, | |
| configURL, | |
| quantized | |
| ) | |
| self.postMessage({ status: "loading", message: "Initializing model" }) | |
| const firstToken = model.init_with_prompt( | |
| prompt, | |
| temp, | |
| top_p, | |
| repeatPenalty, | |
| 64, | |
| BigInt(seed) | |
| ) | |
| const seq_len = 2048 | |
| let sentence = firstToken | |
| let maxTokens = maxSeqLen ? maxSeqLen : seq_len - prompt.length - 1 | |
| let startTime = performance.now() | |
| let tokensCount = 0 | |
| while (tokensCount < maxTokens) { | |
| await new Promise(async (resolve) => { | |
| if (controller && controller.signal.aborted) { | |
| self.postMessage({ | |
| status: "aborted", | |
| message: "Aborted", | |
| output: prompt + sentence, | |
| }) | |
| return | |
| } | |
| const token = await model.next_token() | |
| const terminates = `<|endoftext|>, <|user|>, <|system|>, <|assistant|>`.split(', ').map(e => e.trim()) | |
| if (terminates.includes(token)) { | |
| self.postMessage({ | |
| status: "complete", | |
| message: "complete", | |
| output: prompt + sentence, | |
| }) | |
| return | |
| } | |
| const tokensSec = | |
| ((tokensCount + 1) / (performance.now() - startTime)) * 1000 | |
| sentence += token | |
| self.postMessage({ | |
| status: "generating", | |
| message: "Generating token", | |
| token: token, | |
| sentence: sentence, | |
| totalTime: performance.now() - startTime, | |
| tokensSec, | |
| prompt: prompt, | |
| }) | |
| setTimeout(resolve, 0) | |
| }) | |
| tokensCount++ | |
| } | |
| self.postMessage({ | |
| status: "complete", | |
| message: "complete", | |
| output: prompt + sentence, | |
| }) | |
| } catch (e) { | |
| self.postMessage({ error: e }) | |
| } | |
| } | |