Commit
·
2160235
1
Parent(s):
56c75a3
release: build 60371bd7e5a9f74f023fc8c57fca5cced4b0e47b
Browse files- .gitattributes +2 -0
- README.md +65 -5
- assets/index-DGmKQH7N.js +0 -0
- assets/index-cAxkOY9l.css +1 -0
- assets/play-worklet-CqUYQx_r.js +1 -0
- assets/vad-processor-0sEQXaXZ.js +1 -0
- assets/worker-yoCrhISy.ts +465 -0
- favicon-96x96.png +0 -0
- favicon.svg +8 -0
- index.html +22 -17
- style.css +0 -28
.gitattributes
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
|
| 2 |
+
# Default
|
| 3 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 5 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,10 +1,70 @@
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
emoji: ⚡
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: green
|
| 6 |
sdk: static
|
| 7 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
emoji: "\U0001F4AC"
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: static
|
| 4 |
pinned: false
|
| 5 |
+
license: mit
|
| 6 |
+
title: Realtime Conversational WebGPU (Vue)
|
| 7 |
+
colorFrom: purple
|
| 8 |
+
colorTo: indigo
|
| 9 |
+
models:
|
| 10 |
+
- HuggingFaceTB/SmolLM2-1.7B-Instruct
|
| 11 |
+
- onnx-community/whisper-base
|
| 12 |
+
- onnx-community/silero-vad
|
| 13 |
+
short_description: Yet another Realtime Conversational WebGPU
|
| 14 |
---
|
| 15 |
|
| 16 |
+
<h1 align="center">Realtime Conversational WebGPU (Vue)</h1>
|
| 17 |
+
|
| 18 |
+
<p align="center">
|
| 19 |
+
[<a href="https://conversational-webgpu-vue.netlify.app/">Try it</a>]
|
| 20 |
+
</p>
|
| 21 |
+
|
| 22 |
+
> Heavily inspired by [WebGPU Video Object Detection - a Hugging Face Space by WebML Community](https://huggingface.co/spaces/webml-community/webgpu-video-object-detection)
|
| 23 |
+
|
| 24 |
+
# Realtime Conversational WebGPU
|
| 25 |
+
|
| 26 |
+
## Getting Started
|
| 27 |
+
|
| 28 |
+
Follow the steps below to set up and run the application.
|
| 29 |
+
|
| 30 |
+
### 1. Clone the Repository
|
| 31 |
+
|
| 32 |
+
Clone the examples repository from GitHub:
|
| 33 |
+
|
| 34 |
+
```sh
|
| 35 |
+
git clone https://github.com/proj-airi/webai-examples.git
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
### 2. Navigate to the Project Directory
|
| 39 |
+
|
| 40 |
+
Change your working directory to the `conversational-webgpu` folder:
|
| 41 |
+
|
| 42 |
+
```sh
|
| 43 |
+
cd apps/conversational-webgpu
|
| 44 |
+
```
|
| 45 |
+
|
| 46 |
+
### 3. Install Dependencies
|
| 47 |
+
|
| 48 |
+
Install the necessary dependencies using npm:
|
| 49 |
+
|
| 50 |
+
```sh
|
| 51 |
+
npm i
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### 4. Run the Development Server
|
| 55 |
+
|
| 56 |
+
Start the development server:
|
| 57 |
+
|
| 58 |
+
```sh
|
| 59 |
+
npm run dev
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
The application should now be running locally. Open your browser and go to `http://localhost:5175` to see it in action.
|
| 63 |
+
|
| 64 |
+
## Acknowledgements
|
| 65 |
+
|
| 66 |
+
Great thanks to what WebML Community have done.
|
| 67 |
+
|
| 68 |
+
> [Source code](https://huggingface.co/spaces/webml-community/conversational-webgpu)
|
| 69 |
+
|
| 70 |
+
> [UI inspiration](https://app.sesame.com/)
|
assets/index-DGmKQH7N.js
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
assets/index-cAxkOY9l.css
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
@keyframes ripple-314fbe9f{0%{transform:scale(1);opacity:.7}to{transform:scale(2);opacity:0}}.embla[data-v-314fbe9f]{position:relative;overflow:hidden}.embla[data-v-314fbe9f]:before,.embla[data-v-314fbe9f]:after{content:"";position:absolute;top:0;bottom:0;width:48px;z-index:1;pointer-events:none}.embla-edge-disabled.embla[data-v-314fbe9f]:before,.embla-edge-disabled.embla[data-v-314fbe9f]:after{display:none}.embla[data-v-314fbe9f]:before{left:-24px;background:linear-gradient(to right,#ffffff 32px,transparent)}.embla[data-v-314fbe9f]:after{right:-24px;background:linear-gradient(to left,#ffffff 32px,transparent)}.dark .embla[data-v-314fbe9f]:before{left:-24px;background:linear-gradient(to right,#121212 32px,transparent)}.dark .embla[data-v-314fbe9f]:after{right:-24px;background:linear-gradient(to left,#121212 32px,transparent)}.fade-enter-active[data-v-314fbe9f],.fade-leave-active[data-v-314fbe9f]{transition:opacity .5s ease}.fade-enter-from[data-v-314fbe9f],.fade-leave-to[data-v-314fbe9f]{opacity:0}.fade-enter-to[data-v-314fbe9f],.fade-leave-from[data-v-314fbe9f]{opacity:1}.fade-scale-enter-active[data-v-314fbe9f],.fade-scale-leave-active[data-v-314fbe9f]{transition:all .2s ease-in-out}.fade-scale-enter-from[data-v-314fbe9f],.fade-scale-leave-to[data-v-314fbe9f]{opacity:0;transform:scale(.8)}.fade-scale-enter-to[data-v-314fbe9f],.fade-scale-leave-from[data-v-314fbe9f]{opacity:1;transform:scale(1)}*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:var(--un-default-border-color, #e5e7eb)}:before,:after{--un-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]:where(:not([hidden=until-found])){display:none}:root{--bg-color-light: rgb(255 255 255);--bg-color-dark: rgb(18 18 18);--bg-color: var(--bg-color-light)}html,body,#app{height:100%;margin:0;padding:0;overscroll-behavior:none}html{background:var(--bg-color);transition:all .3s ease-in-out}html.dark{--bg-color: var(--bg-color-dark);color-scheme:dark}*,:before,:after{--un-rotate:0;--un-rotate-x:0;--un-rotate-y:0;--un-rotate-z:0;--un-scale-x:1;--un-scale-y:1;--un-scale-z:1;--un-skew-x:0;--un-skew-y:0;--un-translate-x:0;--un-translate-y:0;--un-translate-z:0;--un-pan-x: ;--un-pan-y: ;--un-pinch-zoom: ;--un-scroll-snap-strictness:proximity;--un-ordinal: ;--un-slashed-zero: ;--un-numeric-figure: ;--un-numeric-spacing: ;--un-numeric-fraction: ;--un-border-spacing-x:0;--un-border-spacing-y:0;--un-ring-offset-shadow:0 0 rgb(0 0 0 / 0);--un-ring-shadow:0 0 rgb(0 0 0 / 0);--un-shadow-inset: ;--un-shadow:0 0 rgb(0 0 0 / 0);--un-ring-inset: ;--un-ring-offset-width:0px;--un-ring-offset-color:#fff;--un-ring-width:0px;--un-ring-color:rgb(147 197 253 / .5);--un-blur: ;--un-brightness: ;--un-contrast: ;--un-drop-shadow: ;--un-grayscale: ;--un-hue-rotate: ;--un-invert: ;--un-saturate: ;--un-sepia: ;--un-backdrop-blur: ;--un-backdrop-brightness: ;--un-backdrop-contrast: ;--un-backdrop-grayscale: ;--un-backdrop-hue-rotate: ;--un-backdrop-invert: ;--un-backdrop-opacity: ;--un-backdrop-saturate: ;--un-backdrop-sepia: }::backdrop{--un-rotate:0;--un-rotate-x:0;--un-rotate-y:0;--un-rotate-z:0;--un-scale-x:1;--un-scale-y:1;--un-scale-z:1;--un-skew-x:0;--un-skew-y:0;--un-translate-x:0;--un-translate-y:0;--un-translate-z:0;--un-pan-x: ;--un-pan-y: ;--un-pinch-zoom: ;--un-scroll-snap-strictness:proximity;--un-ordinal: ;--un-slashed-zero: ;--un-numeric-figure: ;--un-numeric-spacing: ;--un-numeric-fraction: ;--un-border-spacing-x:0;--un-border-spacing-y:0;--un-ring-offset-shadow:0 0 rgb(0 0 0 / 0);--un-ring-shadow:0 0 rgb(0 0 0 / 0);--un-shadow-inset: ;--un-shadow:0 0 rgb(0 0 0 / 0);--un-ring-inset: ;--un-ring-offset-width:0px;--un-ring-offset-color:#fff;--un-ring-width:0px;--un-ring-color:rgb(147 197 253 / .5);--un-blur: ;--un-brightness: ;--un-contrast: ;--un-drop-shadow: ;--un-grayscale: ;--un-hue-rotate: ;--un-invert: ;--un-saturate: ;--un-sepia: ;--un-backdrop-blur: ;--un-backdrop-brightness: ;--un-backdrop-contrast: ;--un-backdrop-grayscale: ;--un-backdrop-hue-rotate: ;--un-backdrop-invert: ;--un-backdrop-opacity: ;--un-backdrop-saturate: ;--un-backdrop-sepia: }@font-face{font-family:DM Mono;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmmono/v15/aFTU7PB1QTsUX8KYthSQBK6PYK3EXw.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:DM Mono;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmmono/v15/aFTU7PB1QTsUX8KYthqQBK6PYK0.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:DM Sans;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmsans/v16/rP2tp2ywxg089UriI5-g4vlH9VoD8CmcqZG40F9JadbnoEwAopxRR232RmYJp8I5zzw.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:DM Sans;font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmsans/v16/rP2tp2ywxg089UriI5-g4vlH9VoD8CmcqZG40F9JadbnoEwAopxRSW32RmYJp8I5.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}@font-face{font-family:"DM Serif Display";font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmserifdisplay/v16/-nFnOHM81r4j6k0gjAW3mujVU2B2G_5x0vrx52jJ3Q.woff2) format("woff2");unicode-range:U+0100-02BA,U+02BD-02C5,U+02C7-02CC,U+02CE-02D7,U+02DD-02FF,U+0304,U+0308,U+0329,U+1D00-1DBF,U+1E00-1E9F,U+1EF2-1EFF,U+2020,U+20A0-20AB,U+20AD-20C0,U+2113,U+2C60-2C7F,U+A720-A7FF}@font-face{font-family:"DM Serif Display";font-style:normal;font-weight:400;font-display:swap;src:url(https://fonts.gstatic.com/s/dmserifdisplay/v16/-nFnOHM81r4j6k0gjAW3mujVU2B2G_Bx0vrx52g.woff2) format("woff2");unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD}.i-solar\:end-call-rounded-bold,[i-solar\:end-call-rounded-bold=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Cpath fill='currentColor' d='m5.607 16.897l1.34-.38C8.156 16.174 9 14.983 9 13.618c0 0 0-1.654 3-1.654s3 1.654 3 1.654c0 1.365.844 2.556 2.053 2.9l1.34.38C20.218 17.414 22 15.91 22 13.85c0-1.237-.277-2.477-1.083-3.347C19.56 9.04 16.807 7 12 7s-7.56 2.039-8.917 3.503C2.277 11.373 2 12.613 2 13.85c0 2.06 1.782 3.565 3.607 3.047'/%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.i-solar\:phone-bold,[i-solar\:phone-bold=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Cpath fill='currentColor' d='m16.556 12.906l-.455.453s-1.083 1.076-4.038-1.862s-1.872-4.014-1.872-4.014l.286-.286c.707-.702.774-1.83.157-2.654L9.374 2.86C8.61 1.84 7.135 1.705 6.26 2.575l-1.57 1.56c-.433.432-.723.99-.688 1.61c.09 1.587.808 5 4.812 8.982c4.247 4.222 8.232 4.39 9.861 4.238c.516-.048.964-.31 1.325-.67l1.42-1.412c.96-.953.69-2.588-.538-3.255l-1.91-1.039c-.806-.437-1.787-.309-2.417.317'/%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.i-svg-spinners\:3-dots-bounce,[i-svg-spinners\:3-dots-bounce=""]{--un-icon:url("data:image/svg+xml;utf8,%3Csvg viewBox='0 0 24 24' width='1.2em' height='1.2em' xmlns='http://www.w3.org/2000/svg' %3E%3Ccircle cx='4' cy='12' r='3' fill='currentColor'%3E%3Canimate id='svgSpinners3DotsBounce0' attributeName='cy' begin='0;svgSpinners3DotsBounce1.end+0.25s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3Ccircle cx='12' cy='12' r='3' fill='currentColor'%3E%3Canimate attributeName='cy' begin='svgSpinners3DotsBounce0.begin+0.1s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3Ccircle cx='20' cy='12' r='3' fill='currentColor'%3E%3Canimate id='svgSpinners3DotsBounce1' attributeName='cy' begin='svgSpinners3DotsBounce0.begin+0.2s' calcMode='spline' dur='0.6s' keySplines='.33,.66,.66,1;.33,0,.66,.33' values='12;6;12'/%3E%3C/circle%3E%3C/svg%3E");-webkit-mask:var(--un-icon) no-repeat;mask:var(--un-icon) no-repeat;-webkit-mask-size:100% 100%;mask-size:100% 100%;background-color:currentColor;color:inherit;width:1.2em;height:1.2em}.prose :where(h1,h2,h3,h4,h5,h6):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-headings);font-weight:600;line-height:1.25}.prose :where(a):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-links);text-decoration:underline;font-weight:500}.prose :where(a code):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-links)}.prose :where(p,ul,ol,pre):not(:where(.not-prose,.not-prose *)){margin:1em 0;line-height:1.75}.prose :where(blockquote):not(:where(.not-prose,.not-prose *)){margin:1em 0;padding-left:1em;font-style:italic;border-left:.25em solid var(--un-prose-borders)}.prose :where(h1):not(:where(.not-prose,.not-prose *)){margin:1rem 0;font-size:2.25em}.prose :where(h2):not(:where(.not-prose,.not-prose *)){margin:1.75em 0 .5em;font-size:1.75em}.prose :where(h3):not(:where(.not-prose,.not-prose *)){margin:1.5em 0 .5em;font-size:1.375em}.prose :where(h4):not(:where(.not-prose,.not-prose *)){margin:1em 0;font-size:1.125em}.prose :where(img,video):not(:where(.not-prose,.not-prose *)){max-width:100%}.prose :where(figure,picture):not(:where(.not-prose,.not-prose *)){margin:1em 0}.prose :where(figcaption):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-captions);font-size:.875em}.prose :where(code):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-code);font-size:.875em;font-weight:600;font-family:DM Mono,ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.prose :where(:not(pre)>code):not(:where(.not-prose,.not-prose *)):before,.prose :where(:not(pre)>code):not(:where(.not-prose,.not-prose *)):after{content:"`"}.prose :where(pre):not(:where(.not-prose,.not-prose *)){padding:1.25rem 1.5rem;overflow-x:auto;border-radius:.375rem}.prose :where(pre,code):not(:where(.not-prose,.not-prose *)){white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;hyphens:none;background:transparent}.prose :where(pre code):not(:where(.not-prose,.not-prose *)){font-weight:inherit}.prose :where(ol,ul):not(:where(.not-prose,.not-prose *)){padding-left:1.25em}.prose :where(ol):not(:where(.not-prose,.not-prose *)){list-style-type:decimal}.prose :where(ol[type=A]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-alpha}.prose :where(ol[type=a]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-alpha}.prose :where(ol[type=A s]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-alpha}.prose :where(ol[type=a s]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-alpha}.prose :where(ol[type=I]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-roman}.prose :where(ol[type=i]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-roman}.prose :where(ol[type=I s]):not(:where(.not-prose,.not-prose *)){list-style-type:upper-roman}.prose :where(ol[type=i s]):not(:where(.not-prose,.not-prose *)){list-style-type:lower-roman}.prose :where(ol[type="1"]):not(:where(.not-prose,.not-prose *)){list-style-type:decimal}.prose :where(ul):not(:where(.not-prose,.not-prose *)){list-style-type:disc}.prose :where(ol>li):not(:where(.not-prose,.not-prose *))::marker,.prose :where(ul>li):not(:where(.not-prose,.not-prose *))::marker,.prose :where(summary):not(:where(.not-prose,.not-prose *))::marker{color:var(--un-prose-lists)}.prose :where(hr):not(:where(.not-prose,.not-prose *)){margin:2em 0;border:1px solid var(--un-prose-hr)}.prose :where(table):not(:where(.not-prose,.not-prose *)){display:block;margin:1em 0;border-collapse:collapse;overflow-x:auto}.prose :where(tr):not(:where(.not-prose,.not-prose *)):nth-child(2n){background:var(--un-prose-bg-soft)}.prose :where(td,th):not(:where(.not-prose,.not-prose *)){border:1px solid var(--un-prose-borders);padding:.625em 1em}.prose :where(abbr):not(:where(.not-prose,.not-prose *)){cursor:help}.prose :where(kbd):not(:where(.not-prose,.not-prose *)){color:var(--un-prose-code);border:1px solid;padding:.25rem .5rem;font-size:.875em;border-radius:.25rem}.prose :where(details):not(:where(.not-prose,.not-prose *)){margin:1em 0;padding:1.25rem 1.5rem;background:var(--un-prose-bg-soft)}.prose :where(summary):not(:where(.not-prose,.not-prose *)){cursor:pointer;font-weight:600}.prose{color:var(--un-prose-body);max-width:65ch}.pointer-events-none{pointer-events:none}.absolute{position:absolute}.relative{position:relative}.inset-0{top:0;right:0;bottom:0;left:0}.z-10{z-index:10}.m-auto{margin:auto}.-mt-4{margin-top:-1rem}.mb-4{margin-bottom:1rem}.hidden{display:none}.aspect-square{aspect-ratio:1/1}.h-100dvh{height:100dvh}.h-32{height:8rem}.h-50{height:12.5rem}.h-80{height:20rem}.h-full,[h-full=""]{height:100%}.min-w-0{min-width:0}.w-100dvw{width:100dvw}.w-120{width:30rem}.w-140{width:35rem}.w-32{width:8rem}.w-fit,[w-fit=""]{width:fit-content}.w-full{width:100%}.flex,[flex=""]{display:flex}.flex-\[0_0_80\%\]{flex:0 0 80%}.flex-shrink-0,.shrink-0{flex-shrink:0}.grow-0{flex-grow:0}.basis-full{flex-basis:100%}.flex-col,[flex-col=""]{flex-direction:column}.transform{transform:translate(var(--un-translate-x)) translateY(var(--un-translate-y)) translateZ(var(--un-translate-z)) rotate(var(--un-rotate)) rotateX(var(--un-rotate-x)) rotateY(var(--un-rotate-y)) rotate(var(--un-rotate-z)) skew(var(--un-skew-x)) skewY(var(--un-skew-y)) scaleX(var(--un-scale-x)) scaleY(var(--un-scale-y)) scaleZ(var(--un-scale-z))}@keyframes ping{0%{transform:scale(1);opacity:1}75%,to{transform:scale(2);opacity:0}}.animate-ping{animation:ping 1s cubic-bezier(0,0,.2,1) infinite}.cursor-pointer{cursor:pointer}.items-center,[items-center=""]{align-items:center}.justify-center,[justify-center=""]{justify-content:center}.justify-between,[justify-between=""]{justify-content:space-between}.gap-2,[gap-2=""]{gap:.5rem}.gap-4{gap:1rem}.overflow-hidden{overflow:hidden}.border-2{border-width:2px}.border-cyan-200{--un-border-opacity:1;border-color:rgb(165 243 252 / var(--un-border-opacity))}.dark .dark\:border-cyan-500{--un-border-opacity:1;border-color:rgb(6 182 212 / var(--un-border-opacity))}.rounded-full{border-radius:9999px}.rounded-lg,[rounded-lg=""]{border-radius:.5rem}.rounded-xl,[rounded-xl=""]{border-radius:.75rem}.bg-cyan-200{--un-bg-opacity:1;background-color:rgb(165 243 252 / var(--un-bg-opacity))}.bg-cyan-300{--un-bg-opacity:1;background-color:rgb(103 232 249 / var(--un-bg-opacity))}.bg-red-200{--un-bg-opacity:1;background-color:rgb(254 202 202 / var(--un-bg-opacity))}.bg-red-300{--un-bg-opacity:1;background-color:rgb(252 165 165 / var(--un-bg-opacity))}.dark .dark\:bg-cyan-600{--un-bg-opacity:1;background-color:rgb(8 145 178 / var(--un-bg-opacity))}.dark .dark\:bg-cyan-800{--un-bg-opacity:1;background-color:rgb(21 94 117 / var(--un-bg-opacity))}.dark .dark\:bg-red-400{--un-bg-opacity:1;background-color:rgb(248 113 113 / var(--un-bg-opacity))}.dark [bg~="dark:cyan-950"]{--un-bg-opacity:1;background-color:rgb(8 51 68 / var(--un-bg-opacity))}[bg~=cyan-50]{--un-bg-opacity:1;background-color:rgb(236 254 255 / var(--un-bg-opacity))}.dark [bg~="dark:hover:cyan-900"]:hover{--un-bg-opacity:1;background-color:rgb(22 78 99 / var(--un-bg-opacity))}[bg~="hover:cyan-100"]:hover{--un-bg-opacity:1;background-color:rgb(207 250 254 / var(--un-bg-opacity))}.p-4,[p-4=""]{padding:1rem}.px-1,[px-1=""]{padding-left:.25rem;padding-right:.25rem}.px-16{padding-left:4rem;padding-right:4rem}.px-4,[px-4=""]{padding-left:1rem;padding-right:1rem}.px-8{padding-left:2rem;padding-right:2rem}.py-1,[py-1=""]{padding-top:.25rem;padding-bottom:.25rem}.py-2,[py-2=""]{padding-top:.5rem;padding-bottom:.5rem}.pl-0{padding-left:0}.pt-4{padding-top:1rem}.text-center{text-align:center}.text-left{text-align:left}.text-2xl,[text-2xl=""]{font-size:1.5rem;line-height:2rem}.text-lg,[text-lg=""]{font-size:1.125rem;line-height:1.75rem}.text-sm,[text-sm=""]{font-size:.875rem;line-height:1.25rem}.dark .dark\:text-white,.dark [text~="dark:white"]{--un-text-opacity:1;color:rgb(255 255 255 / var(--un-text-opacity))}.dark [text~="dark:cyan-500"]{--un-text-opacity:1;color:rgb(6 182 212 / var(--un-text-opacity))}.text-gray-700{--un-text-opacity:1;color:rgb(55 65 81 / var(--un-text-opacity))}.text-red-700{--un-text-opacity:1;color:rgb(185 28 28 / var(--un-text-opacity))}[text~=black]{--un-text-opacity:1;color:rgb(0 0 0 / var(--un-text-opacity))}[text~=cyan-400]{--un-text-opacity:1;color:rgb(34 211 238 / var(--un-text-opacity))}[text~=red-400]{--un-text-opacity:1;color:rgb(248 113 113 / var(--un-text-opacity))}[text~="hover:red-300"]:hover{--un-text-opacity:1;color:rgb(252 165 165 / var(--un-text-opacity))}[text~="active:red-400"]:active{--un-text-opacity:1;color:rgb(248 113 113 / var(--un-text-opacity))}.font-sans{font-family:DM Sans,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji"}.opacity-0{opacity:0}.opacity-75{opacity:.75}.shadow-inner{--un-shadow:inset 0 2px 4px 0 var(--un-shadow-color, rgb(0 0 0 / .05));box-shadow:var(--un-ring-offset-shadow),var(--un-ring-shadow),var(--un-shadow)}.outline-none,[outline-none=""]{outline:2px solid transparent;outline-offset:2px}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}[transition~=all]{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-300,[transition~=duration-300]{transition-duration:.3s}.duration-500,[transition~=duration-500]{transition-duration:.5s}.ease,.ease-in-out,[transition~=ease-in-out]{transition-timing-function:cubic-bezier(.4,0,.2,1)}.ease-out{transition-timing-function:cubic-bezier(0,0,.2,1)}
|
assets/play-worklet-CqUYQx_r.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
var c=Object.defineProperty;var l=(s,e,t)=>e in s?c(s,e,{enumerable:!0,configurable:!0,writable:!0,value:t}):s[e]=t;var n=(s,e,t)=>l(s,typeof e!="symbol"?e+"":e,t);class d extends AudioWorkletProcessor{constructor(){super();n(this,"bufferQueue",[]);n(this,"currentChunkOffset",0);n(this,"hadData",!1);this.bufferQueue=[],this.currentChunkOffset=0,this.hadData=!1,this.port.onmessage=t=>{const r=t.data;r instanceof Float32Array?(this.hadData=!0,this.bufferQueue.push(r)):r==="stop"&&(this.bufferQueue=[],this.currentChunkOffset=0)}}process(t,r){const f=r[0][0];if(!f)return!0;const h=f.length;let u=0;for(this.hadData&&this.bufferQueue.length===0&&(this.port.postMessage({type:"playback_ended"}),this.hadData=!1);u<h;)if(this.bufferQueue.length>0){const a=this.bufferQueue[0],o=a.length-this.currentChunkOffset,i=Math.min(o,h-u);f.set(a.subarray(this.currentChunkOffset,this.currentChunkOffset+i),u),this.currentChunkOffset+=i,u+=i,this.currentChunkOffset>=a.length&&(this.bufferQueue.shift(),this.currentChunkOffset=0)}else f.fill(0,u),u=h;return!0}}registerProcessor("buffered-audio-worklet-processor",d);
|
assets/vad-processor-0sEQXaXZ.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
let s=0;const r=new Float32Array(512);class l extends AudioWorkletProcessor{process(o,n,f){const e=o[0][0];if(!e)return!1;if(e.length>512)this.port.postMessage({buffer:e});else{const t=512-s;e.length>=t?(r.set(e.subarray(0,t),s),this.port.postMessage({buffer:r}),r.fill(0),r.set(e.subarray(t),0),s=e.length-t):(r.set(e,s),s+=e.length)}return!0}}registerProcessor("vad-processor",l);
|
assets/worker-yoCrhISy.ts
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type {
|
| 2 |
+
AutomaticSpeechRecognitionPipeline,
|
| 3 |
+
CausalLMOutputWithPast,
|
| 4 |
+
GPT2Tokenizer,
|
| 5 |
+
LlamaForCausalLM,
|
| 6 |
+
PreTrainedModel,
|
| 7 |
+
StoppingCriteriaList,
|
| 8 |
+
} from '@huggingface/transformers'
|
| 9 |
+
import type { Device, DType } from '@xsai-transformers/shared/types'
|
| 10 |
+
import type { GenerateOptions } from 'kokoro-js'
|
| 11 |
+
import type {
|
| 12 |
+
WorkerMessageEventError,
|
| 13 |
+
WorkerMessageEventInfo,
|
| 14 |
+
WorkerMessageEventOutput,
|
| 15 |
+
WorkerMessageEventProgress,
|
| 16 |
+
WorkerMessageEventSetVoiceResponse,
|
| 17 |
+
WorkerMessageEventStatus,
|
| 18 |
+
} from '../types/worker'
|
| 19 |
+
|
| 20 |
+
import {
|
| 21 |
+
// VAD
|
| 22 |
+
AutoModel,
|
| 23 |
+
|
| 24 |
+
AutoModelForCausalLM,
|
| 25 |
+
// LLM
|
| 26 |
+
AutoTokenizer,
|
| 27 |
+
InterruptableStoppingCriteria,
|
| 28 |
+
pipeline,
|
| 29 |
+
|
| 30 |
+
// Speech recognition
|
| 31 |
+
Tensor,
|
| 32 |
+
TextStreamer,
|
| 33 |
+
} from '@huggingface/transformers'
|
| 34 |
+
import { isWebGPUSupported } from 'gpuu/webgpu'
|
| 35 |
+
import { KokoroTTS, TextSplitterStream } from 'kokoro-js'
|
| 36 |
+
|
| 37 |
+
import {
|
| 38 |
+
EXIT_THRESHOLD,
|
| 39 |
+
INPUT_SAMPLE_RATE,
|
| 40 |
+
MAX_BUFFER_DURATION,
|
| 41 |
+
MAX_NUM_PREV_BUFFERS,
|
| 42 |
+
MIN_SILENCE_DURATION_SAMPLES,
|
| 43 |
+
MIN_SPEECH_DURATION_SAMPLES,
|
| 44 |
+
SPEECH_PAD_SAMPLES,
|
| 45 |
+
SPEECH_THRESHOLD,
|
| 46 |
+
} from '../constants'
|
| 47 |
+
|
| 48 |
+
interface Message {
|
| 49 |
+
role: 'system' | 'user' | 'assistant'
|
| 50 |
+
content: string
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
type Voices = GenerateOptions['voice']
|
| 54 |
+
export type PretrainedConfig = NonNullable<Parameters<typeof AutoModel.from_pretrained>[1]>['config']
|
| 55 |
+
|
| 56 |
+
const whisperDtypeMap: Record<Device, DType> = {
|
| 57 |
+
webgpu: {
|
| 58 |
+
encoder_model: 'fp32',
|
| 59 |
+
decoder_model_merged: 'fp32',
|
| 60 |
+
},
|
| 61 |
+
wasm: {
|
| 62 |
+
encoder_model: 'fp32',
|
| 63 |
+
decoder_model_merged: 'q8',
|
| 64 |
+
},
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
const model_id = 'onnx-community/Kokoro-82M-v1.0-ONNX'
|
| 68 |
+
let voice: Voices | undefined
|
| 69 |
+
let silero_vad: PreTrainedModel
|
| 70 |
+
let transcriber: AutomaticSpeechRecognitionPipeline
|
| 71 |
+
let tts: KokoroTTS
|
| 72 |
+
|
| 73 |
+
const SYSTEM_MESSAGE: Message = {
|
| 74 |
+
role: 'system',
|
| 75 |
+
content:
|
| 76 |
+
'You\'re a helpful and conversational voice assistant. Keep your responses short, clear, and casual.',
|
| 77 |
+
}
|
| 78 |
+
let messages: Message[] = [SYSTEM_MESSAGE]
|
| 79 |
+
let past_key_values_cache: any = null
|
| 80 |
+
let stopping_criteria: InterruptableStoppingCriteria | null = null
|
| 81 |
+
|
| 82 |
+
// Global audio buffer to store incoming audio
|
| 83 |
+
const BUFFER = new Float32Array(MAX_BUFFER_DURATION * INPUT_SAMPLE_RATE)
|
| 84 |
+
let bufferPointer = 0
|
| 85 |
+
|
| 86 |
+
// Initial state for VAD
|
| 87 |
+
const sr = new Tensor('int64', [INPUT_SAMPLE_RATE], [])
|
| 88 |
+
let state = new Tensor('float32', new Float32Array(2 * 1 * 128), [2, 1, 128])
|
| 89 |
+
|
| 90 |
+
// Whether we are in the process of adding audio to the buffer
|
| 91 |
+
let isRecording = false
|
| 92 |
+
let isPlaying = false // new flag
|
| 93 |
+
|
| 94 |
+
let tokenizer: GPT2Tokenizer
|
| 95 |
+
let llm: LlamaForCausalLM
|
| 96 |
+
|
| 97 |
+
const prevBuffers: Float32Array[] = []
|
| 98 |
+
|
| 99 |
+
export async function loadModels() {
|
| 100 |
+
tts = await KokoroTTS.from_pretrained(model_id, {
|
| 101 |
+
dtype: 'fp32',
|
| 102 |
+
device: 'webgpu',
|
| 103 |
+
})
|
| 104 |
+
|
| 105 |
+
const device = 'webgpu'
|
| 106 |
+
globalThis.postMessage({ type: 'info', data: { message: `Using device: "${device}"` } } satisfies WorkerMessageEventInfo)
|
| 107 |
+
globalThis.postMessage({ type: 'info', data: { message: 'Loading models...', duration: 'until_next' } } satisfies WorkerMessageEventInfo)
|
| 108 |
+
|
| 109 |
+
// Load models
|
| 110 |
+
silero_vad = await AutoModel.from_pretrained(
|
| 111 |
+
'onnx-community/silero-vad',
|
| 112 |
+
{
|
| 113 |
+
config: { model_type: 'custom' } as PretrainedConfig,
|
| 114 |
+
dtype: 'fp32', // Full-precision
|
| 115 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
| 116 |
+
},
|
| 117 |
+
).catch((error: Error) => {
|
| 118 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
| 119 |
+
throw error
|
| 120 |
+
})
|
| 121 |
+
|
| 122 |
+
transcriber = await pipeline(
|
| 123 |
+
'automatic-speech-recognition',
|
| 124 |
+
'onnx-community/whisper-base', // or "onnx-community/moonshine-base-ONNX",
|
| 125 |
+
{
|
| 126 |
+
device,
|
| 127 |
+
dtype: whisperDtypeMap[device as keyof typeof whisperDtypeMap],
|
| 128 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
| 129 |
+
},
|
| 130 |
+
).catch((error: Error) => {
|
| 131 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
| 132 |
+
throw error
|
| 133 |
+
})
|
| 134 |
+
|
| 135 |
+
await transcriber(new Float32Array(INPUT_SAMPLE_RATE)) // Compile shaders
|
| 136 |
+
|
| 137 |
+
llm = await AutoModelForCausalLM.from_pretrained(
|
| 138 |
+
'HuggingFaceTB/SmolLM2-1.7B-Instruct',
|
| 139 |
+
{
|
| 140 |
+
dtype: await isWebGPUSupported() ? 'q4f16' : 'int8',
|
| 141 |
+
device: await isWebGPUSupported() ? 'webgpu' : 'wasm',
|
| 142 |
+
progress_callback: progress => globalThis.postMessage({ type: 'progress', data: { message: progress } } satisfies WorkerMessageEventProgress),
|
| 143 |
+
},
|
| 144 |
+
).catch((error: Error) => {
|
| 145 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
| 146 |
+
throw error
|
| 147 |
+
})
|
| 148 |
+
|
| 149 |
+
tokenizer = await AutoTokenizer.from_pretrained(
|
| 150 |
+
'HuggingFaceTB/SmolLM2-1.7B-Instruct',
|
| 151 |
+
).catch((error: Error) => {
|
| 152 |
+
globalThis.postMessage({ type: 'error', data: { error, message: error.message } } satisfies WorkerMessageEventError<Error>)
|
| 153 |
+
throw error
|
| 154 |
+
})
|
| 155 |
+
|
| 156 |
+
await llm.generate({ ...tokenizer('x'), max_new_tokens: 1 }) // Compile shaders
|
| 157 |
+
|
| 158 |
+
globalThis.postMessage({
|
| 159 |
+
type: 'status',
|
| 160 |
+
data: {
|
| 161 |
+
status: 'ready',
|
| 162 |
+
message: 'Ready!',
|
| 163 |
+
voices: tts.voices,
|
| 164 |
+
},
|
| 165 |
+
} as WorkerMessageEventStatus)
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
loadModels()
|
| 169 |
+
|
| 170 |
+
/**
|
| 171 |
+
* Perform Voice Activity Detection (VAD)
|
| 172 |
+
* @param buffer The new audio buffer
|
| 173 |
+
* @returns `true` if the buffer is speech, `false` otherwise.
|
| 174 |
+
*/
|
| 175 |
+
async function vad(buffer?: Float32Array): Promise<boolean> {
|
| 176 |
+
if (!buffer) {
|
| 177 |
+
// Possibly closed or interrupted
|
| 178 |
+
return false
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
const input = new Tensor('float32', buffer, [1, buffer.length])
|
| 182 |
+
|
| 183 |
+
const { stateN, output } = await silero_vad({ input, sr, state })
|
| 184 |
+
state = stateN // Update state
|
| 185 |
+
|
| 186 |
+
const isSpeech = output.data[0]
|
| 187 |
+
|
| 188 |
+
// Use heuristics to determine if the buffer is speech or not
|
| 189 |
+
return (
|
| 190 |
+
// Case 1: We are above the threshold (definitely speech)
|
| 191 |
+
isSpeech > SPEECH_THRESHOLD
|
| 192 |
+
// Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold
|
| 193 |
+
|| (isRecording && isSpeech >= EXIT_THRESHOLD)
|
| 194 |
+
)
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
interface SpeechData {
|
| 198 |
+
start: number
|
| 199 |
+
end: number
|
| 200 |
+
duration: number
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
type BatchEncodingItem = number[] | number[][] | Tensor
|
| 204 |
+
/**
|
| 205 |
+
* Holds the output of the tokenizer's call function.
|
| 206 |
+
*/
|
| 207 |
+
interface BatchEncoding {
|
| 208 |
+
/**
|
| 209 |
+
* List of token ids to be fed to a model.
|
| 210 |
+
*/
|
| 211 |
+
input_ids: BatchEncodingItem
|
| 212 |
+
/**
|
| 213 |
+
* List of indices specifying which tokens should be attended to by the model.
|
| 214 |
+
*/
|
| 215 |
+
attention_mask: BatchEncodingItem
|
| 216 |
+
/**
|
| 217 |
+
* List of token type ids to be fed to a model.
|
| 218 |
+
*/
|
| 219 |
+
token_type_ids?: BatchEncodingItem
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
/**
|
| 223 |
+
* Transcribe the audio buffer
|
| 224 |
+
* @param buffer The audio buffer
|
| 225 |
+
* @param _data Additional data
|
| 226 |
+
*/
|
| 227 |
+
async function speechToSpeech(buffer: Float32Array, _data: SpeechData): Promise<void> {
|
| 228 |
+
isPlaying = true
|
| 229 |
+
|
| 230 |
+
// 1. Transcribe the audio from the user
|
| 231 |
+
const result = await transcriber(buffer)
|
| 232 |
+
const text = (result as { text: string }).text.trim()
|
| 233 |
+
|
| 234 |
+
if (['', '[BLANK_AUDIO]'].includes(text)) {
|
| 235 |
+
// If the transcription is empty or a blank audio, we skip the rest of the processing
|
| 236 |
+
return
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
messages.push({ role: 'user', content: text })
|
| 240 |
+
|
| 241 |
+
// Set up text-to-speech streaming
|
| 242 |
+
const splitter = new TextSplitterStream()
|
| 243 |
+
const stream = tts!.stream(splitter, { voice });
|
| 244 |
+
(async () => {
|
| 245 |
+
for await (const { text, audio } of stream) {
|
| 246 |
+
globalThis.postMessage({ type: 'output', data: { text, result: audio } } satisfies WorkerMessageEventOutput)
|
| 247 |
+
}
|
| 248 |
+
})()
|
| 249 |
+
|
| 250 |
+
// 2. Generate a response using the LLM
|
| 251 |
+
const inputs = tokenizer.apply_chat_template(messages, {
|
| 252 |
+
add_generation_prompt: true,
|
| 253 |
+
return_dict: true,
|
| 254 |
+
}) as BatchEncoding
|
| 255 |
+
|
| 256 |
+
const streamer = new TextStreamer(tokenizer, {
|
| 257 |
+
skip_prompt: true,
|
| 258 |
+
skip_special_tokens: true,
|
| 259 |
+
callback_function: (text: string) => {
|
| 260 |
+
splitter.push(text)
|
| 261 |
+
},
|
| 262 |
+
token_callback_function: () => {},
|
| 263 |
+
})
|
| 264 |
+
|
| 265 |
+
stopping_criteria = new InterruptableStoppingCriteria()
|
| 266 |
+
type GenerationFunctionParameters = Parameters<typeof llm.generate>[0] & Record<string, any>
|
| 267 |
+
|
| 268 |
+
const generatedRes = await llm.generate({
|
| 269 |
+
...inputs,
|
| 270 |
+
past_key_values: past_key_values_cache,
|
| 271 |
+
do_sample: false, // TODO: do_sample: true is bugged (invalid data location on top-k sample)
|
| 272 |
+
max_new_tokens: 1024,
|
| 273 |
+
streamer,
|
| 274 |
+
stopping_criteria: stopping_criteria as unknown as StoppingCriteriaList,
|
| 275 |
+
return_dict_in_generate: true,
|
| 276 |
+
} as GenerationFunctionParameters)
|
| 277 |
+
|
| 278 |
+
const { past_key_values, sequences } = generatedRes as CausalLMOutputWithPast & { sequences: Tensor }
|
| 279 |
+
past_key_values_cache = past_key_values
|
| 280 |
+
|
| 281 |
+
// Finally, close the stream to signal that no more text will be added.
|
| 282 |
+
splitter.close()
|
| 283 |
+
|
| 284 |
+
const decoded = tokenizer.batch_decode(
|
| 285 |
+
// TODO: fix null as any
|
| 286 |
+
sequences.slice(null, [(inputs.input_ids as Tensor).dims[1], null as any]),
|
| 287 |
+
{ skip_special_tokens: true },
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
messages.push({ role: 'assistant', content: decoded[0] })
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
// Track the number of samples after the last speech chunk
|
| 294 |
+
let postSpeechSamples = 0
|
| 295 |
+
function resetAfterRecording(offset = 0): void {
|
| 296 |
+
globalThis.postMessage({
|
| 297 |
+
type: 'status',
|
| 298 |
+
data: {
|
| 299 |
+
status: 'recording_end',
|
| 300 |
+
message: 'Transcribing...',
|
| 301 |
+
duration: 'until_next',
|
| 302 |
+
},
|
| 303 |
+
} satisfies WorkerMessageEventStatus)
|
| 304 |
+
|
| 305 |
+
BUFFER.fill(0, offset)
|
| 306 |
+
bufferPointer = offset
|
| 307 |
+
isRecording = false
|
| 308 |
+
postSpeechSamples = 0
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
function dispatchForTranscriptionAndResetAudioBuffer(overflow?: Float32Array): void {
|
| 312 |
+
// Get start and end time of the speech segment, minus the padding
|
| 313 |
+
const now = Date.now()
|
| 314 |
+
const end
|
| 315 |
+
= now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / INPUT_SAMPLE_RATE) * 1000
|
| 316 |
+
const start = end - (bufferPointer / INPUT_SAMPLE_RATE) * 1000
|
| 317 |
+
const duration = end - start
|
| 318 |
+
const overflowLength = overflow?.length ?? 0
|
| 319 |
+
|
| 320 |
+
// Send the audio buffer to the worker
|
| 321 |
+
const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES)
|
| 322 |
+
|
| 323 |
+
const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0)
|
| 324 |
+
const paddedBuffer = new Float32Array(prevLength + buffer.length)
|
| 325 |
+
let offset = 0
|
| 326 |
+
for (const prev of prevBuffers) {
|
| 327 |
+
paddedBuffer.set(prev, offset)
|
| 328 |
+
offset += prev.length
|
| 329 |
+
}
|
| 330 |
+
paddedBuffer.set(buffer, offset)
|
| 331 |
+
speechToSpeech(paddedBuffer, { start, end, duration })
|
| 332 |
+
|
| 333 |
+
// Set overflow (if present) and reset the rest of the audio buffer
|
| 334 |
+
if (overflow) {
|
| 335 |
+
BUFFER.set(overflow, 0)
|
| 336 |
+
}
|
| 337 |
+
resetAfterRecording(overflowLength)
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
globalThis.onmessage = async (event: MessageEvent) => {
|
| 341 |
+
const { type, buffer } = event.data
|
| 342 |
+
|
| 343 |
+
// refuse new audio while playing back
|
| 344 |
+
if (type === 'audio' && isPlaying)
|
| 345 |
+
return
|
| 346 |
+
|
| 347 |
+
switch (type) {
|
| 348 |
+
case 'start_call': {
|
| 349 |
+
const name = tts!.voices[voice ?? 'af_heart']?.name ?? 'Heart'
|
| 350 |
+
greet(`Hey there, my name is ${name}! How can I help you today?`)
|
| 351 |
+
return
|
| 352 |
+
}
|
| 353 |
+
case 'end_call':
|
| 354 |
+
messages = [SYSTEM_MESSAGE]
|
| 355 |
+
past_key_values_cache = null
|
| 356 |
+
break
|
| 357 |
+
case 'interrupt':
|
| 358 |
+
stopping_criteria?.interrupt()
|
| 359 |
+
return
|
| 360 |
+
case 'set_voice':
|
| 361 |
+
voice = event.data.voice
|
| 362 |
+
|
| 363 |
+
globalThis.postMessage({
|
| 364 |
+
type: 'set_voice_response',
|
| 365 |
+
data: {
|
| 366 |
+
ok: true,
|
| 367 |
+
},
|
| 368 |
+
} satisfies WorkerMessageEventSetVoiceResponse)
|
| 369 |
+
|
| 370 |
+
return
|
| 371 |
+
case 'playback_ended':
|
| 372 |
+
isPlaying = false
|
| 373 |
+
return
|
| 374 |
+
}
|
| 375 |
+
|
| 376 |
+
const wasRecording = isRecording // Save current state
|
| 377 |
+
const isSpeech = await vad(buffer)
|
| 378 |
+
|
| 379 |
+
if (!wasRecording && !isSpeech) {
|
| 380 |
+
// We are not recording, and the buffer is not speech,
|
| 381 |
+
// so we will probably discard the buffer. So, we insert
|
| 382 |
+
// into a FIFO queue with maximum size of PREV_BUFFER_SIZE
|
| 383 |
+
if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) {
|
| 384 |
+
// If the queue is full, we discard the oldest buffer
|
| 385 |
+
prevBuffers.shift()
|
| 386 |
+
}
|
| 387 |
+
prevBuffers.push(buffer)
|
| 388 |
+
return
|
| 389 |
+
}
|
| 390 |
+
|
| 391 |
+
const remaining = BUFFER.length - bufferPointer
|
| 392 |
+
if (buffer.length >= remaining) {
|
| 393 |
+
// The buffer is larger than (or equal to) the remaining space in the global buffer,
|
| 394 |
+
// so we perform transcription and copy the overflow to the global buffer
|
| 395 |
+
BUFFER.set(buffer.subarray(0, remaining), bufferPointer)
|
| 396 |
+
bufferPointer += remaining
|
| 397 |
+
|
| 398 |
+
// Dispatch the audio buffer
|
| 399 |
+
const overflow = buffer.subarray(remaining)
|
| 400 |
+
dispatchForTranscriptionAndResetAudioBuffer(overflow)
|
| 401 |
+
return
|
| 402 |
+
}
|
| 403 |
+
else {
|
| 404 |
+
// The buffer is smaller than the remaining space in the global buffer,
|
| 405 |
+
// so we copy it to the global buffer
|
| 406 |
+
BUFFER.set(buffer, bufferPointer)
|
| 407 |
+
bufferPointer += buffer.length
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
if (isSpeech) {
|
| 411 |
+
if (!isRecording) {
|
| 412 |
+
// Indicate start of recording
|
| 413 |
+
globalThis.postMessage({
|
| 414 |
+
type: 'status',
|
| 415 |
+
data: {
|
| 416 |
+
status: 'recording_start',
|
| 417 |
+
message: 'Listening...',
|
| 418 |
+
duration: 'until_next',
|
| 419 |
+
},
|
| 420 |
+
} satisfies WorkerMessageEventStatus)
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
// Start or continue recording
|
| 424 |
+
isRecording = true
|
| 425 |
+
postSpeechSamples = 0 // Reset the post-speech samples
|
| 426 |
+
|
| 427 |
+
return
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
postSpeechSamples += buffer.length
|
| 431 |
+
|
| 432 |
+
// At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech.
|
| 433 |
+
// So, we check whether we have reached the end of the current audio chunk.
|
| 434 |
+
if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) {
|
| 435 |
+
// There was a short pause, but not long enough to consider the end of a speech chunk
|
| 436 |
+
// (e.g., the speaker took a breath), so we continue recording
|
| 437 |
+
return
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) {
|
| 441 |
+
// The entire buffer (including the new chunk) is smaller than the minimum
|
| 442 |
+
// duration of a speech chunk, so we can safely discard the buffer.
|
| 443 |
+
resetAfterRecording()
|
| 444 |
+
return
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
dispatchForTranscriptionAndResetAudioBuffer()
|
| 448 |
+
}
|
| 449 |
+
|
| 450 |
+
function greet(text: string): void {
|
| 451 |
+
isPlaying = true
|
| 452 |
+
|
| 453 |
+
const splitter = new TextSplitterStream()
|
| 454 |
+
const stream = tts!.stream(splitter, { voice });
|
| 455 |
+
|
| 456 |
+
(async () => {
|
| 457 |
+
for await (const { text: chunkText, audio } of stream) {
|
| 458 |
+
globalThis.postMessage({ type: 'output', data: { text: chunkText, result: audio } } satisfies WorkerMessageEventOutput)
|
| 459 |
+
}
|
| 460 |
+
})()
|
| 461 |
+
|
| 462 |
+
splitter.push(text)
|
| 463 |
+
splitter.close()
|
| 464 |
+
messages.push({ role: 'assistant', content: text })
|
| 465 |
+
}
|
favicon-96x96.png
ADDED
|
|
favicon.svg
ADDED
|
|
index.html
CHANGED
|
@@ -1,19 +1,24 @@
|
|
| 1 |
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
</html>
|
|
|
|
| 1 |
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<title>Realtime Conversational WebGPU (Vue)</title>
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=0" />
|
| 7 |
+
<link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96" />
|
| 8 |
+
<link rel="icon" type="image/svg+xml" href="/favicon.svg" />
|
| 9 |
+
<script>
|
| 10 |
+
;(function () {
|
| 11 |
+
const prefersDark = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches
|
| 12 |
+
const setting = localStorage.getItem('vueuse-color-scheme') || 'auto'
|
| 13 |
+
if (setting === 'dark' || (prefersDark && setting !== 'light'))
|
| 14 |
+
document.documentElement.classList.toggle('dark', true)
|
| 15 |
+
})()
|
| 16 |
+
</script>
|
| 17 |
+
<script type="module" crossorigin src="/assets/index-DGmKQH7N.js"></script>
|
| 18 |
+
<link rel="stylesheet" crossorigin href="/assets/index-cAxkOY9l.css">
|
| 19 |
+
</head>
|
| 20 |
+
<body class="font-sans">
|
| 21 |
+
<div id="app"></div>
|
| 22 |
+
<noscript> This website requires JavaScript to function properly. Please enable JavaScript to continue. </noscript>
|
| 23 |
+
</body>
|
| 24 |
</html>
|
style.css
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
padding: 2rem;
|
| 3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
h1 {
|
| 7 |
-
font-size: 16px;
|
| 8 |
-
margin-top: 0;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
p {
|
| 12 |
-
color: rgb(107, 114, 128);
|
| 13 |
-
font-size: 15px;
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
.card {
|
| 19 |
-
max-width: 620px;
|
| 20 |
-
margin: 0 auto;
|
| 21 |
-
padding: 16px;
|
| 22 |
-
border: 1px solid lightgray;
|
| 23 |
-
border-radius: 16px;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
.card p:last-child {
|
| 27 |
-
margin-bottom: 0;
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|