diff --git a/packages/moonshine-web/.dockerignore b/packages/moonshine-web/.dockerignore new file mode 100644 index 0000000..f06235c --- /dev/null +++ b/packages/moonshine-web/.dockerignore @@ -0,0 +1,2 @@ +node_modules +dist diff --git a/packages/moonshine-web/Dockerfile b/packages/moonshine-web/Dockerfile new file mode 100644 index 0000000..3e3a684 --- /dev/null +++ b/packages/moonshine-web/Dockerfile @@ -0,0 +1,18 @@ +FROM node:20-alpine as build-stage + +WORKDIR /app +RUN corepack enable + +COPY .npmrc package.json pnpm-lock.yaml ./ +RUN --mount=type=cache,id=pnpm-store,target=/root/.pnpm-store \ + pnpm install --frozen-lockfile + +COPY . . +RUN pnpm build + +FROM nginx:stable-alpine as production-stage + +COPY --from=build-stage /app/dist /usr/share/nginx/html +EXPOSE 80 + +CMD ["nginx", "-g", "daemon off;"] diff --git a/packages/moonshine-web/README.md b/packages/moonshine-web/README.md new file mode 100644 index 0000000..f55d218 --- /dev/null +++ b/packages/moonshine-web/README.md @@ -0,0 +1,71 @@ +--- +title: Moonshine Web (Vue) +emoji: 🌙 +colorFrom: blue +colorTo: pink +sdk: static +pinned: false +license: apache-2.0 +models: + - onnx-community/moonshine-base-ONNX +short_description: Yet another Real-time in-browser speech recognition, re-implemented with Vue +thumbnail: https://raw.githubusercontent.com/moeru-ai/airi/refs/heads/main/packages/whisper-webgpu/public/banner.png +--- + +

Moonshine Web (Vue)

+ +

+ [Try it] +

+ +> Heavily inspired by [Realtime in-browser speech recognition](https://huggingface.co/spaces/webml-community/moonshine-web) + +# Moonshine Web + +A simple Vue + Vite application for running [Moonshine Base](https://huggingface.co/onnx-community/moonshine-base-ONNX), a powerful speech-to-text model optimized for fast and accurate automatic speech recognition (ASR) on resource-constrained devices. It runs locally in the browser using Transformers.js and WebGPU-acceleration (or WASM as a fallback). + +## Getting Started + +Follow the steps below to set up and run the application. + +### 1. Clone the Repository + +Clone the examples repository from GitHub: + +```sh +git clone https://github.com/moeru-ai/airi.git +``` + +### 2. Navigate to the Project Directory + +Change your working directory to the `moonshine-web` folder: + +```sh +cd packages/moonshine-web +``` + +### 3. Install Dependencies + +Install the necessary dependencies using npm: + +```sh +npm i +``` + +### 4. Run the Development Server + +Start the development server: + +```sh +npm run dev +``` + +The application should now be running locally. Open your browser and go to `http://localhost:5175` to see it in action. + +## Acknowledgements + +The audio visualizer was adapted from Wael Yasmina's [amazing tutorial](https://waelyasmina.net/articles/how-to-create-a-3d-audio-visualizer-using-three-js/). + +Great thanks to what Xenova have done. + +> [Source code](https://github.com/huggingface/transformers.js-examples/tree/38a883dd465d70d7368b86b95aa0678895ca4e83/moonshine-web) diff --git a/packages/moonshine-web/index.html b/packages/moonshine-web/index.html new file mode 100644 index 0000000..8c9a995 --- /dev/null +++ b/packages/moonshine-web/index.html @@ -0,0 +1,136 @@ + + + + + Moonshine Web (Vue) + + + + + + + +
+ + + + diff --git a/packages/moonshine-web/netlify.toml b/packages/moonshine-web/netlify.toml new file mode 100755 index 0000000..12b9de3 --- /dev/null +++ b/packages/moonshine-web/netlify.toml @@ -0,0 +1,18 @@ +[build] +publish = "packages/moonshine-web/dist" +command = "pnpm run packages:stub && pnpm run build" + +[build.environment] +NODE_VERSION = "22" + +[[redirects]] +from = "/assets/*" +to = "/assets/:splat" +status = 200 +force = true + +[[redirects]] +from = "/*" +to = "/index.html" +status = 200 +force = false diff --git a/packages/moonshine-web/package.json b/packages/moonshine-web/package.json new file mode 100644 index 0000000..d6cd22a --- /dev/null +++ b/packages/moonshine-web/package.json @@ -0,0 +1,37 @@ +{ + "name": "@proj-airi/moonshine-web", + "type": "module", + "private": true, + "packageManager": "pnpm@9.15.1", + "description": "Yet another WebGPU based STT + VAD with Moonshine model re-implemented", + "author": { + "name": "Neko Ayaka", + "email": "neko@ayaka.moe", + "url": "https://github.com/nekomeowww" + }, + "license": "MIT", + "scripts": { + "build": "vite build", + "dev": "vite --port 5175", + "lint": "eslint .", + "preview": "vite preview", + "typecheck": "vue-tsc --noEmit" + }, + "dependencies": { + "@tresjs/core": "^4.3.1", + "@unocss/reset": "^0.65.2", + "@vueuse/core": "^12.1.0", + "@vueuse/motion": "^2.2.6", + "ofetch": "^1.4.1", + "three": "^0.171.0", + "vue": "^3.5.13" + }, + "devDependencies": { + "@huggingface/transformers": "^3.2.1", + "@types/audioworklet": "^0.0.65", + "@types/three": "^0.171.0", + "@vitejs/plugin-vue": "^5.2.1", + "@webgpu/types": "^0.1.52", + "vue-tsc": "^2.1.10" + } +} diff --git a/packages/moonshine-web/public/banner.png b/packages/moonshine-web/public/banner.png new file mode 100644 index 0000000..37f81a3 Binary files /dev/null and b/packages/moonshine-web/public/banner.png differ diff --git a/packages/moonshine-web/public/logo.png b/packages/moonshine-web/public/logo.png new file mode 100644 index 0000000..084b9f6 Binary files /dev/null and b/packages/moonshine-web/public/logo.png differ diff --git a/packages/moonshine-web/src/App.vue b/packages/moonshine-web/src/App.vue new file mode 100644 index 0000000..4012715 --- /dev/null +++ b/packages/moonshine-web/src/App.vue @@ -0,0 +1,263 @@ + + + + + diff --git a/packages/moonshine-web/src/components/AnimatedMesh.vue b/packages/moonshine-web/src/components/AnimatedMesh.vue new file mode 100644 index 0000000..27cf171 --- /dev/null +++ b/packages/moonshine-web/src/components/AnimatedMesh.vue @@ -0,0 +1,61 @@ + + + diff --git a/packages/moonshine-web/src/components/BloomScene.vue b/packages/moonshine-web/src/components/BloomScene.vue new file mode 100644 index 0000000..91bbb2d --- /dev/null +++ b/packages/moonshine-web/src/components/BloomScene.vue @@ -0,0 +1,49 @@ + + + diff --git a/packages/moonshine-web/src/constants/index.ts b/packages/moonshine-web/src/constants/index.ts new file mode 100644 index 0000000..4d67fe4 --- /dev/null +++ b/packages/moonshine-web/src/constants/index.ts @@ -0,0 +1,53 @@ +/** + * Sample rate of the audio. + * Coindicentally, this is the same for both models (Moonshine and Silero VAD) + */ +export const SAMPLE_RATE = 16000 +export const SAMPLE_RATE_MS = SAMPLE_RATE / 1000 + +/** + * Probabilities ABOVE this value are considered as SPEECH + */ +export const SPEECH_THRESHOLD = 0.3 + +/** + * If current state is SPEECH, and the probability of the next state + * is below this value, it is considered as NON-SPEECH. + */ +export const EXIT_THRESHOLD = 0.1 + +/** + * After each speech chunk, wait for at least this amount of silence + * before considering the next chunk as a new speech chunk + */ +export const MIN_SILENCE_DURATION_MS = 400 +export const MIN_SILENCE_DURATION_SAMPLES + = MIN_SILENCE_DURATION_MS * SAMPLE_RATE_MS + +/** + * Pad the speech chunk with this amount each side + */ +export const SPEECH_PAD_MS = 80 +export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * SAMPLE_RATE_MS + +/** + * Final speech chunks below this duration are discarded + */ +export const MIN_SPEECH_DURATION_SAMPLES = 250 * SAMPLE_RATE_MS // 250 ms + +/** + * Maximum duration of audio that can be handled by Moonshine + */ +export const MAX_BUFFER_DURATION = 30 + +/** + * Size of the incoming buffers + */ +export const NEW_BUFFER_SIZE = 512 + +/** + * The number of previous buffers to keep, to ensure the audio is padded correctly + */ +export const MAX_NUM_PREV_BUFFERS = Math.ceil( + SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE, +) diff --git a/packages/moonshine-web/src/libs/processor.ts b/packages/moonshine-web/src/libs/processor.ts new file mode 100644 index 0000000..cbd1eed --- /dev/null +++ b/packages/moonshine-web/src/libs/processor.ts @@ -0,0 +1,40 @@ +const MIN_CHUNK_SIZE = 512 +let globalPointer = 0 +const globalBuffer = new Float32Array(MIN_CHUNK_SIZE) + +class VADProcessor extends AudioWorkletProcessor { + process(inputs: Float32Array[][], _outputs: Float32Array[][], _parameters: Record): boolean { + const buffer = inputs[0][0] + if (!buffer) + return false // buffer is null when the stream ends + + if (buffer.length > MIN_CHUNK_SIZE) { + // If the buffer is larger than the minimum chunk size, send the entire buffer + this.port.postMessage({ buffer }) + } + else { + const remaining = MIN_CHUNK_SIZE - globalPointer + if (buffer.length >= remaining) { + // If the buffer is larger than (or equal to) the remaining space in the global buffer, copy the remaining space + globalBuffer.set(buffer.subarray(0, remaining), globalPointer) + + // Send the global buffer + this.port.postMessage({ buffer: globalBuffer }) + + // Reset the global buffer and set the remaining buffer + globalBuffer.fill(0) + globalBuffer.set(buffer.subarray(remaining), 0) + globalPointer = buffer.length - remaining + } + else { + // If the buffer is smaller than the remaining space in the global buffer, copy the buffer to the global buffer + globalBuffer.set(buffer, globalPointer) + globalPointer += buffer.length + } + } + + return true // Keep the processor alive + } +} + +registerProcessor('vad-processor', VADProcessor) diff --git a/packages/moonshine-web/src/libs/types.ts b/packages/moonshine-web/src/libs/types.ts new file mode 100644 index 0000000..a3f8135 --- /dev/null +++ b/packages/moonshine-web/src/libs/types.ts @@ -0,0 +1,42 @@ +export enum MessageType { + Status = 'status', + Output = 'output', + Info = 'info', +} + +export enum MessageStatus { + RecordingStart = 'recording_start', + RecordingEnd = 'recording_end', +} + +export enum Duration { + UntilNext = 'until_next', +} + +export interface MessageEventStatus { + type: MessageType.Status + status: MessageStatus + message: string + duration: Duration +} + +export interface MessageEventOutput { + type: MessageType.Output + buffer: Float32Array + message: string + start: number + end: number + duration: number +} + +export interface MessageEventInfo { + type: MessageType.Info + message: string + duration?: Duration.UntilNext +} + +export interface MessageEventError { + error: unknown +} + +export type MessageEvent = MessageEventError | MessageEventStatus | MessageEventOutput | MessageEventInfo diff --git a/packages/moonshine-web/src/libs/worker.ts b/packages/moonshine-web/src/libs/worker.ts new file mode 100644 index 0000000..361028e --- /dev/null +++ b/packages/moonshine-web/src/libs/worker.ts @@ -0,0 +1,238 @@ +/* eslint-disable antfu/no-top-level-await */ +/* eslint-disable no-restricted-globals */ +import type { MessageEventError, MessageEventInfo, MessageEventOutput, MessageEventStatus } from './types' +import { AutoModel, pipeline, Tensor } from '@huggingface/transformers' + +import { + EXIT_THRESHOLD, + MAX_BUFFER_DURATION, + MAX_NUM_PREV_BUFFERS, + MIN_SILENCE_DURATION_SAMPLES, + MIN_SPEECH_DURATION_SAMPLES, + SAMPLE_RATE, + SPEECH_PAD_SAMPLES, + SPEECH_THRESHOLD, +} from '../constants' +import { supportsWebGPU } from '../utils' +import { Duration, MessageStatus, MessageType } from './types' + +const device = (await supportsWebGPU()) ? 'webgpu' : 'wasm' +self.postMessage({ type: MessageType.Info, message: `Using device: "${device}"` } satisfies MessageEventInfo) +self.postMessage({ + type: MessageType.Info, + message: 'Loading models...', + duration: Duration.UntilNext, +} satisfies MessageEventInfo) + +// Load models +const silero_vad = await AutoModel.from_pretrained( + 'onnx-community/silero-vad', + { + config: { model_type: 'custom' }, + dtype: 'fp32', // Full-precision + }, +).catch((error) => { + self.postMessage({ error } satisfies MessageEventError) + throw error +}) + +const DEVICE_DTYPE_CONFIGS = { + webgpu: { + encoder_model: 'fp32', + decoder_model_merged: 'q4', + }, + wasm: { + encoder_model: 'fp32', + decoder_model_merged: 'q8', + }, +} + +const transcriber = await pipeline( + 'automatic-speech-recognition', + 'onnx-community/moonshine-base-ONNX', // or "onnx-community/whisper-tiny.en", + { + device, + dtype: DEVICE_DTYPE_CONFIGS[device], + }, +).catch((error) => { + self.postMessage({ error } satisfies MessageEventError) + throw error +}) + +await transcriber(new Float32Array(SAMPLE_RATE)) // Compile shaders +self.postMessage({ type: 'status', status: 'ready', message: 'Ready!' }) + +// Transformers.js currently doesn't support simultaneous inference, +// so we need to chain the inference promises. +let inferenceChain = Promise.resolve() + +// Global audio buffer to store incoming audio +const BUFFER = new Float32Array(MAX_BUFFER_DURATION * SAMPLE_RATE) +let bufferPointer = 0 + +// Initial state for VAD +const sr = new Tensor('int64', [SAMPLE_RATE], []) +let state = new Tensor('float32', new Float32Array(2 * 1 * 128), [2, 1, 128]) + +// Whether we are in the process of adding audio to the buffer +let isRecording = false + +/** + * Perform Voice Activity Detection (VAD) + * @param {Float32Array} buffer The new audio buffer + * @returns {Promise} `true` if the buffer is speech, `false` otherwise. + */ +async function vad(buffer: Float32Array) { + const input = new Tensor('float32', buffer, [1, buffer.length]) + + const { stateN, output } = await (inferenceChain = inferenceChain.then(_ => + silero_vad({ input, sr, state }), + )) + state = stateN // Update state + + const isSpeech = output.data[0] + + // Use heuristics to determine if the buffer is speech or not + return ( + // Case 1: We are above the threshold (definitely speech) + isSpeech > SPEECH_THRESHOLD + // Case 2: We are in the process of recording, and the probability is above the negative (exit) threshold + || (isRecording && isSpeech >= EXIT_THRESHOLD) + ) +} + +/** + * Transcribe the audio buffer + * @param {Float32Array} buffer The audio buffer + * @param {object} data Additional data + * @param {number} data.start The start time of the speech segment + * @param {number} data.end The end time of the speech segment + * @param {number} data.duration The duration of the speech segment + */ +async function transcribe(buffer: Float32Array, data: { start: number, end: number, duration: number }) { + const { text } = await (inferenceChain = inferenceChain.then(_ => + transcriber(buffer), + )) + self.postMessage({ type: MessageType.Output, buffer, message: text, ...data } satisfies MessageEventOutput) +} + +// Track the number of samples after the last speech chunk +let postSpeechSamples = 0 +function reset(offset = 0) { + self.postMessage({ + type: MessageType.Status, + status: MessageStatus.RecordingEnd, + message: 'Transcribing...', + duration: Duration.UntilNext, + } satisfies MessageEventStatus) + BUFFER.fill(0, offset) + bufferPointer = offset + isRecording = false + postSpeechSamples = 0 +} + +const prevBuffers: Array> = [] + +function dispatchForTranscriptionAndResetAudioBuffer(overflow?: Float32Array) { + // Get start and end time of the speech segment, minus the padding + const now = Date.now() + const end + = now - ((postSpeechSamples + SPEECH_PAD_SAMPLES) / SAMPLE_RATE) * 1000 + const start = end - (bufferPointer / SAMPLE_RATE) * 1000 + const duration = end - start + const overflowLength = overflow?.length ?? 0 + + // Send the audio buffer to the worker + const buffer = BUFFER.slice(0, bufferPointer + SPEECH_PAD_SAMPLES) + + const prevLength = prevBuffers.reduce((acc, b) => acc + b.length, 0) + const paddedBuffer = new Float32Array(prevLength + buffer.length) + let offset = 0 + for (const prev of prevBuffers) { + paddedBuffer.set(prev, offset) + offset += prev.length + } + paddedBuffer.set(buffer, offset) + transcribe(paddedBuffer, { start, end, duration }) + + // Set overflow (if present) and reset the rest of the audio buffer + if (overflow) { + BUFFER.set(overflow, 0) + } + + reset(overflowLength) +} + +self.onmessage = async (event) => { + const { buffer } = event.data as { buffer: Float32Array } + + const wasRecording = isRecording // Save current state + const isSpeech = await vad(buffer) + + if (!wasRecording && !isSpeech) { + // We are not recording, and the buffer is not speech, + // so we will probably discard the buffer. So, we insert + // into a FIFO queue with maximum size of PREV_BUFFER_SIZE + if (prevBuffers.length >= MAX_NUM_PREV_BUFFERS) { + // If the queue is full, we discard the oldest buffer + prevBuffers.shift() + } + + prevBuffers.push(buffer) + return + } + + const remaining = BUFFER.length - bufferPointer + if (buffer.length >= remaining) { + // The buffer is larger than (or equal to) the remaining space in the global buffer, + // so we perform transcription and copy the overflow to the global buffer + BUFFER.set(buffer.subarray(0, remaining), bufferPointer) + bufferPointer += remaining + + // Dispatch the audio buffer + const overflow = buffer.subarray(remaining) + dispatchForTranscriptionAndResetAudioBuffer(overflow) + return + } + else { + // The buffer is smaller than the remaining space in the global buffer, + // so we copy it to the global buffer + BUFFER.set(buffer, bufferPointer) + bufferPointer += buffer.length + } + + if (isSpeech) { + if (!isRecording) { + // Indicate start of recording + self.postMessage({ + type: MessageType.Status, + status: MessageStatus.RecordingStart, + message: 'Listening...', + duration: Duration.UntilNext, + } satisfies MessageEventStatus) + } + // Start or continue recording + isRecording = true + postSpeechSamples = 0 // Reset the post-speech samples + return + } + + postSpeechSamples += buffer.length + + // At this point we're confident that we were recording (wasRecording === true), but the latest buffer is not speech. + // So, we check whether we have reached the end of the current audio chunk. + if (postSpeechSamples < MIN_SILENCE_DURATION_SAMPLES) { + // There was a short pause, but not long enough to consider the end of a speech chunk + // (e.g., the speaker took a breath), so we continue recording + return + } + + if (bufferPointer < MIN_SPEECH_DURATION_SAMPLES) { + // The entire buffer (including the new chunk) is smaller than the minimum + // duration of a speech chunk, so we can safely discard the buffer. + reset() + return + } + + dispatchForTranscriptionAndResetAudioBuffer() +} diff --git a/packages/moonshine-web/src/main.ts b/packages/moonshine-web/src/main.ts new file mode 100644 index 0000000..af0b83f --- /dev/null +++ b/packages/moonshine-web/src/main.ts @@ -0,0 +1,14 @@ +import Tres from '@tresjs/core' +import { MotionPlugin } from '@vueuse/motion' +import { createApp } from 'vue' + +import App from './App.vue' + +import '@unocss/reset/tailwind.css' +import './styles/main.css' +import 'uno.css' + +createApp(App) + .use(MotionPlugin) + .use(Tres) + .mount('#app') diff --git a/packages/moonshine-web/src/styles/main.css b/packages/moonshine-web/src/styles/main.css new file mode 100644 index 0000000..47e1087 --- /dev/null +++ b/packages/moonshine-web/src/styles/main.css @@ -0,0 +1,18 @@ +@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@100;200;300;400;500;600;700;800;900&display=swap'); + +* { + font-family: 'Poppins', sans-serif; +} + +html { + overflow: hidden; +} + +html, +body, +#app { + margin: 0; + padding: 0; + height: 100%; + width: 100%; +} diff --git a/packages/moonshine-web/src/utils/index.ts b/packages/moonshine-web/src/utils/index.ts new file mode 100644 index 0000000..ef9bd2e --- /dev/null +++ b/packages/moonshine-web/src/utils/index.ts @@ -0,0 +1,26 @@ +export function formatDate(timestamp: number) { + return new Date(timestamp).toLocaleString('zh', { + hour12: false, + year: 'numeric', + month: 'numeric', + day: 'numeric', + hour: 'numeric', + minute: 'numeric', + second: 'numeric', + fractionalSecondDigits: 3, + }) +} + +export async function supportsWebGPU() { + try { + if (!('gpu' in navigator) || !navigator.gpu) + return false + + await navigator.gpu.requestAdapter() + return true + } + catch (e) { + console.error(e) + return false + } +} diff --git a/packages/moonshine-web/tsconfig.json b/packages/moonshine-web/tsconfig.json new file mode 100644 index 0000000..e3b932b --- /dev/null +++ b/packages/moonshine-web/tsconfig.json @@ -0,0 +1,38 @@ +{ + "compilerOptions": { + "target": "ESNext", + "jsx": "preserve", + "lib": [ + "DOM", + "ESNext", + "WebWorker" + ], + "baseUrl": ".", + "module": "ESNext", + "moduleResolution": "Bundler", + "paths": { + "~/*": ["src/*"] + }, + "resolveJsonModule": true, + "types": [ + "vitest", + "vite/client", + // Currently AudioWorkletProcessor type is missing, we need to add it manually through @types/audioworklet + // https://github.com/microsoft/TypeScript/issues/28308#issuecomment-1512509870 + "@types/audioworklet", + // @webgpu/types + // https://www.npmjs.com/package/@webgpu/types + "@webgpu/types" + ], + "allowJs": true, + "strict": true, + "strictNullChecks": true, + "noUnusedLocals": true, + "noEmit": true, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "skipLibCheck": true + }, + "exclude": ["dist", "node_modules", "cypress"] +} diff --git a/packages/moonshine-web/uno.config.ts b/packages/moonshine-web/uno.config.ts new file mode 100644 index 0000000..0202cc5 --- /dev/null +++ b/packages/moonshine-web/uno.config.ts @@ -0,0 +1,17 @@ +import { defineConfig, mergeConfigs, presetWebFonts } from 'unocss' +import UnoCSSConfig from '../../uno.config' + +export default defineConfig(mergeConfigs([ + UnoCSSConfig, + { + presets: [ + presetWebFonts({ + fonts: { + sans: 'DM Sans', + serif: 'DM Serif Display', + mono: 'DM Mono', + }, + }), + ], + }, +])) diff --git a/packages/moonshine-web/vite.config.ts b/packages/moonshine-web/vite.config.ts new file mode 100644 index 0000000..abb1ece --- /dev/null +++ b/packages/moonshine-web/vite.config.ts @@ -0,0 +1,17 @@ +import { templateCompilerOptions } from '@tresjs/core' +import Vue from '@vitejs/plugin-vue' +import Unocss from 'unocss/vite' +import { defineConfig } from 'vite' + +export default defineConfig({ + plugins: [ + Vue({ + // Other config + ...templateCompilerOptions, + }), + // https://github.com/antfu/unocss + // see uno.config.ts for config + Unocss(), + ], + worker: { format: 'es' }, +}) diff --git a/packages/whisper-webgpu/index.html b/packages/whisper-webgpu/index.html index 666cfef..1ce8f2a 100644 --- a/packages/whisper-webgpu/index.html +++ b/packages/whisper-webgpu/index.html @@ -4,11 +4,7 @@ Whisper Realtime (WebGPU) - - - - - +