feat: MicVAD integrated

moeru-ai · Dec 10, 2024 · e4a0cc7 · e4a0cc7
1 parent 50190a3
commit e4a0cc7
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 1 deletion.
diff --git a/packages/stage/src/components/MainStage.vue b/packages/stage/src/components/MainStage.vue
@@ -1,6 +1,7 @@
 <script setup lang="ts">
 import type { AssistantMessage, Message, SystemMessage } from '@xsai/shared-chat-completion'
 import type { Emotion } from '../constants/emotions'
+import { MicVAD } from '@ricky0123/vad-web'
 import { useLocalStorage } from '@vueuse/core'
 import { storeToRefs } from 'pinia'
 
@@ -51,6 +52,9 @@ const audioAnalyser = ref<AnalyserNode>()
 const mouthOpenSize = ref(0)
 const nowSpeaking = ref(false)
 const lipSyncStarted = ref(false)
+const micVad = ref<MicVAD>()
+const { audioInputs } = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
+const selectedAudioDevice = ref<MediaDeviceInfo>()
 
 const nowSpeakingAvatarBorderOpacity = computed<number>(() => {
   if (!nowSpeaking.value)
@@ -71,6 +75,67 @@ function handleModelChange(event: Event) {
   openAIModel.value = found
 }
 
+async function handleMicVADActivation(deviceId: string) {
+  if (micVad.value) {
+    micVad.value.destroy()
+    micVad.value = undefined
+    console.warn('existing MicVAD destroyed')
+  }
+
+  const media = await navigator.mediaDevices.getUserMedia({ audio: { deviceId } })
+
+  // Use of MicVAD is inspired by Open-LLM-VTuber
+  // Source code reference: https://github.com/t41372/Open-LLM-VTuber/blob/92cbf4349b84a68b0035bc825bc3d1d61fd0f063/static/index.html#L119
+  micVad.value = await MicVAD.new({
+    stream: media,
+    model: 'v5',
+    positiveSpeechThreshold: 0.2, // default is 0.5
+    negativeSpeechThreshold: 0.08, // default is 0.5 - 0.15
+    minSpeechFrames: 60, // default is 9
+    onSpeechStart: () => {
+      // TODO: interrupt the playback
+      // TODO: interrupt any of the ongoing TTS
+      // TODO: interrupt any of the ongoing LLM requests
+      // TODO: interrupt any of the ongoing animation of Live2D or VRM
+      // TODO: once interrupted, we should somehow switch to listen or thinking
+      //       emotion / expression?
+      listening.value = true
+    },
+    // VAD misfire means while speech end is detected but
+    // the frames of the segment of the audio buffer
+    // is not enough to be considered as a speech segment
+    // which controlled by the `minSpeechFrames` parameter
+    onVADMisfire: () => {
+      // TODO: do audio buffer send to whisper
+      listening.value = false
+    },
+    onSpeechEnd: () => {
+      // TODO: do audio buffer send to whisper
+      listening.value = false
+    },
+    // WORKAROUND: temporary workaround for onnxruntime-web, since @ricky0123/vad-web
+    // uses hardcoded version of [email protected] to fetch the already non-existing
+    // ort-wasm-simd-threaded.mjs file and its WASM binary, we are going to force
+    // the onnxruntime-web to use the latest version of onnxruntime-web from jsdelivr
+    // to fetch the correct ort-wasm-simd-threaded.wasm binary
+    onnxWASMBasePath: 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/',
+  })
+
+  micVad.value.start()
+}
+
+async function handleAudioInputChange(event: Event) {
+  const target = event.target as HTMLSelectElement
+  const found = audioInputs.value.find(d => d.deviceId === target.value)
+  if (!found) {
+    selectedAudioDevice.value = undefined
+    return
+  }
+
+  selectedAudioDevice.value = found
+  await handleMicVADActivation(found.deviceId)
+}
+
 const audioQueue = useQueue<{ audioBuffer: AudioBuffer, text: string }>({
   handlers: [
     (ctx) => {
@@ -250,6 +315,11 @@ watch([openAiApiBaseURL, openAiApiKey], async ([baseUrl, apiKey]) => {
   supportedModels.value = await models(baseUrl, apiKey)
 })
 
+onUnmounted(() => {
+  if (micVad.value)
+    micVad.value.destroy()
+})
+
 onMounted(async () => {
   if (!openAiApiBaseURL.value || !openAiApiKey.value)
     return
@@ -354,6 +424,22 @@ onUnmounted(() => {
     </div>
     <div flex="~ row" my="2" space-x="2" w-full self-end>
       <div flex="~ col" w-full space-y="2">
+        <select
+          p="2"
+          bg="zinc-100 dark:zinc-700" w-full rounded-lg
+          outline-none
+          @change="handleAudioInputChange"
+        >
+          <option disabled>
+            Select a Audio Input
+          </option>
+          <option v-if="selectedAudioDevice" :value="selectedAudioDevice.deviceId">
+            {{ selectedAudioDevice.label }}
+          </option>
+          <option v-for="m in audioInputs" :key="m.deviceId" :value="m.deviceId">
+            {{ m.label }}
+          </option>
+        </select>
         <select
           p="2"
           bg="zinc-100 dark:zinc-700" w-full rounded-lg

diff --git a/packages/stt-realtime-webgpu/package.json b/packages/stt-realtime-webgpu/package.json
@@ -12,7 +12,7 @@
   "license": "MIT",
   "scripts": {
     "build": "vite build",
-    "dev": "vite",
+    "dev": "vite  --port 5174",
     "lint": "eslint .",
     "preview": "vite preview",
     "typecheck": "vue-tsc --noEmit"