feat: to composable

moeru-ai · Dec 11, 2024 · b3d8bf9 · b3d8bf9
1 parent e4a0cc7
commit b3d8bf9
Show file tree

Hide file tree

Showing 7 changed files with 113 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -20,3 +20,4 @@ pnpm dev
 
 - [pixiv/ChatVRM](https://github.com/pixiv/ChatVRM)
 - [josephrocca/ChatVRM-js: A JS conversion/adaptation of parts of the ChatVRM (TypeScript) code for standalone use in OpenCharacters and elsewhere](https://github.com/josephrocca/ChatVRM-js)
+- Design of UI and style was inspired by [Cookard](https://store.steampowered.com/app/2919650/Cookard/), [UNBEATABLE](https://store.steampowered.com/app/2240620/UNBEATABLE/), and [Sensei! I like you so much!](https://store.steampowered.com/app/2957700/_/), and artworks of [Ayame by Mercedes Bazan](https://dribbble.com/shots/22157656-Ayame) with [Wish by Mercedes Bazan](https://dribbble.com/shots/24501019-Wish)
diff --git a/cspell.config.yaml b/cspell.config.yaml
@@ -23,6 +23,7 @@ words:
   - cubismmotionqueuemanager
   - cubismusermodel
   - cubismviewmatrix
+  - defu
   - demi
   - elevenlabs
   - gltf
@@ -43,6 +44,7 @@ words:
   - nuxt
   - nuxtjs
   - ofetch
+  - onnx
   - onnxruntime
   - openai
   - pinia

diff --git a/packages/stage/package.json b/packages/stage/package.json
@@ -42,9 +42,11 @@
     "@unocss/reset": "^0.65.1",
     "@vueuse/core": "^12.0.0",
     "@vueuse/head": "^2.0.0",
+    "@vueuse/shared": "^12.0.0",
     "@xsai/model": "^0.0.19",
     "@xsai/shared-chat-completion": "^0.0.19",
     "@xsai/stream-text": "^0.0.19",
+    "defu": "^6.1.4",
     "nprogress": "^0.2.0",
     "ofetch": "^1.4.1",
     "onnxruntime-web": "^1.20.1",

diff --git a/packages/stage/src/auto-imports.d.ts b/packages/stage/src/auto-imports.d.ts
@@ -206,6 +206,7 @@ declare global {
   const useMemoize: typeof import('@vueuse/core')['useMemoize']
   const useMemory: typeof import('@vueuse/core')['useMemory']
   const useMessageContentQueue: typeof import('./composables/queues')['useMessageContentQueue']
+  const useMicVAD: typeof import('./composables/micvad')['useMicVAD']
   const useModel: typeof import('vue')['useModel']
   const useMounted: typeof import('@vueuse/core')['useMounted']
   const useMouse: typeof import('@vueuse/core')['useMouse']
@@ -514,6 +515,7 @@ declare module 'vue' {
     readonly useMemoize: UnwrapRef<typeof import('@vueuse/core')['useMemoize']>
     readonly useMemory: UnwrapRef<typeof import('@vueuse/core')['useMemory']>
     readonly useMessageContentQueue: UnwrapRef<typeof import('./composables/queues')['useMessageContentQueue']>
+    readonly useMicVAD: UnwrapRef<typeof import('./composables/micvad')['useMicVAD']>
     readonly useModel: UnwrapRef<typeof import('vue')['useModel']>
     readonly useMounted: UnwrapRef<typeof import('@vueuse/core')['useMounted']>
     readonly useMouse: UnwrapRef<typeof import('@vueuse/core')['useMouse']>

diff --git a/packages/stage/src/components/MainStage.vue b/packages/stage/src/components/MainStage.vue
@@ -1,14 +1,14 @@
 <script setup lang="ts">
 import type { AssistantMessage, Message, SystemMessage } from '@xsai/shared-chat-completion'
 import type { Emotion } from '../constants/emotions'
-import { MicVAD } from '@ricky0123/vad-web'
 import { useLocalStorage } from '@vueuse/core'
 import { storeToRefs } from 'pinia'
 
 import { computed, onMounted, ref, watch } from 'vue'
 import Avatar from '../assets/live2d/models/hiyori_free_zh/avatar.png'
 
 import { useMarkdown } from '../composables/markdown'
+import { useMicVAD } from '../composables/micvad'
 import { useQueue } from '../composables/queue'
 import { useDelayMessageQueue, useEmotionsMessageQueue, useMessageContentQueue } from '../composables/queues'
 import { llmInferenceEndToken } from '../constants'
@@ -52,9 +52,9 @@ const audioAnalyser = ref<AnalyserNode>()
 const mouthOpenSize = ref(0)
 const nowSpeaking = ref(false)
 const lipSyncStarted = ref(false)
-const micVad = ref<MicVAD>()
 const { audioInputs } = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
 const selectedAudioDevice = ref<MediaDeviceInfo>()
+const selectedAudioDeviceId = computed(() => selectedAudioDevice.value?.deviceId)
 
 const nowSpeakingAvatarBorderOpacity = computed<number>(() => {
   if (!nowSpeaking.value)
@@ -64,6 +64,30 @@ const nowSpeakingAvatarBorderOpacity = computed<number>(() => {
     + (nowSpeakingAvatarBorderOpacityMax - nowSpeakingAvatarBorderOpacityMin) * mouthOpenSize.value) / 100)
 })
 
+useMicVAD(selectedAudioDeviceId, {
+  onSpeechStart: () => {
+    // TODO: interrupt the playback
+    // TODO: interrupt any of the ongoing TTS
+    // TODO: interrupt any of the ongoing LLM requests
+    // TODO: interrupt any of the ongoing animation of Live2D or VRM
+    // TODO: once interrupted, we should somehow switch to listen or thinking
+    //       emotion / expression?
+    listening.value = true
+  },
+  // VAD misfire means while speech end is detected but
+  // the frames of the segment of the audio buffer
+  // is not enough to be considered as a speech segment
+  // which controlled by the `minSpeechFrames` parameter
+  onVADMisfire: () => {
+    // TODO: do audio buffer send to whisper
+    listening.value = false
+  },
+  onSpeechEnd: () => {
+    // TODO: do audio buffer send to whisper
+    listening.value = false
+  },
+})
+
 function handleModelChange(event: Event) {
   const target = event.target as HTMLSelectElement
   const found = supportedModels.value.find(m => m.id === target.value)
@@ -75,55 +99,6 @@ function handleModelChange(event: Event) {
   openAIModel.value = found
 }
 
-async function handleMicVADActivation(deviceId: string) {
-  if (micVad.value) {
-    micVad.value.destroy()
-    micVad.value = undefined
-    console.warn('existing MicVAD destroyed')
-  }
-
-  const media = await navigator.mediaDevices.getUserMedia({ audio: { deviceId } })
-
-  // Use of MicVAD is inspired by Open-LLM-VTuber
-  // Source code reference: https://github.com/t41372/Open-LLM-VTuber/blob/92cbf4349b84a68b0035bc825bc3d1d61fd0f063/static/index.html#L119
-  micVad.value = await MicVAD.new({
-    stream: media,
-    model: 'v5',
-    positiveSpeechThreshold: 0.2, // default is 0.5
-    negativeSpeechThreshold: 0.08, // default is 0.5 - 0.15
-    minSpeechFrames: 60, // default is 9
-    onSpeechStart: () => {
-      // TODO: interrupt the playback
-      // TODO: interrupt any of the ongoing TTS
-      // TODO: interrupt any of the ongoing LLM requests
-      // TODO: interrupt any of the ongoing animation of Live2D or VRM
-      // TODO: once interrupted, we should somehow switch to listen or thinking
-      //       emotion / expression?
-      listening.value = true
-    },
-    // VAD misfire means while speech end is detected but
-    // the frames of the segment of the audio buffer
-    // is not enough to be considered as a speech segment
-    // which controlled by the `minSpeechFrames` parameter
-    onVADMisfire: () => {
-      // TODO: do audio buffer send to whisper
-      listening.value = false
-    },
-    onSpeechEnd: () => {
-      // TODO: do audio buffer send to whisper
-      listening.value = false
-    },
-    // WORKAROUND: temporary workaround for onnxruntime-web, since @ricky0123/vad-web
-    // uses hardcoded version of [email protected] to fetch the already non-existing
-    // ort-wasm-simd-threaded.mjs file and its WASM binary, we are going to force
-    // the onnxruntime-web to use the latest version of onnxruntime-web from jsdelivr
-    // to fetch the correct ort-wasm-simd-threaded.wasm binary
-    onnxWASMBasePath: 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/',
-  })
-
-  micVad.value.start()
-}
-
 async function handleAudioInputChange(event: Event) {
   const target = event.target as HTMLSelectElement
   const found = audioInputs.value.find(d => d.deviceId === target.value)
@@ -133,7 +108,6 @@ async function handleAudioInputChange(event: Event) {
   }
 
   selectedAudioDevice.value = found
-  await handleMicVADActivation(found.deviceId)
 }
 
 const audioQueue = useQueue<{ audioBuffer: AudioBuffer, text: string }>({
@@ -315,11 +289,6 @@ watch([openAiApiBaseURL, openAiApiKey], async ([baseUrl, apiKey]) => {
   supportedModels.value = await models(baseUrl, apiKey)
 })
 
-onUnmounted(() => {
-  if (micVad.value)
-    micVad.value.destroy()
-})
-
 onMounted(async () => {
   if (!openAiApiBaseURL.value || !openAiApiKey.value)
     return

diff --git a/packages/stage/src/composables/micvad.ts b/packages/stage/src/composables/micvad.ts
@@ -0,0 +1,74 @@
+import type { RealTimeVADOptions } from '@ricky0123/vad-web'
+import { getDefaultRealTimeVADOptions, MicVAD } from '@ricky0123/vad-web'
+import { usePermission } from '@vueuse/core'
+import { tryOnMounted } from '@vueuse/shared'
+import { defu } from 'defu'
+
+export function useMicVAD(deviceId: MaybeRef<ConstrainDOMString | undefined>, options?: Partial<RealTimeVADOptions> & { auto?: boolean }) {
+  const opts = defu<Partial<RealTimeVADOptions> & { auto?: boolean }, Array<Omit<RealTimeVADOptions, 'stream'> & { auto?: boolean }>>(options ?? {}, {
+    ...getDefaultRealTimeVADOptions('v5'),
+    positiveSpeechThreshold: 0.2, // default is 0.5
+    negativeSpeechThreshold: 0.08, // default is 0.5 - 0.15
+    minSpeechFrames: 60, // default is 9
+    // WORKAROUND: temporary workaround for onnxruntime-web, since @ricky0123/vad-web
+    // uses hardcoded version of [email protected] to fetch the already non-existing
+    // ort-wasm-simd-threaded.mjs file and its WASM binary, we are going to force
+    // the onnxruntime-web to use the latest version of onnxruntime-web from jsdelivr
+    // to fetch the correct ort-wasm-simd-threaded.wasm binary
+    onnxWASMBasePath: 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/',
+    auto: true,
+  })
+
+  const micVad = ref<MicVAD>()
+  const microphoneAccess = usePermission('microphone')
+
+  async function update() {
+    if (micVad.value) {
+      micVad.value.destroy()
+      micVad.value = undefined
+      console.warn('existing MicVAD destroyed')
+    }
+    if (!microphoneAccess.value)
+      return
+
+    const id = unref(deviceId)
+    if (!id)
+      return
+
+    const media = await navigator.mediaDevices.getUserMedia({ audio: { deviceId: id } })
+
+    // Use of MicVAD is inspired by Open-LLM-VTuber
+    // Source code reference: https://github.com/t41372/Open-LLM-VTuber/blob/92cbf4349b84a68b0035bc825bc3d1d61fd0f063/static/index.html#L119
+    micVad.value = await MicVAD.new({
+      ...opts,
+      stream: media,
+    })
+
+    if (opts.auto)
+      micVad.value.start()
+  }
+
+  watch(microphoneAccess, update, { immediate: true })
+  watch(toRef(deviceId), update, { immediate: true })
+  tryOnMounted(update)
+  onUnmounted(() => {
+    if (micVad.value) {
+      micVad.value.destroy()
+      micVad.value = undefined
+    }
+  })
+
+  return {
+    destroy: () => {
+      if (micVad.value) {
+        micVad.value.destroy()
+        micVad.value = undefined
+      }
+    },
+    start: () => {
+      if (micVad.value) {
+        micVad.value.start()
+      }
+    },
+  }
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
Original file line number	Diff line number	Diff line change
Expand Up		@@ -20,3 +20,4 @@ pnpm dev

		- [pixiv/ChatVRM](https://github.com/pixiv/ChatVRM)
		- [josephrocca/ChatVRM-js: A JS conversion/adaptation of parts of the ChatVRM (TypeScript) code for standalone use in OpenCharacters and elsewhere](https://github.com/josephrocca/ChatVRM-js)
		- Design of UI and style was inspired by [Cookard](https://store.steampowered.com/app/2919650/Cookard/), [UNBEATABLE](https://store.steampowered.com/app/2240620/UNBEATABLE/), and [Sensei! I like you so much!](https://store.steampowered.com/app/2957700/_/), and artworks of [Ayame by Mercedes Bazan](https://dribbble.com/shots/22157656-Ayame) with [Wish by Mercedes Bazan](https://dribbble.com/shots/24501019-Wish)