Skip to content

Commit

Permalink
feat: MicVAD integrated
Browse files Browse the repository at this point in the history
  • Loading branch information
nekomeowww committed Dec 10, 2024
1 parent 50190a3 commit e4a0cc7
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 1 deletion.
86 changes: 86 additions & 0 deletions packages/stage/src/components/MainStage.vue
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
<script setup lang="ts">
import type { AssistantMessage, Message, SystemMessage } from '@xsai/shared-chat-completion'
import type { Emotion } from '../constants/emotions'
import { MicVAD } from '@ricky0123/vad-web'
import { useLocalStorage } from '@vueuse/core'
import { storeToRefs } from 'pinia'
Expand Down Expand Up @@ -51,6 +52,9 @@ const audioAnalyser = ref<AnalyserNode>()
const mouthOpenSize = ref(0)
const nowSpeaking = ref(false)
const lipSyncStarted = ref(false)
const micVad = ref<MicVAD>()
const { audioInputs } = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
const selectedAudioDevice = ref<MediaDeviceInfo>()
const nowSpeakingAvatarBorderOpacity = computed<number>(() => {
if (!nowSpeaking.value)
Expand All @@ -71,6 +75,67 @@ function handleModelChange(event: Event) {
openAIModel.value = found
}
async function handleMicVADActivation(deviceId: string) {
if (micVad.value) {
micVad.value.destroy()
micVad.value = undefined
console.warn('existing MicVAD destroyed')
}
const media = await navigator.mediaDevices.getUserMedia({ audio: { deviceId } })
// Use of MicVAD is inspired by Open-LLM-VTuber
// Source code reference: https://github.com/t41372/Open-LLM-VTuber/blob/92cbf4349b84a68b0035bc825bc3d1d61fd0f063/static/index.html#L119
micVad.value = await MicVAD.new({
stream: media,
model: 'v5',
positiveSpeechThreshold: 0.2, // default is 0.5
negativeSpeechThreshold: 0.08, // default is 0.5 - 0.15
minSpeechFrames: 60, // default is 9
onSpeechStart: () => {
// TODO: interrupt the playback
// TODO: interrupt any of the ongoing TTS
// TODO: interrupt any of the ongoing LLM requests
// TODO: interrupt any of the ongoing animation of Live2D or VRM
// TODO: once interrupted, we should somehow switch to listen or thinking
// emotion / expression?
listening.value = true
},
// VAD misfire means while speech end is detected but
// the frames of the segment of the audio buffer
// is not enough to be considered as a speech segment
// which controlled by the `minSpeechFrames` parameter
onVADMisfire: () => {
// TODO: do audio buffer send to whisper
listening.value = false
},
onSpeechEnd: () => {
// TODO: do audio buffer send to whisper
listening.value = false
},
// WORKAROUND: temporary workaround for onnxruntime-web, since @ricky0123/vad-web
// uses hardcoded version of [email protected] to fetch the already non-existing
// ort-wasm-simd-threaded.mjs file and its WASM binary, we are going to force
// the onnxruntime-web to use the latest version of onnxruntime-web from jsdelivr
// to fetch the correct ort-wasm-simd-threaded.wasm binary
onnxWASMBasePath: 'https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/',
})
micVad.value.start()
}
async function handleAudioInputChange(event: Event) {
const target = event.target as HTMLSelectElement
const found = audioInputs.value.find(d => d.deviceId === target.value)
if (!found) {
selectedAudioDevice.value = undefined
return
}
selectedAudioDevice.value = found
await handleMicVADActivation(found.deviceId)
}
const audioQueue = useQueue<{ audioBuffer: AudioBuffer, text: string }>({
handlers: [
(ctx) => {
Expand Down Expand Up @@ -250,6 +315,11 @@ watch([openAiApiBaseURL, openAiApiKey], async ([baseUrl, apiKey]) => {
supportedModels.value = await models(baseUrl, apiKey)
})
onUnmounted(() => {
if (micVad.value)
micVad.value.destroy()
})
onMounted(async () => {
if (!openAiApiBaseURL.value || !openAiApiKey.value)
return
Expand Down Expand Up @@ -354,6 +424,22 @@ onUnmounted(() => {
</div>
<div flex="~ row" my="2" space-x="2" w-full self-end>
<div flex="~ col" w-full space-y="2">
<select
p="2"
bg="zinc-100 dark:zinc-700" w-full rounded-lg
outline-none
@change="handleAudioInputChange"
>
<option disabled>
Select a Audio Input
</option>
<option v-if="selectedAudioDevice" :value="selectedAudioDevice.deviceId">
{{ selectedAudioDevice.label }}
</option>
<option v-for="m in audioInputs" :key="m.deviceId" :value="m.deviceId">
{{ m.label }}
</option>
</select>
<select
p="2"
bg="zinc-100 dark:zinc-700" w-full rounded-lg
Expand Down
2 changes: 1 addition & 1 deletion packages/stt-realtime-webgpu/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"license": "MIT",
"scripts": {
"build": "vite build",
"dev": "vite",
"dev": "vite --port 5174",
"lint": "eslint .",
"preview": "vite preview",
"typecheck": "vue-tsc --noEmit"
Expand Down

0 comments on commit e4a0cc7

Please sign in to comment.