Skip to content

Commit

Permalink
feat: use vad audio result to whisper to transcribe
Browse files Browse the repository at this point in the history
  • Loading branch information
nekomeowww committed Dec 11, 2024
1 parent 19487ac commit 01dbaeb
Show file tree
Hide file tree
Showing 10 changed files with 388 additions and 7 deletions.
1 change: 1 addition & 0 deletions cspell.config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ words:
- live2dcubismcore
- live2dcubismframework
- Llmmarker
- micvad
- Myriam
- Neko
- nekomeowww
Expand Down
1 change: 1 addition & 0 deletions packages/stage/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
},
"dependencies": {
"@11labs/client": "^0.0.4",
"@huggingface/transformers": "^3.1.2",
"@pixi/app": "^6.5.10",
"@pixi/constants": "6",
"@pixi/core": "6",
Expand Down
21 changes: 17 additions & 4 deletions packages/stage/src/components/MainStage.vue
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,25 @@
import type { AssistantMessage, Message, SystemMessage } from '@xsai/shared-chat-completion'
import type { Emotion } from '../constants/emotions'
import { useLocalStorage } from '@vueuse/core'
import { storeToRefs } from 'pinia'
import { storeToRefs } from 'pinia'
import { computed, onMounted, ref, watch } from 'vue'
import Avatar from '../assets/live2d/models/hiyori_free_zh/avatar.png'
import { useWhisper } from '~/composables/whisper'
import Avatar from '../assets/live2d/models/hiyori_free_zh/avatar.png'
import { useMarkdown } from '../composables/markdown'
import { useMicVAD } from '../composables/micvad'
import { useQueue } from '../composables/queue'
import { useDelayMessageQueue, useEmotionsMessageQueue, useMessageContentQueue } from '../composables/queues'
import { llmInferenceEndToken } from '../constants'
import { EMOTION_EmotionMotionName_value, EMOTION_VRMExpressionName_value, EmotionThinkMotionName } from '../constants/emotions'
import SystemPromptV2 from '../constants/prompts/system-v2'
import WhisperWorker from '../libs/workers/worker?worker&url'
import { useLLM } from '../stores/llm'
import { useSettings } from '../stores/settings'
import { encodeWAVToBase64 } from '../utils/binary'
import { asyncIteratorFromReadableStream } from '../utils/iterator'
import BasicTextarea from './BasicTextarea.vue'
import Live2DViewer from './Live2DViewer.vue'
import Settings from './Settings.vue'
Expand All @@ -38,6 +41,7 @@ const { streamSpeech, stream, models } = useLLM()
const { audioContext, calculateVolume } = useAudioContext()
const { process } = useMarkdown()
const { audioInputs } = useDevicesList({ constraints: { audio: true }, requestPermissions: true })
const { transcribe: generate } = useWhisper(WhisperWorker)
const listening = ref(false)
const live2DViewerRef = ref<{ setMotion: (motionName: string) => Promise<void> }>()
Expand All @@ -61,6 +65,14 @@ const nowSpeakingAvatarBorderOpacity = computed<number>(() => {
+ (nowSpeakingAvatarBorderOpacityMax - nowSpeakingAvatarBorderOpacityMin) * mouthOpenSize.value) / 100)
})
async function handleTranscription(buffer: Float32Array) {
await audioContext.resume()
// Convert Float32Array to WAV format
const audioBase64 = await encodeWAVToBase64(buffer, audioContext.sampleRate)
generate({ type: 'generate', data: { audio: audioBase64, language: 'en' } })
}
useMicVAD(selectedAudioDeviceId, {
onSpeechStart: () => {
// TODO: interrupt the playback
Expand All @@ -79,9 +91,10 @@ useMicVAD(selectedAudioDeviceId, {
// TODO: do audio buffer send to whisper
listening.value = false
},
onSpeechEnd: () => {
onSpeechEnd: (buffer) => {
// TODO: do audio buffer send to whisper
listening.value = false
handleTranscription(buffer)
},
})
Expand Down
2 changes: 1 addition & 1 deletion packages/stage/src/composables/micvad.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ export function useMicVAD(deviceId: MaybeRef<ConstrainDOMString | undefined>, op
...getDefaultRealTimeVADOptions('v5'),
positiveSpeechThreshold: 0.2, // default is 0.5
negativeSpeechThreshold: 0.08, // default is 0.5 - 0.15
minSpeechFrames: 60, // default is 9
minSpeechFrames: 5, // default is 9
// WORKAROUND: temporary workaround for onnxruntime-web, since @ricky0123/vad-web
// uses hardcoded version of [email protected] to fetch the already non-existing
// ort-wasm-simd-threaded.mjs file and its WASM binary, we are going to force
Expand Down
71 changes: 71 additions & 0 deletions packages/stage/src/composables/whisper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import type { MessageEvents, MessageGenerate, ProgressMessageEvents } from '../libs/workers/types'

export function useWhisper(url: string) {
const { post: whisperPost, data: whisperData, terminate } = useWebWorker<MessageEvents>(url, { type: 'module' })

const status = ref<'loading' | 'ready' | null>(null)
const loadingMessage = ref('')
const loadingProgress = ref<ProgressMessageEvents[]>([])
const transcribing = ref(false)
const tps = ref<number>(0)
const result = ref('')

watch(whisperData, (e) => {
switch (e.status) {
case 'loading':
status.value = 'loading'
loadingMessage.value = e.data
break

case 'initiate':
loadingProgress.value.push(e)
break

case 'progress':
loadingProgress.value = loadingProgress.value.map((item) => {
if (item.file === e.file) {
return { ...item, ...e }
}
return item
})
break

case 'done':
loadingProgress.value = loadingProgress.value.filter(item => item.file !== e.file)
break

case 'ready':
status.value = 'ready'
break

case 'start':
transcribing.value = true
break

case 'update':
tps.value = e.tps
break

case 'complete':
transcribing.value = false
result.value = e.output[0] || ''
// eslint-disable-next-line no-console
console.debug('Whisper result:', result.value)
break
}
})

onUnmounted(() => {
terminate()
})

return {
transcribe: (message: MessageGenerate) => whisperPost(message),
status,
loadingMessage,
loadingProgress,
transcribing,
tps,
result,
}
}
74 changes: 74 additions & 0 deletions packages/stage/src/libs/workers/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
export interface EventLoading {
status: 'loading'
data: string
}

export interface EventInitiate {
status: 'initiate'
name: string
file: string
// Not used
progress?: number
loaded?: number
total?: number
}

export interface EventDownload {
status: 'download'
name: string
file: string
// Not used
progress?: number
loaded?: number
total?: number
}

export interface EventProgress {
status: 'progress'
name: string
file: string
progress: number
loaded: number
total: number
}

export interface EventDone {
status: 'done'
name: string
file: string
// Not used
progress?: number
loaded?: number
total?: number
}

export interface EventReady {
status: 'ready'
}

export interface EventStart {
status: 'start'
}

export interface EventUpdate {
status: 'update'
tps: number
output: string
numTokens: number
}

export interface EventComplete {
status: 'complete'
output: string[]
}

export type MessageEvents = EventLoading | EventInitiate | EventDownload | EventProgress | EventDone | EventReady | EventStart | EventUpdate | EventComplete
export type ProgressMessageEvents = EventInitiate | EventProgress | EventDone

export interface MessageGenerate {
type: 'generate'
data: {
audio: string
language: string
}
}
Loading

0 comments on commit 01dbaeb

Please sign in to comment.