From c18d044c93e37748b3d1250b862d7da2951b2158 Mon Sep 17 00:00:00 2001 From: Neko Ayaka Date: Mon, 6 Jan 2025 17:18:32 +0800 Subject: [PATCH] refactor: drop ffmpeg --- pnpm-lock.yaml | 47 ++---- services/discord-voice-bot/.env | 9 + services/discord-voice-bot/README.md | 5 + services/discord-voice-bot/package.json | 3 - .../src/bots/discord/commands/summon.ts | 158 +++++++----------- .../discord-voice-bot/src/pipelines/tts.ts | 49 ++++++ services/discord-voice-bot/src/utils/audio.ts | 51 ++++++ 7 files changed, 184 insertions(+), 138 deletions(-) create mode 100644 services/discord-voice-bot/.env create mode 100644 services/discord-voice-bot/README.md create mode 100644 services/discord-voice-bot/src/utils/audio.ts diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index b41cb6c..5d3a58c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -500,18 +500,9 @@ importers: '@xsai/shared-chat': specifier: ^0.0.23 version: 0.0.23 - date-fns: - specifier: ^4.1.0 - version: 4.1.0 discord.js: specifier: ^14.17.2 version: 14.17.2 - ffmpeg-static: - specifier: ^5.2.0 - version: 5.2.0 - fluent-ffmpeg: - specifier: ^2.1.3 - version: 2.1.3 libsodium-wrappers: specifier: ^0.7.15 version: 0.7.15 @@ -3886,9 +3877,6 @@ packages: resolution: {integrity: sha512-NW2cX8m1Q7KPA7a5M2ULQeZ2wR5qI5PAbw5L0UOMxdioVk9PMZ0h1TmyZEkPYrCvYjDlFICusOu1dlEKAAeXBw==} engines: {node: '>=0.12.0'} - async@0.2.10: - resolution: {integrity: sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ==} - async@2.6.4: resolution: {integrity: sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==} @@ -5120,10 +5108,6 @@ packages: flatted@3.3.1: resolution: {integrity: sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==} - fluent-ffmpeg@2.1.3: - resolution: {integrity: sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q==} - engines: {node: '>=18'} - follow-redirects@1.15.2: resolution: {integrity: sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==} engines: {node: '>=4.0'} @@ -8374,10 +8358,6 @@ packages: resolution: {integrity: sha512-qe9UWWpkeG5yzZ0tNYxDmd7vo58HDBc39mZ0xWWpolAGADdFOzkfamWLDxkOWcvHQKVmdTyQdLD4NOfjLWTKew==} engines: {node: '>= 0.4'} - which@1.3.1: - resolution: {integrity: sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==} - hasBin: true - which@2.0.2: resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} engines: {node: '>= 8'} @@ -9367,6 +9347,7 @@ snapshots: concat-stream: 2.0.0 http-response-object: 3.0.2 parse-cache-control: 1.0.1 + optional: true '@develar/schema-utils@2.6.5': dependencies: @@ -11107,7 +11088,8 @@ snapshots: '@types/ms@0.7.34': {} - '@types/node@10.17.60': {} + '@types/node@10.17.60': + optional: true '@types/node@20.17.11': dependencies: @@ -12439,8 +12421,6 @@ snapshots: async-exit-hook@2.0.1: {} - async@0.2.10: {} - async@2.6.4: dependencies: lodash: 4.17.21 @@ -12691,7 +12671,8 @@ snapshots: caniuse-lite@1.0.30001680: {} - caseless@0.12.0: {} + caseless@0.12.0: + optional: true ccount@2.0.1: {} @@ -12864,6 +12845,7 @@ snapshots: inherits: 2.0.4 readable-stream: 3.6.2 typedarray: 0.0.6 + optional: true confbox@0.1.8: {} @@ -13992,6 +13974,7 @@ snapshots: progress: 2.0.3 transitivePeerDependencies: - supports-color + optional: true figures@6.1.0: dependencies: @@ -14062,11 +14045,6 @@ snapshots: flatted@3.3.1: {} - fluent-ffmpeg@2.1.3: - dependencies: - async: 0.2.10 - which: 1.3.1 - follow-redirects@1.15.2: {} for-each@0.3.3: @@ -14455,6 +14433,7 @@ snapshots: http-response-object@3.0.2: dependencies: '@types/node': 10.17.60 + optional: true http2-wrapper@1.0.3: dependencies: @@ -15716,7 +15695,8 @@ snapshots: dependencies: callsites: 3.1.0 - parse-cache-control@1.0.1: {} + parse-cache-control@1.0.1: + optional: true parse-gitignore@2.0.0: {} @@ -17132,7 +17112,8 @@ snapshots: for-each: 0.3.3 is-typed-array: 1.1.12 - typedarray@0.0.6: {} + typedarray@0.0.6: + optional: true typescript@5.7.2: {} @@ -17936,10 +17917,6 @@ snapshots: gopd: 1.0.1 has-tostringtag: 1.0.0 - which@1.3.1: - dependencies: - isexe: 2.0.0 - which@2.0.2: dependencies: isexe: 2.0.0 diff --git a/services/discord-voice-bot/.env b/services/discord-voice-bot/.env new file mode 100644 index 0000000..b859022 --- /dev/null +++ b/services/discord-voice-bot/.env @@ -0,0 +1,9 @@ +DISCORD_TOKEN='' +DISCORD_BOT_CLIENT_ID='' + +OPENAI_MODEL='' +OPENAI_API_KEY='' +OPENAI_API_BASE_URL='' + +ELEVENLABS_API_KEY='' +ELEVENLABS_API_BASE_URL='' diff --git a/services/discord-voice-bot/README.md b/services/discord-voice-bot/README.md new file mode 100644 index 0000000..5fb3a4c --- /dev/null +++ b/services/discord-voice-bot/README.md @@ -0,0 +1,5 @@ +## Acknowledgements + +- Implementation of Audio handling and processing https://github.com/TheTrueSCP/CharacterAIVoice/blob/54d6a41b4e0eba9ad996c5f9ddcc6230277af2f8/src/VoiceHandler.js +- Example of usage https://github.com/discordjs/voice-examples/blob/da0c3b419107d41053501a4dddf3826ad53c03f7/radio-bot/src/bot.ts +- Excellent library https://github.com/discordjs/discord.js diff --git a/services/discord-voice-bot/package.json b/services/discord-voice-bot/package.json index 0167cc8..52685c2 100644 --- a/services/discord-voice-bot/package.json +++ b/services/discord-voice-bot/package.json @@ -40,10 +40,7 @@ "@xsai/generate-text": "^0.0.23", "@xsai/providers": "^0.0.23", "@xsai/shared-chat": "^0.0.23", - "date-fns": "^4.1.0", "discord.js": "^14.17.2", - "ffmpeg-static": "^5.2.0", - "fluent-ffmpeg": "^2.1.3", "libsodium-wrappers": "^0.7.15", "opusscript": "^0.1.1", "tsx": "^4.19.2", diff --git a/services/discord-voice-bot/src/bots/discord/commands/summon.ts b/services/discord-voice-bot/src/bots/discord/commands/summon.ts index 375a941..f39974d 100644 --- a/services/discord-voice-bot/src/bots/discord/commands/summon.ts +++ b/services/discord-voice-bot/src/bots/discord/commands/summon.ts @@ -2,26 +2,65 @@ import type { AudioReceiveStream } from '@discordjs/voice' import type { useLogg } from '@guiiai/logg' import type { CacheType, ChatInputCommandInteraction, GuildMember } from 'discord.js' import { Buffer } from 'node:buffer' -import { createWriteStream } from 'node:fs' -import { mkdir, readFile } from 'node:fs/promises' import { env } from 'node:process' -import { Readable } from 'node:stream' +import { Readable, Writable } from 'node:stream' import { createAudioPlayer, createAudioResource, EndBehaviorType, entersState, joinVoiceChannel, NoSubscriberBehavior, VoiceConnectionStatus } from '@discordjs/voice' import { generateSpeech } from '@xsai/generate-speech' import { generateText } from '@xsai/generate-text' import { createOpenAI, createUnElevenLabs } from '@xsai/providers' import { message } from '@xsai/shared-chat' -import { formatDate } from 'date-fns' -import ffmpeg from 'fluent-ffmpeg' import OpusScript from 'opusscript' -import wavefile from 'wavefile' -import { WhisperLargeV3Pipeline } from '../../../pipelines/tts' +import { transcribe } from '../../../pipelines/tts' import { systemPrompt } from '../../../prompts/system-v1' -import { exists } from '../../../utils/fs' const decoder = new OpusScript(48000, 2) +async function transcribeTextFromAudioReceiveStream(stream: AudioReceiveStream) { + return new Promise((resolve, reject) => { + try { + let pcmBuffer = Buffer.alloc(0) + const pcmStream = new Writable({ + write(chunk, _encoding, callback) { + pcmBuffer = Buffer.concat([pcmBuffer, chunk]) + callback() + }, + }) + + stream.on('error', (err) => { + reject(err) + }) + + // Create the pipeline + stream.on('data', async (chunk) => { + try { + const pcm = decoder.decode(chunk) + pcmStream.write(pcm) + } + catch (err) { + reject(err) + } + }) + + // When user stops talking, stop the stream and generate an mp3 file. + stream.on('end', async () => { + try { + pcmStream.end() + + const result = await transcribe(pcmBuffer) + resolve(result) + } + catch (err) { + reject(err) + } + }) + } + catch (err) { + reject(err) + } + }) +} + export async function handleSummon(log: ReturnType, interaction: ChatInputCommandInteraction) { const currVoiceChannel = (interaction.member as GuildMember).voice.channel if (!currVoiceChannel) { @@ -85,7 +124,8 @@ export async function handleSummon(log: ReturnType, interaction: }, }) - const result = await transcribeAudioStream(log, listenStream, userId) + const result = await transcribeTextFromAudioReceiveStream(listenStream) + const openai = createOpenAI({ apiKey: env.OPENAI_API_KEY, baseURL: env.OPENAI_API_BASE_URL, @@ -115,11 +155,18 @@ export async function handleSummon(log: ReturnType, interaction: }) const speechRes = await generateSpeech({ - ...elevenlabs.speech({ model: 'elevenlabs/eleven_multilingual_v2', voice: 'lNxY9WuCBCZCISASyJ55' }), + ...elevenlabs.speech({ + model: 'eleven_multilingual_v2', + voice: 'lNxY9WuCBCZCISASyJ55', + voiceSettings: { + stability: 0.4, + similarityBoost: 0.5, + }, + }), input: res.text, }) - log.withField('length', speechRes.byteLength).withField('text', Buffer.from(speechRes).toString('utf-8')).log('Generated speech') + log.withField('length', speechRes.byteLength).log('Generated speech') const audioResource = createAudioResource(Readable.from(Buffer.from(speechRes))) player.play(audioResource) @@ -138,92 +185,3 @@ export async function handleSummon(log: ReturnType, interaction: await interaction.reply('Could not join voice channel.') } } - -async function transcribeAudioStream(log: ReturnType, stream: AudioReceiveStream, userId: string) { - async function createDirIfNotExists(path: string) { - if (!(await exists(path))) { - await mkdir(path, { recursive: true }) - } - } - - return new Promise((resolve, reject) => { - createDirIfNotExists(`temp/audios/${userId}`).then(() => { - try { - const fileBasename = formatDate(new Date(), 'yyyy-MM-dd HH:mm:ss') - - // Generate a uid for the audio file. - // Create a stream that writes a new pcm file with the generated uid - const writeStream = createWriteStream(`temp/audios/${userId}/${fileBasename}.pcm`, { flags: 'a' }) - - stream.on('error', (err) => { - reject(err) - }) - - // Create the pipeline - stream.on('data', async (chunk) => { - try { - const pcm = decoder.decode(chunk) - writeStream.write(pcm) - } - catch (err) { - log.withError(err).log('Error decoding audio') - } - }) - - // When user stops talking, stop the stream and generate an mp3 file. - stream.on('end', async () => { - writeStream.end() - - ffmpeg() - .input(`temp/audios/${userId}/${fileBasename}.pcm`) - .inputFormat('s32le') - .audioFrequency(60000) - .audioChannels(2) - .output(`temp/audios/${userId}/${fileBasename}.wav`) - .outputFormat('wav') - .on('error', (err) => { - reject(err) - }) - .on('end', async () => { - log.log('Audio file generated') - - // Read .wav file and convert it to required format - const wav = new wavefile.WaveFile(await readFile(`temp/audios/${userId}/${fileBasename}.wav`)) - wav.toBitDepth('32f') // Pipeline expects input as a Float32Array - wav.toSampleRate(16000) // Whisper expects audio with a sampling rate of 16000 - const audioData = wav.getSamples() - - const transcriber = await WhisperLargeV3Pipeline.getInstance() - log.log('Transcribing audio') - - const result = await transcriber(audioData) - if (Array.isArray(result)) { - const arrayResult = result as { text: string }[] - if (arrayResult.length === 0) { - log.log('No transcription result') - return resolve('') - } - - log.withField('result', result[0].text).log('Transcription result') - resolve(result[0].text) - } - else { - if ('text' in result) { - log.withField('result', result.text).log('Transcription result') - return resolve(result.text) - } - else { - log.withField('result', result).log('No transcription result') - return resolve('') - } - } - }) - .run() - }) - } - catch (err) { - reject(err) - } - }) - }) -} diff --git a/services/discord-voice-bot/src/pipelines/tts.ts b/services/discord-voice-bot/src/pipelines/tts.ts index 84e9514..b545f8f 100644 --- a/services/discord-voice-bot/src/pipelines/tts.ts +++ b/services/discord-voice-bot/src/pipelines/tts.ts @@ -1,5 +1,10 @@ +import type { Buffer } from 'node:buffer' +import { useLogg } from '@guiiai/logg' import { pipeline, type PipelineType } from '@huggingface/transformers' +import wavefile from 'wavefile' +import { pcmToWav } from '../utils/audio' + export class WhisperLargeV3Pipeline { static task: PipelineType = 'automatic-speech-recognition' static model = 'Xenova/whisper-tiny.en' @@ -16,3 +21,47 @@ export class WhisperLargeV3Pipeline { return this.instance } } + +export function textFromResult(result: Array<{ text: string }> | { text: string }) { + if (Array.isArray(result)) { + const arrayResult = result as { text: string }[] + if (arrayResult.length === 0) { + return '' + } + + return result[0].text + } + else { + if ('text' in result) { + return result.text + } + else { + return '' + } + } +} + +export async function transcribe(pcmBuffer: Buffer) { + const log = useLogg('Transcribe').useGlobalConfig() + + const pcmConvertedWav = pcmToWav(pcmBuffer, 48000, 2) + log.withFields({ from: pcmBuffer.byteLength, to: pcmConvertedWav.byteLength }).log('Audio data received') + + const transcriber = await WhisperLargeV3Pipeline.getInstance() as (audio: Float32Array | Float64Array) => Promise | { text: string }> + log.log('Transcribing audio') + + const wav = new wavefile.WaveFile(pcmConvertedWav) + wav.toBitDepth('32f') // Pipeline expects input as a Float32Array + wav.toSampleRate(16000) // Whisper expects audio with a sampling rate of 16000 + const audioData = wav.getSamples() + + const result = await transcriber(audioData) + const text = textFromResult(result) + if (!text) { + log.log('No transcription result') + return '' + } + + log.withField('result', text).log('Transcription result') + return text +} diff --git a/services/discord-voice-bot/src/utils/audio.ts b/services/discord-voice-bot/src/utils/audio.ts new file mode 100644 index 0000000..5303e2e --- /dev/null +++ b/services/discord-voice-bot/src/utils/audio.ts @@ -0,0 +1,51 @@ +import type { Buffer } from 'node:buffer' + +export function pcmToWav(pcmBuffer: Buffer, sampleRate: number, numChannels: number): Uint8Array { + const byteRate = sampleRate * numChannels * 2 // Assuming 16-bit PCM (2 bytes per sample) + const blockAlign = numChannels * 2 // Block align for 16-bit PCM + + // Create WAV header + const header = new ArrayBuffer(44) + const view = new DataView(header) + + // Write RIFF identifier + writeString(view, 0, 'RIFF') + // Write file length (size of data + header) + view.setUint32(4, 36 + pcmBuffer.byteLength, true) + // Write WAVE identifier + writeString(view, 8, 'WAVE') + // Write format chunk identifier + writeString(view, 12, 'fmt ') + // Write format chunk length (16 for PCM) + view.setUint32(16, 16, true) + // Write audio format (1 for PCM) + view.setUint16(20, 1, true) + // Write number of channels + view.setUint16(22, numChannels, true) + // Write sample rate + view.setUint32(24, sampleRate, true) + // Write byte rate + view.setUint32(28, byteRate, true) + // Write block align + view.setUint16(32, blockAlign, true) + // Write bits per sample (16) + view.setUint16(34, 16, true) + // Write data chunk identifier + writeString(view, 36, 'data') + // Write data chunk length (size of PCM data) + view.setUint32(40, pcmBuffer.byteLength, true) + + // Combine header and PCM data into one buffer + const wavBuffer = new Uint8Array(header.byteLength + pcmBuffer.byteLength) + + wavBuffer.set(new Uint8Array(header), 0) + wavBuffer.set(new Uint8Array(pcmBuffer), header.byteLength) + + return wavBuffer +} + +function writeString(view, offset, string) { + for (let i = 0; i < string.length; i++) { + view.setUint8(offset + i, string.charCodeAt(i)) + } +}