From c18d044c93e37748b3d1250b862d7da2951b2158 Mon Sep 17 00:00:00 2001
From: Neko Ayaka <neko@ayaka.moe>
Date: Mon, 6 Jan 2025 17:18:32 +0800
Subject: [PATCH] refactor: drop ffmpeg

---
 pnpm-lock.yaml                                |  47 ++----
 services/discord-voice-bot/.env               |   9 +
 services/discord-voice-bot/README.md          |   5 +
 services/discord-voice-bot/package.json       |   3 -
 .../src/bots/discord/commands/summon.ts       | 158 +++++++-----------
 .../discord-voice-bot/src/pipelines/tts.ts    |  49 ++++++
 services/discord-voice-bot/src/utils/audio.ts |  51 ++++++
 7 files changed, 184 insertions(+), 138 deletions(-)
 create mode 100644 services/discord-voice-bot/.env
 create mode 100644 services/discord-voice-bot/README.md
 create mode 100644 services/discord-voice-bot/src/utils/audio.ts

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index b41cb6c..5d3a58c 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -500,18 +500,9 @@ importers:
       '@xsai/shared-chat':
         specifier: ^0.0.23
         version: 0.0.23
-      date-fns:
-        specifier: ^4.1.0
-        version: 4.1.0
       discord.js:
         specifier: ^14.17.2
         version: 14.17.2
-      ffmpeg-static:
-        specifier: ^5.2.0
-        version: 5.2.0
-      fluent-ffmpeg:
-        specifier: ^2.1.3
-        version: 2.1.3
       libsodium-wrappers:
         specifier: ^0.7.15
         version: 0.7.15
@@ -3886,9 +3877,6 @@ packages:
     resolution: {integrity: sha512-NW2cX8m1Q7KPA7a5M2ULQeZ2wR5qI5PAbw5L0UOMxdioVk9PMZ0h1TmyZEkPYrCvYjDlFICusOu1dlEKAAeXBw==}
     engines: {node: '>=0.12.0'}
 
-  async@0.2.10:
-    resolution: {integrity: sha512-eAkdoKxU6/LkKDBzLpT+t6Ff5EtfSF4wx1WfJiPEEV7WNLnDaRXk0oVysiEPm262roaachGexwUv94WhSgN5TQ==}
-
   async@2.6.4:
     resolution: {integrity: sha512-mzo5dfJYwAn29PeiJ0zvwTo04zj8HDJj0Mn8TD7sno7q12prdbnasKJHhkm2c1LgrhlJ0teaea8860oxi51mGA==}
 
@@ -5120,10 +5108,6 @@ packages:
   flatted@3.3.1:
     resolution: {integrity: sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==}
 
-  fluent-ffmpeg@2.1.3:
-    resolution: {integrity: sha512-Be3narBNt2s6bsaqP6Jzq91heDgOEaDCJAXcE3qcma/EJBSy5FB4cvO31XBInuAuKBx8Kptf8dkhjK0IOru39Q==}
-    engines: {node: '>=18'}
-
   follow-redirects@1.15.2:
     resolution: {integrity: sha512-VQLG33o04KaQ8uYi2tVNbdrWp1QWxNNea+nmIB4EVM28v0hmP17z7aG1+wAkNzVq4KeXTq3221ye5qTJP91JwA==}
     engines: {node: '>=4.0'}
@@ -8374,10 +8358,6 @@ packages:
     resolution: {integrity: sha512-qe9UWWpkeG5yzZ0tNYxDmd7vo58HDBc39mZ0xWWpolAGADdFOzkfamWLDxkOWcvHQKVmdTyQdLD4NOfjLWTKew==}
     engines: {node: '>= 0.4'}
 
-  which@1.3.1:
-    resolution: {integrity: sha512-HxJdYWq1MTIQbJ3nw0cqssHoTNU267KlrDuGZ1WYlxDStUtKUhOaJmh112/TZmHxxUfuJqPXSOm7tDyas0OSIQ==}
-    hasBin: true
-
   which@2.0.2:
     resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==}
     engines: {node: '>= 8'}
@@ -9367,6 +9347,7 @@ snapshots:
       concat-stream: 2.0.0
       http-response-object: 3.0.2
       parse-cache-control: 1.0.1
+    optional: true
 
   '@develar/schema-utils@2.6.5':
     dependencies:
@@ -11107,7 +11088,8 @@ snapshots:
 
   '@types/ms@0.7.34': {}
 
-  '@types/node@10.17.60': {}
+  '@types/node@10.17.60':
+    optional: true
 
   '@types/node@20.17.11':
     dependencies:
@@ -12439,8 +12421,6 @@ snapshots:
 
   async-exit-hook@2.0.1: {}
 
-  async@0.2.10: {}
-
   async@2.6.4:
     dependencies:
       lodash: 4.17.21
@@ -12691,7 +12671,8 @@ snapshots:
 
   caniuse-lite@1.0.30001680: {}
 
-  caseless@0.12.0: {}
+  caseless@0.12.0:
+    optional: true
 
   ccount@2.0.1: {}
 
@@ -12864,6 +12845,7 @@ snapshots:
       inherits: 2.0.4
       readable-stream: 3.6.2
       typedarray: 0.0.6
+    optional: true
 
   confbox@0.1.8: {}
 
@@ -13992,6 +13974,7 @@ snapshots:
       progress: 2.0.3
     transitivePeerDependencies:
       - supports-color
+    optional: true
 
   figures@6.1.0:
     dependencies:
@@ -14062,11 +14045,6 @@ snapshots:
 
   flatted@3.3.1: {}
 
-  fluent-ffmpeg@2.1.3:
-    dependencies:
-      async: 0.2.10
-      which: 1.3.1
-
   follow-redirects@1.15.2: {}
 
   for-each@0.3.3:
@@ -14455,6 +14433,7 @@ snapshots:
   http-response-object@3.0.2:
     dependencies:
       '@types/node': 10.17.60
+    optional: true
 
   http2-wrapper@1.0.3:
     dependencies:
@@ -15716,7 +15695,8 @@ snapshots:
     dependencies:
       callsites: 3.1.0
 
-  parse-cache-control@1.0.1: {}
+  parse-cache-control@1.0.1:
+    optional: true
 
   parse-gitignore@2.0.0: {}
 
@@ -17132,7 +17112,8 @@ snapshots:
       for-each: 0.3.3
       is-typed-array: 1.1.12
 
-  typedarray@0.0.6: {}
+  typedarray@0.0.6:
+    optional: true
 
   typescript@5.7.2: {}
 
@@ -17936,10 +17917,6 @@ snapshots:
       gopd: 1.0.1
       has-tostringtag: 1.0.0
 
-  which@1.3.1:
-    dependencies:
-      isexe: 2.0.0
-
   which@2.0.2:
     dependencies:
       isexe: 2.0.0
diff --git a/services/discord-voice-bot/.env b/services/discord-voice-bot/.env
new file mode 100644
index 0000000..b859022
--- /dev/null
+++ b/services/discord-voice-bot/.env
@@ -0,0 +1,9 @@
+DISCORD_TOKEN=''
+DISCORD_BOT_CLIENT_ID=''
+
+OPENAI_MODEL=''
+OPENAI_API_KEY=''
+OPENAI_API_BASE_URL=''
+
+ELEVENLABS_API_KEY=''
+ELEVENLABS_API_BASE_URL=''
diff --git a/services/discord-voice-bot/README.md b/services/discord-voice-bot/README.md
new file mode 100644
index 0000000..5fb3a4c
--- /dev/null
+++ b/services/discord-voice-bot/README.md
@@ -0,0 +1,5 @@
+## Acknowledgements
+
+- Implementation of Audio handling and processing https://github.com/TheTrueSCP/CharacterAIVoice/blob/54d6a41b4e0eba9ad996c5f9ddcc6230277af2f8/src/VoiceHandler.js
+- Example of usage https://github.com/discordjs/voice-examples/blob/da0c3b419107d41053501a4dddf3826ad53c03f7/radio-bot/src/bot.ts
+- Excellent library https://github.com/discordjs/discord.js
diff --git a/services/discord-voice-bot/package.json b/services/discord-voice-bot/package.json
index 0167cc8..52685c2 100644
--- a/services/discord-voice-bot/package.json
+++ b/services/discord-voice-bot/package.json
@@ -40,10 +40,7 @@
     "@xsai/generate-text": "^0.0.23",
     "@xsai/providers": "^0.0.23",
     "@xsai/shared-chat": "^0.0.23",
-    "date-fns": "^4.1.0",
     "discord.js": "^14.17.2",
-    "ffmpeg-static": "^5.2.0",
-    "fluent-ffmpeg": "^2.1.3",
     "libsodium-wrappers": "^0.7.15",
     "opusscript": "^0.1.1",
     "tsx": "^4.19.2",
diff --git a/services/discord-voice-bot/src/bots/discord/commands/summon.ts b/services/discord-voice-bot/src/bots/discord/commands/summon.ts
index 375a941..f39974d 100644
--- a/services/discord-voice-bot/src/bots/discord/commands/summon.ts
+++ b/services/discord-voice-bot/src/bots/discord/commands/summon.ts
@@ -2,26 +2,65 @@ import type { AudioReceiveStream } from '@discordjs/voice'
 import type { useLogg } from '@guiiai/logg'
 import type { CacheType, ChatInputCommandInteraction, GuildMember } from 'discord.js'
 import { Buffer } from 'node:buffer'
-import { createWriteStream } from 'node:fs'
-import { mkdir, readFile } from 'node:fs/promises'
 import { env } from 'node:process'
-import { Readable } from 'node:stream'
+import { Readable, Writable } from 'node:stream'
 import { createAudioPlayer, createAudioResource, EndBehaviorType, entersState, joinVoiceChannel, NoSubscriberBehavior, VoiceConnectionStatus } from '@discordjs/voice'
 import { generateSpeech } from '@xsai/generate-speech'
 import { generateText } from '@xsai/generate-text'
 import { createOpenAI, createUnElevenLabs } from '@xsai/providers'
 import { message } from '@xsai/shared-chat'
-import { formatDate } from 'date-fns'
-import ffmpeg from 'fluent-ffmpeg'
 import OpusScript from 'opusscript'
-import wavefile from 'wavefile'
 
-import { WhisperLargeV3Pipeline } from '../../../pipelines/tts'
+import { transcribe } from '../../../pipelines/tts'
 import { systemPrompt } from '../../../prompts/system-v1'
-import { exists } from '../../../utils/fs'
 
 const decoder = new OpusScript(48000, 2)
 
+async function transcribeTextFromAudioReceiveStream(stream: AudioReceiveStream) {
+  return new Promise<string>((resolve, reject) => {
+    try {
+      let pcmBuffer = Buffer.alloc(0)
+      const pcmStream = new Writable({
+        write(chunk, _encoding, callback) {
+          pcmBuffer = Buffer.concat([pcmBuffer, chunk])
+          callback()
+        },
+      })
+
+      stream.on('error', (err) => {
+        reject(err)
+      })
+
+      // Create the pipeline
+      stream.on('data', async (chunk) => {
+        try {
+          const pcm = decoder.decode(chunk)
+          pcmStream.write(pcm)
+        }
+        catch (err) {
+          reject(err)
+        }
+      })
+
+      // When user stops talking, stop the stream and generate an mp3 file.
+      stream.on('end', async () => {
+        try {
+          pcmStream.end()
+
+          const result = await transcribe(pcmBuffer)
+          resolve(result)
+        }
+        catch (err) {
+          reject(err)
+        }
+      })
+    }
+    catch (err) {
+      reject(err)
+    }
+  })
+}
+
 export async function handleSummon(log: ReturnType<typeof useLogg>, interaction: ChatInputCommandInteraction<CacheType>) {
   const currVoiceChannel = (interaction.member as GuildMember).voice.channel
   if (!currVoiceChannel) {
@@ -85,7 +124,8 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
           },
         })
 
-        const result = await transcribeAudioStream(log, listenStream, userId)
+        const result = await transcribeTextFromAudioReceiveStream(listenStream)
+
         const openai = createOpenAI({
           apiKey: env.OPENAI_API_KEY,
           baseURL: env.OPENAI_API_BASE_URL,
@@ -115,11 +155,18 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
         })
 
         const speechRes = await generateSpeech({
-          ...elevenlabs.speech({ model: 'elevenlabs/eleven_multilingual_v2', voice: 'lNxY9WuCBCZCISASyJ55' }),
+          ...elevenlabs.speech({
+            model: 'eleven_multilingual_v2',
+            voice: 'lNxY9WuCBCZCISASyJ55',
+            voiceSettings: {
+              stability: 0.4,
+              similarityBoost: 0.5,
+            },
+          }),
           input: res.text,
         })
 
-        log.withField('length', speechRes.byteLength).withField('text', Buffer.from(speechRes).toString('utf-8')).log('Generated speech')
+        log.withField('length', speechRes.byteLength).log('Generated speech')
 
         const audioResource = createAudioResource(Readable.from(Buffer.from(speechRes)))
         player.play(audioResource)
@@ -138,92 +185,3 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
     await interaction.reply('Could not join voice channel.')
   }
 }
-
-async function transcribeAudioStream(log: ReturnType<typeof useLogg>, stream: AudioReceiveStream, userId: string) {
-  async function createDirIfNotExists(path: string) {
-    if (!(await exists(path))) {
-      await mkdir(path, { recursive: true })
-    }
-  }
-
-  return new Promise<string>((resolve, reject) => {
-    createDirIfNotExists(`temp/audios/${userId}`).then(() => {
-      try {
-        const fileBasename = formatDate(new Date(), 'yyyy-MM-dd HH:mm:ss')
-
-        // Generate a uid for the audio file.
-        // Create a stream that writes a new pcm file with the generated uid
-        const writeStream = createWriteStream(`temp/audios/${userId}/${fileBasename}.pcm`, { flags: 'a' })
-
-        stream.on('error', (err) => {
-          reject(err)
-        })
-
-        // Create the pipeline
-        stream.on('data', async (chunk) => {
-          try {
-            const pcm = decoder.decode(chunk)
-            writeStream.write(pcm)
-          }
-          catch (err) {
-            log.withError(err).log('Error decoding audio')
-          }
-        })
-
-        // When user stops talking, stop the stream and generate an mp3 file.
-        stream.on('end', async () => {
-          writeStream.end()
-
-          ffmpeg()
-            .input(`temp/audios/${userId}/${fileBasename}.pcm`)
-            .inputFormat('s32le')
-            .audioFrequency(60000)
-            .audioChannels(2)
-            .output(`temp/audios/${userId}/${fileBasename}.wav`)
-            .outputFormat('wav')
-            .on('error', (err) => {
-              reject(err)
-            })
-            .on('end', async () => {
-              log.log('Audio file generated')
-
-              // Read .wav file and convert it to required format
-              const wav = new wavefile.WaveFile(await readFile(`temp/audios/${userId}/${fileBasename}.wav`))
-              wav.toBitDepth('32f') // Pipeline expects input as a Float32Array
-              wav.toSampleRate(16000) // Whisper expects audio with a sampling rate of 16000
-              const audioData = wav.getSamples()
-
-              const transcriber = await WhisperLargeV3Pipeline.getInstance()
-              log.log('Transcribing audio')
-
-              const result = await transcriber(audioData)
-              if (Array.isArray(result)) {
-                const arrayResult = result as { text: string }[]
-                if (arrayResult.length === 0) {
-                  log.log('No transcription result')
-                  return resolve('')
-                }
-
-                log.withField('result', result[0].text).log('Transcription result')
-                resolve(result[0].text)
-              }
-              else {
-                if ('text' in result) {
-                  log.withField('result', result.text).log('Transcription result')
-                  return resolve(result.text)
-                }
-                else {
-                  log.withField('result', result).log('No transcription result')
-                  return resolve('')
-                }
-              }
-            })
-            .run()
-        })
-      }
-      catch (err) {
-        reject(err)
-      }
-    })
-  })
-}
diff --git a/services/discord-voice-bot/src/pipelines/tts.ts b/services/discord-voice-bot/src/pipelines/tts.ts
index 84e9514..b545f8f 100644
--- a/services/discord-voice-bot/src/pipelines/tts.ts
+++ b/services/discord-voice-bot/src/pipelines/tts.ts
@@ -1,5 +1,10 @@
+import type { Buffer } from 'node:buffer'
+import { useLogg } from '@guiiai/logg'
 import { pipeline, type PipelineType } from '@huggingface/transformers'
 
+import wavefile from 'wavefile'
+import { pcmToWav } from '../utils/audio'
+
 export class WhisperLargeV3Pipeline {
   static task: PipelineType = 'automatic-speech-recognition'
   static model = 'Xenova/whisper-tiny.en'
@@ -16,3 +21,47 @@ export class WhisperLargeV3Pipeline {
     return this.instance
   }
 }
+
+export function textFromResult(result: Array<{ text: string }> | { text: string }) {
+  if (Array.isArray(result)) {
+    const arrayResult = result as { text: string }[]
+    if (arrayResult.length === 0) {
+      return ''
+    }
+
+    return result[0].text
+  }
+  else {
+    if ('text' in result) {
+      return result.text
+    }
+    else {
+      return ''
+    }
+  }
+}
+
+export async function transcribe(pcmBuffer: Buffer) {
+  const log = useLogg('Transcribe').useGlobalConfig()
+
+  const pcmConvertedWav = pcmToWav(pcmBuffer, 48000, 2)
+  log.withFields({ from: pcmBuffer.byteLength, to: pcmConvertedWav.byteLength }).log('Audio data received')
+
+  const transcriber = await WhisperLargeV3Pipeline.getInstance() as (audio: Float32Array | Float64Array) => Promise<Array<{ text: string }> | { text: string }>
+  log.log('Transcribing audio')
+
+  const wav = new wavefile.WaveFile(pcmConvertedWav)
+  wav.toBitDepth('32f') // Pipeline expects input as a Float32Array
+  wav.toSampleRate(16000) // Whisper expects audio with a sampling rate of 16000
+  const audioData = wav.getSamples()
+
+  const result = await transcriber(audioData)
+  const text = textFromResult(result)
+  if (!text) {
+    log.log('No transcription result')
+    return ''
+  }
+
+  log.withField('result', text).log('Transcription result')
+  return text
+}
diff --git a/services/discord-voice-bot/src/utils/audio.ts b/services/discord-voice-bot/src/utils/audio.ts
new file mode 100644
index 0000000..5303e2e
--- /dev/null
+++ b/services/discord-voice-bot/src/utils/audio.ts
@@ -0,0 +1,51 @@
+import type { Buffer } from 'node:buffer'
+
+export function pcmToWav(pcmBuffer: Buffer, sampleRate: number, numChannels: number): Uint8Array {
+  const byteRate = sampleRate * numChannels * 2 // Assuming 16-bit PCM (2 bytes per sample)
+  const blockAlign = numChannels * 2 // Block align for 16-bit PCM
+
+  // Create WAV header
+  const header = new ArrayBuffer(44)
+  const view = new DataView(header)
+
+  // Write RIFF identifier
+  writeString(view, 0, 'RIFF')
+  // Write file length (size of data + header)
+  view.setUint32(4, 36 + pcmBuffer.byteLength, true)
+  // Write WAVE identifier
+  writeString(view, 8, 'WAVE')
+  // Write format chunk identifier
+  writeString(view, 12, 'fmt ')
+  // Write format chunk length (16 for PCM)
+  view.setUint32(16, 16, true)
+  // Write audio format (1 for PCM)
+  view.setUint16(20, 1, true)
+  // Write number of channels
+  view.setUint16(22, numChannels, true)
+  // Write sample rate
+  view.setUint32(24, sampleRate, true)
+  // Write byte rate
+  view.setUint32(28, byteRate, true)
+  // Write block align
+  view.setUint16(32, blockAlign, true)
+  // Write bits per sample (16)
+  view.setUint16(34, 16, true)
+  // Write data chunk identifier
+  writeString(view, 36, 'data')
+  // Write data chunk length (size of PCM data)
+  view.setUint32(40, pcmBuffer.byteLength, true)
+
+  // Combine header and PCM data into one buffer
+  const wavBuffer = new Uint8Array(header.byteLength + pcmBuffer.byteLength)
+
+  wavBuffer.set(new Uint8Array(header), 0)
+  wavBuffer.set(new Uint8Array(pcmBuffer), header.byteLength)
+
+  return wavBuffer
+}
+
+function writeString(view, offset, string) {
+  for (let i = 0; i < string.length; i++) {
+    view.setUint8(offset + i, string.charCodeAt(i))
+  }
+}