Skip to content

Commit

Permalink
refactor: drop ffmpeg
Browse files Browse the repository at this point in the history
  • Loading branch information
nekomeowww committed Jan 6, 2025
1 parent 3385b8b commit c18d044
Show file tree
Hide file tree
Showing 7 changed files with 184 additions and 138 deletions.
47 changes: 12 additions & 35 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions services/discord-voice-bot/.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
DISCORD_TOKEN=''
DISCORD_BOT_CLIENT_ID=''

OPENAI_MODEL=''
OPENAI_API_KEY=''
OPENAI_API_BASE_URL=''

ELEVENLABS_API_KEY=''
ELEVENLABS_API_BASE_URL=''
5 changes: 5 additions & 0 deletions services/discord-voice-bot/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
## Acknowledgements

- Implementation of Audio handling and processing https://github.com/TheTrueSCP/CharacterAIVoice/blob/54d6a41b4e0eba9ad996c5f9ddcc6230277af2f8/src/VoiceHandler.js
- Example of usage https://github.com/discordjs/voice-examples/blob/da0c3b419107d41053501a4dddf3826ad53c03f7/radio-bot/src/bot.ts
- Excellent library https://github.com/discordjs/discord.js
3 changes: 0 additions & 3 deletions services/discord-voice-bot/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,7 @@
"@xsai/generate-text": "^0.0.23",
"@xsai/providers": "^0.0.23",
"@xsai/shared-chat": "^0.0.23",
"date-fns": "^4.1.0",
"discord.js": "^14.17.2",
"ffmpeg-static": "^5.2.0",
"fluent-ffmpeg": "^2.1.3",
"libsodium-wrappers": "^0.7.15",
"opusscript": "^0.1.1",
"tsx": "^4.19.2",
Expand Down
158 changes: 58 additions & 100 deletions services/discord-voice-bot/src/bots/discord/commands/summon.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,65 @@ import type { AudioReceiveStream } from '@discordjs/voice'
import type { useLogg } from '@guiiai/logg'
import type { CacheType, ChatInputCommandInteraction, GuildMember } from 'discord.js'
import { Buffer } from 'node:buffer'
import { createWriteStream } from 'node:fs'
import { mkdir, readFile } from 'node:fs/promises'
import { env } from 'node:process'
import { Readable } from 'node:stream'
import { Readable, Writable } from 'node:stream'
import { createAudioPlayer, createAudioResource, EndBehaviorType, entersState, joinVoiceChannel, NoSubscriberBehavior, VoiceConnectionStatus } from '@discordjs/voice'
import { generateSpeech } from '@xsai/generate-speech'
import { generateText } from '@xsai/generate-text'
import { createOpenAI, createUnElevenLabs } from '@xsai/providers'
import { message } from '@xsai/shared-chat'
import { formatDate } from 'date-fns'
import ffmpeg from 'fluent-ffmpeg'
import OpusScript from 'opusscript'
import wavefile from 'wavefile'

import { WhisperLargeV3Pipeline } from '../../../pipelines/tts'
import { transcribe } from '../../../pipelines/tts'
import { systemPrompt } from '../../../prompts/system-v1'
import { exists } from '../../../utils/fs'

const decoder = new OpusScript(48000, 2)

async function transcribeTextFromAudioReceiveStream(stream: AudioReceiveStream) {
return new Promise<string>((resolve, reject) => {
try {
let pcmBuffer = Buffer.alloc(0)
const pcmStream = new Writable({
write(chunk, _encoding, callback) {
pcmBuffer = Buffer.concat([pcmBuffer, chunk])
callback()
},
})

stream.on('error', (err) => {
reject(err)
})

// Create the pipeline
stream.on('data', async (chunk) => {
try {
const pcm = decoder.decode(chunk)
pcmStream.write(pcm)
}
catch (err) {
reject(err)
}
})

// When user stops talking, stop the stream and generate an mp3 file.
stream.on('end', async () => {
try {
pcmStream.end()

const result = await transcribe(pcmBuffer)
resolve(result)
}
catch (err) {
reject(err)
}
})
}
catch (err) {
reject(err)
}
})
}

export async function handleSummon(log: ReturnType<typeof useLogg>, interaction: ChatInputCommandInteraction<CacheType>) {
const currVoiceChannel = (interaction.member as GuildMember).voice.channel
if (!currVoiceChannel) {
Expand Down Expand Up @@ -85,7 +124,8 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
},
})

const result = await transcribeAudioStream(log, listenStream, userId)
const result = await transcribeTextFromAudioReceiveStream(listenStream)

const openai = createOpenAI({
apiKey: env.OPENAI_API_KEY,
baseURL: env.OPENAI_API_BASE_URL,
Expand Down Expand Up @@ -115,11 +155,18 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
})

const speechRes = await generateSpeech({
...elevenlabs.speech({ model: 'elevenlabs/eleven_multilingual_v2', voice: 'lNxY9WuCBCZCISASyJ55' }),
...elevenlabs.speech({
model: 'eleven_multilingual_v2',
voice: 'lNxY9WuCBCZCISASyJ55',
voiceSettings: {
stability: 0.4,
similarityBoost: 0.5,
},
}),
input: res.text,
})

log.withField('length', speechRes.byteLength).withField('text', Buffer.from(speechRes).toString('utf-8')).log('Generated speech')
log.withField('length', speechRes.byteLength).log('Generated speech')

const audioResource = createAudioResource(Readable.from(Buffer.from(speechRes)))
player.play(audioResource)
Expand All @@ -138,92 +185,3 @@ export async function handleSummon(log: ReturnType<typeof useLogg>, interaction:
await interaction.reply('Could not join voice channel.')
}
}

async function transcribeAudioStream(log: ReturnType<typeof useLogg>, stream: AudioReceiveStream, userId: string) {
async function createDirIfNotExists(path: string) {
if (!(await exists(path))) {
await mkdir(path, { recursive: true })
}
}

return new Promise<string>((resolve, reject) => {
createDirIfNotExists(`temp/audios/${userId}`).then(() => {
try {
const fileBasename = formatDate(new Date(), 'yyyy-MM-dd HH:mm:ss')

// Generate a uid for the audio file.
// Create a stream that writes a new pcm file with the generated uid
const writeStream = createWriteStream(`temp/audios/${userId}/${fileBasename}.pcm`, { flags: 'a' })

stream.on('error', (err) => {
reject(err)
})

// Create the pipeline
stream.on('data', async (chunk) => {
try {
const pcm = decoder.decode(chunk)
writeStream.write(pcm)
}
catch (err) {
log.withError(err).log('Error decoding audio')
}
})

// When user stops talking, stop the stream and generate an mp3 file.
stream.on('end', async () => {
writeStream.end()

ffmpeg()
.input(`temp/audios/${userId}/${fileBasename}.pcm`)
.inputFormat('s32le')
.audioFrequency(60000)
.audioChannels(2)
.output(`temp/audios/${userId}/${fileBasename}.wav`)
.outputFormat('wav')
.on('error', (err) => {
reject(err)
})
.on('end', async () => {
log.log('Audio file generated')

// Read .wav file and convert it to required format
const wav = new wavefile.WaveFile(await readFile(`temp/audios/${userId}/${fileBasename}.wav`))
wav.toBitDepth('32f') // Pipeline expects input as a Float32Array
wav.toSampleRate(16000) // Whisper expects audio with a sampling rate of 16000
const audioData = wav.getSamples()

const transcriber = await WhisperLargeV3Pipeline.getInstance()
log.log('Transcribing audio')

const result = await transcriber(audioData)
if (Array.isArray(result)) {
const arrayResult = result as { text: string }[]
if (arrayResult.length === 0) {
log.log('No transcription result')
return resolve('')
}

log.withField('result', result[0].text).log('Transcription result')
resolve(result[0].text)
}
else {
if ('text' in result) {
log.withField('result', result.text).log('Transcription result')
return resolve(result.text)
}
else {
log.withField('result', result).log('No transcription result')
return resolve('')
}
}
})
.run()
})
}
catch (err) {
reject(err)
}
})
})
}
Loading

0 comments on commit c18d044

Please sign in to comment.