Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(tokenize): add basic tokenizer implementations #109

Merged
merged 7 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/funny-adults-brake.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@livekit/agents": minor
---

add basic tokenizer implementations
4 changes: 2 additions & 2 deletions agents/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import * as cli from './cli.js';
import * as llm from './llm/index.js';
import * as multimodal from './multimodal/index.js';
import * as stt from './stt/index.js';
import * as tokenize from './tokenize/index.js';
import * as tts from './tts/index.js';

export * from './vad.js';
Expand All @@ -23,8 +24,7 @@ export * from './worker.js';
export * from './utils.js';
export * from './log.js';
export * from './generator.js';
export * from './tokenize.js';
export * from './audio.js';
export * from './transcription.js';

export { cli, stt, tts, llm, multimodal };
export { cli, stt, tts, llm, multimodal, tokenize };
22 changes: 0 additions & 22 deletions agents/src/tokenize.ts

This file was deleted.

73 changes: 73 additions & 0 deletions agents/src/tokenize/basic/basic.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import * as tokenizer from '../index.js';
import { BufferedSentenceStream } from '../token_stream.js';
import { hyphenator } from './hyphenator.js';
import { splitParagraphs } from './paragraph.js';
import { splitSentences } from './sentence.js';
import { splitWords } from './word';

interface TokenizerOptions {
language: string;
minSentenceLength: number;
streamContextLength: number;
}

export class SentenceTokenizer extends tokenizer.SentenceTokenizer {
#config: TokenizerOptions;

constructor(language = 'en-US', minSentenceLength = 20, streamContextLength = 10) {
super();
this.#config = {
language,
minSentenceLength,
streamContextLength,
};
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
tokenize(text: string, language?: string): string[] {
return splitSentences(text, this.#config.minSentenceLength).map((tok) => tok[0]);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
stream(language?: string): tokenizer.SentenceStream {
return new BufferedSentenceStream(
(text: string) => splitSentences(text, this.#config.minSentenceLength),
this.#config.minSentenceLength,
this.#config.streamContextLength,
);
}
}

export class WordTokenizer extends tokenizer.SentenceTokenizer {
#ignorePunctuation: boolean;

constructor(ignorePunctuation = true) {
super();
this.#ignorePunctuation = ignorePunctuation;
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
tokenize(text: string, language?: string): string[] {
return splitWords(text, this.#ignorePunctuation).map((tok) => tok[0]);
}

// eslint-disable-next-line @typescript-eslint/no-unused-vars
stream(language?: string): tokenizer.SentenceStream {
return new BufferedSentenceStream(
(text: string) => splitWords(text, this.#ignorePunctuation),
1,
1,
);
}
}

export const hyphenateWord = (word: string): string[] => {
return hyphenator.hyphenateWord(word);
};

export const tokenizeParagraphs = (text: string): string[] => {
return splitParagraphs(text).map((tok) => tok[0]);
};
436 changes: 436 additions & 0 deletions agents/src/tokenize/basic/hyphenator.ts

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions agents/src/tokenize/basic/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

export { SentenceTokenizer, WordTokenizer, tokenizeParagraphs, hyphenateWord } from './basic.js';
43 changes: 43 additions & 0 deletions agents/src/tokenize/basic/paragraph.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

/**
* Split the text into paragraphs.
*/
export const splitParagraphs = (text: string): [string, number, number][] => {
const re = /\n\s*\n/g;
const splits = Array.from(text.matchAll(re));

const paragraphs: [string, number, number][] = [];
let start = 0;

// no splits (single paragraph)
if (splits.length === 0) {
const stripped = text.trim();
if (!stripped) return paragraphs;

const start = text.indexOf(stripped);
return [[stripped, start, start + stripped.length]];
}

for (const split of splits) {
const end = split.index!;
const paragraph = text.slice(start, end).trim();
if (paragraph) {
const paragraphStart = start + text.slice(start, end).indexOf(paragraph);
const paragraphEnd = paragraphStart + paragraph.length;
paragraphs.push([paragraph, paragraphStart, paragraphEnd]);
}
start = end + split[0].length;
}

const lastParagraph = text.slice(start).trim();
if (lastParagraph) {
const paragraphStart = start + text.slice(start).indexOf(lastParagraph);
const paragraphEnd = paragraphStart + lastParagraph.length;
paragraphs.push([lastParagraph, paragraphStart, paragraphEnd]);
}

return paragraphs;
};
69 changes: 69 additions & 0 deletions agents/src/tokenize/basic/sentence.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

/**
* Split the text into sentences.
*/
export const splitSentences = (text: string, minLength = 20): [string, number, number][] => {
const alphabets = /([A-Za-z])/g;
const prefixes = /(Mr|St|Mrs|Ms|Dr)[.]/g;
const suffixes = /(Inc|Ltd|Jr|Sr|Co)/g;
const starters =
/(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)/g;
const acronyms = /([A-Z][.][A-Z][.](?:[A-Z][.])?)/g;
const websites = /[.](com|net|org|io|gov|edu|me)/g;
const digits = /([0-9])/g;
const dots = /\.{2,}/g;

text = text.replaceAll('\n', ' ');
text = text.replaceAll(prefixes, '$1<prd>');
text = text.replaceAll(websites, '<prd>$2');
text = text.replaceAll(new RegExp(`${digits}[.]${digits}`, 'g'), '$1<prd>$2');
text = text.replaceAll(dots, (match) => '<prd>'.repeat(match.length));
text = text.replaceAll('Ph.D.', 'Ph<prd>D<prd>');
text = text.replaceAll(new RegExp(`\s${alphabets}[.]`, 'g'), '$1<prd>');
text = text.replaceAll(new RegExp(`${acronyms} ${starters}`, 'g'), '$1<stop> $2');
text = text.replaceAll(
new RegExp(`${alphabets}[.]${alphabets}[.]${alphabets}[.]`, 'g'),
'$1<prd>$2<prd>$3<prd>',
);
text = text.replaceAll(new RegExp(`${alphabets}[.]${alphabets}[.]`, 'g'), '$1<prd>$2<prd>');
text = text.replaceAll(new RegExp(` ${suffixes}[.] ${starters}`, 'g'), '$1<stop> $2');
text = text.replaceAll(new RegExp(` ${suffixes}[.]`, 'g'), '$1<prd>');
text = text.replaceAll(new RegExp(` ${alphabets}[.]`, 'g'), '$1<prd>');
text = text.replaceAll('.”', '”.');
text = text.replaceAll('."', '".');
text = text.replaceAll('!"', '"!');
text = text.replaceAll('?"', '"?');
text = text.replaceAll('.', '.<stop>');
text = text.replaceAll('?', '?<stop>');
text = text.replaceAll('!', '!<stop>');
text = text.replaceAll('<prd>', '.');

const split = text.split('<stop>');
text = text.replaceAll('<stop>', '');

const sentences: [string, number, number][] = [];
let buf = '';
let start = 0;
let end = 0;
for (const match of split) {
const sentence = match.trim();
if (!sentence) continue;

buf += ' ' + sentence;
end += match.length;
if (buf.length > minLength) {
sentences.push([buf.slice(1), start, end]);
start = end;
buf = '';
}
}

if (buf) {
sentences.push([buf.slice(1), start, text.length - 1]);
}

return sentences;
};
27 changes: 27 additions & 0 deletions agents/src/tokenize/basic/word.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { PUNCTUATIONS } from '../tokenizer.js';

/**
* Split the text into words.
*/
export const splitWords = (text: string, ignorePunctuation = true): [string, number, number][] => {
const re = /\S+/g;
const words: [string, number, number][] = [];

let arr;
while ((arr = re.exec(text)) !== null) {
let word = arr[0];
const start = arr.index;
const end = start + word.length;

if (ignorePunctuation) {
word = word.replace(new RegExp(`[${PUNCTUATIONS.join('')}]`, 'g'), '');
}

words.push([word, start, end]);
}

return words;
};
16 changes: 16 additions & 0 deletions agents/src/tokenize/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import * as basic from './basic/index.js';

export {
type TokenData,
SentenceTokenizer,
SentenceStream,
WordTokenizer,
WordStream,
} from './tokenizer.js';

export { BufferedSentenceStream, BufferedTokenStream } from './token_stream.js';

export { basic };
Loading