-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Functions now async and take getter callbacks instead of data
- Loading branch information
1 parent
4de7282
commit 7614f68
Showing
5 changed files
with
216 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# GPT-3-Encoder Deno | ||
# GPT Encoder Deno | ||
|
||
Deno-optimized fork of https://github.com/latitudegames/GPT-3-Encoder, a JavaScript BPE Encoder Decoder for GPT-2/GPT-3. | ||
|
||
|
@@ -8,18 +8,40 @@ GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to | |
|
||
## Usage | ||
|
||
For convenience, you can add `gpt-encoder-deno` to your `import_map.json` and set it to the current version (e.g. `https://esm.sh/gh/clearlylocal/[email protected]`). | ||
Replace `$VERSION` with the version you wish to use, or import via an import map. | ||
|
||
```ts | ||
import { | ||
encode, | ||
decode, | ||
tokenLength, | ||
getBpeRankFrom, | ||
getTokenFrom, | ||
getWordFrom, | ||
} from 'https://esm.sh/gh/clearlylocal/gpt-encoder-deno@$VERSION/mod.ts' | ||
import tokenMapping from 'https://raw.githubusercontent.com/clearlylocal/gpt-encoder-deno/$VERSION/token-mapping-gpt3.json' assert { type: 'json' } | ||
import { assertEquals } from 'https://deno.land/[email protected]/testing/asserts.ts' | ||
import { encode, decode } from 'gpt-encoder-deno/mod.ts' | ||
import tokenMapping from 'gpt-encoder-deno/token-mapping-gpt3.json' assert { type: 'json' } | ||
const bpe = await (await fetch(import.meta.resolve('gpt-encoder-deno/vocab-gpt3.bpe'))).text() | ||
|
||
const getToken = getTokenFrom(tokenMapping) | ||
const getWord = getWordFrom(tokenMapping) | ||
const getBpeRank = getBpeRankFrom( | ||
await ( | ||
await fetch( | ||
import.meta.resolve( | ||
'https://raw.githubusercontent.com/clearlylocal/gpt-encoder-deno/$VERSION/vocab-gpt3.bpe', | ||
), | ||
) | ||
).text(), | ||
) | ||
|
||
const str = 'my example string 🦄' | ||
const encoded = encode(str, { tokenMapping, bpe }) | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
const expectedTokens = [1820, 1672, 4731, 12520, 99, 226] | ||
|
||
assertEquals(encoded, [1820, 1672, 4731, 12520, 99, 226]) | ||
assertEquals(encoded, expectedTokens) | ||
assertEquals(len, expectedTokens.length) | ||
|
||
for (const [idx, data] of [ | ||
{ token: 1820, string: 'my' }, | ||
|
@@ -30,9 +52,9 @@ for (const [idx, data] of [ | |
{ token: 226, string: '�' }, | ||
].entries()) { | ||
const token = encoded[idx] | ||
assertEquals(data, { token, string: decode([token], { tokenMapping }) }) | ||
assertEquals(data, { token, string: await decode([token], { getWord }) }) | ||
} | ||
|
||
const decoded = decode(encoded, { tokenMapping }) | ||
const decoded = await decode(encoded, { getWord }) | ||
assertEquals(decoded, str) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,10 @@ | ||
import tokenMapping from './token-mapping-gpt3.json' assert { type: 'json' } | ||
import { encode, decode } from './codec.ts' | ||
import { encode, decode, tokenLength, getBpeRankFrom, getTokenFrom, getWordFrom } from './mod.ts' | ||
import { assertEquals } from 'https://deno.land/[email protected]/testing/asserts.ts' | ||
const bpe = await (await fetch(import.meta.resolve('./vocab-gpt3.bpe'))).text() | ||
|
||
const getToken = getTokenFrom(tokenMapping) | ||
const getWord = getWordFrom(tokenMapping) | ||
const getBpeRank = getBpeRankFrom(await (await fetch(import.meta.resolve('./vocab-gpt3.bpe'))).text()) | ||
|
||
Deno.test('docs', async (t) => { | ||
const docFiles = ['./README.md'] | ||
|
@@ -22,46 +25,119 @@ Deno.test('docs', async (t) => { | |
} | ||
}) | ||
|
||
Deno.test('empty string', () => { | ||
Deno.test('empty string', async () => { | ||
const str = '' | ||
assertEquals(encode(str, { tokenMapping, bpe }), []) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
|
||
const expectedTokens: number[] = [] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('space', () => { | ||
Deno.test('space', async () => { | ||
const str = ' ' | ||
assertEquals(encode(str, { tokenMapping, bpe }), [220]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
|
||
const expectedTokens = [220] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('tab', () => { | ||
Deno.test('tab', async () => { | ||
const str = '\t' | ||
assertEquals(encode(str, { tokenMapping, bpe }), [197]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
|
||
const expectedTokens = [197] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('simple text', () => { | ||
Deno.test('simple text', async () => { | ||
const str = 'This is some text' | ||
assertEquals(encode(str, { tokenMapping, bpe }), [1212, 318, 617, 2420]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
|
||
const expectedTokens = [1212, 318, 617, 2420] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('multi-token word', () => { | ||
Deno.test('multi-token word', async () => { | ||
const str = 'indivisible' | ||
assertEquals(encode(str, { tokenMapping, bpe }), [521, 452, 12843]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
|
||
const expectedTokens = [521, 452, 12843] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('emojis', () => { | ||
Deno.test('repetition (no cache)', async () => { | ||
const str = 'This is some text, This is some text' | ||
|
||
const expectedTokens = [1212, 318, 617, 2420, 11, 770, 318, 617, 2420] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('repetition (with cache)', async () => { | ||
const str = 'This is some text, This is some text' | ||
|
||
const expectedTokens = [1212, 318, 617, 2420, 11, 770, 318, 617, 2420] | ||
const encoded = await encode(str, { getToken, getBpeRank, cache: new Map<string, string>() }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank, cache: new Map<string, string>() }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('emojis', async () => { | ||
const str = 'hello 👋 world 🌍' | ||
|
||
assertEquals(encode(str, { tokenMapping, bpe }), [31373, 50169, 233, 995, 12520, 234, 235]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
const expectedTokens = [31373, 50169, 233, 995, 12520, 234, 235] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) | ||
|
||
Deno.test('properties of Object', () => { | ||
Deno.test('properties of Object', async () => { | ||
const str = 'toString constructor hasOwnProperty valueOf' | ||
|
||
assertEquals(encode(str, { tokenMapping, bpe }), [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189]) | ||
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str) | ||
const expectedTokens = [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189] | ||
const encoded = await encode(str, { getToken, getBpeRank }) | ||
const decoded = await decode(encoded, { getWord }) | ||
const len = await tokenLength(str, { getBpeRank }) | ||
|
||
assertEquals(encoded, expectedTokens) | ||
assertEquals(decoded, str) | ||
assertEquals(len, expectedTokens.length) | ||
}) |
Oops, something went wrong.