Skip to content

Commit

Permalink
Functions now async and take getter callbacks instead of data
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe committed Apr 28, 2023
1 parent 4de7282 commit 7614f68
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 99 deletions.
40 changes: 31 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# GPT-3-Encoder Deno
# GPT Encoder Deno

Deno-optimized fork of https://github.com/latitudegames/GPT-3-Encoder, a JavaScript BPE Encoder Decoder for GPT-2/GPT-3.

Expand All @@ -8,18 +8,40 @@ GPT-2 and GPT-3 use byte pair encoding to turn text into a series of integers to

## Usage

For convenience, you can add `gpt-encoder-deno` to your `import_map.json` and set it to the current version (e.g. `https://esm.sh/gh/clearlylocal/[email protected]`).
Replace `$VERSION` with the version you wish to use, or import via an import map.

```ts
import {
encode,
decode,
tokenLength,
getBpeRankFrom,
getTokenFrom,
getWordFrom,
} from 'https://esm.sh/gh/clearlylocal/gpt-encoder-deno@$VERSION/mod.ts'
import tokenMapping from 'https://raw.githubusercontent.com/clearlylocal/gpt-encoder-deno/$VERSION/token-mapping-gpt3.json' assert { type: 'json' }
import { assertEquals } from 'https://deno.land/[email protected]/testing/asserts.ts'
import { encode, decode } from 'gpt-encoder-deno/mod.ts'
import tokenMapping from 'gpt-encoder-deno/token-mapping-gpt3.json' assert { type: 'json' }
const bpe = await (await fetch(import.meta.resolve('gpt-encoder-deno/vocab-gpt3.bpe'))).text()

const getToken = getTokenFrom(tokenMapping)
const getWord = getWordFrom(tokenMapping)
const getBpeRank = getBpeRankFrom(
await (
await fetch(
import.meta.resolve(
'https://raw.githubusercontent.com/clearlylocal/gpt-encoder-deno/$VERSION/vocab-gpt3.bpe',
),
)
).text(),
)

const str = 'my example string 🦄'
const encoded = encode(str, { tokenMapping, bpe })
const encoded = await encode(str, { getToken, getBpeRank })
const len = await tokenLength(str, { getBpeRank })

const expectedTokens = [1820, 1672, 4731, 12520, 99, 226]

assertEquals(encoded, [1820, 1672, 4731, 12520, 99, 226])
assertEquals(encoded, expectedTokens)
assertEquals(len, expectedTokens.length)

for (const [idx, data] of [
{ token: 1820, string: 'my' },
Expand All @@ -30,9 +52,9 @@ for (const [idx, data] of [
{ token: 226, string: '' },
].entries()) {
const token = encoded[idx]
assertEquals(data, { token, string: decode([token], { tokenMapping }) })
assertEquals(data, { token, string: await decode([token], { getWord }) })
}

const decoded = decode(encoded, { tokenMapping })
const decoded = await decode(encoded, { getWord })
assertEquals(decoded, str)
```
122 changes: 99 additions & 23 deletions codec.test.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import tokenMapping from './token-mapping-gpt3.json' assert { type: 'json' }
import { encode, decode } from './codec.ts'
import { encode, decode, tokenLength, getBpeRankFrom, getTokenFrom, getWordFrom } from './mod.ts'
import { assertEquals } from 'https://deno.land/[email protected]/testing/asserts.ts'
const bpe = await (await fetch(import.meta.resolve('./vocab-gpt3.bpe'))).text()

const getToken = getTokenFrom(tokenMapping)
const getWord = getWordFrom(tokenMapping)
const getBpeRank = getBpeRankFrom(await (await fetch(import.meta.resolve('./vocab-gpt3.bpe'))).text())

Deno.test('docs', async (t) => {
const docFiles = ['./README.md']
Expand All @@ -22,46 +25,119 @@ Deno.test('docs', async (t) => {
}
})

Deno.test('empty string', () => {
Deno.test('empty string', async () => {
const str = ''
assertEquals(encode(str, { tokenMapping, bpe }), [])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)

const expectedTokens: number[] = []
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('space', () => {
Deno.test('space', async () => {
const str = ' '
assertEquals(encode(str, { tokenMapping, bpe }), [220])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)

const expectedTokens = [220]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('tab', () => {
Deno.test('tab', async () => {
const str = '\t'
assertEquals(encode(str, { tokenMapping, bpe }), [197])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)

const expectedTokens = [197]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('simple text', () => {
Deno.test('simple text', async () => {
const str = 'This is some text'
assertEquals(encode(str, { tokenMapping, bpe }), [1212, 318, 617, 2420])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)

const expectedTokens = [1212, 318, 617, 2420]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('multi-token word', () => {
Deno.test('multi-token word', async () => {
const str = 'indivisible'
assertEquals(encode(str, { tokenMapping, bpe }), [521, 452, 12843])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)

const expectedTokens = [521, 452, 12843]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('emojis', () => {
Deno.test('repetition (no cache)', async () => {
const str = 'This is some text, This is some text'

const expectedTokens = [1212, 318, 617, 2420, 11, 770, 318, 617, 2420]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('repetition (with cache)', async () => {
const str = 'This is some text, This is some text'

const expectedTokens = [1212, 318, 617, 2420, 11, 770, 318, 617, 2420]
const encoded = await encode(str, { getToken, getBpeRank, cache: new Map<string, string>() })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank, cache: new Map<string, string>() })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('emojis', async () => {
const str = 'hello 👋 world 🌍'

assertEquals(encode(str, { tokenMapping, bpe }), [31373, 50169, 233, 995, 12520, 234, 235])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)
const expectedTokens = [31373, 50169, 233, 995, 12520, 234, 235]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})

Deno.test('properties of Object', () => {
Deno.test('properties of Object', async () => {
const str = 'toString constructor hasOwnProperty valueOf'

assertEquals(encode(str, { tokenMapping, bpe }), [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189])
assertEquals(decode(encode(str, { tokenMapping, bpe }), { tokenMapping }), str)
const expectedTokens = [1462, 10100, 23772, 468, 23858, 21746, 1988, 5189]
const encoded = await encode(str, { getToken, getBpeRank })
const decoded = await decode(encoded, { getWord })
const len = await tokenLength(str, { getBpeRank })

assertEquals(encoded, expectedTokens)
assertEquals(decoded, str)
assertEquals(len, expectedTokens.length)
})
Loading

0 comments on commit 7614f68

Please sign in to comment.