Skip to content

Commit

Permalink
Optimize for Deno
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe committed Mar 8, 2023
1 parent 9df47fc commit 29f8660
Show file tree
Hide file tree
Showing 20 changed files with 275 additions and 4,950 deletions.
30 changes: 0 additions & 30 deletions .github/workflows/node.js.yml

This file was deleted.

3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
node_modules
.npmrc
.env
2 changes: 2 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
vocab.bpe
encoder.json
18 changes: 18 additions & 0 deletions .prettierrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"arrowParens": "always",
"bracketSpacing": true,
"htmlWhitespaceSensitivity": "strict",
"insertPragma": false,
"bracketSameLine": false,
"jsxSingleQuote": true,
"proseWrap": "preserve",
"quoteProps": "as-needed",
"requirePragma": false,
"semi": false,
"singleQuote": true,
"tabWidth": 4,
"useTabs": true,
"trailingComma": "all",
"vueIndentScriptAndStyle": false,
"printWidth": 120
}
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"deno.enable": true,
"deno.unstable": true
}
276 changes: 144 additions & 132 deletions Encoder.js
Original file line number Diff line number Diff line change
@@ -1,178 +1,190 @@
// This file includes code which was modified from https://github.com/openai/gpt-2
const fs = require('fs')
const path = require('path');

const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json')));
const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8');
import encoder from './encoder.json' assert { type: 'json' }
const bpe_file = await (await fetch(import.meta.resolve('./vocab.bpe'))).text()

const range = (x, y) => {
const res = Array.from(Array(y).keys()).slice(x)
return res
const res = Array.from(Array(y).keys()).slice(x)
return res
}

const ord = x => {
return x.charCodeAt(0)
const ord = (x) => {
return x.charCodeAt(0)
}

const chr = x => {
return String.fromCharCode(x)
const chr = (x) => {
return String.fromCharCode(x)
}

const textEncoder = new TextEncoder("utf-8")
const encodeStr = str => {
return Array.from(textEncoder.encode(str)).map(x => x.toString())
const textEncoder = new TextEncoder('utf-8')
const encodeStr = (str) => {
return Array.from(textEncoder.encode(str)).map((x) => x.toString())
}

const textDecoder = new TextDecoder("utf-8")
const decodeStr = arr => {
return textDecoder.decode(new Uint8Array(arr));
const textDecoder = new TextDecoder('utf-8')
const decodeStr = (arr) => {
return textDecoder.decode(new Uint8Array(arr))
}

const dictZip = (x, y) => {
const result = {}
x.map((_, i) => { result[x[i]] = y[i] })
return result
const result = {}
x.map((_, i) => {
result[x[i]] = y[i]
})
return result
}

function bytes_to_unicode() {
const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1))

let cs = bs.slice()
let n = 0
for (let b = 0; b < 2 ** 8; b++) {
if (!bs.includes(b)) {
bs.push(b)
cs.push(2 ** 8 + n)
n = n + 1
}
}

cs = cs.map(x => chr(x))

const result = {}
bs.map((_, i) => { result[bs[i]] = cs[i] })
return result
const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1))

let cs = bs.slice()
let n = 0
for (let b = 0; b < 2 ** 8; b++) {
if (!bs.includes(b)) {
bs.push(b)
cs.push(2 ** 8 + n)
n = n + 1
}
}

cs = cs.map((x) => chr(x))

const result = {}
bs.map((_, i) => {
result[bs[i]] = cs[i]
})
return result
}

function get_pairs(word) {
const pairs = new Set()
let prev_char = word[0]
for (let i = 1; i < word.length; i++) {
const char = word[i]
pairs.add([prev_char, char])
prev_char = char
}
return pairs
const pairs = new Set()
let prev_char = word[0]
for (let i = 1; i < word.length; i++) {
const char = word[i]
pairs.add([prev_char, char])
prev_char = char
}
return pairs
}

const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu

const decoder = {}
Object.keys(encoder).map(x => { decoder[encoder[x]] = x })
Object.keys(encoder).map((x) => {
decoder[encoder[x]] = x
})

const lines = bpe_file.split('\n')

// bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
const bpe_merges = lines.slice(1, lines.length - 1).map(x => {
return x.split(/(\s+)/).filter(function(e) { return e.trim().length > 0 })
const bpe_merges = lines.slice(1, lines.length - 1).map((x) => {
return x.split(/(\s+)/).filter(function (e) {
return e.trim().length > 0
})
})

const byte_encoder = bytes_to_unicode()
const byte_decoder = {}
Object.keys(byte_encoder).map(x => { byte_decoder[byte_encoder[x]] = x })
Object.keys(byte_encoder).map((x) => {
byte_decoder[byte_encoder[x]] = x
})

const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length))
const cache = new Map;
const cache = new Map()

function bpe(token) {
if (cache.has(token)) {
return cache.get(token)
}``

let word = token.split('')

let pairs = get_pairs(word)

if (!pairs) {
return token
}

while (true) {
const minPairs = {}
Array.from(pairs).map(pair => {
const rank = bpe_ranks[pair]
minPairs[(isNaN(rank) ? 10e10 : rank)] = pair
})



const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => {
return parseInt(x)
}
))]

if (!(bigram in bpe_ranks)) {
break
}

const first = bigram[0]
const second = bigram[1]
let new_word = []
let i = 0

while (i < word.length) {
const j = word.indexOf(first, i)
if (j === -1) {
new_word = new_word.concat(word.slice(i))
break
}
new_word = new_word.concat(word.slice(i, j))
i = j

if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
new_word.push(first + second)
i = i + 2
} else {
new_word.push(word[i])
i = i + 1
}
}

word = new_word
if (word.length === 1) {
break
} else {
pairs = get_pairs(word)
}
}

word = word.join(' ')
cache.set(token, word)

return word
if (cache.has(token)) {
return cache.get(token)
}
;``

let word = token.split('')

let pairs = get_pairs(word)

if (!pairs) {
return token
}

while (true) {
const minPairs = {}
Array.from(pairs).map((pair) => {
const rank = bpe_ranks[pair]
minPairs[isNaN(rank) ? 10e10 : rank] = pair
})

const bigram =
minPairs[
Math.min(
...Object.keys(minPairs).map((x) => {
return parseInt(x)
}),
)
]

if (!(bigram in bpe_ranks)) {
break
}

const first = bigram[0]
const second = bigram[1]
let new_word = []
let i = 0

while (i < word.length) {
const j = word.indexOf(first, i)
if (j === -1) {
new_word = new_word.concat(word.slice(i))
break
}
new_word = new_word.concat(word.slice(i, j))
i = j

if (word[i] === first && i < word.length - 1 && word[i + 1] === second) {
new_word.push(first + second)
i = i + 2
} else {
new_word.push(word[i])
i = i + 1
}
}

word = new_word
if (word.length === 1) {
break
} else {
pairs = get_pairs(word)
}
}

word = word.join(' ')
cache.set(token, word)

return word
}

function encode(text) {
let bpe_tokens = []
const matches = Array.from(text.matchAll(pat)).map(x => x[0])
for (let token of matches) {
token = encodeStr(token).map(x => {
return byte_encoder[x]
}).join('')

const new_tokens = bpe(token).split(' ').map(x => encoder[x])
bpe_tokens = bpe_tokens.concat(new_tokens)
}
return bpe_tokens
let bpe_tokens = []
const matches = Array.from(text.matchAll(pat)).map((x) => x[0])
for (let token of matches) {
token = encodeStr(token)
.map((x) => {
return byte_encoder[x]
})
.join('')

const new_tokens = bpe(token)
.split(' ')
.map((x) => encoder[x])
bpe_tokens = bpe_tokens.concat(new_tokens)
}
return bpe_tokens
}

function decode(tokens) {
let text = tokens.map(x => decoder[x]).join('')
text = decodeStr(text.split('').map(x => byte_decoder[x]))
return text
let text = tokens.map((x) => decoder[x]).join('')
text = decodeStr(text.split('').map((x) => byte_decoder[x]))
return text
}

module.exports = {
encode,
decode
};
export { encode, decode }
Loading

0 comments on commit 29f8660

Please sign in to comment.