-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9df47fc
commit 29f8660
Showing
20 changed files
with
275 additions
and
4,950 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1 @@ | ||
node_modules | ||
.npmrc | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
vocab.bpe | ||
encoder.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"arrowParens": "always", | ||
"bracketSpacing": true, | ||
"htmlWhitespaceSensitivity": "strict", | ||
"insertPragma": false, | ||
"bracketSameLine": false, | ||
"jsxSingleQuote": true, | ||
"proseWrap": "preserve", | ||
"quoteProps": "as-needed", | ||
"requirePragma": false, | ||
"semi": false, | ||
"singleQuote": true, | ||
"tabWidth": 4, | ||
"useTabs": true, | ||
"trailingComma": "all", | ||
"vueIndentScriptAndStyle": false, | ||
"printWidth": 120 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"deno.enable": true, | ||
"deno.unstable": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,178 +1,190 @@ | ||
// This file includes code which was modified from https://github.com/openai/gpt-2 | ||
const fs = require('fs') | ||
const path = require('path'); | ||
|
||
const encoder = JSON.parse(fs.readFileSync(path.join(__dirname, './encoder.json'))); | ||
const bpe_file = fs.readFileSync(path.join(__dirname, './vocab.bpe'), 'utf-8'); | ||
import encoder from './encoder.json' assert { type: 'json' } | ||
const bpe_file = await (await fetch(import.meta.resolve('./vocab.bpe'))).text() | ||
|
||
const range = (x, y) => { | ||
const res = Array.from(Array(y).keys()).slice(x) | ||
return res | ||
const res = Array.from(Array(y).keys()).slice(x) | ||
return res | ||
} | ||
|
||
const ord = x => { | ||
return x.charCodeAt(0) | ||
const ord = (x) => { | ||
return x.charCodeAt(0) | ||
} | ||
|
||
const chr = x => { | ||
return String.fromCharCode(x) | ||
const chr = (x) => { | ||
return String.fromCharCode(x) | ||
} | ||
|
||
const textEncoder = new TextEncoder("utf-8") | ||
const encodeStr = str => { | ||
return Array.from(textEncoder.encode(str)).map(x => x.toString()) | ||
const textEncoder = new TextEncoder('utf-8') | ||
const encodeStr = (str) => { | ||
return Array.from(textEncoder.encode(str)).map((x) => x.toString()) | ||
} | ||
|
||
const textDecoder = new TextDecoder("utf-8") | ||
const decodeStr = arr => { | ||
return textDecoder.decode(new Uint8Array(arr)); | ||
const textDecoder = new TextDecoder('utf-8') | ||
const decodeStr = (arr) => { | ||
return textDecoder.decode(new Uint8Array(arr)) | ||
} | ||
|
||
const dictZip = (x, y) => { | ||
const result = {} | ||
x.map((_, i) => { result[x[i]] = y[i] }) | ||
return result | ||
const result = {} | ||
x.map((_, i) => { | ||
result[x[i]] = y[i] | ||
}) | ||
return result | ||
} | ||
|
||
function bytes_to_unicode() { | ||
const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1)) | ||
|
||
let cs = bs.slice() | ||
let n = 0 | ||
for (let b = 0; b < 2 ** 8; b++) { | ||
if (!bs.includes(b)) { | ||
bs.push(b) | ||
cs.push(2 ** 8 + n) | ||
n = n + 1 | ||
} | ||
} | ||
|
||
cs = cs.map(x => chr(x)) | ||
|
||
const result = {} | ||
bs.map((_, i) => { result[bs[i]] = cs[i] }) | ||
return result | ||
const bs = range(ord('!'), ord('~') + 1).concat(range(ord('¡'), ord('¬') + 1), range(ord('®'), ord('ÿ') + 1)) | ||
|
||
let cs = bs.slice() | ||
let n = 0 | ||
for (let b = 0; b < 2 ** 8; b++) { | ||
if (!bs.includes(b)) { | ||
bs.push(b) | ||
cs.push(2 ** 8 + n) | ||
n = n + 1 | ||
} | ||
} | ||
|
||
cs = cs.map((x) => chr(x)) | ||
|
||
const result = {} | ||
bs.map((_, i) => { | ||
result[bs[i]] = cs[i] | ||
}) | ||
return result | ||
} | ||
|
||
function get_pairs(word) { | ||
const pairs = new Set() | ||
let prev_char = word[0] | ||
for (let i = 1; i < word.length; i++) { | ||
const char = word[i] | ||
pairs.add([prev_char, char]) | ||
prev_char = char | ||
} | ||
return pairs | ||
const pairs = new Set() | ||
let prev_char = word[0] | ||
for (let i = 1; i < word.length; i++) { | ||
const char = word[i] | ||
pairs.add([prev_char, char]) | ||
prev_char = char | ||
} | ||
return pairs | ||
} | ||
|
||
const pat = /'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+/gu | ||
|
||
const decoder = {} | ||
Object.keys(encoder).map(x => { decoder[encoder[x]] = x }) | ||
Object.keys(encoder).map((x) => { | ||
decoder[encoder[x]] = x | ||
}) | ||
|
||
const lines = bpe_file.split('\n') | ||
|
||
// bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] | ||
const bpe_merges = lines.slice(1, lines.length - 1).map(x => { | ||
return x.split(/(\s+)/).filter(function(e) { return e.trim().length > 0 }) | ||
const bpe_merges = lines.slice(1, lines.length - 1).map((x) => { | ||
return x.split(/(\s+)/).filter(function (e) { | ||
return e.trim().length > 0 | ||
}) | ||
}) | ||
|
||
const byte_encoder = bytes_to_unicode() | ||
const byte_decoder = {} | ||
Object.keys(byte_encoder).map(x => { byte_decoder[byte_encoder[x]] = x }) | ||
Object.keys(byte_encoder).map((x) => { | ||
byte_decoder[byte_encoder[x]] = x | ||
}) | ||
|
||
const bpe_ranks = dictZip(bpe_merges, range(0, bpe_merges.length)) | ||
const cache = new Map; | ||
const cache = new Map() | ||
|
||
function bpe(token) { | ||
if (cache.has(token)) { | ||
return cache.get(token) | ||
}`` | ||
|
||
let word = token.split('') | ||
|
||
let pairs = get_pairs(word) | ||
|
||
if (!pairs) { | ||
return token | ||
} | ||
|
||
while (true) { | ||
const minPairs = {} | ||
Array.from(pairs).map(pair => { | ||
const rank = bpe_ranks[pair] | ||
minPairs[(isNaN(rank) ? 10e10 : rank)] = pair | ||
}) | ||
|
||
|
||
|
||
const bigram = minPairs[Math.min(...Object.keys(minPairs).map(x => { | ||
return parseInt(x) | ||
} | ||
))] | ||
|
||
if (!(bigram in bpe_ranks)) { | ||
break | ||
} | ||
|
||
const first = bigram[0] | ||
const second = bigram[1] | ||
let new_word = [] | ||
let i = 0 | ||
|
||
while (i < word.length) { | ||
const j = word.indexOf(first, i) | ||
if (j === -1) { | ||
new_word = new_word.concat(word.slice(i)) | ||
break | ||
} | ||
new_word = new_word.concat(word.slice(i, j)) | ||
i = j | ||
|
||
if (word[i] === first && i < word.length - 1 && word[i + 1] === second) { | ||
new_word.push(first + second) | ||
i = i + 2 | ||
} else { | ||
new_word.push(word[i]) | ||
i = i + 1 | ||
} | ||
} | ||
|
||
word = new_word | ||
if (word.length === 1) { | ||
break | ||
} else { | ||
pairs = get_pairs(word) | ||
} | ||
} | ||
|
||
word = word.join(' ') | ||
cache.set(token, word) | ||
|
||
return word | ||
if (cache.has(token)) { | ||
return cache.get(token) | ||
} | ||
;`` | ||
|
||
let word = token.split('') | ||
|
||
let pairs = get_pairs(word) | ||
|
||
if (!pairs) { | ||
return token | ||
} | ||
|
||
while (true) { | ||
const minPairs = {} | ||
Array.from(pairs).map((pair) => { | ||
const rank = bpe_ranks[pair] | ||
minPairs[isNaN(rank) ? 10e10 : rank] = pair | ||
}) | ||
|
||
const bigram = | ||
minPairs[ | ||
Math.min( | ||
...Object.keys(minPairs).map((x) => { | ||
return parseInt(x) | ||
}), | ||
) | ||
] | ||
|
||
if (!(bigram in bpe_ranks)) { | ||
break | ||
} | ||
|
||
const first = bigram[0] | ||
const second = bigram[1] | ||
let new_word = [] | ||
let i = 0 | ||
|
||
while (i < word.length) { | ||
const j = word.indexOf(first, i) | ||
if (j === -1) { | ||
new_word = new_word.concat(word.slice(i)) | ||
break | ||
} | ||
new_word = new_word.concat(word.slice(i, j)) | ||
i = j | ||
|
||
if (word[i] === first && i < word.length - 1 && word[i + 1] === second) { | ||
new_word.push(first + second) | ||
i = i + 2 | ||
} else { | ||
new_word.push(word[i]) | ||
i = i + 1 | ||
} | ||
} | ||
|
||
word = new_word | ||
if (word.length === 1) { | ||
break | ||
} else { | ||
pairs = get_pairs(word) | ||
} | ||
} | ||
|
||
word = word.join(' ') | ||
cache.set(token, word) | ||
|
||
return word | ||
} | ||
|
||
function encode(text) { | ||
let bpe_tokens = [] | ||
const matches = Array.from(text.matchAll(pat)).map(x => x[0]) | ||
for (let token of matches) { | ||
token = encodeStr(token).map(x => { | ||
return byte_encoder[x] | ||
}).join('') | ||
|
||
const new_tokens = bpe(token).split(' ').map(x => encoder[x]) | ||
bpe_tokens = bpe_tokens.concat(new_tokens) | ||
} | ||
return bpe_tokens | ||
let bpe_tokens = [] | ||
const matches = Array.from(text.matchAll(pat)).map((x) => x[0]) | ||
for (let token of matches) { | ||
token = encodeStr(token) | ||
.map((x) => { | ||
return byte_encoder[x] | ||
}) | ||
.join('') | ||
|
||
const new_tokens = bpe(token) | ||
.split(' ') | ||
.map((x) => encoder[x]) | ||
bpe_tokens = bpe_tokens.concat(new_tokens) | ||
} | ||
return bpe_tokens | ||
} | ||
|
||
function decode(tokens) { | ||
let text = tokens.map(x => decoder[x]).join('') | ||
text = decodeStr(text.split('').map(x => byte_decoder[x])) | ||
return text | ||
let text = tokens.map((x) => decoder[x]).join('') | ||
text = decodeStr(text.split('').map((x) => byte_decoder[x])) | ||
return text | ||
} | ||
|
||
module.exports = { | ||
encode, | ||
decode | ||
}; | ||
export { encode, decode } |
Oops, something went wrong.