-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: truncate to 10kb for mardown sections (#1859)
Co-authored-by: Andrew Jiang <[email protected]>
- Loading branch information
1 parent
8786652
commit e97952e
Showing
17 changed files
with
65,501 additions
and
5,132 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,33 @@ | ||
function _truncate(getLength: (str: string) => number, string: string, byteLength: number) { | ||
if (typeof string !== "string") { | ||
throw new Error("Input must be string"); | ||
} | ||
|
||
let curByteLength = 0; | ||
let codePoint; | ||
let segment; | ||
|
||
for (let i = 0; i < string.length; i += 1) { | ||
codePoint = string.charCodeAt(i); | ||
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion | ||
segment = string[i]!; | ||
|
||
if (isHighSurrogate(codePoint) && isLowSurrogate(string.charCodeAt(i + 1))) { | ||
i += 1; | ||
segment += string[i]; | ||
} | ||
/** | ||
* Chunks a string into an array of strings, each of the specified byte size. | ||
* @param str - The string to chunk. | ||
* @param byteSize - The byte size of each chunk. i.e. 10KB = 50 * 1000 | ||
* @returns An array of strings, each of the specified byte size. | ||
*/ | ||
export function chunkToBytes(str: string, byteSize: number): string[] { | ||
const encoder = new TextEncoder(); | ||
|
||
curByteLength += getLength(segment); | ||
// TODO: what if the string isn't utf8? | ||
const utf8Bytes = encoder.encode(str); | ||
const numChunks = Math.ceil(utf8Bytes.length / byteSize); | ||
const chunks = new Array(numChunks); | ||
|
||
if (curByteLength === byteLength) { | ||
return string.slice(0, i + 1); | ||
} else if (curByteLength > byteLength) { | ||
return string.slice(0, i - segment.length + 1); | ||
} | ||
for (let i = 0, o = 0; i < numChunks; ++i, o += byteSize) { | ||
chunks[i] = new TextDecoder().decode(utf8Bytes.slice(o, o + byteSize)); | ||
} | ||
|
||
return string; | ||
} | ||
|
||
function isHighSurrogate(codePoint: number) { | ||
return codePoint >= 0xd800 && codePoint <= 0xdbff; | ||
} | ||
|
||
function isLowSurrogate(codePoint: number) { | ||
return codePoint >= 0xdc00 && codePoint <= 0xdfff; | ||
return chunks; | ||
} | ||
|
||
/** | ||
* Truncates a string to the specified byte length. | ||
* | ||
* @see https://github.com/parshap/truncate-utf8-bytes | ||
* @param str - The string to truncate. | ||
* @param byteSize - The byte size of the truncated string. i.e. 10KB = 50 * 1000 | ||
* @returns The truncated string. | ||
*/ | ||
export const truncateToBytes = _truncate.bind(null, Buffer.byteLength.bind(Buffer)); | ||
export function truncateToBytes(str: string, byteSize: number): string { | ||
const encoder = new TextEncoder(); | ||
const utf8Bytes = encoder.encode(str); | ||
const truncatedBytes = utf8Bytes.slice(0, byteSize); | ||
return new TextDecoder().decode(truncatedBytes); | ||
} |
Oops, something went wrong.