Skip to content

Commit

Permalink
fix: truncate to 10kb for mardown sections (#1859)
Browse files Browse the repository at this point in the history
Co-authored-by: Andrew Jiang <[email protected]>
  • Loading branch information
RohinBhargava and abvthecity authored Dec 4, 2024
1 parent 8786652 commit e97952e
Show file tree
Hide file tree
Showing 17 changed files with 65,501 additions and 5,132 deletions.
60 changes: 24 additions & 36 deletions packages/commons/core-utils/src/bytes.ts
Original file line number Diff line number Diff line change
@@ -1,45 +1,33 @@
function _truncate(getLength: (str: string) => number, string: string, byteLength: number) {
if (typeof string !== "string") {
throw new Error("Input must be string");
}

let curByteLength = 0;
let codePoint;
let segment;

for (let i = 0; i < string.length; i += 1) {
codePoint = string.charCodeAt(i);
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
segment = string[i]!;

if (isHighSurrogate(codePoint) && isLowSurrogate(string.charCodeAt(i + 1))) {
i += 1;
segment += string[i];
}
/**
* Chunks a string into an array of strings, each of the specified byte size.
* @param str - The string to chunk.
* @param byteSize - The byte size of each chunk. i.e. 10KB = 50 * 1000
* @returns An array of strings, each of the specified byte size.
*/
export function chunkToBytes(str: string, byteSize: number): string[] {
const encoder = new TextEncoder();

curByteLength += getLength(segment);
// TODO: what if the string isn't utf8?
const utf8Bytes = encoder.encode(str);
const numChunks = Math.ceil(utf8Bytes.length / byteSize);
const chunks = new Array(numChunks);

if (curByteLength === byteLength) {
return string.slice(0, i + 1);
} else if (curByteLength > byteLength) {
return string.slice(0, i - segment.length + 1);
}
for (let i = 0, o = 0; i < numChunks; ++i, o += byteSize) {
chunks[i] = new TextDecoder().decode(utf8Bytes.slice(o, o + byteSize));
}

return string;
}

function isHighSurrogate(codePoint: number) {
return codePoint >= 0xd800 && codePoint <= 0xdbff;
}

function isLowSurrogate(codePoint: number) {
return codePoint >= 0xdc00 && codePoint <= 0xdfff;
return chunks;
}

/**
* Truncates a string to the specified byte length.
*
* @see https://github.com/parshap/truncate-utf8-bytes
* @param str - The string to truncate.
* @param byteSize - The byte size of the truncated string. i.e. 10KB = 50 * 1000
* @returns The truncated string.
*/
export const truncateToBytes = _truncate.bind(null, Buffer.byteLength.bind(Buffer));
export function truncateToBytes(str: string, byteSize: number): string {
const encoder = new TextEncoder();
const utf8Bytes = encoder.encode(str);
const truncatedBytes = utf8Bytes.slice(0, byteSize);
return new TextDecoder().decode(truncatedBytes);
}
Loading

0 comments on commit e97952e

Please sign in to comment.