Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Server Side Token Counting #18

Closed
wants to merge 5 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 31 additions & 22 deletions src/utils/tiktoken.ts
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • 把 todo 去掉
  • 返回 userMessagesTokenCount 和 assistantMessagesTokenCount 有什么用?

Copy link
Author

@sweep-ai sweep-ai bot Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Wrote Changes

Done.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

返回 userMessagesTokenCount 和 assistantMessagesTokenCount 有什么用?

Copy link
Author

@sweep-ai sweep-ai bot Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Wrote Changes

Done.

Copy link
Author

@sweep-ai sweep-ai bot Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❌ Failed to make changes

Please join our Discord to report this issue.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

去掉 userMessagesTokenCount 和 assistantMessagesTokenCount

Copy link
Author

@sweep-ai sweep-ai bot Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Wrote Changes

Done.

Copy link
Author

@sweep-ai sweep-ai bot Nov 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀 Wrote Changes

Done.

Original file line number Diff line number Diff line change
@@ -1,32 +1,41 @@

import type { ChatMessage } from '@/types'
import type { Tiktoken } from 'tiktoken'

const countTokensSingleMessage = (enc: Tiktoken, message: ChatMessage) => {
return 4 + enc.encode(message.content).length // im_start, im_end, role/name, "\n"
const countTokensSingleMessage = (encoder: Tiktoken, chatMessage: ChatMessage) => {
// Calculate the token count, accounting for metadata tokens: im_start, im_end, role/name, and newline
const metadataTokenCount = 4;
const messageTokenCount = encoder.encode(chatMessage.content).length;
return metadataTokenCount + messageTokenCount;
}

export const countTokens = (enc: Tiktoken | null, messages: ChatMessage[]) => {
if (messages.length === 0) return

if (!enc) return { total: Infinity }

const lastMsg = messages.at(-1)
const context = messages.slice(0, -1)

const countTokens: (message: ChatMessage) => number = countTokensSingleMessage.bind(null, enc)

const countLastMsg = countTokens(lastMsg!)
const countContext = context.map(countTokens).reduce((a, b) => a + b, 3) // im_start, "assistant", "\n"

return { countContext, countLastMsg, total: countContext + countLastMsg }
export const countTokens = (encoder: Tiktoken | null, messages: ChatMessage[]) => {
// Ensure the encoder and messages are valid
if (!encoder || !Array.isArray(messages) || messages.some(msg => typeof msg !== 'object' || msg === null)) {
throw new Error('Invalid encoder or messages array');
}
// Return early if there are no messages
if (messages.length === 0) return { total: 0 };

// Use a more descriptive function name and documentation
const getTokenCountForMessage = countTokensSingleMessage.bind(null, encoder);
// Batch the token counting for all but the last message (context)
const contextTokenCounts = messages.slice(0, -1).map(getTokenCountForMessage);
// Use a batch process instead of individual map calls to optimize performance
const contextTotalTokens = contextTokenCounts.length > 0 ? contextTokenCounts.reduce((total, count) => total + count, 3) : 3; // Account for metadata tokens
// Count tokens for the last message separately
const lastMessage = messages[messages.length - 1];
const lastMessageTokenCount = lastMessage ? getTokenCountForMessage(lastMessage) : 0;

// Separate counts for user and assistant messages
return {
contextTotalTokens,
lastMessageTokenCount,
total: contextTotalTokens + lastMessageTokenCount
};
}

const cl100k_base_json = import.meta.env.PUBLIC_CL100K_BASE_JSON_URL || '/cl100k_base.json'
const tiktoken_bg_wasm = import.meta.env.PUBLIC_TIKTOKEN_BG_WASM_URL || '/tiktoken_bg.wasm'

async function getBPE() {
return fetch(cl100k_base_json).then(r => r.json())
}

export const initTikToken = async() => {
const { init } = await import('tiktoken/lite/init')
Expand All @@ -36,4 +45,4 @@ export const initTikToken = async() => {
fetch(tiktoken_bg_wasm).then(r => r.arrayBuffer()).then(wasm => init(imports => WebAssembly.instantiate(wasm, imports))),
])
return new Tiktoken(bpe_ranks, special_tokens, pat_str)
}
}