From 88fe9ec11e5a490d5ebe381ef585a4268887830d Mon Sep 17 00:00:00 2001 From: aoife cassidy Date: Thu, 31 Oct 2024 05:46:04 +0200 Subject: [PATCH] feat(llm): add ChatContext incomplete implementation, notably missing tool call hooks (see comment for more info). --- agents/src/llm/chat_context.ts | 105 +++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 agents/src/llm/chat_context.ts diff --git a/agents/src/llm/chat_context.ts b/agents/src/llm/chat_context.ts new file mode 100644 index 00000000..9b58ced1 --- /dev/null +++ b/agents/src/llm/chat_context.ts @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: 2024 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import type { AudioFrame } from '@livekit/rtc-node'; + +export enum ChatRole { + SYSTEM, + USER, + ASSISTANT, + TOOL, +} + +export interface ChatImage { + image: string | AudioFrame; + inferenceWidth?: number; + inferenceHeight?: number; + /** Used by LLM implementations to store a processed version of the image for later use. */ + cache: { [id: string | number | symbol]: any }; +} + +export interface ChatAudio { + frame: AudioFrame | AudioFrame[]; +} + +export type ChatContent = string | ChatImage | ChatAudio; + +export class ChatMessage { + readonly role: ChatRole; + readonly id?: string; + readonly name?: string; + readonly content?: ChatContent | ChatContent[]; + readonly toolCallId?: string; + readonly toolException?: Error; + + /** @internal */ + constructor({ + role, + id, + name, + content, + toolCallId, + toolException, + }: { + role: ChatRole; + id?: string; + name?: string; + content?: ChatContent | ChatContent[]; + toolCallId?: string; + toolException?: Error; + }) { + this.role = role; + this.id = id; + this.name = name; + this.content = content; + this.toolCallId = toolCallId; + this.toolException = toolException; + } + + // TODO(nbsp): tool call functions. + // the system defined in function_context.ts is fundamentally different (and much, much simpler) + // than the one in Python Agents. + // pair with theo to figure out what to do here (and later in MultimodalAgent/RealtimeModel) + + static create({ + text = '', + images = [], + role = ChatRole.SYSTEM, + }: { + text?: string; + images: ChatImage[]; + role: ChatRole; + }): ChatMessage { + if (!images.length) { + return new ChatMessage({ + role: ChatRole.ASSISTANT, + content: text, + }); + } else { + return new ChatMessage({ + role, + content: [...(text ? [text] : []), ...images], + }); + } + } + + /** Returns a structured clone of this message. */ + copy(): ChatMessage { + return structuredClone(this); + } +} + +export class ChatContext { + messages: ChatMessage[] = []; + metadata: { [id: string]: any } = {}; + + append(msg: { text?: string; images: ChatImage[]; role: ChatRole }): ChatContext { + this.messages.push(ChatMessage.create(msg)); + return this; + } + + /** Returns a structured clone of this context. */ + copy(): ChatContext { + return structuredClone(this); + } +}