diff --git a/config/chroma.ts b/config/chroma.ts new file mode 100644 index 000000000..289d20ea2 --- /dev/null +++ b/config/chroma.ts @@ -0,0 +1,7 @@ +/** + * Change the namespace to the namespace on Chroma you'd like to store your embeddings. + */ + +const CHROMA_NAME_SPACE = 'pdf-test'; //namespace is optional for your vectors + +export { CHROMA_NAME_SPACE }; diff --git a/package.json b/package.json index 82579df5b..c9cfa2cbb 100644 --- a/package.json +++ b/package.json @@ -12,12 +12,13 @@ "type-check": "tsc --noEmit", "lint": "eslint --ignore-path .gitignore \"**/*.+(ts|js|tsx)\"", "format": "prettier --ignore-path .gitignore \"**/*.+(ts|js|tsx)\" --write", - "ingest": "tsx -r dotenv/config scripts/ingest-data.ts" + "ingest": "tsx -r dotenv/config scripts/ingest-data-chroma.ts" }, "dependencies": { "@microsoft/fetch-event-source": "^2.0.1", "@pinecone-database/pinecone": "0.0.12", "@radix-ui/react-accordion": "^1.1.1", + "chromadb": "^1.3.1", "clsx": "^1.2.1", "dotenv": "^16.0.3", "langchain": "0.0.55", @@ -58,4 +59,4 @@ "pdf", "openai" ] -} +} \ No newline at end of file diff --git a/pages/api/chat.ts b/pages/api/chat.ts index b9f41f54d..0aa539662 100644 --- a/pages/api/chat.ts +++ b/pages/api/chat.ts @@ -1,9 +1,6 @@ import type { NextApiRequest, NextApiResponse } from 'next'; -import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; -import { PineconeStore } from 'langchain/vectorstores/pinecone'; +import { loadVectorStore } from '@/utils/loadVectorStore'; import { makeChain } from '@/utils/makechain'; -import { pinecone } from '@/utils/pinecone-client'; -import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone'; export default async function handler( req: NextApiRequest, @@ -26,17 +23,8 @@ export default async function handler( const sanitizedQuestion = question.trim().replaceAll('\n', ' '); try { - const index = pinecone.Index(PINECONE_INDEX_NAME); - - /* create vectorstore*/ - const vectorStore = await PineconeStore.fromExistingIndex( - new OpenAIEmbeddings({}), - { - pineconeIndex: index, - textKey: 'text', - namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder - }, - ); + //Load Vector Store + const vectorStore = await loadVectorStore('chroma'); //create chain const chain = makeChain(vectorStore); diff --git a/pages/index.tsx b/pages/index.tsx index c80830751..11c7b3605 100644 --- a/pages/index.tsx +++ b/pages/index.tsx @@ -25,7 +25,7 @@ export default function Home() { }>({ messages: [ { - message: 'Hi, what would you like to learn about this legal case?', + message: 'Hi, what would you like to learn about the ingested data?', type: 'apiMessage', }, ], @@ -125,7 +125,7 @@ export default function Home() {

- Chat With Your Legal Docs + Chat With Your Docs

@@ -224,7 +224,7 @@ export default function Home() { placeholder={ loading ? 'Waiting for response...' - : 'What is this legal case about?' + : 'What is this data about?' } value={query} onChange={(e) => setQuery(e.target.value)} diff --git a/scripts/ingest-data-chroma.ts b/scripts/ingest-data-chroma.ts new file mode 100644 index 000000000..85becdb02 --- /dev/null +++ b/scripts/ingest-data-chroma.ts @@ -0,0 +1,47 @@ +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; +import { OpenAIEmbeddings } from 'langchain/embeddings/openai'; +import { Chroma } from 'langchain/vectorstores'; +import { DirectoryLoader } from 'langchain/document_loaders/fs/directory'; +import { CustomPDFLoader } from '@/utils/customPDFLoader'; +import { CHROMA_NAME_SPACE } from '@/config/chroma'; + +/* Name of directory to retrieve your files from */ +const filePath = 'docs'; + +export const run = async () => { + try { + /*load raw docs from the all files in the directory */ + const directoryLoader = new DirectoryLoader(filePath, { + '.pdf': (path) => new CustomPDFLoader(path), + }); + + // const loader = new PDFLoader(filePath); + const rawDocs = await directoryLoader.load(); + + /* Split text into chunks */ + const textSplitter = new RecursiveCharacterTextSplitter({ + chunkSize: 1000, + chunkOverlap: 200, + }); + + const docs = await textSplitter.splitDocuments(rawDocs); + console.log('split docs', docs); + + console.log('creating vector store...'); + /*create and store the embeddings in the vectorStore*/ + const embeddings = new OpenAIEmbeddings(); + + //embed the PDF documents + const vectorStore = await Chroma.fromDocuments(docs, embeddings, { + collectionName: CHROMA_NAME_SPACE, + }); + } catch (error) { + console.log('error', error); + throw new Error('Failed to ingest your data'); + } +}; + +(async () => { + await run(); + console.log('ingestion complete'); +})(); diff --git a/utils/loadVectorStore.ts b/utils/loadVectorStore.ts new file mode 100644 index 000000000..51e58dfbc --- /dev/null +++ b/utils/loadVectorStore.ts @@ -0,0 +1,10 @@ +import { OpenAIEmbeddings } from "langchain/embeddings/openai"; +import { loadChroma } from "./loaders/loadChroma"; +import { loadPinecone } from "./loaders/loadPinecone"; + +export async function loadVectorStore(vectorStoreName: string) { + // throw new Error(`Store ${vectorStoreName} not found`); + const loader = vectorStoreName === 'chroma' ? loadChroma : loadPinecone; + const embeddings = new OpenAIEmbeddings(); + return loader(embeddings); +}; diff --git a/utils/loaders/loadChroma.ts b/utils/loaders/loadChroma.ts new file mode 100644 index 000000000..490bfb527 --- /dev/null +++ b/utils/loaders/loadChroma.ts @@ -0,0 +1,11 @@ +import { Chroma } from 'langchain/vectorstores'; +import { CHROMA_NAME_SPACE } from '@/config/chroma'; + +export const loadChroma = async (embeddings) => { + return await Chroma.fromExistingCollection( + embeddings, + { + collectionName: CHROMA_NAME_SPACE, + } + ); +} \ No newline at end of file diff --git a/utils/loaders/loadPinecone.ts b/utils/loaders/loadPinecone.ts new file mode 100644 index 000000000..f4ff8935f --- /dev/null +++ b/utils/loaders/loadPinecone.ts @@ -0,0 +1,15 @@ +import { PineconeStore } from 'langchain/vectorstores'; +import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone'; +import { pinecone } from '../pinecone-client'; + +export const loadPinecone = async (embeddings) => { + const index = pinecone.Index(PINECONE_INDEX_NAME); + return await PineconeStore.fromExistingIndex( + embeddings, + { + pineconeIndex: index, + textKey: 'text', + namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder + }, + ); +}; \ No newline at end of file