Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Pinecone with Chroma DB as Default Vector Database #178

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions config/chroma.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
/**
* Change the namespace to the namespace on Chroma you'd like to store your embeddings.
*/

const CHROMA_NAME_SPACE = 'pdf-test'; //namespace is optional for your vectors

export { CHROMA_NAME_SPACE };
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@
"type-check": "tsc --noEmit",
"lint": "eslint --ignore-path .gitignore \"**/*.+(ts|js|tsx)\"",
"format": "prettier --ignore-path .gitignore \"**/*.+(ts|js|tsx)\" --write",
"ingest": "tsx -r dotenv/config scripts/ingest-data.ts"
"ingest": "tsx -r dotenv/config scripts/ingest-data-chroma.ts"
},
"dependencies": {
"@microsoft/fetch-event-source": "^2.0.1",
"@pinecone-database/pinecone": "0.0.12",
"@radix-ui/react-accordion": "^1.1.1",
"chromadb": "^1.3.1",
"clsx": "^1.2.1",
"dotenv": "^16.0.3",
"langchain": "0.0.55",
Expand Down Expand Up @@ -58,4 +59,4 @@
"pdf",
"openai"
]
}
}
18 changes: 3 additions & 15 deletions pages/api/chat.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import type { NextApiRequest, NextApiResponse } from 'next';
import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
import { PineconeStore } from 'langchain/vectorstores/pinecone';
import { loadVectorStore } from '@/utils/loadVectorStore';
import { makeChain } from '@/utils/makechain';
import { pinecone } from '@/utils/pinecone-client';
import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';

export default async function handler(
req: NextApiRequest,
Expand All @@ -26,17 +23,8 @@ export default async function handler(
const sanitizedQuestion = question.trim().replaceAll('\n', ' ');

try {
const index = pinecone.Index(PINECONE_INDEX_NAME);

/* create vectorstore*/
const vectorStore = await PineconeStore.fromExistingIndex(
new OpenAIEmbeddings({}),
{
pineconeIndex: index,
textKey: 'text',
namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder
},
);
//Load Vector Store
const vectorStore = await loadVectorStore('chroma');

//create chain
const chain = makeChain(vectorStore);
Expand Down
6 changes: 3 additions & 3 deletions pages/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ export default function Home() {
}>({
messages: [
{
message: 'Hi, what would you like to learn about this legal case?',
message: 'Hi, what would you like to learn about the ingested data?',
type: 'apiMessage',
},
],
Expand Down Expand Up @@ -125,7 +125,7 @@ export default function Home() {
<Layout>
<div className="mx-auto flex flex-col gap-4">
<h1 className="text-2xl font-bold leading-[1.1] tracking-tighter text-center">
Chat With Your Legal Docs
Chat With Your Docs
</h1>
<main className={styles.main}>
<div className={styles.cloud}>
Expand Down Expand Up @@ -224,7 +224,7 @@ export default function Home() {
placeholder={
loading
? 'Waiting for response...'
: 'What is this legal case about?'
: 'What is this data about?'
}
value={query}
onChange={(e) => setQuery(e.target.value)}
Expand Down
47 changes: 47 additions & 0 deletions scripts/ingest-data-chroma.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { OpenAIEmbeddings } from 'langchain/embeddings/openai';
import { Chroma } from 'langchain/vectorstores';
import { DirectoryLoader } from 'langchain/document_loaders/fs/directory';
import { CustomPDFLoader } from '@/utils/customPDFLoader';
import { CHROMA_NAME_SPACE } from '@/config/chroma';

/* Name of directory to retrieve your files from */
const filePath = 'docs';

export const run = async () => {
try {
/*load raw docs from the all files in the directory */
const directoryLoader = new DirectoryLoader(filePath, {
'.pdf': (path) => new CustomPDFLoader(path),
});

// const loader = new PDFLoader(filePath);
const rawDocs = await directoryLoader.load();

/* Split text into chunks */
const textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
});

const docs = await textSplitter.splitDocuments(rawDocs);
console.log('split docs', docs);

console.log('creating vector store...');
/*create and store the embeddings in the vectorStore*/
const embeddings = new OpenAIEmbeddings();

//embed the PDF documents
const vectorStore = await Chroma.fromDocuments(docs, embeddings, {
collectionName: CHROMA_NAME_SPACE,
});
} catch (error) {
console.log('error', error);
throw new Error('Failed to ingest your data');
}
};

(async () => {
await run();
console.log('ingestion complete');
})();
10 changes: 10 additions & 0 deletions utils/loadVectorStore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import { OpenAIEmbeddings } from "langchain/embeddings/openai";
import { loadChroma } from "./loaders/loadChroma";
import { loadPinecone } from "./loaders/loadPinecone";

export async function loadVectorStore(vectorStoreName: string) {
// throw new Error(`Store ${vectorStoreName} not found`);
const loader = vectorStoreName === 'chroma' ? loadChroma : loadPinecone;
const embeddings = new OpenAIEmbeddings();
return loader(embeddings);
};
11 changes: 11 additions & 0 deletions utils/loaders/loadChroma.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import { Chroma } from 'langchain/vectorstores';
import { CHROMA_NAME_SPACE } from '@/config/chroma';

export const loadChroma = async (embeddings) => {
return await Chroma.fromExistingCollection(
embeddings,
{
collectionName: CHROMA_NAME_SPACE,
}
);
}
15 changes: 15 additions & 0 deletions utils/loaders/loadPinecone.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { PineconeStore } from 'langchain/vectorstores';
import { PINECONE_INDEX_NAME, PINECONE_NAME_SPACE } from '@/config/pinecone';
import { pinecone } from '../pinecone-client';

export const loadPinecone = async (embeddings) => {
const index = pinecone.Index(PINECONE_INDEX_NAME);
return await PineconeStore.fromExistingIndex(
embeddings,
{
pineconeIndex: index,
textKey: 'text',
namespace: PINECONE_NAME_SPACE, //namespace comes from your config folder
},
);
};