Pgvector RAG guide (GoogleCloudPlatform#1177)

* add yamls and tf code * fix service type * fix the code * add region tags * add ci step * update ci * ci quickfix
epam · Mar 18, 2024 · d3b0f9a · d3b0f9a
1 parent f313ed0
commit d3b0f9a
Show file tree

Hide file tree

Showing 20 changed files with 863 additions and 0 deletions.
diff --git a/.github/workflows/databases-pgvector-ci.yml b/.github/workflows/databases-pgvector-ci.yml
@@ -0,0 +1,45 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: databases-pgvector-ci.yml
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - '.github/workflows/databases-pgvector-ci.yml'
+      - 'databases/postgres-pgvector/**'
+  pull_request:
+    paths:
+      - '.github/workflows/databases-pgvector-ci.yml'
+      - 'databases/postgres-pgvector/**'
+jobs:
+  job:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - name: Validate Cloud Storage module TF for Pgvector
+        run: |
+          cd databases/postgres-pgvector/terraform/cloud-storage
+          terraform init
+          terraform validate
+      - name: Build chatbot app container
+        run: |
+          cd databases/postgres-pgvector/docker/chatbot
+          docker build --tag chatbot:1.0 .
+      - name: Build docs embedder container
+        run: |
+          cd databases/postgres-pgvector/docker/embed-docs
+          docker build --tag embed-docs:1.0 .
+
diff --git a/databases/postgres-pgvector/docker/chatbot/Dockerfile b/databases/postgres-pgvector/docker/chatbot/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim-bookworm
+
+ENV POSTGRES_HOST gke-pg-cluster-rw.pg-ns
+ENV DATABASE_NAME app
+ENV COLLECTION_NAME training-docs
+RUN apt update && \
+    apt install -y --no-install-recommends gcc libc6-dev && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+
+CMD ["run","/app/chat.py"]
+ENTRYPOINT ["streamlit"]
+
diff --git a/databases/postgres-pgvector/docker/chatbot/chat.py b/databases/postgres-pgvector/docker/chatbot/chat.py
@@ -0,0 +1,112 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain_google_vertexai import ChatVertexAI
+from langchain.prompts import ChatPromptTemplate
+from langchain_google_vertexai import VertexAIEmbeddings
+from langchain.memory import ConversationBufferWindowMemory
+from langchain_community.vectorstores.pgvector import PGVector
+import streamlit as st
+import os
+
+# [START gke_databases_postgres_pgvector_docker_chat_model]
+vertexAI = ChatVertexAI(model_name="gemini-pro", streaming=True, convert_system_message_to_human=True)
+prompt_template = ChatPromptTemplate.from_messages(
+    [
+        ("system", "You are a helpful assistant who helps in finding answers to questions using the provided context."),
+        ("human", """
+        The answer should be based on the text context given in "text_context" and the conversation history given in "conversation_history" along with its Caption: \n
+        Base your response on the provided text context and the current conversation history to answer the query.
+        Select the most relevant information from the context.
+        Generate a draft response using the selected information. Remove duplicate content from the draft response.
+        Generate your final response after adjusting it to increase accuracy and relevance.
+        Now only show your final response!
+        If you do not know the answer or context is not relevant, response with "I don't know".
+
+        text_context:
+        {context}
+
+        conversation_history:
+        {history}
+
+        query:
+        {query}
+        """),
+    ]
+)
+
+embedding_model = VertexAIEmbeddings("textembedding-gecko@001")
+# [END gke_databases_postgres_pgvector_docker_chat_model]
+
+# [START gke_databases_postgres_pgvector_docker_chat_client]
+CONNECTION_STRING = PGVector.connection_string_from_db_params(
+    driver="psycopg2",
+    host=os.environ.get("POSTGRES_HOST"),
+    port=5432,
+    database=os.environ.get("DATABASE_NAME"),
+    user=os.environ.get("USERNAME"),
+    password=os.environ.get("PASSWORD"),
+)
+COLLECTION_NAME = os.environ.get("COLLECTION_NAME"),
+
+postgres_vector_search = PGVector(
+    collection_name=COLLECTION_NAME,
+    connection_string=CONNECTION_STRING,
+    embedding_function=embedding_model,
+)
+# [END gke_databases_postgres_pgvector_docker_chat_client]
+
+def format_docs(docs):
+    return "\n\n".join([d.page_content for d in docs])
+
+st.title("🤖 Chatbot")
+if "messages" not in st.session_state:
+    st.session_state["messages"] = [{"role": "ai", "content": "How can I help you?"}]
+
+# [START gke_databases_postgres_pgvector_docker_chat_session]
+if "memory" not in st.session_state:
+    st.session_state["memory"] = ConversationBufferWindowMemory(
+        memory_key="history",
+        ai_prefix="Bot",
+        human_prefix="User",
+        k=3,
+    )
+# [END gke_databases_postgres_pgvector_docker_chat_session]
+
+# [START gke_databases_postgres_pgvector_docker_chat_history]
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.write(message["content"])
+# [END gke_databases_postgres_pgvector_docker_chat_history]
+
+if chat_input := st.chat_input():
+    with st.chat_message("human"):
+        st.write(chat_input)
+        st.session_state.messages.append({"role": "human", "content": chat_input})
+
+    found_docs = postgres_vector_search.similarity_search(chat_input)
+    context = format_docs(found_docs)
+
+    prompt_value = prompt_template.format_messages(name="Bot", query=chat_input, context=context, history=st.session_state.memory.load_memory_variables({}))
+    with st.chat_message("ai"):
+        with st.spinner("Typing..."):
+            content = ""
+            with st.empty():
+                for chunk in vertexAI.stream(prompt_value):
+                    content += chunk.content
+                    st.write(content)
+            st.session_state.messages.append({"role": "ai", "content": content})
+
+    st.session_state.memory.save_context({"input": chat_input}, {"output": content})
+
diff --git a/databases/postgres-pgvector/docker/chatbot/requirements.txt b/databases/postgres-pgvector/docker/chatbot/requirements.txt
@@ -0,0 +1,9 @@
+streamlit==1.31.1
+google-cloud-aiplatform==1.41.0
+langchain==0.1.7
+langchain-community==0.0.20
+langchain-google-vertexai==0.0.5
+pgvector==0.2.5
+psycopg2-binary==2.9.9
+arxiv==2.1.0
+pymupdf==1.23.21
diff --git a/databases/postgres-pgvector/docker/embed-docs/Dockerfile b/databases/postgres-pgvector/docker/embed-docs/Dockerfile
@@ -0,0 +1,18 @@
+FROM python:3.12-slim-bookworm
+
+ENV POSTGRES_HOST gke-pg-cluster-rw.pg-ns
+ENV DATABASE_NAME app
+ENV COLLECTION_NAME training-docs
+RUN apt update && \
+    apt install -y --no-install-recommends gcc libc6-dev && \
+    rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /documents
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+RUN chmod 765 endpoint.py
+EXPOSE 5001
+
+CMD ["/app/embedding-job.py"]
+ENTRYPOINT ["python"]
diff --git a/databases/postgres-pgvector/docker/embed-docs/embedding-job.py b/databases/postgres-pgvector/docker/embed-docs/embedding-job.py
@@ -0,0 +1,62 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from langchain_google_vertexai import VertexAIEmbeddings
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores.pgvector import PGVector
+from google.cloud import storage
+import os
+# [START gke_databases_postgres_pgvector_docker_embed_docs_retrieval]
+bucketname = os.getenv("BUCKET_NAME")
+filename = os.getenv("FILE_NAME")
+
+storage_client = storage.Client()
+bucket = storage_client.bucket(bucketname)
+blob = bucket.blob(filename)
+blob.download_to_filename("/documents/" + filename)
+# [END gke_databases_postgres_pgvector_docker_embed_docs_retrieval]
+
+# [START gke_databases_postgres_pgvector_docker_embed_docs_split]
+loader = PyPDFLoader("/documents/" + filename)
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+documents = loader.load_and_split(text_splitter)
+# [END gke_databases_postgres_pgvector_docker_embed_docs_split]
+
+# [START gke_databases_postgres_pgvector_docker_embed_docs_embed]
+embeddings = VertexAIEmbeddings("textembedding-gecko@001")
+# [END gke_databases_postgres_pgvector_docker_embed_docs_embed]
+
+# [START gke_databases_postgres_pgvector_docker_embed_docs_storage]
+CONNECTION_STRING = PGVector.connection_string_from_db_params(
+    driver="psycopg2",
+    host=os.environ.get("POSTGRES_HOST"),
+    port=5432,
+    database=os.environ.get("DATABASE_NAME"),
+    user=os.environ.get("USERNAME"),
+    password=os.environ.get("PASSWORD"),
+)
+COLLECTION_NAME = os.environ.get("COLLECTION_NAME")
+
+db = PGVector.from_documents(
+    embedding=embeddings,
+    documents=documents,
+    collection_name=COLLECTION_NAME,
+    connection_string=CONNECTION_STRING,
+)
+# [END gke_databases_postgres_pgvector_docker_embed_docs_storage]
+
+print(filename + " was successfully embedded") 
+print(f"# of vectors = {len(documents)}")
+
diff --git a/databases/postgres-pgvector/docker/embed-docs/endpoint.py b/databases/postgres-pgvector/docker/embed-docs/endpoint.py
@@ -0,0 +1,89 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from flask import Flask, jsonify
+from flask import request
+import logging
+import sys,os, time
+from kubernetes import client, config, utils
+import kubernetes.client
+from kubernetes.client.rest import ApiException
+
+
+app = Flask(__name__)
+@app.route('/check')
+def message():
+    return jsonify({"Message": "Hi there"})
+
+
+@app.route('/', methods=['POST'])
+def bucket():
+    request_data = request.get_json()
+    print(request_data)
+    bckt = request_data['bucket']
+    f_name = request_data['name']
+    id = request_data['generation'] 
+    kube_create_job(bckt, f_name, id)
+    return "ok"
+
+# Set logging
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+# Setup K8 configs
+config.load_incluster_config()
+# [START gke_databases_postgres_pgvector_docker_embed_endpoint_job]
+def kube_create_job_object(name, container_image, bucket_name, f_name, namespace="pg-ns", container_name="jobcontainer", env_vars={}):
+
+    body = client.V1Job(api_version="batch/v1", kind="Job")
+    body.metadata = client.V1ObjectMeta(namespace=namespace, name=name)
+    body.status = client.V1JobStatus()
+
+    template = client.V1PodTemplate()
+    template.template = client.V1PodTemplateSpec()
+    env_list = [
+        client.V1EnvVar(name="POSTGRES_HOST", value=os.getenv("POSTGRES_HOST")),
+        client.V1EnvVar(name="DATABASE_NAME", value="app"), 
+        client.V1EnvVar(name="COLLECTION_NAME", value="training-docs"), 
+        client.V1EnvVar(name="FILE_NAME", value=f_name), 
+        client.V1EnvVar(name="BUCKET_NAME", value=bucket_name),
+        client.V1EnvVar(name="PASSWORD", value_from=client.V1EnvVarSource(secret_key_ref=client.V1SecretKeySelector(key="password", name="gke-pg-cluster-app"))), 
+        client.V1EnvVar(name="USERNAME", value_from=client.V1EnvVarSource(secret_key_ref=client.V1SecretKeySelector(key="username", name="gke-pg-cluster-app"))), 
+    ]
+
+    container = client.V1Container(name=container_name, image=container_image, image_pull_policy='Always', env=env_list)
+    template.template.spec = client.V1PodSpec(containers=[container], restart_policy='Never', service_account='embed-docs-sa')
+
+    body.spec = client.V1JobSpec(backoff_limit=3, ttl_seconds_after_finished=60, template=template.template)
+    return body
+# [END gke_databases_postgres_pgvector_docker_embed_endpoint_job]
+def kube_test_credentials():
+    try: 
+        api_response = api_instance.get_api_resources()
+        logging.info(api_response)
+    except ApiException as e:
+        print("Exception when calling API: %s\n" % e)
+
+def kube_create_job(bckt, f_name, id):
+    container_image = os.getenv("JOB_IMAGE")
+    name = "docs-embedder" + id
+    body = kube_create_job_object(name, container_image, bckt, f_name)
+    v1=client.BatchV1Api()
+    try: 
+        v1.create_namespaced_job("pg-ns", body, pretty=True)
+    except ApiException as e:
+        print("Exception when calling BatchV1Api->create_namespaced_job: %s\n" % e)
+    return
+
+if __name__ == '__main__':
+    app.run('0.0.0.0', port=5001, debug=True)
diff --git a/databases/postgres-pgvector/docker/embed-docs/requirements.txt b/databases/postgres-pgvector/docker/embed-docs/requirements.txt
@@ -0,0 +1,15 @@
+google-cloud-storage==2.14.0
+google-cloud-aiplatform==1.41.0
+langchain==0.1.7
+langchain-community==0.0.20
+langchain-google-vertexai==0.0.5
+pgvector==0.2.5
+psycopg2-binary==2.9.9
+pypdf==3.17.4
+click==8.1.7
+Flask==2.3.3
+itsdangerous==2.1.2
+Jinja2==3.1.3
+MarkupSafe==2.1.5
+Werkzeug==2.3.8
+kubernetes==28.1.0
diff --git a/databases/postgres-pgvector/documents/carbon-free-energy.pdf b/databases/postgres-pgvector/documents/carbon-free-energy.pdf