From 7ced8f06a65644b94503c3ad501ea50902c40ad1 Mon Sep 17 00:00:00 2001
From: 0ptim <kummer.joshua@outlook.com>
Date: Sun, 15 Oct 2023 17:42:08 +0200
Subject: [PATCH] Finalize defichain python integration

---
 .../defichain_python_scraping_production.yml  | 27 ++++++++++++++
 .../defichain_python_scraping_staging.yml     | 30 ++++++++++++++++
 .../workflows/wiki_scraping_production.yml    |  2 +-
 .github/workflows/wiki_scraping_staging.yml   |  7 ----
 backend/tools/defichainpython_qa.py           |  7 ++--
 backend/tools/wiki_qa.py                      | 11 +++---
 ...150642_add_defichain_python_embeddings.sql | 35 +++++++++++++++++++
 job/defichainpython_embedding.py              |  8 ++---
 8 files changed, 105 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/defichain_python_scraping_production.yml
 create mode 100644 .github/workflows/defichain_python_scraping_staging.yml
 create mode 100644 data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql

diff --git a/.github/workflows/defichain_python_scraping_production.yml b/.github/workflows/defichain_python_scraping_production.yml
new file mode 100644
index 0000000..b129f0a
--- /dev/null
+++ b/.github/workflows/defichain_python_scraping_production.yml
@@ -0,0 +1,27 @@
+name: Defichain Python scraping Production
+
+on:
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Run the script
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/defichain_python_scraping_staging.yml b/.github/workflows/defichain_python_scraping_staging.yml
new file mode 100644
index 0000000..9920664
--- /dev/null
+++ b/.github/workflows/defichain_python_scraping_staging.yml
@@ -0,0 +1,30 @@
+name: Defichain Python scraping Staging
+
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Embeddings for DefichainPython
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/wiki_scraping_production.yml b/.github/workflows/wiki_scraping_production.yml
index 3c13db0..b642572 100644
--- a/.github/workflows/wiki_scraping_production.yml
+++ b/.github/workflows/wiki_scraping_production.yml
@@ -53,7 +53,7 @@ jobs:
         working-directory: ./job
 
       - name: Run the script
-        run: python ./job/app.py
+        run: python ./job/wiki_embedding.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml
index 8dc4bf1..fa28b3c 100644
--- a/.github/workflows/wiki_scraping_staging.yml
+++ b/.github/workflows/wiki_scraping_staging.yml
@@ -28,10 +28,3 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
           SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
-
-      - name: Embeddings for DefichainPython
-        run: python ./job/defichainpython_embedding.py
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
-          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py
index 7bfc619..4e15e7f 100644
--- a/backend/tools/defichainpython_qa.py
+++ b/backend/tools/defichainpython_qa.py
@@ -20,8 +20,8 @@
 # Set debug to True to see A LOT of details of langchain's inner workings
 # langchain.debug = True
 
-# The name of the table in Supabase, where the vectors are stored
-vectorTableName = "embeddings"
+# The name of the function in Supabase which is used to match the embeddings
+matchVectorFunctionName = "match_embeddings_defichain_python"
 
 # Create the supabase client
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -61,7 +61,7 @@ def get_answer(question: str) -> str:
     try:
         vectors = OpenAIEmbeddings().embed_documents([question])
         embeddings = supabase.rpc(
-            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
         ).execute()
 
         print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
@@ -93,6 +93,7 @@ def get_answer(question: str) -> str:
     args_schema=ToolInputSchema,
 )
 
+
 if __name__ == "__main__":
     while True:
         question = input(
diff --git a/backend/tools/wiki_qa.py b/backend/tools/wiki_qa.py
index 607fe11..7188574 100644
--- a/backend/tools/wiki_qa.py
+++ b/backend/tools/wiki_qa.py
@@ -15,14 +15,13 @@
 )
 import langchain
 
-
 load_dotenv()
 
 # Set debug to True to see A LOT of details of langchain's inner workings
 # langchain.debug = True
 
 # The name of the table in Supabase, where the vectors are stored
-vectorTableName = "embeddings"
+matchVectorFunctionName = "match_embeddings"
 
 # Create the supabase client
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel):
     )
 
 
-llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)
 
 prompt_msgs = [
     SystemMessagePromptTemplate.from_template(
-        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance.
-        
+        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
+
         Context:
         {context}"""
     ),
@@ -62,7 +61,7 @@ def get_answer(question: str) -> str:
     try:
         vectors = OpenAIEmbeddings().embed_documents([question])
         embeddings = supabase.rpc(
-            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
         ).execute()
 
         print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
diff --git a/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql
new file mode 100644
index 0000000..433923a
--- /dev/null
+++ b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql
@@ -0,0 +1,35 @@
+-- Create a table to store embeddings
+create table embeddings_defichain_python (
+  id UUID primary key,
+  content text, -- corresponds to Document.pageContent
+  metadata jsonb, -- corresponds to Document.metadata
+  embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
+);
+
+-- Create a function to search for embeddings
+create function match_embeddings_defichain_python (
+  query_embedding vector(1536),
+  match_count int default null,
+  filter jsonb DEFAULT '{}'
+) returns table (
+  id uuid,
+  content text,
+  metadata jsonb,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    content,
+    metadata,
+    1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity
+  from embeddings_defichain_python
+  where metadata @> filter
+  order by embeddings_defichain_python.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;
diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py
index dac507c..e50249f 100644
--- a/job/defichainpython_embedding.py
+++ b/job/defichainpython_embedding.py
@@ -11,7 +11,7 @@
 
 load_dotenv()
 
-vectorTableName = "embeddings"
+vectorTableName = "embeddings_defichain_python"
 scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
 embedding_model = "text-embedding-ada-002"
 
@@ -29,9 +29,7 @@
 print("🔎 Found %s unique pages" % len(urls))
 
 # Remove urls
-remove_urls = (
-    "https://docs.defichain-python.de/build/html/search.html"
-)
+remove_urls = "https://docs.defichain-python.de/build/html/search.html"
 
 urls = [url for url in urls if url not in remove_urls]
 
@@ -60,7 +58,7 @@
 
 # Split the documents in chunks for upload (Did time out when too large).
 docs_chunks = [
-    docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
+    docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
 ]
 
 # Iterate over each chunk and upload separately.