0ptim · 0ptim · Oct 15, 2023 · Aug 6, 2023 · Aug 6, 2023 · Aug 8, 2023
diff --git a/.github/workflows/defichain_python_scraping_production.yml b/.github/workflows/defichain_python_scraping_production.yml
@@ -0,0 +1,27 @@
+name: Defichain Python scraping Production
+
+on:
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Run the script
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/defichain_python_scraping_staging.yml b/.github/workflows/defichain_python_scraping_staging.yml
@@ -0,0 +1,30 @@
+name: Defichain Python scraping Staging
+
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Embeddings for DefichainPython
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/wiki_scraping_production.yml b/.github/workflows/wiki_scraping_production.yml
@@ -53,7 +53,7 @@ jobs:
         working-directory: ./job
 
       - name: Run the script
-        run: python ./job/app.py
+        run: python ./job/wiki_embedding.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}

diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml
@@ -1,6 +1,9 @@
 name: Wiki scraping Staging
 
 on:
+  pull_request:
+    branches:
+      - main
   workflow_dispatch:
 
 jobs:
@@ -19,8 +22,8 @@ jobs:
           pip install -r requirements.txt
         working-directory: ./job
 
-      - name: Run the script
-        run: python ./job/app.py
+      - name: Embeddings for DefichainWiki
+        run: python ./job/wiki_embedding.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}

diff --git a/backend/agent/main_agent.py b/backend/agent/main_agent.py
@@ -8,6 +8,7 @@
 import langchain
 
 from tools.wiki_qa import wikiTool
+from tools.defichainpython_qa import defichainPythonTool
 from tools.ocean import oceanTools
 
 from agent.prompt import PROMPT
@@ -38,7 +39,7 @@ def create_agent(memory, final_output_handler=None):
         temperature=0,
     )
 
-    tools = [wikiTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools
+    tools = [wikiTool, defichainPythonTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools
 
     system_message = SystemMessage(content=PROMPT)
 

diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py
@@ -0,0 +1,102 @@
+import os
+import json
+from typing import List
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from supabase.client import Client, create_client
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.tools import StructuredTool
+from langchain.chains.openai_functions import create_structured_output_chain
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+import langchain
+
+load_dotenv()
+
+# Set debug to True to see A LOT of details of langchain's inner workings
+# langchain.debug = True
+
+# The name of the function in Supabase which is used to match the embeddings
+matchVectorFunctionName = "match_embeddings_defichain_python"
+
+# Create the supabase client
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+
+
+class ToolInputSchema(BaseModel):
+    question: str = Field(..., description="A fully formed question.")
+
+
+class KnowledgeAnswer(BaseModel):
+    answer: str = Field(..., description="The answer to the question.")
+    sources: List[str] = Field(
+        ...,
+        description="The sources which contributed to the answer.",
+    )
+
+
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)
+
+prompt_msgs = [
+    SystemMessagePromptTemplate.from_template(
+        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
+
+        Context:
+        {context}"""
+    ),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+prompt = ChatPromptTemplate.from_messages(prompt_msgs)
+
+chain = create_structured_output_chain(KnowledgeAnswer, llm, prompt)
+
+
+def get_answer(question: str) -> str:
+    try:
+        vectors = OpenAIEmbeddings().embed_documents([question])
+        embeddings = supabase.rpc(
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
+        ).execute()
+
+        print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
+        for entry in embeddings.data:
+            print("🔖 Title:", entry["metadata"]["title"])
+            print("🌐 Source:", entry["metadata"]["source"])
+            print("📊 Similarity:", entry["similarity"])
+            print("📄 Content:", entry["content"].replace("\n", " ")[:100] + "...")
+            print("-" * 50)
+
+        result = chain.run(context=json.dumps(embeddings.data), question=question)
+        print("📝 Result of knowledge extraction chain:", result)
+
+        return f"""Answer: {result.answer}
+        Sources: {json.dumps(result.sources)}
+        """
+
+    except Exception as e:
+        print(e)
+        return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/"
+
+
+description = """Use this if you need to answer any question reguarding python and coding in general. Keywords: python, script, coding, connection to a defichain node, connection to ocean API, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""
+
+defichainPythonTool = StructuredTool(
+    name="defichain_python_knowledge",
+    description=description,
+    func=get_answer,
+    args_schema=ToolInputSchema,
+)
+
+
+if __name__ == "__main__":
+    while True:
+        question = input(
+            "Ask something, that can be answered using information from DeFiChainWiki: "
+        )
+        print("✅", get_answer(question))
diff --git a/backend/tools/wiki_qa.py b/backend/tools/wiki_qa.py
@@ -15,14 +15,13 @@
 )
 import langchain
 
-
 load_dotenv()
 
 # Set debug to True to see A LOT of details of langchain's inner workings
 # langchain.debug = True
 
 # The name of the table in Supabase, where the vectors are stored
-vectorTableName = "embeddings"
+matchVectorFunctionName = "match_embeddings"
 
 # Create the supabase client
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel):
     )
 
 
-llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)
 
 prompt_msgs = [
     SystemMessagePromptTemplate.from_template(
-        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance.
-        
+        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
+
         Context:
         {context}"""
     ),
@@ -62,7 +61,7 @@ def get_answer(question: str) -> str:
     try:
         vectors = OpenAIEmbeddings().embed_documents([question])
         embeddings = supabase.rpc(
-            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
         ).execute()
 
         print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")

diff --git a/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql
@@ -0,0 +1,35 @@
+-- Create a table to store embeddings
+create table embeddings_defichain_python (
+  id UUID primary key,
+  content text, -- corresponds to Document.pageContent
+  metadata jsonb, -- corresponds to Document.metadata
+  embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
+);
+
+-- Create a function to search for embeddings
+create function match_embeddings_defichain_python (
+  query_embedding vector(1536),
+  match_count int default null,
+  filter jsonb DEFAULT '{}'
+) returns table (
+  id uuid,
+  content text,
+  metadata jsonb,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    content,
+    metadata,
+    1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity
+  from embeddings_defichain_python
+  where metadata @> filter
+  order by embeddings_defichain_python.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;
diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py
@@ -0,0 +1,72 @@
+import os
+import re
+from langchain.vectorstores import SupabaseVectorStore
+from langchain.embeddings.openai import OpenAIEmbeddings
+from supabase.client import Client, create_client
+from dotenv import load_dotenv
+import uuid
+
+from defichainpython_loader import DefichainPythonLoader
+from sitemap_parser import get_urls
+
+load_dotenv()
+
+vectorTableName = "embeddings_defichain_python"
+scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
+embedding_model = "text-embedding-ada-002"
+
+supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))
+
+urls = []
+
+# Get all urls from sitemap
+for url in scrapeUrls:
+    urls.extend(get_urls(url))
+print("🔎 Found %s pages in total" % len(urls))
+
+# Remove duplicates
+urls = list(dict.fromkeys(urls))
+print("🔎 Found %s unique pages" % len(urls))
+
+# Remove urls
+remove_urls = "https://docs.defichain-python.de/build/html/search.html"
+
+urls = [url for url in urls if url not in remove_urls]
+
+print("🔭 Scrape %s found pages.." % len(urls))
+print("---")
+docs = []
+for url in urls:
+    loader = DefichainPythonLoader(url)
+    docs.extend(loader.load())
+
+print(f"✅ Scraped all pages")
+
+for doc in docs:
+    print("🌐 Source:", doc.metadata["source"])
+    print("🔖 Title:", doc.metadata["title"])
+    print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
+    print("---")
+
+print("➖ Remove all old documents from table")
+supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute()
+print("✅ Removed all old documents from table")
+
+print("🔮 Embedding..")
+embeddings = OpenAIEmbeddings(model=embedding_model)
+upload_chunk_size = 200
+
+# Split the documents in chunks for upload (Did time out when too large).
+docs_chunks = [
+    docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
+]
+
+# Iterate over each chunk and upload separately.
+for doc_chunk in docs_chunks:
+    vector_store = SupabaseVectorStore.from_documents(
+        doc_chunk,
+        embeddings,
+        client=supabase,
+        table_name=vectorTableName,
+    )
+print("✅ Embedded")