From 81f14fdd0803cde684b9bed413e4fc77b1db0e0c Mon Sep 17 00:00:00 2001
From: 0ptim <kummer.joshua@outlook.com>
Date: Sun, 6 Aug 2023 21:14:47 +0200
Subject: [PATCH 01/11] embed details into metadata

---
 job/defichainpython_loader.py |  86 ++++++++++++++++++++++++++++
 job/new.py                    | 102 ++++++++++++++++++++++++++++++++++
 2 files changed, 188 insertions(+)
 create mode 100644 job/defichainpython_loader.py
 create mode 100644 job/new.py

diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py
new file mode 100644
index 0000000..792fb78
--- /dev/null
+++ b/job/defichainpython_loader.py
@@ -0,0 +1,86 @@
+"""Loader that loads from DefichainPython."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class DefichainPythonLoader(WebBaseLoader):
+    """Loader that loads from DefichainPython."""
+
+    def load(self) -> List[Document]:
+        """Load webpage."""
+        soup = self.scrape()
+
+        title_tag = soup.find("h1")
+        if title_tag:
+            title = title_tag.get_text()
+        else:
+            print(self.web_path)
+            raise ValueError("Title tag not found.")
+
+        documents = []
+
+        method_tags = soup.find_all("dl", class_="method")
+        for method_tag in method_tags:
+            area = self.web_path.split("/")[5]
+            tech = self.web_path.split("/")[6]
+            method_signature = method_tag.find("dt").get_text()
+            method_description = method_tag.find("dd").get_text()
+
+            metadata_methods = {
+                "title": title,
+                "source": self.web_path,
+                "area": area,
+                "tech": tech,
+                "class": title,
+                "method": method_signature.split("(")[0].replace("\n", ""),
+            }
+
+            content = method_signature + "\n" + method_description
+
+            document = Document(page_content=content, metadata=metadata_methods)
+            documents.append(document)
+
+        """Embeddings for classes"""
+        class_tags = soup.find_all("dl", class_="class")
+        for class_tag in class_tags:
+            area = self.web_path.split("/")[5]
+            tech = self.web_path.split("/")[6]
+
+            class_full_tag = class_tag.find("dd")
+            all_tags = class_full_tag.find_all("dl")
+
+            method_tags = [tag for tag in all_tags if "method" in tag["class"]]
+
+            for method_tag in method_tags:
+                method_tag.decompose()
+
+            class_signature = class_tag.find("dt").get_text()
+            class_description = " ".join([all_tag.get_text() for all_tag in all_tags])
+
+            content = class_signature + "\n" + class_description
+
+            metadata_class = {
+                "title": title,
+                "source": self.web_path,
+                "area": area,
+                "tech": tech,
+                "class": title,
+            }
+
+            document = Document(page_content=content, metadata=metadata_class)
+            documents.append(document)
+
+        return documents
+
+
+if __name__ == "__main__":
+    loader = DefichainPythonLoader(
+        "https://docs.defichain-python.de/build/html/sdk/hdwallet/wallet.html"
+    )
+    docs = loader.load()
+    for doc in docs:
+        print("Source:", doc.metadata["source"])
+        print("Title:", doc.metadata["title"])
+        print("Content:", doc.page_content)
diff --git a/job/new.py b/job/new.py
new file mode 100644
index 0000000..900e04b
--- /dev/null
+++ b/job/new.py
@@ -0,0 +1,102 @@
+import os
+import re
+from langchain.vectorstores import SupabaseVectorStore
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from supabase.client import Client, create_client
+from dotenv import load_dotenv
+import uuid
+
+from defichainpython_loader import DefichainPythonLoader
+from sitemap_parser import get_urls
+
+load_dotenv()
+
+vectorTableName = "embeddings"
+scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
+chunk_size = 1000
+chunk_overlap = 50
+embedding_model = "text-embedding-ada-002"
+
+supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))
+
+urls = []
+
+# Get all urls from sitemap
+for url in scrapeUrls:
+    urls.extend(get_urls(url))
+print("🔎 Found %s pages in total" % len(urls))
+
+# Remove duplicates
+urls = list(dict.fromkeys(urls))
+print("🔎 Found %s unique pages" % len(urls))
+
+
+# Remove for testing
+urls = [url for url in urls if "/wallet" in url]
+
+print("🔭 Scrape %s found pages.." % len(urls))
+print("---")
+docs = []
+for url in urls:
+    loader = DefichainPythonLoader(url)
+    docs.extend(loader.load())
+
+print(f"✅ Scraped all pages")
+
+for doc in docs:
+    print("🌐 Source:", doc.metadata["source"])
+    print("🔖 Title:", doc.metadata["title"])
+    print("📄 Content:", doc.page_content.replace("\n", " ")[:1000] + "...")
+    print("---")
+
+
+print("➖ Remove long strings")
+for document in docs:
+    document.page_content = re.sub(
+        r"(?<=\S)[^\s]{" + str(chunk_size) + ",}(?=\S)", "", document.page_content
+    )
+print("✅ Removed long strings")
+
+
+print("🗨 Split into chunks..")
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=chunk_size,
+    chunk_overlap=chunk_overlap,
+    length_function=len,
+    separators=["\n\n", "\n", " ", ""],
+)
+docs = text_splitter.split_documents(docs)
+print("✅ Split into %s chunks" % len(docs))
+
+# import tiktoken
+
+# enc = tiktoken.get_encoding("cl100k_base")
+# for doc in docs:
+#     print("🔖 Title:", doc.metadata["title"])
+#     print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
+#     tokens = enc.encode(doc.page_content)
+#     print("⚡ Tokens:", len(tokens))
+
+print("➖ Remove all old documents from table")
+supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute()
+print("✅ Removed all old documents from table")
+
+print("🔮 Embedding..")
+embeddings = OpenAIEmbeddings(model=embedding_model)
+upload_chunk_size = 200
+
+# Split the documents in chunks for upload (Did time out when too large).
+docs_chunks = [
+    docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
+]
+
+# Iterate over each chunk and upload separately.
+for doc_chunk in docs_chunks:
+    vector_store = SupabaseVectorStore.from_documents(
+        doc_chunk,
+        embeddings,
+        client=supabase,
+        table_name=vectorTableName,
+    )
+print("✅ Embedded")

From 23ad7bea255bf66b45cdbbe0caa339ce23be64d8 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Mon, 7 Aug 2023 01:00:22 +0200
Subject: [PATCH 02/11] remove urls that should not be indexed

---
 job/new.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/job/new.py b/job/new.py
index 900e04b..005c588 100644
--- a/job/new.py
+++ b/job/new.py
@@ -32,8 +32,12 @@
 print("🔎 Found %s unique pages" % len(urls))
 
 
-# Remove for testing
-urls = [url for url in urls if "/wallet" in url]
+# Remove urls
+remove_urls = (
+    "https://docs.defichain-python.de/build/html/search.html"
+)
+
+urls = [url for url in urls if url not in remove_urls]
 
 print("🔭 Scrape %s found pages.." % len(urls))
 print("---")
@@ -47,10 +51,9 @@
 for doc in docs:
     print("🌐 Source:", doc.metadata["source"])
     print("🔖 Title:", doc.metadata["title"])
-    print("📄 Content:", doc.page_content.replace("\n", " ")[:1000] + "...")
+    print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
     print("---")
 
-
 print("➖ Remove long strings")
 for document in docs:
     document.page_content = re.sub(

From 2d2c0df01a3d7a37c2267b73f859957701646619 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Tue, 8 Aug 2023 19:21:14 +0200
Subject: [PATCH 03/11] add embeddings for normal text

---
 job/defichainpython_loader.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py
index 792fb78..b76da03 100644
--- a/job/defichainpython_loader.py
+++ b/job/defichainpython_loader.py
@@ -8,13 +8,19 @@
 class DefichainPythonLoader(WebBaseLoader):
     """Loader that loads from DefichainPython."""
 
+    @staticmethod
+    def replace_enter(text: str) -> str:
+        while text.find("\n\n") != -1:
+            text = text.replace("\n\n", "\n")
+        return text
+
     def load(self) -> List[Document]:
         """Load webpage."""
         soup = self.scrape()
 
-        title_tag = soup.find("h1")
-        if title_tag:
-            title = title_tag.get_text()
+        title_tags = soup.find_all("h1")
+        if title_tags:
+            title = ", ".join([tag.get_text() for tag in title_tags])
         else:
             print(self.web_path)
             raise ValueError("Title tag not found.")
@@ -72,12 +78,30 @@ def load(self) -> List[Document]:
             document = Document(page_content=content, metadata=metadata_class)
             documents.append(document)
 
+        """Embeddings for normal text"""
+        article_tag = soup.find("article")
+        all_tags = article_tag.find_all("dl")
+
+        class_tags = [tag for tag in all_tags if "class" in tag["class"]]
+
+        for class_tag in class_tags:
+            class_tag.decompose()
+
+        content = DefichainPythonLoader.replace_enter(article_tag.get_text())
+
+        metadata_class = {
+            "title": title,
+            "source": self.web_path,
+        }
+
+        document = Document(page_content=content, metadata=metadata_class)
+        documents.append(document)
         return documents
 
 
 if __name__ == "__main__":
     loader = DefichainPythonLoader(
-        "https://docs.defichain-python.de/build/html/sdk/hdwallet/wallet.html"
+        "https://docs.defichain-python.de/build/html/guides/example/chainedTransactions.html"
     )
     docs = loader.load()
     for doc in docs:

From 4cd43ffb8938ddf61499c3ac8feee27b79027673 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Tue, 8 Aug 2023 19:23:06 +0200
Subject: [PATCH 04/11] rename files that do the embedding

---
 job/{new.py => defichainpython_embedding.py} | 0
 job/{app.py => wiki_embedding.py}            | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename job/{new.py => defichainpython_embedding.py} (100%)
 rename job/{app.py => wiki_embedding.py} (100%)

diff --git a/job/new.py b/job/defichainpython_embedding.py
similarity index 100%
rename from job/new.py
rename to job/defichainpython_embedding.py
diff --git a/job/app.py b/job/wiki_embedding.py
similarity index 100%
rename from job/app.py
rename to job/wiki_embedding.py

From c1753462758577e1cc3dc7382986078ba62d69a9 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Sun, 13 Aug 2023 21:10:46 +0200
Subject: [PATCH 05/11] complete rework of embeddings creation for
 DefichainPython

---
 job/defichainpython_embedding.py |  33 +----
 job/defichainpython_loader.py    | 240 ++++++++++++++++++++++---------
 2 files changed, 176 insertions(+), 97 deletions(-)

diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py
index 005c588..dac507c 100644
--- a/job/defichainpython_embedding.py
+++ b/job/defichainpython_embedding.py
@@ -2,7 +2,6 @@
 import re
 from langchain.vectorstores import SupabaseVectorStore
 from langchain.embeddings.openai import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
 from supabase.client import Client, create_client
 from dotenv import load_dotenv
 import uuid
@@ -14,8 +13,6 @@
 
 vectorTableName = "embeddings"
 scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
-chunk_size = 1000
-chunk_overlap = 50
 embedding_model = "text-embedding-ada-002"
 
 supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))
@@ -31,7 +28,6 @@
 urls = list(dict.fromkeys(urls))
 print("🔎 Found %s unique pages" % len(urls))
 
-
 # Remove urls
 remove_urls = (
     "https://docs.defichain-python.de/build/html/search.html"
@@ -54,33 +50,6 @@
     print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
     print("---")
 
-print("➖ Remove long strings")
-for document in docs:
-    document.page_content = re.sub(
-        r"(?<=\S)[^\s]{" + str(chunk_size) + ",}(?=\S)", "", document.page_content
-    )
-print("✅ Removed long strings")
-
-
-print("🗨 Split into chunks..")
-text_splitter = RecursiveCharacterTextSplitter(
-    chunk_size=chunk_size,
-    chunk_overlap=chunk_overlap,
-    length_function=len,
-    separators=["\n\n", "\n", " ", ""],
-)
-docs = text_splitter.split_documents(docs)
-print("✅ Split into %s chunks" % len(docs))
-
-# import tiktoken
-
-# enc = tiktoken.get_encoding("cl100k_base")
-# for doc in docs:
-#     print("🔖 Title:", doc.metadata["title"])
-#     print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
-#     tokens = enc.encode(doc.page_content)
-#     print("⚡ Tokens:", len(tokens))
-
 print("➖ Remove all old documents from table")
 supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute()
 print("✅ Removed all old documents from table")
@@ -91,7 +60,7 @@
 
 # Split the documents in chunks for upload (Did time out when too large).
 docs_chunks = [
-    docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
+    docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
 ]
 
 # Iterate over each chunk and upload separately.
diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py
index b76da03..38503fe 100644
--- a/job/defichainpython_loader.py
+++ b/job/defichainpython_loader.py
@@ -1,8 +1,9 @@
 """Loader that loads from DefichainPython."""
-from typing import List
+from typing import List, Dict
 
 from langchain.docstore.document import Document
 from langchain.document_loaders.web_base import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 class DefichainPythonLoader(WebBaseLoader):
@@ -14,97 +15,206 @@ def replace_enter(text: str) -> str:
             text = text.replace("\n\n", "\n")
         return text
 
-    def load(self) -> List[Document]:
-        """Load webpage."""
+    @staticmethod
+    def split_documents(docs: List[Document]):
+        chunk_size = 800
+        chunk_overlap = 50
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap,
+            length_function=len,
+            separators=["\n\n", "\n", " ", ""],
+        )
+        return text_splitter.split_documents(docs)
+
+    @staticmethod
+    def append_metadata(docs: List[Document]):
+        for doc in docs:
+            header = "---\n"
+
+            for key in doc.metadata:
+                header += f"{key.capitalize()}: {doc.metadata.get(key)}\n"
+
+            header += "---\n"
+            doc.page_content = header + doc.page_content
+        return docs
+
+    def to_json(self) -> Dict:
+        """
+        The Content of a webpage will be compressed into a JSON format:
+
+        {
+            title: str
+            source: str
+            area: str
+            tech: str
+            page_description: str
+            classes: [
+                {
+                    class_name: str
+                    class_signature: str
+                    class_description: str
+                    class_methods: [
+                        method_name: str
+                        method_signature: str
+                        method_description: str
+                    ]
+                }
+            ]
+            functions: [
+                function_name: str
+                function_signature: str
+                function_description: str
+            ]
+        }
+        """
+
+        page_json = {}
+
         soup = self.scrape()
 
+        """Title and Source"""
         title_tags = soup.find_all("h1")
         if title_tags:
-            title = ", ".join([tag.get_text() for tag in title_tags])
+            title = ", ".join([tag.get_text().replace("#", "") for tag in title_tags])
         else:
             print(self.web_path)
             raise ValueError("Title tag not found.")
+        for tag in title_tags:
+            tag.decompose()
 
-        documents = []
+        page_json.update({"title": title, "source": self.web_path})
 
-        method_tags = soup.find_all("dl", class_="method")
-        for method_tag in method_tags:
-            area = self.web_path.split("/")[5]
-            tech = self.web_path.split("/")[6]
-            method_signature = method_tag.find("dt").get_text()
-            method_description = method_tag.find("dd").get_text()
-
-            metadata_methods = {
-                "title": title,
-                "source": self.web_path,
-                "area": area,
-                "tech": tech,
-                "class": title,
-                "method": method_signature.split("(")[0].replace("\n", ""),
-            }
-
-            content = method_signature + "\n" + method_description
-
-            document = Document(page_content=content, metadata=metadata_methods)
-            documents.append(document)
-
-        """Embeddings for classes"""
-        class_tags = soup.find_all("dl", class_="class")
-        for class_tag in class_tags:
-            area = self.web_path.split("/")[5]
-            tech = self.web_path.split("/")[6]
+        """Area"""
+        area = self.web_path.split("/")[5:][:-1]
+        area = area[0] if area else ""
+        page_json.update({"area": area})
 
-            class_full_tag = class_tag.find("dd")
-            all_tags = class_full_tag.find_all("dl")
+        """Tech"""
+        tech = "/".join(self.web_path.split("/")[6:][:-1])
+        page_json.update({"tech": tech})
 
-            method_tags = [tag for tag in all_tags if "method" in tag["class"]]
+        """Classes and Methods"""
+        classes_list = []
+
+        class_tags = soup.find_all("dl", class_="class")  # Find all classes
+        for class_tag in class_tags:
+            class_content = class_tag.find("dd")
 
+            class_methods_list = []
+            method_tags = class_content.find_all("dl", class_="method")  # Find all methods inside the class
             for method_tag in method_tags:
+                method_signature = method_tag.find("dt").get_text().replace("#", "").replace("\n", "")
+                method_name = method_signature.split("(")[0]
+                method_description = DefichainPythonLoader.replace_enter(method_tag.find("dd").get_text())
+
+                class_methods_list.append({"method_name": method_name,
+                                           "method_signature": method_signature,
+                                           "method_description": method_description})
                 method_tag.decompose()
 
-            class_signature = class_tag.find("dt").get_text()
-            class_description = " ".join([all_tag.get_text() for all_tag in all_tags])
+            class_signature = class_tag.find("dt").get_text().replace("#", "").replace("\n", "")
+            class_name = class_signature.split("(")[0].split(".")[-1]
+            class_description = DefichainPythonLoader.replace_enter(class_content.get_text())
 
-            content = class_signature + "\n" + class_description
+            classes_list.append({"class_name": class_name,
+                                 "class_signature": class_signature,
+                                 "class_description": class_description,
+                                 "class_methods": class_methods_list})
 
-            metadata_class = {
-                "title": title,
-                "source": self.web_path,
-                "area": area,
-                "tech": tech,
-                "class": title,
-            }
+            class_tag.decompose()
 
-            document = Document(page_content=content, metadata=metadata_class)
-            documents.append(document)
+        """Functions"""
+        functions_list = []
+        functions_tags = soup.find_all("dl", class_="function")  # Find all functions
+        for function_tag in functions_tags:
+            function_signature = function_tag.find("dt").get_text().replace("#", "").replace("\n", "")
+            function_name = function_signature.split("(")[0].split(".")[-1]
+            function_description = DefichainPythonLoader.replace_enter(function_tag.find("dd").get_text())
 
-        """Embeddings for normal text"""
-        article_tag = soup.find("article")
-        all_tags = article_tag.find_all("dl")
+            functions_list.append({"function_name": function_name,
+                                   "function_signature": function_signature,
+                                   "function_description": function_description})
+            function_tag.decompose()
 
-        class_tags = [tag for tag in all_tags if "class" in tag["class"]]
+        """Page Description"""
+        article = soup.find("article")
+        page_description = DefichainPythonLoader.replace_enter(article.get_text()).replace("#", "")
 
-        for class_tag in class_tags:
-            class_tag.decompose()
+        page_json.update({"page_description": page_description})
+        page_json.update({"classes": classes_list})
+        page_json.update({"functions": functions_list})
 
-        content = DefichainPythonLoader.replace_enter(article_tag.get_text())
+        return page_json
 
-        metadata_class = {
-            "title": title,
-            "source": self.web_path,
-        }
+    def load(self) -> List[Document]:
+        """
+        Load DefichainPython WebPage
+        """
+
+        page_json = self.to_json()
+        documents = []
+
+        base_metadata = {"title": page_json.get("title"),
+                         "source": page_json.get("source"),
+                         "area": page_json.get("area"),
+                         "tech": page_json.get("tech")}
+
+        """Page Content"""
+        page_content_docs = DefichainPythonLoader.split_documents(
+            [Document(page_content=page_json.get("page_description"), metadata=base_metadata)])
+        page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs)
+        documents.extend(page_content_docs)
+
+        """Classes"""
+        classes = page_json.get("classes")
+        for class_ in classes:
+            class_content = f'{class_.get("class_signature")}\n{class_.get("class_description")}'
+            class_metadata = base_metadata.copy()
+            class_metadata.update({"class_name": class_.get("class_name")})
+
+            class_content_docs = DefichainPythonLoader.split_documents(
+                [Document(page_content=class_content, metadata=class_metadata)])
+            class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs)
+
+            documents.extend(class_content_docs)
+
+            """Methods"""
+            methods = class_.get("class_methods")
+            for method in methods:
+                method_content = f'{method.get("method_signature")}\n{method.get("method_description")}'
+                method_metadata = base_metadata.copy()
+                method_metadata.update({"class_name": class_.get("class_name"),
+                                        "method_name": method.get("method_name")})
+
+                method_content_docs = DefichainPythonLoader.split_documents(
+                    [Document(page_content=method_content, metadata=method_metadata)])
+                method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs)
+                documents.extend(method_content_docs)
+
+        """Functions"""
+        functions = page_json.get("functions")
+        for function in functions:
+            function_content = f'{function.get("function_signature")}\n{function.get("function_description")}'
+            function_metadata = base_metadata.copy()
+            function_metadata.update({"function_name": function.get("function_name")})
+
+            function_content_docs = DefichainPythonLoader.split_documents(
+                [Document(page_content=function_content, metadata=function_metadata)])
+            function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs)
+            documents.extend(function_content_docs)
 
-        document = Document(page_content=content, metadata=metadata_class)
-        documents.append(document)
         return documents
 
 
 if __name__ == "__main__":
-    loader = DefichainPythonLoader(
-        "https://docs.defichain-python.de/build/html/guides/example/chainedTransactions.html"
-    )
+    url = "https://docs.defichain-python.de/build/html/api/node/index.html"
+    loader = DefichainPythonLoader(url)
     docs = loader.load()
+
     for doc in docs:
-        print("Source:", doc.metadata["source"])
-        print("Title:", doc.metadata["title"])
-        print("Content:", doc.page_content)
+        for key in doc.metadata:
+            print(f"{key.capitalize()}: {doc.metadata.get(key)}")
+        print("Content:", doc.page_content.split("---")[2].replace("\n", "\\n")[:100])
+        print("----------------------------------------")

From 7355fe84111fbe75be2b24d39e4479c6cc985e55 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Tue, 5 Sep 2023 01:08:42 +0200
Subject: [PATCH 06/11] add defichain_qa tool

---
 backend/agent/main_agent.py         |   3 +-
 backend/tools/defichainpython_qa.py | 101 ++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 backend/tools/defichainpython_qa.py

diff --git a/backend/agent/main_agent.py b/backend/agent/main_agent.py
index 82d8fa6..59e4314 100644
--- a/backend/agent/main_agent.py
+++ b/backend/agent/main_agent.py
@@ -8,6 +8,7 @@
 import langchain
 
 from tools.wiki_qa import wikiTool
+from tools.defichainpython_qa import defichainPythonTool
 from tools.ocean import oceanTools
 
 from agent.prompt import PROMPT
@@ -38,7 +39,7 @@ def create_agent(memory, final_output_handler=None):
         temperature=0,
     )
 
-    tools = [wikiTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools
+    tools = [wikiTool, defichainPythonTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools
 
     system_message = SystemMessage(content=PROMPT)
 
diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py
new file mode 100644
index 0000000..a26163f
--- /dev/null
+++ b/backend/tools/defichainpython_qa.py
@@ -0,0 +1,101 @@
+import os
+import json
+from typing import List
+from dotenv import load_dotenv
+from pydantic import BaseModel, Field
+from supabase.client import Client, create_client
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.tools import StructuredTool
+from langchain.chains.openai_functions import create_structured_output_chain
+from langchain.prompts import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+import langchain
+
+load_dotenv()
+
+# Set debug to True to see A LOT of details of langchain's inner workings
+# langchain.debug = True
+
+# The name of the table in Supabase, where the vectors are stored
+vectorTableName = "embeddings"
+
+# Create the supabase client
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_KEY = os.getenv("SUPABASE_KEY")
+supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
+
+
+class ToolInputSchema(BaseModel):
+    question: str = Field(..., description="A fully formed question.")
+
+
+class KnowledgeAnswer(BaseModel):
+    answer: str = Field(..., description="The answer to the question.")
+    sources: List[str] = Field(
+        ...,
+        description="The sources which contributed to the answer.",
+    )
+
+
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
+
+prompt_msgs = [
+    SystemMessagePromptTemplate.from_template(
+        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
+
+        Context:
+        {context}"""
+    ),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+prompt = ChatPromptTemplate.from_messages(prompt_msgs)
+
+chain = create_structured_output_chain(KnowledgeAnswer, llm, prompt)
+
+
+def get_answer(question: str) -> str:
+    try:
+        vectors = OpenAIEmbeddings().embed_documents([question])
+        embeddings = supabase.rpc(
+            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+        ).execute()
+
+        print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
+        for entry in embeddings.data:
+            print("🔖 Title:", entry["metadata"]["title"])
+            print("🌐 Source:", entry["metadata"]["source"])
+            print("📊 Similarity:", entry["similarity"])
+            print("📄 Content:", entry["content"].replace("\n", " ")[:100] + "...")
+            print("-" * 50)
+
+        result = chain.run(context=json.dumps(embeddings.data), question=question)
+        print("📝 Result of knowledge extraction chain:", result)
+
+        return f"""Answer: {result.answer}
+        Sources: {json.dumps(result.sources)}
+        """
+
+    except Exception as e:
+        print(e)
+        return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/"
+
+
+description = """Use this if you need to answer any question reguarding python for defichain:coding, connection to a defichain node, connection to ocean, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""
+
+defichainPythonTool = StructuredTool(
+    name="defichain_python_knowledge",
+    description=description,
+    func=get_answer,
+    args_schema=ToolInputSchema,
+)
+
+if __name__ == "__main__":
+    while True:
+        question = input(
+            "Ask something, that can be answered using information from DeFiChainWiki: "
+        )
+        print("✅", get_answer(question))

From 0393830476732dbca4d517928a3f04636d068b53 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Mon, 18 Sep 2023 15:54:49 +0200
Subject: [PATCH 07/11] remove text metadata from embeddings

---
 job/defichainpython_loader.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py
index 38503fe..9e9560a 100644
--- a/job/defichainpython_loader.py
+++ b/job/defichainpython_loader.py
@@ -164,7 +164,7 @@ def load(self) -> List[Document]:
         """Page Content"""
         page_content_docs = DefichainPythonLoader.split_documents(
             [Document(page_content=page_json.get("page_description"), metadata=base_metadata)])
-        page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs)
+        #page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs)
         documents.extend(page_content_docs)
 
         """Classes"""
@@ -176,7 +176,7 @@ def load(self) -> List[Document]:
 
             class_content_docs = DefichainPythonLoader.split_documents(
                 [Document(page_content=class_content, metadata=class_metadata)])
-            class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs)
+            #class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs)
 
             documents.extend(class_content_docs)
 
@@ -190,7 +190,7 @@ def load(self) -> List[Document]:
 
                 method_content_docs = DefichainPythonLoader.split_documents(
                     [Document(page_content=method_content, metadata=method_metadata)])
-                method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs)
+                #method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs)
                 documents.extend(method_content_docs)
 
         """Functions"""
@@ -202,7 +202,7 @@ def load(self) -> List[Document]:
 
             function_content_docs = DefichainPythonLoader.split_documents(
                 [Document(page_content=function_content, metadata=function_metadata)])
-            function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs)
+            #function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs)
             documents.extend(function_content_docs)
 
         return documents

From 5e7d2c423e938891ef63337b691fd3862bac2e42 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Mon, 18 Sep 2023 15:55:00 +0200
Subject: [PATCH 08/11] reduce temperature

---
 backend/tools/defichainpython_qa.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py
index a26163f..7bfc619 100644
--- a/backend/tools/defichainpython_qa.py
+++ b/backend/tools/defichainpython_qa.py
@@ -41,7 +41,7 @@ class KnowledgeAnswer(BaseModel):
     )
 
 
-llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)
 
 prompt_msgs = [
     SystemMessagePromptTemplate.from_template(
@@ -84,7 +84,7 @@ def get_answer(question: str) -> str:
         return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/"
 
 
-description = """Use this if you need to answer any question reguarding python for defichain:coding, connection to a defichain node, connection to ocean, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""
+description = """Use this if you need to answer any question reguarding python and coding in general. Keywords: python, script, coding, connection to a defichain node, connection to ocean API, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""
 
 defichainPythonTool = StructuredTool(
     name="defichain_python_knowledge",

From 3cec657fc4ed0ba7693ad6091e686426d883daa0 Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Mon, 18 Sep 2023 15:55:31 +0200
Subject: [PATCH 09/11] update embeddings workflow

---
 .github/workflows/wiki_scraping_staging.yml | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml
index 363261b..f241ef5 100644
--- a/.github/workflows/wiki_scraping_staging.yml
+++ b/.github/workflows/wiki_scraping_staging.yml
@@ -19,8 +19,15 @@ jobs:
           pip install -r requirements.txt
         working-directory: ./job
 
-      - name: Run the script
-        run: python ./job/app.py
+      - name: Embeddings for DefichainWiki
+        run: python ./job/wiki_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
+
+      - name: Embeddings for DefichainPython
+        run: python ./job/defichainpython_embedding.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}

From ffd8802b59ad93c1e7c8ed2c50b8694a4893612a Mon Sep 17 00:00:00 2001
From: eric-volz <eric@volz.link>
Date: Mon, 18 Sep 2023 16:09:55 +0200
Subject: [PATCH 10/11] add scraping when new request on main branch

---
 .github/workflows/wiki_scraping_staging.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml
index f241ef5..8dc4bf1 100644
--- a/.github/workflows/wiki_scraping_staging.yml
+++ b/.github/workflows/wiki_scraping_staging.yml
@@ -1,6 +1,9 @@
 name: Wiki scraping Staging
 
 on:
+  pull_request:
+    branches:
+      - main
   workflow_dispatch:
 
 jobs:

From 6db10acc9c0e2e68747781a983f2cd7e6b02649b Mon Sep 17 00:00:00 2001
From: 0ptim <kummer.joshua@outlook.com>
Date: Sun, 15 Oct 2023 17:42:08 +0200
Subject: [PATCH 11/11] Finalize defichain python integration

---
 .../defichain_python_scraping_production.yml  | 27 ++++++++++++++
 .../defichain_python_scraping_staging.yml     | 30 ++++++++++++++++
 .../workflows/wiki_scraping_production.yml    |  2 +-
 .github/workflows/wiki_scraping_staging.yml   |  7 ----
 backend/tools/defichainpython_qa.py           |  7 ++--
 backend/tools/wiki_qa.py                      | 11 +++---
 ...150642_add_defichain_python_embeddings.sql | 35 +++++++++++++++++++
 job/defichainpython_embedding.py              |  8 ++---
 8 files changed, 105 insertions(+), 22 deletions(-)
 create mode 100644 .github/workflows/defichain_python_scraping_production.yml
 create mode 100644 .github/workflows/defichain_python_scraping_staging.yml
 create mode 100644 data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql

diff --git a/.github/workflows/defichain_python_scraping_production.yml b/.github/workflows/defichain_python_scraping_production.yml
new file mode 100644
index 0000000..b129f0a
--- /dev/null
+++ b/.github/workflows/defichain_python_scraping_production.yml
@@ -0,0 +1,27 @@
+name: Defichain Python scraping Production
+
+on:
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Run the script
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/defichain_python_scraping_staging.yml b/.github/workflows/defichain_python_scraping_staging.yml
new file mode 100644
index 0000000..9920664
--- /dev/null
+++ b/.github/workflows/defichain_python_scraping_staging.yml
@@ -0,0 +1,30 @@
+name: Defichain Python scraping Staging
+
+on:
+  pull_request:
+    branches:
+      - main
+  workflow_dispatch:
+
+jobs:
+  run_script:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+        working-directory: ./job
+
+      - name: Embeddings for DefichainPython
+        run: python ./job/defichainpython_embedding.py
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
+          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
diff --git a/.github/workflows/wiki_scraping_production.yml b/.github/workflows/wiki_scraping_production.yml
index 3c13db0..b642572 100644
--- a/.github/workflows/wiki_scraping_production.yml
+++ b/.github/workflows/wiki_scraping_production.yml
@@ -53,7 +53,7 @@ jobs:
         working-directory: ./job
 
       - name: Run the script
-        run: python ./job/app.py
+        run: python ./job/wiki_embedding.py
         env:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml
index 8dc4bf1..fa28b3c 100644
--- a/.github/workflows/wiki_scraping_staging.yml
+++ b/.github/workflows/wiki_scraping_staging.yml
@@ -28,10 +28,3 @@ jobs:
           OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
           SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
           SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
-
-      - name: Embeddings for DefichainPython
-        run: python ./job/defichainpython_embedding.py
-        env:
-          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
-          SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
-          SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py
index 7bfc619..4e15e7f 100644
--- a/backend/tools/defichainpython_qa.py
+++ b/backend/tools/defichainpython_qa.py
@@ -20,8 +20,8 @@
 # Set debug to True to see A LOT of details of langchain's inner workings
 # langchain.debug = True
 
-# The name of the table in Supabase, where the vectors are stored
-vectorTableName = "embeddings"
+# The name of the function in Supabase which is used to match the embeddings
+matchVectorFunctionName = "match_embeddings_defichain_python"
 
 # Create the supabase client
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -61,7 +61,7 @@ def get_answer(question: str) -> str:
     try:
         vectors = OpenAIEmbeddings().embed_documents([question])
         embeddings = supabase.rpc(
-            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
         ).execute()
 
         print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
@@ -93,6 +93,7 @@ def get_answer(question: str) -> str:
     args_schema=ToolInputSchema,
 )
 
+
 if __name__ == "__main__":
     while True:
         question = input(
diff --git a/backend/tools/wiki_qa.py b/backend/tools/wiki_qa.py
index 607fe11..7188574 100644
--- a/backend/tools/wiki_qa.py
+++ b/backend/tools/wiki_qa.py
@@ -15,14 +15,13 @@
 )
 import langchain
 
-
 load_dotenv()
 
 # Set debug to True to see A LOT of details of langchain's inner workings
 # langchain.debug = True
 
 # The name of the table in Supabase, where the vectors are stored
-vectorTableName = "embeddings"
+matchVectorFunctionName = "match_embeddings"
 
 # Create the supabase client
 SUPABASE_URL = os.getenv("SUPABASE_URL")
@@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel):
     )
 
 
-llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
+llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)
 
 prompt_msgs = [
     SystemMessagePromptTemplate.from_template(
-        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance.
-        
+        """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
+
         Context:
         {context}"""
     ),
@@ -62,7 +61,7 @@ def get_answer(question: str) -> str:
     try:
         vectors = OpenAIEmbeddings().embed_documents([question])
         embeddings = supabase.rpc(
-            "match_embeddings", dict(query_embedding=vectors[0], match_count=7)
+            matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
         ).execute()
 
         print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
diff --git a/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql
new file mode 100644
index 0000000..433923a
--- /dev/null
+++ b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql
@@ -0,0 +1,35 @@
+-- Create a table to store embeddings
+create table embeddings_defichain_python (
+  id UUID primary key,
+  content text, -- corresponds to Document.pageContent
+  metadata jsonb, -- corresponds to Document.metadata
+  embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
+);
+
+-- Create a function to search for embeddings
+create function match_embeddings_defichain_python (
+  query_embedding vector(1536),
+  match_count int default null,
+  filter jsonb DEFAULT '{}'
+) returns table (
+  id uuid,
+  content text,
+  metadata jsonb,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    content,
+    metadata,
+    1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity
+  from embeddings_defichain_python
+  where metadata @> filter
+  order by embeddings_defichain_python.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;
diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py
index dac507c..e50249f 100644
--- a/job/defichainpython_embedding.py
+++ b/job/defichainpython_embedding.py
@@ -11,7 +11,7 @@
 
 load_dotenv()
 
-vectorTableName = "embeddings"
+vectorTableName = "embeddings_defichain_python"
 scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
 embedding_model = "text-embedding-ada-002"
 
@@ -29,9 +29,7 @@
 print("🔎 Found %s unique pages" % len(urls))
 
 # Remove urls
-remove_urls = (
-    "https://docs.defichain-python.de/build/html/search.html"
-)
+remove_urls = "https://docs.defichain-python.de/build/html/search.html"
 
 urls = [url for url in urls if url not in remove_urls]
 
@@ -60,7 +58,7 @@
 
 # Split the documents in chunks for upload (Did time out when too large).
 docs_chunks = [
-    docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
+    docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
 ]
 
 # Iterate over each chunk and upload separately.