From 81f14fdd0803cde684b9bed413e4fc77b1db0e0c Mon Sep 17 00:00:00 2001 From: 0ptim Date: Sun, 6 Aug 2023 21:14:47 +0200 Subject: [PATCH 01/11] embed details into metadata --- job/defichainpython_loader.py | 86 ++++++++++++++++++++++++++++ job/new.py | 102 ++++++++++++++++++++++++++++++++++ 2 files changed, 188 insertions(+) create mode 100644 job/defichainpython_loader.py create mode 100644 job/new.py diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py new file mode 100644 index 0000000..792fb78 --- /dev/null +++ b/job/defichainpython_loader.py @@ -0,0 +1,86 @@ +"""Loader that loads from DefichainPython.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.web_base import WebBaseLoader + + +class DefichainPythonLoader(WebBaseLoader): + """Loader that loads from DefichainPython.""" + + def load(self) -> List[Document]: + """Load webpage.""" + soup = self.scrape() + + title_tag = soup.find("h1") + if title_tag: + title = title_tag.get_text() + else: + print(self.web_path) + raise ValueError("Title tag not found.") + + documents = [] + + method_tags = soup.find_all("dl", class_="method") + for method_tag in method_tags: + area = self.web_path.split("/")[5] + tech = self.web_path.split("/")[6] + method_signature = method_tag.find("dt").get_text() + method_description = method_tag.find("dd").get_text() + + metadata_methods = { + "title": title, + "source": self.web_path, + "area": area, + "tech": tech, + "class": title, + "method": method_signature.split("(")[0].replace("\n", ""), + } + + content = method_signature + "\n" + method_description + + document = Document(page_content=content, metadata=metadata_methods) + documents.append(document) + + """Embeddings for classes""" + class_tags = soup.find_all("dl", class_="class") + for class_tag in class_tags: + area = self.web_path.split("/")[5] + tech = self.web_path.split("/")[6] + + class_full_tag = class_tag.find("dd") + all_tags = class_full_tag.find_all("dl") + + method_tags = [tag for tag in all_tags if "method" in tag["class"]] + + for method_tag in method_tags: + method_tag.decompose() + + class_signature = class_tag.find("dt").get_text() + class_description = " ".join([all_tag.get_text() for all_tag in all_tags]) + + content = class_signature + "\n" + class_description + + metadata_class = { + "title": title, + "source": self.web_path, + "area": area, + "tech": tech, + "class": title, + } + + document = Document(page_content=content, metadata=metadata_class) + documents.append(document) + + return documents + + +if __name__ == "__main__": + loader = DefichainPythonLoader( + "https://docs.defichain-python.de/build/html/sdk/hdwallet/wallet.html" + ) + docs = loader.load() + for doc in docs: + print("Source:", doc.metadata["source"]) + print("Title:", doc.metadata["title"]) + print("Content:", doc.page_content) diff --git a/job/new.py b/job/new.py new file mode 100644 index 0000000..900e04b --- /dev/null +++ b/job/new.py @@ -0,0 +1,102 @@ +import os +import re +from langchain.vectorstores import SupabaseVectorStore +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.text_splitter import RecursiveCharacterTextSplitter +from supabase.client import Client, create_client +from dotenv import load_dotenv +import uuid + +from defichainpython_loader import DefichainPythonLoader +from sitemap_parser import get_urls + +load_dotenv() + +vectorTableName = "embeddings" +scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"] +chunk_size = 1000 +chunk_overlap = 50 +embedding_model = "text-embedding-ada-002" + +supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY")) + +urls = [] + +# Get all urls from sitemap +for url in scrapeUrls: + urls.extend(get_urls(url)) +print("🔎 Found %s pages in total" % len(urls)) + +# Remove duplicates +urls = list(dict.fromkeys(urls)) +print("🔎 Found %s unique pages" % len(urls)) + + +# Remove for testing +urls = [url for url in urls if "/wallet" in url] + +print("🔭 Scrape %s found pages.." % len(urls)) +print("---") +docs = [] +for url in urls: + loader = DefichainPythonLoader(url) + docs.extend(loader.load()) + +print(f"✅ Scraped all pages") + +for doc in docs: + print("🌐 Source:", doc.metadata["source"]) + print("🔖 Title:", doc.metadata["title"]) + print("📄 Content:", doc.page_content.replace("\n", " ")[:1000] + "...") + print("---") + + +print("➖ Remove long strings") +for document in docs: + document.page_content = re.sub( + r"(?<=\S)[^\s]{" + str(chunk_size) + ",}(?=\S)", "", document.page_content + ) +print("✅ Removed long strings") + + +print("🗨 Split into chunks..") +text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + separators=["\n\n", "\n", " ", ""], +) +docs = text_splitter.split_documents(docs) +print("✅ Split into %s chunks" % len(docs)) + +# import tiktoken + +# enc = tiktoken.get_encoding("cl100k_base") +# for doc in docs: +# print("🔖 Title:", doc.metadata["title"]) +# print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...") +# tokens = enc.encode(doc.page_content) +# print("⚡ Tokens:", len(tokens)) + +print("➖ Remove all old documents from table") +supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute() +print("✅ Removed all old documents from table") + +print("🔮 Embedding..") +embeddings = OpenAIEmbeddings(model=embedding_model) +upload_chunk_size = 200 + +# Split the documents in chunks for upload (Did time out when too large). +docs_chunks = [ + docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) +] + +# Iterate over each chunk and upload separately. +for doc_chunk in docs_chunks: + vector_store = SupabaseVectorStore.from_documents( + doc_chunk, + embeddings, + client=supabase, + table_name=vectorTableName, + ) +print("✅ Embedded") From 23ad7bea255bf66b45cdbbe0caa339ce23be64d8 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Mon, 7 Aug 2023 01:00:22 +0200 Subject: [PATCH 02/11] remove urls that should not be indexed --- job/new.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/job/new.py b/job/new.py index 900e04b..005c588 100644 --- a/job/new.py +++ b/job/new.py @@ -32,8 +32,12 @@ print("🔎 Found %s unique pages" % len(urls)) -# Remove for testing -urls = [url for url in urls if "/wallet" in url] +# Remove urls +remove_urls = ( + "https://docs.defichain-python.de/build/html/search.html" +) + +urls = [url for url in urls if url not in remove_urls] print("🔭 Scrape %s found pages.." % len(urls)) print("---") @@ -47,10 +51,9 @@ for doc in docs: print("🌐 Source:", doc.metadata["source"]) print("🔖 Title:", doc.metadata["title"]) - print("📄 Content:", doc.page_content.replace("\n", " ")[:1000] + "...") + print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...") print("---") - print("➖ Remove long strings") for document in docs: document.page_content = re.sub( From 2d2c0df01a3d7a37c2267b73f859957701646619 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Tue, 8 Aug 2023 19:21:14 +0200 Subject: [PATCH 03/11] add embeddings for normal text --- job/defichainpython_loader.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py index 792fb78..b76da03 100644 --- a/job/defichainpython_loader.py +++ b/job/defichainpython_loader.py @@ -8,13 +8,19 @@ class DefichainPythonLoader(WebBaseLoader): """Loader that loads from DefichainPython.""" + @staticmethod + def replace_enter(text: str) -> str: + while text.find("\n\n") != -1: + text = text.replace("\n\n", "\n") + return text + def load(self) -> List[Document]: """Load webpage.""" soup = self.scrape() - title_tag = soup.find("h1") - if title_tag: - title = title_tag.get_text() + title_tags = soup.find_all("h1") + if title_tags: + title = ", ".join([tag.get_text() for tag in title_tags]) else: print(self.web_path) raise ValueError("Title tag not found.") @@ -72,12 +78,30 @@ def load(self) -> List[Document]: document = Document(page_content=content, metadata=metadata_class) documents.append(document) + """Embeddings for normal text""" + article_tag = soup.find("article") + all_tags = article_tag.find_all("dl") + + class_tags = [tag for tag in all_tags if "class" in tag["class"]] + + for class_tag in class_tags: + class_tag.decompose() + + content = DefichainPythonLoader.replace_enter(article_tag.get_text()) + + metadata_class = { + "title": title, + "source": self.web_path, + } + + document = Document(page_content=content, metadata=metadata_class) + documents.append(document) return documents if __name__ == "__main__": loader = DefichainPythonLoader( - "https://docs.defichain-python.de/build/html/sdk/hdwallet/wallet.html" + "https://docs.defichain-python.de/build/html/guides/example/chainedTransactions.html" ) docs = loader.load() for doc in docs: From 4cd43ffb8938ddf61499c3ac8feee27b79027673 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Tue, 8 Aug 2023 19:23:06 +0200 Subject: [PATCH 04/11] rename files that do the embedding --- job/{new.py => defichainpython_embedding.py} | 0 job/{app.py => wiki_embedding.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename job/{new.py => defichainpython_embedding.py} (100%) rename job/{app.py => wiki_embedding.py} (100%) diff --git a/job/new.py b/job/defichainpython_embedding.py similarity index 100% rename from job/new.py rename to job/defichainpython_embedding.py diff --git a/job/app.py b/job/wiki_embedding.py similarity index 100% rename from job/app.py rename to job/wiki_embedding.py From c1753462758577e1cc3dc7382986078ba62d69a9 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Sun, 13 Aug 2023 21:10:46 +0200 Subject: [PATCH 05/11] complete rework of embeddings creation for DefichainPython --- job/defichainpython_embedding.py | 33 +---- job/defichainpython_loader.py | 240 ++++++++++++++++++++++--------- 2 files changed, 176 insertions(+), 97 deletions(-) diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py index 005c588..dac507c 100644 --- a/job/defichainpython_embedding.py +++ b/job/defichainpython_embedding.py @@ -2,7 +2,6 @@ import re from langchain.vectorstores import SupabaseVectorStore from langchain.embeddings.openai import OpenAIEmbeddings -from langchain.text_splitter import RecursiveCharacterTextSplitter from supabase.client import Client, create_client from dotenv import load_dotenv import uuid @@ -14,8 +13,6 @@ vectorTableName = "embeddings" scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"] -chunk_size = 1000 -chunk_overlap = 50 embedding_model = "text-embedding-ada-002" supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY")) @@ -31,7 +28,6 @@ urls = list(dict.fromkeys(urls)) print("🔎 Found %s unique pages" % len(urls)) - # Remove urls remove_urls = ( "https://docs.defichain-python.de/build/html/search.html" @@ -54,33 +50,6 @@ print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...") print("---") -print("➖ Remove long strings") -for document in docs: - document.page_content = re.sub( - r"(?<=\S)[^\s]{" + str(chunk_size) + ",}(?=\S)", "", document.page_content - ) -print("✅ Removed long strings") - - -print("🗨 Split into chunks..") -text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - length_function=len, - separators=["\n\n", "\n", " ", ""], -) -docs = text_splitter.split_documents(docs) -print("✅ Split into %s chunks" % len(docs)) - -# import tiktoken - -# enc = tiktoken.get_encoding("cl100k_base") -# for doc in docs: -# print("🔖 Title:", doc.metadata["title"]) -# print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...") -# tokens = enc.encode(doc.page_content) -# print("⚡ Tokens:", len(tokens)) - print("➖ Remove all old documents from table") supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute() print("✅ Removed all old documents from table") @@ -91,7 +60,7 @@ # Split the documents in chunks for upload (Did time out when too large). docs_chunks = [ - docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) + docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) ] # Iterate over each chunk and upload separately. diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py index b76da03..38503fe 100644 --- a/job/defichainpython_loader.py +++ b/job/defichainpython_loader.py @@ -1,8 +1,9 @@ """Loader that loads from DefichainPython.""" -from typing import List +from typing import List, Dict from langchain.docstore.document import Document from langchain.document_loaders.web_base import WebBaseLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter class DefichainPythonLoader(WebBaseLoader): @@ -14,97 +15,206 @@ def replace_enter(text: str) -> str: text = text.replace("\n\n", "\n") return text - def load(self) -> List[Document]: - """Load webpage.""" + @staticmethod + def split_documents(docs: List[Document]): + chunk_size = 800 + chunk_overlap = 50 + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + length_function=len, + separators=["\n\n", "\n", " ", ""], + ) + return text_splitter.split_documents(docs) + + @staticmethod + def append_metadata(docs: List[Document]): + for doc in docs: + header = "---\n" + + for key in doc.metadata: + header += f"{key.capitalize()}: {doc.metadata.get(key)}\n" + + header += "---\n" + doc.page_content = header + doc.page_content + return docs + + def to_json(self) -> Dict: + """ + The Content of a webpage will be compressed into a JSON format: + + { + title: str + source: str + area: str + tech: str + page_description: str + classes: [ + { + class_name: str + class_signature: str + class_description: str + class_methods: [ + method_name: str + method_signature: str + method_description: str + ] + } + ] + functions: [ + function_name: str + function_signature: str + function_description: str + ] + } + """ + + page_json = {} + soup = self.scrape() + """Title and Source""" title_tags = soup.find_all("h1") if title_tags: - title = ", ".join([tag.get_text() for tag in title_tags]) + title = ", ".join([tag.get_text().replace("#", "") for tag in title_tags]) else: print(self.web_path) raise ValueError("Title tag not found.") + for tag in title_tags: + tag.decompose() - documents = [] + page_json.update({"title": title, "source": self.web_path}) - method_tags = soup.find_all("dl", class_="method") - for method_tag in method_tags: - area = self.web_path.split("/")[5] - tech = self.web_path.split("/")[6] - method_signature = method_tag.find("dt").get_text() - method_description = method_tag.find("dd").get_text() - - metadata_methods = { - "title": title, - "source": self.web_path, - "area": area, - "tech": tech, - "class": title, - "method": method_signature.split("(")[0].replace("\n", ""), - } - - content = method_signature + "\n" + method_description - - document = Document(page_content=content, metadata=metadata_methods) - documents.append(document) - - """Embeddings for classes""" - class_tags = soup.find_all("dl", class_="class") - for class_tag in class_tags: - area = self.web_path.split("/")[5] - tech = self.web_path.split("/")[6] + """Area""" + area = self.web_path.split("/")[5:][:-1] + area = area[0] if area else "" + page_json.update({"area": area}) - class_full_tag = class_tag.find("dd") - all_tags = class_full_tag.find_all("dl") + """Tech""" + tech = "/".join(self.web_path.split("/")[6:][:-1]) + page_json.update({"tech": tech}) - method_tags = [tag for tag in all_tags if "method" in tag["class"]] + """Classes and Methods""" + classes_list = [] + + class_tags = soup.find_all("dl", class_="class") # Find all classes + for class_tag in class_tags: + class_content = class_tag.find("dd") + class_methods_list = [] + method_tags = class_content.find_all("dl", class_="method") # Find all methods inside the class for method_tag in method_tags: + method_signature = method_tag.find("dt").get_text().replace("#", "").replace("\n", "") + method_name = method_signature.split("(")[0] + method_description = DefichainPythonLoader.replace_enter(method_tag.find("dd").get_text()) + + class_methods_list.append({"method_name": method_name, + "method_signature": method_signature, + "method_description": method_description}) method_tag.decompose() - class_signature = class_tag.find("dt").get_text() - class_description = " ".join([all_tag.get_text() for all_tag in all_tags]) + class_signature = class_tag.find("dt").get_text().replace("#", "").replace("\n", "") + class_name = class_signature.split("(")[0].split(".")[-1] + class_description = DefichainPythonLoader.replace_enter(class_content.get_text()) - content = class_signature + "\n" + class_description + classes_list.append({"class_name": class_name, + "class_signature": class_signature, + "class_description": class_description, + "class_methods": class_methods_list}) - metadata_class = { - "title": title, - "source": self.web_path, - "area": area, - "tech": tech, - "class": title, - } + class_tag.decompose() - document = Document(page_content=content, metadata=metadata_class) - documents.append(document) + """Functions""" + functions_list = [] + functions_tags = soup.find_all("dl", class_="function") # Find all functions + for function_tag in functions_tags: + function_signature = function_tag.find("dt").get_text().replace("#", "").replace("\n", "") + function_name = function_signature.split("(")[0].split(".")[-1] + function_description = DefichainPythonLoader.replace_enter(function_tag.find("dd").get_text()) - """Embeddings for normal text""" - article_tag = soup.find("article") - all_tags = article_tag.find_all("dl") + functions_list.append({"function_name": function_name, + "function_signature": function_signature, + "function_description": function_description}) + function_tag.decompose() - class_tags = [tag for tag in all_tags if "class" in tag["class"]] + """Page Description""" + article = soup.find("article") + page_description = DefichainPythonLoader.replace_enter(article.get_text()).replace("#", "") - for class_tag in class_tags: - class_tag.decompose() + page_json.update({"page_description": page_description}) + page_json.update({"classes": classes_list}) + page_json.update({"functions": functions_list}) - content = DefichainPythonLoader.replace_enter(article_tag.get_text()) + return page_json - metadata_class = { - "title": title, - "source": self.web_path, - } + def load(self) -> List[Document]: + """ + Load DefichainPython WebPage + """ + + page_json = self.to_json() + documents = [] + + base_metadata = {"title": page_json.get("title"), + "source": page_json.get("source"), + "area": page_json.get("area"), + "tech": page_json.get("tech")} + + """Page Content""" + page_content_docs = DefichainPythonLoader.split_documents( + [Document(page_content=page_json.get("page_description"), metadata=base_metadata)]) + page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs) + documents.extend(page_content_docs) + + """Classes""" + classes = page_json.get("classes") + for class_ in classes: + class_content = f'{class_.get("class_signature")}\n{class_.get("class_description")}' + class_metadata = base_metadata.copy() + class_metadata.update({"class_name": class_.get("class_name")}) + + class_content_docs = DefichainPythonLoader.split_documents( + [Document(page_content=class_content, metadata=class_metadata)]) + class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs) + + documents.extend(class_content_docs) + + """Methods""" + methods = class_.get("class_methods") + for method in methods: + method_content = f'{method.get("method_signature")}\n{method.get("method_description")}' + method_metadata = base_metadata.copy() + method_metadata.update({"class_name": class_.get("class_name"), + "method_name": method.get("method_name")}) + + method_content_docs = DefichainPythonLoader.split_documents( + [Document(page_content=method_content, metadata=method_metadata)]) + method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs) + documents.extend(method_content_docs) + + """Functions""" + functions = page_json.get("functions") + for function in functions: + function_content = f'{function.get("function_signature")}\n{function.get("function_description")}' + function_metadata = base_metadata.copy() + function_metadata.update({"function_name": function.get("function_name")}) + + function_content_docs = DefichainPythonLoader.split_documents( + [Document(page_content=function_content, metadata=function_metadata)]) + function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs) + documents.extend(function_content_docs) - document = Document(page_content=content, metadata=metadata_class) - documents.append(document) return documents if __name__ == "__main__": - loader = DefichainPythonLoader( - "https://docs.defichain-python.de/build/html/guides/example/chainedTransactions.html" - ) + url = "https://docs.defichain-python.de/build/html/api/node/index.html" + loader = DefichainPythonLoader(url) docs = loader.load() + for doc in docs: - print("Source:", doc.metadata["source"]) - print("Title:", doc.metadata["title"]) - print("Content:", doc.page_content) + for key in doc.metadata: + print(f"{key.capitalize()}: {doc.metadata.get(key)}") + print("Content:", doc.page_content.split("---")[2].replace("\n", "\\n")[:100]) + print("----------------------------------------") From 7355fe84111fbe75be2b24d39e4479c6cc985e55 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Tue, 5 Sep 2023 01:08:42 +0200 Subject: [PATCH 06/11] add defichain_qa tool --- backend/agent/main_agent.py | 3 +- backend/tools/defichainpython_qa.py | 101 ++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) create mode 100644 backend/tools/defichainpython_qa.py diff --git a/backend/agent/main_agent.py b/backend/agent/main_agent.py index 82d8fa6..59e4314 100644 --- a/backend/agent/main_agent.py +++ b/backend/agent/main_agent.py @@ -8,6 +8,7 @@ import langchain from tools.wiki_qa import wikiTool +from tools.defichainpython_qa import defichainPythonTool from tools.ocean import oceanTools from agent.prompt import PROMPT @@ -38,7 +39,7 @@ def create_agent(memory, final_output_handler=None): temperature=0, ) - tools = [wikiTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools + tools = [wikiTool, defichainPythonTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools system_message = SystemMessage(content=PROMPT) diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py new file mode 100644 index 0000000..a26163f --- /dev/null +++ b/backend/tools/defichainpython_qa.py @@ -0,0 +1,101 @@ +import os +import json +from typing import List +from dotenv import load_dotenv +from pydantic import BaseModel, Field +from supabase.client import Client, create_client +from langchain.chat_models import ChatOpenAI +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.tools import StructuredTool +from langchain.chains.openai_functions import create_structured_output_chain +from langchain.prompts import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +import langchain + +load_dotenv() + +# Set debug to True to see A LOT of details of langchain's inner workings +# langchain.debug = True + +# The name of the table in Supabase, where the vectors are stored +vectorTableName = "embeddings" + +# Create the supabase client +SUPABASE_URL = os.getenv("SUPABASE_URL") +SUPABASE_KEY = os.getenv("SUPABASE_KEY") +supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY) + + +class ToolInputSchema(BaseModel): + question: str = Field(..., description="A fully formed question.") + + +class KnowledgeAnswer(BaseModel): + answer: str = Field(..., description="The answer to the question.") + sources: List[str] = Field( + ..., + description="The sources which contributed to the answer.", + ) + + +llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7) + +prompt_msgs = [ + SystemMessagePromptTemplate.from_template( + """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user. + + Context: + {context}""" + ), + HumanMessagePromptTemplate.from_template("{question}"), +] +prompt = ChatPromptTemplate.from_messages(prompt_msgs) + +chain = create_structured_output_chain(KnowledgeAnswer, llm, prompt) + + +def get_answer(question: str) -> str: + try: + vectors = OpenAIEmbeddings().embed_documents([question]) + embeddings = supabase.rpc( + "match_embeddings", dict(query_embedding=vectors[0], match_count=7) + ).execute() + + print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:") + for entry in embeddings.data: + print("🔖 Title:", entry["metadata"]["title"]) + print("🌐 Source:", entry["metadata"]["source"]) + print("📊 Similarity:", entry["similarity"]) + print("📄 Content:", entry["content"].replace("\n", " ")[:100] + "...") + print("-" * 50) + + result = chain.run(context=json.dumps(embeddings.data), question=question) + print("📝 Result of knowledge extraction chain:", result) + + return f"""Answer: {result.answer} + Sources: {json.dumps(result.sources)} + """ + + except Exception as e: + print(e) + return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/" + + +description = """Use this if you need to answer any question reguarding python for defichain:coding, connection to a defichain node, connection to ocean, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response.""" + +defichainPythonTool = StructuredTool( + name="defichain_python_knowledge", + description=description, + func=get_answer, + args_schema=ToolInputSchema, +) + +if __name__ == "__main__": + while True: + question = input( + "Ask something, that can be answered using information from DeFiChainWiki: " + ) + print("✅", get_answer(question)) From 0393830476732dbca4d517928a3f04636d068b53 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Mon, 18 Sep 2023 15:54:49 +0200 Subject: [PATCH 07/11] remove text metadata from embeddings --- job/defichainpython_loader.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/job/defichainpython_loader.py b/job/defichainpython_loader.py index 38503fe..9e9560a 100644 --- a/job/defichainpython_loader.py +++ b/job/defichainpython_loader.py @@ -164,7 +164,7 @@ def load(self) -> List[Document]: """Page Content""" page_content_docs = DefichainPythonLoader.split_documents( [Document(page_content=page_json.get("page_description"), metadata=base_metadata)]) - page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs) + #page_content_docs = DefichainPythonLoader.append_metadata(page_content_docs) documents.extend(page_content_docs) """Classes""" @@ -176,7 +176,7 @@ def load(self) -> List[Document]: class_content_docs = DefichainPythonLoader.split_documents( [Document(page_content=class_content, metadata=class_metadata)]) - class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs) + #class_content_docs = DefichainPythonLoader.append_metadata(class_content_docs) documents.extend(class_content_docs) @@ -190,7 +190,7 @@ def load(self) -> List[Document]: method_content_docs = DefichainPythonLoader.split_documents( [Document(page_content=method_content, metadata=method_metadata)]) - method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs) + #method_content_docs = DefichainPythonLoader.append_metadata(method_content_docs) documents.extend(method_content_docs) """Functions""" @@ -202,7 +202,7 @@ def load(self) -> List[Document]: function_content_docs = DefichainPythonLoader.split_documents( [Document(page_content=function_content, metadata=function_metadata)]) - function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs) + #function_content_docs = DefichainPythonLoader.append_metadata(function_content_docs) documents.extend(function_content_docs) return documents From 5e7d2c423e938891ef63337b691fd3862bac2e42 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Mon, 18 Sep 2023 15:55:00 +0200 Subject: [PATCH 08/11] reduce temperature --- backend/tools/defichainpython_qa.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py index a26163f..7bfc619 100644 --- a/backend/tools/defichainpython_qa.py +++ b/backend/tools/defichainpython_qa.py @@ -41,7 +41,7 @@ class KnowledgeAnswer(BaseModel): ) -llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7) +llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3) prompt_msgs = [ SystemMessagePromptTemplate.from_template( @@ -84,7 +84,7 @@ def get_answer(question: str) -> str: return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/" -description = """Use this if you need to answer any question reguarding python for defichain:coding, connection to a defichain node, connection to ocean, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response.""" +description = """Use this if you need to answer any question reguarding python and coding in general. Keywords: python, script, coding, connection to a defichain node, connection to ocean API, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response.""" defichainPythonTool = StructuredTool( name="defichain_python_knowledge", From 3cec657fc4ed0ba7693ad6091e686426d883daa0 Mon Sep 17 00:00:00 2001 From: eric-volz Date: Mon, 18 Sep 2023 15:55:31 +0200 Subject: [PATCH 09/11] update embeddings workflow --- .github/workflows/wiki_scraping_staging.yml | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml index 363261b..f241ef5 100644 --- a/.github/workflows/wiki_scraping_staging.yml +++ b/.github/workflows/wiki_scraping_staging.yml @@ -19,8 +19,15 @@ jobs: pip install -r requirements.txt working-directory: ./job - - name: Run the script - run: python ./job/app.py + - name: Embeddings for DefichainWiki + run: python ./job/wiki_embedding.py + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} + SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} + + - name: Embeddings for DefichainPython + run: python ./job/defichainpython_embedding.py env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} From ffd8802b59ad93c1e7c8ed2c50b8694a4893612a Mon Sep 17 00:00:00 2001 From: eric-volz Date: Mon, 18 Sep 2023 16:09:55 +0200 Subject: [PATCH 10/11] add scraping when new request on main branch --- .github/workflows/wiki_scraping_staging.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml index f241ef5..8dc4bf1 100644 --- a/.github/workflows/wiki_scraping_staging.yml +++ b/.github/workflows/wiki_scraping_staging.yml @@ -1,6 +1,9 @@ name: Wiki scraping Staging on: + pull_request: + branches: + - main workflow_dispatch: jobs: From 6db10acc9c0e2e68747781a983f2cd7e6b02649b Mon Sep 17 00:00:00 2001 From: 0ptim Date: Sun, 15 Oct 2023 17:42:08 +0200 Subject: [PATCH 11/11] Finalize defichain python integration --- .../defichain_python_scraping_production.yml | 27 ++++++++++++++ .../defichain_python_scraping_staging.yml | 30 ++++++++++++++++ .../workflows/wiki_scraping_production.yml | 2 +- .github/workflows/wiki_scraping_staging.yml | 7 ---- backend/tools/defichainpython_qa.py | 7 ++-- backend/tools/wiki_qa.py | 11 +++--- ...150642_add_defichain_python_embeddings.sql | 35 +++++++++++++++++++ job/defichainpython_embedding.py | 8 ++--- 8 files changed, 105 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/defichain_python_scraping_production.yml create mode 100644 .github/workflows/defichain_python_scraping_staging.yml create mode 100644 data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql diff --git a/.github/workflows/defichain_python_scraping_production.yml b/.github/workflows/defichain_python_scraping_production.yml new file mode 100644 index 0000000..b129f0a --- /dev/null +++ b/.github/workflows/defichain_python_scraping_production.yml @@ -0,0 +1,27 @@ +name: Defichain Python scraping Production + +on: + workflow_dispatch: + +jobs: + run_script: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + working-directory: ./job + + - name: Run the script + run: python ./job/defichainpython_embedding.py + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }} + SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }} diff --git a/.github/workflows/defichain_python_scraping_staging.yml b/.github/workflows/defichain_python_scraping_staging.yml new file mode 100644 index 0000000..9920664 --- /dev/null +++ b/.github/workflows/defichain_python_scraping_staging.yml @@ -0,0 +1,30 @@ +name: Defichain Python scraping Staging + +on: + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + run_script: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + working-directory: ./job + + - name: Embeddings for DefichainPython + run: python ./job/defichainpython_embedding.py + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} + SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} diff --git a/.github/workflows/wiki_scraping_production.yml b/.github/workflows/wiki_scraping_production.yml index 3c13db0..b642572 100644 --- a/.github/workflows/wiki_scraping_production.yml +++ b/.github/workflows/wiki_scraping_production.yml @@ -53,7 +53,7 @@ jobs: working-directory: ./job - name: Run the script - run: python ./job/app.py + run: python ./job/wiki_embedding.py env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }} diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml index 8dc4bf1..fa28b3c 100644 --- a/.github/workflows/wiki_scraping_staging.yml +++ b/.github/workflows/wiki_scraping_staging.yml @@ -28,10 +28,3 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} - - - name: Embeddings for DefichainPython - run: python ./job/defichainpython_embedding.py - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} - SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py index 7bfc619..4e15e7f 100644 --- a/backend/tools/defichainpython_qa.py +++ b/backend/tools/defichainpython_qa.py @@ -20,8 +20,8 @@ # Set debug to True to see A LOT of details of langchain's inner workings # langchain.debug = True -# The name of the table in Supabase, where the vectors are stored -vectorTableName = "embeddings" +# The name of the function in Supabase which is used to match the embeddings +matchVectorFunctionName = "match_embeddings_defichain_python" # Create the supabase client SUPABASE_URL = os.getenv("SUPABASE_URL") @@ -61,7 +61,7 @@ def get_answer(question: str) -> str: try: vectors = OpenAIEmbeddings().embed_documents([question]) embeddings = supabase.rpc( - "match_embeddings", dict(query_embedding=vectors[0], match_count=7) + matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7) ).execute() print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:") @@ -93,6 +93,7 @@ def get_answer(question: str) -> str: args_schema=ToolInputSchema, ) + if __name__ == "__main__": while True: question = input( diff --git a/backend/tools/wiki_qa.py b/backend/tools/wiki_qa.py index 607fe11..7188574 100644 --- a/backend/tools/wiki_qa.py +++ b/backend/tools/wiki_qa.py @@ -15,14 +15,13 @@ ) import langchain - load_dotenv() # Set debug to True to see A LOT of details of langchain's inner workings # langchain.debug = True # The name of the table in Supabase, where the vectors are stored -vectorTableName = "embeddings" +matchVectorFunctionName = "match_embeddings" # Create the supabase client SUPABASE_URL = os.getenv("SUPABASE_URL") @@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel): ) -llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7) +llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3) prompt_msgs = [ SystemMessagePromptTemplate.from_template( - """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. - + """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user. + Context: {context}""" ), @@ -62,7 +61,7 @@ def get_answer(question: str) -> str: try: vectors = OpenAIEmbeddings().embed_documents([question]) embeddings = supabase.rpc( - "match_embeddings", dict(query_embedding=vectors[0], match_count=7) + matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7) ).execute() print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:") diff --git a/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql new file mode 100644 index 0000000..433923a --- /dev/null +++ b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql @@ -0,0 +1,35 @@ +-- Create a table to store embeddings +create table embeddings_defichain_python ( + id UUID primary key, + content text, -- corresponds to Document.pageContent + metadata jsonb, -- corresponds to Document.metadata + embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed +); + +-- Create a function to search for embeddings +create function match_embeddings_defichain_python ( + query_embedding vector(1536), + match_count int default null, + filter jsonb DEFAULT '{}' +) returns table ( + id uuid, + content text, + metadata jsonb, + similarity float +) +language plpgsql +as $$ +#variable_conflict use_column +begin + return query + select + id, + content, + metadata, + 1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity + from embeddings_defichain_python + where metadata @> filter + order by embeddings_defichain_python.embedding <=> query_embedding + limit match_count; +end; +$$; diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py index dac507c..e50249f 100644 --- a/job/defichainpython_embedding.py +++ b/job/defichainpython_embedding.py @@ -11,7 +11,7 @@ load_dotenv() -vectorTableName = "embeddings" +vectorTableName = "embeddings_defichain_python" scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"] embedding_model = "text-embedding-ada-002" @@ -29,9 +29,7 @@ print("🔎 Found %s unique pages" % len(urls)) # Remove urls -remove_urls = ( - "https://docs.defichain-python.de/build/html/search.html" -) +remove_urls = "https://docs.defichain-python.de/build/html/search.html" urls = [url for url in urls if url not in remove_urls] @@ -60,7 +58,7 @@ # Split the documents in chunks for upload (Did time out when too large). docs_chunks = [ - docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) + docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) ] # Iterate over each chunk and upload separately.