Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DefichainPython knowledge to JellyChat #113

Merged
merged 11 commits into from
Oct 15, 2023
27 changes: 27 additions & 0 deletions .github/workflows/defichain_python_scraping_production.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Defichain Python scraping Production

on:
workflow_dispatch:

jobs:
run_script:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./job

- name: Run the script
run: python ./job/defichainpython_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }}
30 changes: 30 additions & 0 deletions .github/workflows/defichain_python_scraping_staging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Defichain Python scraping Staging

on:
pull_request:
branches:
- main
workflow_dispatch:

jobs:
run_script:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./job

- name: Embeddings for DefichainPython
run: python ./job/defichainpython_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
2 changes: 1 addition & 1 deletion .github/workflows/wiki_scraping_production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
working-directory: ./job

- name: Run the script
run: python ./job/app.py
run: python ./job/wiki_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
Expand Down
7 changes: 5 additions & 2 deletions .github/workflows/wiki_scraping_staging.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
name: Wiki scraping Staging

on:
pull_request:
branches:
- main
workflow_dispatch:

jobs:
Expand All @@ -19,8 +22,8 @@ jobs:
pip install -r requirements.txt
working-directory: ./job

- name: Run the script
run: python ./job/app.py
- name: Embeddings for DefichainWiki
run: python ./job/wiki_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
Expand Down
3 changes: 2 additions & 1 deletion backend/agent/main_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import langchain

from tools.wiki_qa import wikiTool
from tools.defichainpython_qa import defichainPythonTool
from tools.ocean import oceanTools

from agent.prompt import PROMPT
Expand Down Expand Up @@ -38,7 +39,7 @@ def create_agent(memory, final_output_handler=None):
temperature=0,
)

tools = [wikiTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools
tools = [wikiTool, defichainPythonTool] + load_tools(["llm-math"], llm=llm_for_math) + oceanTools

system_message = SystemMessage(content=PROMPT)

Expand Down
102 changes: 102 additions & 0 deletions backend/tools/defichainpython_qa.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import os
import json
from typing import List
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from supabase.client import Client, create_client
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.tools import StructuredTool
from langchain.chains.openai_functions import create_structured_output_chain
from langchain.prompts import (
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
)
import langchain

load_dotenv()

# Set debug to True to see A LOT of details of langchain's inner workings
# langchain.debug = True

# The name of the function in Supabase which is used to match the embeddings
matchVectorFunctionName = "match_embeddings_defichain_python"

# Create the supabase client
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


class ToolInputSchema(BaseModel):
question: str = Field(..., description="A fully formed question.")


class KnowledgeAnswer(BaseModel):
answer: str = Field(..., description="The answer to the question.")
sources: List[str] = Field(
...,
description="The sources which contributed to the answer.",
)


llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)

prompt_msgs = [
SystemMessagePromptTemplate.from_template(
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.

Context:
{context}"""
),
HumanMessagePromptTemplate.from_template("{question}"),
]
prompt = ChatPromptTemplate.from_messages(prompt_msgs)

chain = create_structured_output_chain(KnowledgeAnswer, llm, prompt)


def get_answer(question: str) -> str:
try:
vectors = OpenAIEmbeddings().embed_documents([question])
embeddings = supabase.rpc(
matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
).execute()

print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
for entry in embeddings.data:
print("🔖 Title:", entry["metadata"]["title"])
print("🌐 Source:", entry["metadata"]["source"])
print("📊 Similarity:", entry["similarity"])
print("📄 Content:", entry["content"].replace("\n", " ")[:100] + "...")
print("-" * 50)

result = chain.run(context=json.dumps(embeddings.data), question=question)
print("📝 Result of knowledge extraction chain:", result)

return f"""Answer: {result.answer}
Sources: {json.dumps(result.sources)}
"""

except Exception as e:
print(e)
return "The wiki knowledgebase is currently not available. We are working on it. Tell the user to use the wiki directly. https://www.defichainwiki.com/"


description = """Use this if you need to answer any question reguarding python and coding in general. Keywords: python, script, coding, connection to a defichain node, connection to ocean API, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""

defichainPythonTool = StructuredTool(
name="defichain_python_knowledge",
description=description,
func=get_answer,
args_schema=ToolInputSchema,
)


if __name__ == "__main__":
while True:
question = input(
"Ask something, that can be answered using information from DeFiChainWiki: "
)
print("✅", get_answer(question))
11 changes: 5 additions & 6 deletions backend/tools/wiki_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@
)
import langchain


load_dotenv()

# Set debug to True to see A LOT of details of langchain's inner workings
# langchain.debug = True

# The name of the table in Supabase, where the vectors are stored
vectorTableName = "embeddings"
matchVectorFunctionName = "match_embeddings"

# Create the supabase client
SUPABASE_URL = os.getenv("SUPABASE_URL")
Expand All @@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel):
)


llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)

prompt_msgs = [
SystemMessagePromptTemplate.from_template(
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance.
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
Context:
{context}"""
),
Expand All @@ -62,7 +61,7 @@ def get_answer(question: str) -> str:
try:
vectors = OpenAIEmbeddings().embed_documents([question])
embeddings = supabase.rpc(
"match_embeddings", dict(query_embedding=vectors[0], match_count=7)
matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
).execute()

print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
-- Create a table to store embeddings
create table embeddings_defichain_python (
id UUID primary key,
content text, -- corresponds to Document.pageContent
metadata jsonb, -- corresponds to Document.metadata
embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
);

-- Create a function to search for embeddings
create function match_embeddings_defichain_python (
query_embedding vector(1536),
match_count int default null,
filter jsonb DEFAULT '{}'
) returns table (
id uuid,
content text,
metadata jsonb,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
content,
metadata,
1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity
from embeddings_defichain_python
where metadata @> filter
order by embeddings_defichain_python.embedding <=> query_embedding
limit match_count;
end;
$$;
72 changes: 72 additions & 0 deletions job/defichainpython_embedding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import re
from langchain.vectorstores import SupabaseVectorStore
from langchain.embeddings.openai import OpenAIEmbeddings
from supabase.client import Client, create_client
from dotenv import load_dotenv
import uuid

from defichainpython_loader import DefichainPythonLoader
from sitemap_parser import get_urls

load_dotenv()

vectorTableName = "embeddings_defichain_python"
scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
embedding_model = "text-embedding-ada-002"

supabase: Client = create_client(os.getenv("SUPABASE_URL"), os.getenv("SUPABASE_KEY"))

urls = []

# Get all urls from sitemap
for url in scrapeUrls:
urls.extend(get_urls(url))
print("🔎 Found %s pages in total" % len(urls))

# Remove duplicates
urls = list(dict.fromkeys(urls))
print("🔎 Found %s unique pages" % len(urls))

# Remove urls
remove_urls = "https://docs.defichain-python.de/build/html/search.html"

urls = [url for url in urls if url not in remove_urls]

print("🔭 Scrape %s found pages.." % len(urls))
print("---")
docs = []
for url in urls:
loader = DefichainPythonLoader(url)
docs.extend(loader.load())

print(f"✅ Scraped all pages")

for doc in docs:
print("🌐 Source:", doc.metadata["source"])
print("🔖 Title:", doc.metadata["title"])
print("📄 Content:", doc.page_content.replace("\n", " ")[:100] + "...")
print("---")

print("➖ Remove all old documents from table")
supabase.table(vectorTableName).delete().neq("id", uuid.uuid1()).execute()
print("✅ Removed all old documents from table")

print("🔮 Embedding..")
embeddings = OpenAIEmbeddings(model=embedding_model)
upload_chunk_size = 200

# Split the documents in chunks for upload (Did time out when too large).
docs_chunks = [
docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
]

# Iterate over each chunk and upload separately.
for doc_chunk in docs_chunks:
vector_store = SupabaseVectorStore.from_documents(
doc_chunk,
embeddings,
client=supabase,
table_name=vectorTableName,
)
print("✅ Embedded")
Loading
Loading