From 7ced8f06a65644b94503c3ad501ea50902c40ad1 Mon Sep 17 00:00:00 2001 From: 0ptim Date: Sun, 15 Oct 2023 17:42:08 +0200 Subject: [PATCH] Finalize defichain python integration --- .../defichain_python_scraping_production.yml | 27 ++++++++++++++ .../defichain_python_scraping_staging.yml | 30 ++++++++++++++++ .../workflows/wiki_scraping_production.yml | 2 +- .github/workflows/wiki_scraping_staging.yml | 7 ---- backend/tools/defichainpython_qa.py | 7 ++-- backend/tools/wiki_qa.py | 11 +++--- ...150642_add_defichain_python_embeddings.sql | 35 +++++++++++++++++++ job/defichainpython_embedding.py | 8 ++--- 8 files changed, 105 insertions(+), 22 deletions(-) create mode 100644 .github/workflows/defichain_python_scraping_production.yml create mode 100644 .github/workflows/defichain_python_scraping_staging.yml create mode 100644 data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql diff --git a/.github/workflows/defichain_python_scraping_production.yml b/.github/workflows/defichain_python_scraping_production.yml new file mode 100644 index 0000000..b129f0a --- /dev/null +++ b/.github/workflows/defichain_python_scraping_production.yml @@ -0,0 +1,27 @@ +name: Defichain Python scraping Production + +on: + workflow_dispatch: + +jobs: + run_script: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + working-directory: ./job + + - name: Run the script + run: python ./job/defichainpython_embedding.py + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }} + SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }} diff --git a/.github/workflows/defichain_python_scraping_staging.yml b/.github/workflows/defichain_python_scraping_staging.yml new file mode 100644 index 0000000..9920664 --- /dev/null +++ b/.github/workflows/defichain_python_scraping_staging.yml @@ -0,0 +1,30 @@ +name: Defichain Python scraping Staging + +on: + pull_request: + branches: + - main + workflow_dispatch: + +jobs: + run_script: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.8 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + working-directory: ./job + + - name: Embeddings for DefichainPython + run: python ./job/defichainpython_embedding.py + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} + SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} diff --git a/.github/workflows/wiki_scraping_production.yml b/.github/workflows/wiki_scraping_production.yml index 3c13db0..b642572 100644 --- a/.github/workflows/wiki_scraping_production.yml +++ b/.github/workflows/wiki_scraping_production.yml @@ -53,7 +53,7 @@ jobs: working-directory: ./job - name: Run the script - run: python ./job/app.py + run: python ./job/wiki_embedding.py env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }} diff --git a/.github/workflows/wiki_scraping_staging.yml b/.github/workflows/wiki_scraping_staging.yml index 8dc4bf1..fa28b3c 100644 --- a/.github/workflows/wiki_scraping_staging.yml +++ b/.github/workflows/wiki_scraping_staging.yml @@ -28,10 +28,3 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} - - - name: Embeddings for DefichainPython - run: python ./job/defichainpython_embedding.py - env: - OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }} - SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }} diff --git a/backend/tools/defichainpython_qa.py b/backend/tools/defichainpython_qa.py index 7bfc619..4e15e7f 100644 --- a/backend/tools/defichainpython_qa.py +++ b/backend/tools/defichainpython_qa.py @@ -20,8 +20,8 @@ # Set debug to True to see A LOT of details of langchain's inner workings # langchain.debug = True -# The name of the table in Supabase, where the vectors are stored -vectorTableName = "embeddings" +# The name of the function in Supabase which is used to match the embeddings +matchVectorFunctionName = "match_embeddings_defichain_python" # Create the supabase client SUPABASE_URL = os.getenv("SUPABASE_URL") @@ -61,7 +61,7 @@ def get_answer(question: str) -> str: try: vectors = OpenAIEmbeddings().embed_documents([question]) embeddings = supabase.rpc( - "match_embeddings", dict(query_embedding=vectors[0], match_count=7) + matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7) ).execute() print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:") @@ -93,6 +93,7 @@ def get_answer(question: str) -> str: args_schema=ToolInputSchema, ) + if __name__ == "__main__": while True: question = input( diff --git a/backend/tools/wiki_qa.py b/backend/tools/wiki_qa.py index 607fe11..7188574 100644 --- a/backend/tools/wiki_qa.py +++ b/backend/tools/wiki_qa.py @@ -15,14 +15,13 @@ ) import langchain - load_dotenv() # Set debug to True to see A LOT of details of langchain's inner workings # langchain.debug = True # The name of the table in Supabase, where the vectors are stored -vectorTableName = "embeddings" +matchVectorFunctionName = "match_embeddings" # Create the supabase client SUPABASE_URL = os.getenv("SUPABASE_URL") @@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel): ) -llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7) +llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3) prompt_msgs = [ SystemMessagePromptTemplate.from_template( - """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. - + """You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user. + Context: {context}""" ), @@ -62,7 +61,7 @@ def get_answer(question: str) -> str: try: vectors = OpenAIEmbeddings().embed_documents([question]) embeddings = supabase.rpc( - "match_embeddings", dict(query_embedding=vectors[0], match_count=7) + matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7) ).execute() print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:") diff --git a/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql new file mode 100644 index 0000000..433923a --- /dev/null +++ b/data/supabase/migrations/20231015150642_add_defichain_python_embeddings.sql @@ -0,0 +1,35 @@ +-- Create a table to store embeddings +create table embeddings_defichain_python ( + id UUID primary key, + content text, -- corresponds to Document.pageContent + metadata jsonb, -- corresponds to Document.metadata + embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed +); + +-- Create a function to search for embeddings +create function match_embeddings_defichain_python ( + query_embedding vector(1536), + match_count int default null, + filter jsonb DEFAULT '{}' +) returns table ( + id uuid, + content text, + metadata jsonb, + similarity float +) +language plpgsql +as $$ +#variable_conflict use_column +begin + return query + select + id, + content, + metadata, + 1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity + from embeddings_defichain_python + where metadata @> filter + order by embeddings_defichain_python.embedding <=> query_embedding + limit match_count; +end; +$$; diff --git a/job/defichainpython_embedding.py b/job/defichainpython_embedding.py index dac507c..e50249f 100644 --- a/job/defichainpython_embedding.py +++ b/job/defichainpython_embedding.py @@ -11,7 +11,7 @@ load_dotenv() -vectorTableName = "embeddings" +vectorTableName = "embeddings_defichain_python" scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"] embedding_model = "text-embedding-ada-002" @@ -29,9 +29,7 @@ print("🔎 Found %s unique pages" % len(urls)) # Remove urls -remove_urls = ( - "https://docs.defichain-python.de/build/html/search.html" -) +remove_urls = "https://docs.defichain-python.de/build/html/search.html" urls = [url for url in urls if url not in remove_urls] @@ -60,7 +58,7 @@ # Split the documents in chunks for upload (Did time out when too large). docs_chunks = [ - docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) + docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size) ] # Iterate over each chunk and upload separately.