Skip to content

Commit

Permalink
implement missing features in app.py, add retriever with summarizatio…
Browse files Browse the repository at this point in the history
…n to scripts
  • Loading branch information
salgadev committed Apr 19, 2024
1 parent a0ad413 commit d9ef11d
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 23 deletions.
63 changes: 42 additions & 21 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,50 @@
import streamlit as st
import tempfile

from scripts import generate_metadata, ingest, MODEL_NAME
from scripts import analyze_metadata, generate_metadata, ingest, MODEL_NAME


st.title('DocVerifyRAG')
st.write('Anomaly detection for BIM document metadata')
st.title('# DocVerifyRAG')
st.write('## Anomaly detection for BIM document metadata')

uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
st.write('### Enter your file metadata in the following schema:')

if uploaded_file is not None:
user_input = st.text_input(
label='Filename, Description, Discipline',
value="", placeholder=str)

if st.button('Submit'):
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
tmp.write(uploaded_file.read())
file_path = tmp.name
st.write(f'Created temporary file {file_path}')

docs = ingest(file_path)
st.write('## Querying Together.ai API')
metadata = generate_metadata(docs)
st.write(f'## Metadata Generated by {MODEL_NAME}')
st.write(metadata)

# Clean up the temporary file
os.remove(file_path)

except Exception as e:
st.error(f'Error: {e}')
filename, description, discipline = user_input.split(',')

st.write('## Analyzing with Vectara + together.ai')
analysis = analyze_metadata(filename, description, discipline)

st.write(analysis)

st.write('## Generate metadata?')
st.write('### Upload the file that corresponds to the submitted metadata')

uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])

if uploaded_file is not None:
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp:
tmp.write(uploaded_file.read())
file_path = tmp.name
st.write(f'Created temporary file {file_path}')

docs = ingest(file_path)
st.write('## Querying Together.ai API')
metadata = generate_metadata(docs)
st.write(f'## Metadata Generated by {MODEL_NAME}')
st.write(metadata)

# Clean up the temporary file
os.remove(file_path)

except Exception as e:
st.error(f'Error: {e}')
except ValueError:
st.error('Please enter 3 comma separated values')

50 changes: 48 additions & 2 deletions scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,54 @@
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.embeddings.fake import FakeEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter


load_dotenv()

MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

vectara_customer_id = os.environ['VECTARA_CUSTOMER_ID']
vectara_corpus_id = os.environ['VECTARA_CORPUS_ID']
vectara_api_key = os.environ['VECTARA_API_KEY']

embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large")

vectara = Vectara(vectara_customer_id=vectara_customer_id,
vectara_corpus_id=vectara_corpus_id,
vectara_api_key=vectara_api_key)


summary_config = {"is_enabled": True, "max_results": 3, "response_lang": "eng"}
retriever = vectara.as_retriever(
search_kwargs={"k": 3, "summary_config": summary_config}
)

template = """
passage: You are a helpful assistant that understands BIM building documents.
passage: You will analyze BIM document metadata composed of filename, description, and engineering discipline.
passage: The metadata is written in German.
passage: Filename: {filename}, Description: {description}, Engineering discipline: {discipline}.
query: Does the filename match other filenames within the same discipline?
query: Does the description match the engineering discipline?
query: How different is the metadata to your curated information?
query: Highligh any discrepancies and comment on wether or not the metadata is anomalous.
"""

prompt = PromptTemplate(template=template, input_variables=['filename', 'description', 'discipline'])


def get_sources(documents):
return documents[:-1]

def get_summary(documents):
return documents[-1].page_content

def ingest(file_path):
extension = os.path.splitext(file_path)[1].lower()
Expand Down Expand Up @@ -52,7 +93,7 @@ def generate_metadata(docs):
You are a helpful assistant that understands BIM documents and engineering disciplines. Your answer should be in JSON format and only include the filename, a short description, and the engineering discipline the document belongs to, distinguishing between {[d.value for d in BimDiscipline]} based on the given document."
Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Document:
Analyze the provided document, which could be in either German or English. Extract the filename, its description, and infer the engineering discipline it belongs to. Respond in both English and German. Document:
context="
"""
# plain text
Expand Down Expand Up @@ -89,6 +130,11 @@ def generate_metadata(docs):
return json.loads(chat_completion.choices[0].message.content)


def analyze_metadata(filename, description, discipline):
formatted_prompt = prompt.format(filename=filename, description=description, discipline=discipline)
return (retriever | get_summary).invoke(formatted_prompt)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generate metadata for a BIM document")
parser.add_argument("document", metavar="FILEPATH", type=str,
Expand Down

0 comments on commit d9ef11d

Please sign in to comment.