Skip to content

Commit

Permalink
fallback on pypdf, trim flake, minor ux
Browse files Browse the repository at this point in the history
  • Loading branch information
salgadev committed Apr 19, 2024
1 parent c40d04b commit e39bb0b
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 64 deletions.
30 changes: 15 additions & 15 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@ def suggest_metadata(file_upload):

with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(uploaded_file.read())
file_path = f'{tmp.name}.{extension}'
st.write(f'Created temporary file {file_path}')
st.write(f'Created temporary file {tmp.name}')

st.write('## Processing file with Unstructured')
docs = ingest(file_path)
metadata = generate_metadata(docs)
st.write('## Ingesting Unstructured file')

docs = ingest(tmp.name)
print(f'Ingested {tmp.name}')

metadata = generate_metadata(docs)
st.write('## Querying Together.ai API')
form = st.form(key='generate_form')
st.write(f'## Suggested Metadata Generated by {MODEL_NAME}')
st.write(f'### {metadata}')
st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')
st.write(f'#### {metadata}')

with st.form('analyze_form'):
st.write('Enter your file metadata in the following schema:')
Expand All @@ -38,14 +38,14 @@ def suggest_metadata(file_upload):
analysis = analyze_metadata(filename, description, discipline)

st.write(analysis)
submitted = None

st.write('## Generate metadata?')
uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")

if uploaded_file is not None:
if uploaded_file is not None:

suggest_metadata(uploaded_file)

delete_file_button = form.form_submit_button(label='Delete file')
if delete_file_button:
os.remove(file_path)
query_api = st.button('Query API')
if query_api:
suggest_metadata(uploaded_file)
query_api = None
22 changes: 4 additions & 18 deletions flake.nix
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
description = "A LLM backend development flake powered by unstructured and langchain";

inputs = {
nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
};
Expand All @@ -9,6 +9,7 @@
system = "x86_64-linux";
# ↑ Swap it for your system if needed
# "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
debug = true;
pkgs = nixpkgs.legacyPackages.${system};
in {
devShells.${system}.default = pkgs.mkShell {
Expand All @@ -17,33 +18,18 @@
python-pkgs.pip # VsCode starts
python-pkgs.jupyter
python-pkgs.notebook # VsCode ends
python-pkgs.numpy
python-pkgs.pandas
python-pkgs.scipy
python-pkgs.matplotlib
python-pkgs.requests
python-pkgs.langchain-community
python-pkgs.langchain
python-pkgs.langchain-text-splitters
python-pkgs.unstructured
python-pkgs.wrapt # unstructured[local-inference] starts
python-pkgs.iso-639
python-pkgs.emoji
python-pkgs.pillow-heif
python-pkgs.magic
python-pkgs.poppler-qt5
python-pkgs.pytesseract
python-pkgs.langdetect # unstructured[local-inference] ends
python-pkgs.pypdf
python-pkgs.openai
python-pkgs.pydantic
python-pkgs.python-dotenv
python-pkgs.configargparse
python-pkgs.streamlit
python-pkgs.lark
python-pkgs.sentence-transformers
pkgs.unstructured-api
pkgs.poppler
pkgs.haskellPackages.iso639
python-pkgs.unstructured
]))
];

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ streamlit
python-dotenv
sentence-transformers
iso639-lang
poppler
unstructured[all-docs]
unstructured[pdf]
pypdf
62 changes: 33 additions & 29 deletions scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
import openai
import sys
from dotenv import load_dotenv

from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader

from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Vectara
from langchain_core.output_parsers import StrOutputParser
Expand Down Expand Up @@ -56,35 +59,35 @@ def get_sources(documents):
def get_summary(documents):
return documents[-1].page_content

def ingest(file_path):
extension = os.path.splitext(file_path)[1].lower()

if extension == '.pdf':
def ingest(file_path):
try:
loader = PyPDFLoader(file_path)
documents = loader.load()
print('Loaded PyPDFLoader')
except Exception as e:
print(f'{e}')
loader = UnstructuredPDFLoader(file_path)
elif extension == '.txt':
loader = TextLoader(file_path)
else:
raise NotImplementedError('Only .txt or .pdf files are supported')

# transform locally
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
separators=[
"\n\n",
"\n",
" ",
",",
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
# "\u200B", # Zero-width space (Asian languages)
# "\u3002", # Ideographic full stop (Asian languages)
"",
])
docs = text_splitter.split_documents(documents)

return docs

documents = loader.load()
print('Loaded UnstructuredPDFLoader')
finally:
# transform locally
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
separators=[
"\n\n",
"\n",
" ",
",",
"\uff0c", # Fullwidth comma
"\u3001", # Ideographic comma
"\uff0e", # Fullwidth full stop
# "\u200B", # Zero-width space (Asian languages)
# "\u3002", # Ideographic full stop (Asian languages)
"",
])
docs = text_splitter.split_documents(documents)

return docs


def generate_metadata(docs):
Expand Down Expand Up @@ -126,8 +129,9 @@ def generate_metadata(docs):
}
]
)
return chat_completion.choices[0].message.content

return json.loads(chat_completion.choices[0].message.content)
#return json.loads(chat_completion.choices[0].message.content)


def analyze_metadata(filename, description, discipline):
Expand Down

0 comments on commit e39bb0b

Please sign in to comment.