fallback on pypdf, trim flake, minor ux

salgadev · Apr 19, 2024 · e39bb0b · e39bb0b
1 parent c40d04b
commit e39bb0b
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 64 deletions.
diff --git a/app.py b/app.py
@@ -13,17 +13,17 @@ def suggest_metadata(file_upload):
 
     with tempfile.NamedTemporaryFile(delete=False) as tmp:
         tmp.write(uploaded_file.read())
-        file_path = f'{tmp.name}.{extension}'
-        st.write(f'Created temporary file {file_path}')
+        st.write(f'Created temporary file {tmp.name}')
 
-    st.write('## Processing file with Unstructured')
-    docs = ingest(file_path)   
-    metadata = generate_metadata(docs)
+    st.write('## Ingesting Unstructured file')
+
+    docs = ingest(tmp.name)
+    print(f'Ingested {tmp.name}')
 
+    metadata = generate_metadata(docs)
     st.write('## Querying Together.ai API')
-    form = st.form(key='generate_form')
-    st.write(f'## Suggested Metadata Generated by {MODEL_NAME}')    
-    st.write(f'### {metadata}')   
+    st.write(f'### Suggested Metadata Generated by {MODEL_NAME}')    
+    st.write(f'#### {metadata}')   
 
 with st.form('analyze_form'):
     st.write('Enter your file metadata in the following schema:')
@@ -38,14 +38,14 @@ def suggest_metadata(file_upload):
         analysis = analyze_metadata(filename, description, discipline)
 
         st.write(analysis)
+        submitted = None
 
 st.write('## Generate metadata?')
-uploaded_file = st.file_uploader("Choose a PDF file", type=["pdf","txt"])
+uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
 
-if uploaded_file is not None:  
+if uploaded_file is not None:
 
-    suggest_metadata(uploaded_file)    
-
-    delete_file_button = form.form_submit_button(label='Delete file')        
-    if delete_file_button:
-        os.remove(file_path)
+    query_api = st.button('Query API')
+    if query_api:
+        suggest_metadata(uploaded_file)    
+        query_api = None    
diff --git a/flake.nix b/flake.nix
@@ -1,6 +1,6 @@
 {
   description = "A LLM backend development flake powered by unstructured and langchain";
-
+  
   inputs = {
     nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable";
   };
@@ -9,6 +9,7 @@
     system = "x86_64-linux";
     #       ↑ Swap it for your system if needed
     #       "aarch64-linux" / "x86_64-darwin" / "aarch64-darwin"
+    debug = true;
     pkgs = nixpkgs.legacyPackages.${system};
   in {
     devShells.${system}.default = pkgs.mkShell {
@@ -17,33 +18,18 @@
           python-pkgs.pip # VsCode starts
           python-pkgs.jupyter
           python-pkgs.notebook # VsCode ends
-          python-pkgs.numpy
           python-pkgs.pandas
-          python-pkgs.scipy
-          python-pkgs.matplotlib
           python-pkgs.requests
           python-pkgs.langchain-community
           python-pkgs.langchain
           python-pkgs.langchain-text-splitters
-          python-pkgs.unstructured
-          python-pkgs.wrapt # unstructured[local-inference] starts
-          python-pkgs.iso-639
-          python-pkgs.emoji
-          python-pkgs.pillow-heif
-          python-pkgs.magic
-          python-pkgs.poppler-qt5
-          python-pkgs.pytesseract
-          python-pkgs.langdetect # unstructured[local-inference] ends
+          python-pkgs.pypdf
           python-pkgs.openai
-          python-pkgs.pydantic
           python-pkgs.python-dotenv
           python-pkgs.configargparse
           python-pkgs.streamlit
-          python-pkgs.lark
           python-pkgs.sentence-transformers
-          pkgs.unstructured-api
-          pkgs.poppler
-          pkgs.haskellPackages.iso639
+          python-pkgs.unstructured          
         ]))
       ];
 

diff --git a/requirements.txt b/requirements.txt
@@ -7,5 +7,5 @@ streamlit
 python-dotenv
 sentence-transformers
 iso639-lang
-poppler
-unstructured[all-docs]
+unstructured[pdf]
+pypdf
diff --git a/scripts.py b/scripts.py
@@ -5,8 +5,11 @@
 import openai
 import sys
 from dotenv import load_dotenv
+
 from langchain_community.document_loaders import TextLoader
+from langchain_community.document_loaders import PyPDFLoader
 from langchain_community.document_loaders import UnstructuredPDFLoader
+
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Vectara
 from langchain_core.output_parsers import StrOutputParser
@@ -56,35 +59,35 @@ def get_sources(documents):
 def get_summary(documents):
     return documents[-1].page_content
 
-def ingest(file_path):
-    extension = os.path.splitext(file_path)[1].lower()
-
-    if extension == '.pdf':
+def ingest(file_path):    
+    try:
+        loader = PyPDFLoader(file_path)        
+        documents = loader.load()
+        print('Loaded PyPDFLoader')
+    except Exception as e:
+        print(f'{e}')
         loader = UnstructuredPDFLoader(file_path)
-    elif extension == '.txt':
-        loader = TextLoader(file_path)
-    else:
-        raise NotImplementedError('Only .txt or .pdf files are supported')
-
-    # transform locally
-    documents = loader.load()
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
-    separators=[
-        "\n\n",
-        "\n",
-        " ",
-        ",",
-        "\uff0c",  # Fullwidth comma
-        "\u3001",  # Ideographic comma
-        "\uff0e",  # Fullwidth full stop
-        # "\u200B",  # Zero-width space (Asian languages)
-        # "\u3002",  # Ideographic full stop (Asian languages)
-        "",
-    ])
-    docs = text_splitter.split_documents(documents)
-
-    return docs
-
+        documents = loader.load()
+        print('Loaded UnstructuredPDFLoader')  
+    finally:    
+        # transform locally
+        documents = loader.load()
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0,
+        separators=[
+            "\n\n",
+            "\n",
+            " ",
+            ",",
+            "\uff0c",  # Fullwidth comma
+            "\u3001",  # Ideographic comma
+            "\uff0e",  # Fullwidth full stop
+            # "\u200B",  # Zero-width space (Asian languages)
+            # "\u3002",  # Ideographic full stop (Asian languages)
+            "",
+        ])
+        docs = text_splitter.split_documents(documents)
+
+        return docs
 
 
 def generate_metadata(docs):
@@ -126,8 +129,9 @@ def generate_metadata(docs):
             }
         ]
     )
+    return chat_completion.choices[0].message.content
 
-    return json.loads(chat_completion.choices[0].message.content)    
+    #return json.loads(chat_completion.choices[0].message.content)    
 
 
 def analyze_metadata(filename, description, discipline):