main.py

import imp
import streamlit as st
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import altair as alt
from PIL import Image
import base64
import tarfile
import os
import requests
from backend import *

predefined_limits = 10000

st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={
    "About": "Download the source latex code of multiple arXiv paper with one click"
})

# title
st.title("arXiv2Latex Downloader")

# input arxiv links to download
pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "")
st.markdown("""
            Input example:
            ```Plain Text
            https://arxiv.org/abs/1512.03385
            https://arxiv.org/abs/1706.03762
            https://arxiv.org/abs/2009.09724
            """)
## one click download
crawling_or_not = st.button("Crawling the latex Code")
if crawling_or_not:
    print("Crawling...")
    pdf_lists = pdf_links_input.split("\n")
    print(pdf_lists)
    # cleaning the pdf lists
    pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0]
    # TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download
    try: 
        if len(pdf_lists) > predefined_limits:
            st.warning(f"Currently only support up to {predefined_limits} papers. Please input less than {predefined_limits} papers.")
        else:
            # parsing
            base='./download/'
            project_name = get_timestamp().replace(" ","-")
            base = os.path.join(base,project_name)
            make_dir_if_not_exist(base)
            
            # st.write(download_status)
            with st.spinner("Downloading papers..."):
                # progress bar
                bar = st.progress(0)
                download_status = st.empty()
                N = len(pdf_lists)
                for i, pdf_link in tqdm(enumerate(pdf_lists)):
                    title = get_name_from_arvix(pdf_link)
                    file_stamp = pdf_link.split("/")[-1]
                    source_link = "https://arxiv.org/e-print/"+file_stamp
                    inp = os.path.join(base,'input')
                    make_dir_if_not_exist(inp)
                    out = os.path.join(base,'output')
                    make_dir_if_not_exist(out)
                    response = requests.get(source_link)
                    filename = file_stamp+".tar.gz"
                    filepath = os.path.join(inp,filename)
                    open(filepath, "wb").write(response.content)
                    outpath = os.path.join(out,title)
                    untar(filepath,outpath)
                    
                    # finish one paper
                    bar.progress((i+1)/N)
                    download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title)
            
            with st.spinner("Archiving as Zip Files..."):
                # save it as zip file
                filepath = archive_dir(out,os.path.join(base,project_name))

                # download
                b64 = ToBase64(filepath).decode()
            href = f"<a href='data:file/csv;base64,{b64}' download='arxiv2latex-output-{datetime.datetime.now()}.zip' color='red'>Click here to Download the Output Latex Zip Files</a>"
            st.markdown(href, unsafe_allow_html=True)
                
            # 状态
            st.success("Finished")
    except Exception as e:
        st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))