-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathmain.py
91 lines (82 loc) · 3.67 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import imp
import streamlit as st
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
import altair as alt
from PIL import Image
import base64
import tarfile
import os
import requests
from backend import *
predefined_limits = 10000
st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={
"About": "Download the source latex code of multiple arXiv paper with one click"
})
# title
st.title("arXiv2Latex Downloader")
# input arxiv links to download
pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "")
st.markdown("""
Input example:
```Plain Text
https://arxiv.org/abs/1512.03385
https://arxiv.org/abs/1706.03762
https://arxiv.org/abs/2009.09724
""")
## one click download
crawling_or_not = st.button("Crawling the latex Code")
if crawling_or_not:
print("Crawling...")
pdf_lists = pdf_links_input.split("\n")
print(pdf_lists)
# cleaning the pdf lists
pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0]
# TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download
try:
if len(pdf_lists) > predefined_limits:
st.warning(f"Currently only support up to {predefined_limits} papers. Please input less than {predefined_limits} papers.")
else:
# parsing
base='./download/'
project_name = get_timestamp().replace(" ","-")
base = os.path.join(base,project_name)
make_dir_if_not_exist(base)
# st.write(download_status)
with st.spinner("Downloading papers..."):
# progress bar
bar = st.progress(0)
download_status = st.empty()
N = len(pdf_lists)
for i, pdf_link in tqdm(enumerate(pdf_lists)):
title = get_name_from_arvix(pdf_link)
file_stamp = pdf_link.split("/")[-1]
source_link = "https://arxiv.org/e-print/"+file_stamp
inp = os.path.join(base,'input')
make_dir_if_not_exist(inp)
out = os.path.join(base,'output')
make_dir_if_not_exist(out)
response = requests.get(source_link)
filename = file_stamp+".tar.gz"
filepath = os.path.join(inp,filename)
open(filepath, "wb").write(response.content)
outpath = os.path.join(out,title)
untar(filepath,outpath)
# finish one paper
bar.progress((i+1)/N)
download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title)
with st.spinner("Archiving as Zip Files..."):
# save it as zip file
filepath = archive_dir(out,os.path.join(base,project_name))
# download
b64 = ToBase64(filepath).decode()
href = f"<a href='data:file/csv;base64,{b64}' download='arxiv2latex-output-{datetime.datetime.now()}.zip' color='red'>Click here to Download the Output Latex Zip Files</a>"
st.markdown(href, unsafe_allow_html=True)
# 状态
st.success("Finished")
except Exception as e:
st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))