-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathepub-to-txt.py
65 lines (55 loc) · 2 KB
/
epub-to-txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
from pathlib import Path
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import argparse
# https://medium.com/@zazazakaria18/turn-your-ebook-to-text-with-python-in-seconds-2a1e42804913
blocklist = ['[document]', 'noscript', 'header', 'html', 'meta', 'head', 'input', 'script']
def main():
parser = argparse.ArgumentParser(description='Convert epub to txt',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('-f', '--files', action='append', help='epub file')
parser.add_argument('-s', '--src', action='store', help='Source location')
parser.add_argument('-d', '--dest', action='store', help='Destination location')
args = parser.parse_args()
files = args.files
if files == None:
print('Must provide file')
quit()
src = args.src
if src == None:
src = ''
dest = args.dest
if dest == None:
dest = '.'
for file_name in args.files:
file_path = os.path.join(src, file_name)
add_file_to_json(file_path, dest)
def add_file_to_json(filename, dest):
book = epub.read_epub(filename)
book_name = Path(filename).stem
output_path = os.path.join(dest, book_name +'.txt')
with open(output_path, 'w+', encoding='utf-8') as f:
chapters = []
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_DOCUMENT:
chapters.append(item.get_content())
for chapter in chapters:
text = chapter_to_text(chapter)
f.write(text+'\n')
def chapter_to_text(chap):
output = ''
soup = BeautifulSoup(chap, 'html.parser')
text = soup.find_all(text=True)
prev = ''
for t in text:
if t.parent.name not in blocklist:
if not t.isspace():
if not (str(prev).endswith(' ') or str(t).startswith(' ')):
output += '\n\n'
output += '{}'.format(t)
prev = t
return output
if __name__ == '__main__':
main()