-
Notifications
You must be signed in to change notification settings - Fork 1
/
spider.py
111 lines (89 loc) · 3.26 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#
# This file contains a Scrapy spider. To run it:
#
# scrapy runspider spider.py
#
import re
from pathlib import Path
from scrapy import Request, Item
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
import bs4 as bs
import html2text
URL_PREFIX = "https://www.janelia.org"
OUTPUT_PATH = "./data/janelia.org"
DEBUG = False
IGNORED_CLASSES = ['panels-ipe-label']
h = html2text.HTML2Text()
h.ignore_links = True
h.images_to_alt = True
h.single_line_break = True
class MySpider(CrawlSpider):
""" Spider that crawls janelial.org and saves content
to disk in Markdown format for consumption by an LLM.
"""
name = "janelia.org"
allowed_domains = ["www.janelia.org"]
start_urls = [URL_PREFIX]
rules = (
Rule(LinkExtractor(allow=(r".*",)), callback="parse_item"),
)
seen = set()
def get_path(self, url):
path = url.replace(URL_PREFIX, '')
if not path.startswith("/"):
if DEBUG: print(f"Skipping external link: {url}")
return None
path = path.split("#")[0]
# TODO: add support for query parameters
path = path.split("?")[0]
if "/search" in path or "/node" in path:
if DEBUG: print(f"Skipping: {url}")
return None
if path in self.seen:
if DEBUG: print(f"Already seen: {url}")
return None
return path
def parse_item(self, response):
url = response.url
path = self.get_path(url)
self.seen.add(path)
if not path: return
headers = response.headers.to_unicode_dict()
# TODO: add support for non-HTML content
content_type = str(headers['Content-Type'])
if not content_type.startswith('text/html'):
print(f"Content type {content_type} is not HTML: {url}")
return
# Recurse into all links on the page
item = Item()
sel = Selector(response)
item_urls = sel.xpath(""".//*/a/@href""").getall()
for item_url in item_urls:
if item_url.startswith("/"):
abs_url = self.start_urls[0] +""+ item_url
if self.get_path(abs_url):
yield Request(abs_url, callback=self.parse_item)
# Extract content
content = response.text#body.decode(response.encoding)
if False:
body = response.selector.css(".content-section").get()
if not body:
print(f"No content found: {url}")
else:
soup = bs.BeautifulSoup(body,'lxml')
# certain classes contain metadata that is hidden
# on janelia.org, this eliminates them from the output text
for css_class in IGNORED_CLASSES:
for div in soup.find_all("div", {'class':css_class}):
div.decompose()
content = h.handle(str(soup))
# Save to file
filename = OUTPUT_PATH + path
with Path(filename) as p:
p.mkdir(parents=True, exist_ok=True)
c = p / "content"
# TODO; store metadata like url in a separate YAML file
c.write_text(url + "\n" + content)
print(f"Saved {path}")