Skip to content

Commit

Permalink
Merge pull request #32 from nattvara/feature/fix-crawling-bug
Browse files Browse the repository at this point in the history
Update crawler with google docs support
  • Loading branch information
nattvara authored May 12, 2024
2 parents c19feb2 + 967981b commit 5516574
Show file tree
Hide file tree
Showing 15 changed files with 200 additions and 12 deletions.
11 changes: 11 additions & 0 deletions db/actions/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ def exists_any_unvisited_urls_in_snapshot(snapshot):
).exists()


def exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot):
from db.models.url import Url
return Url.select().filter(
Url.state == Url.States.WAITING_TO_INDEX
).filter(
Url.snapshot == snapshot
).order_by(
Url.created_at.asc()
).exists()


def find_url_with_href_sha_in_snapshot(href_sha: str, snapshot):
from db.models.url import Url
return Url.select().filter(
Expand Down
1 change: 1 addition & 0 deletions db/custom_fields/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
from .embedding import EmbeddingField
from .feedback_question_extradata import FeedbackQuestionExtraDataField
from .courses_list import CoursesListField
from .extra_urls import ExtraUrlsField
5 changes: 5 additions & 0 deletions db/custom_fields/extra_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from playhouse.sqlite_ext import JSONField


class ExtraUrlsField(JSONField):
pass
18 changes: 18 additions & 0 deletions db/migrations/030_add_extra_urls_column_to_courses_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from peewee_migrate import Migrator
import peewee as pw

from db.migrations import column_exists, is_sqlite
from db.custom_fields import ExtraUrlsField
from db.models import Course


def migrate(migrator: Migrator, database: pw.Database, fake=False, **kwargs):
if not column_exists(Course, 'extra_urls'):
extra_urls_field = ExtraUrlsField(null=False, default=[])
migrator.add_fields(Course, extra_urls=extra_urls_field)


def rollback(migrator: Migrator, database: pw.Database, fake=False, **kwargs):
# ignore this operation on sqlite, as removing doesn't work
if not is_sqlite(database):
migrator.remove_fields(Course, 'extra_urls', cascade=True)
2 changes: 2 additions & 0 deletions db/models/course.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import peewee

from db.custom_fields import ExtraUrlsField
from . import BaseModel

# Suppress specific DeprecationWarning about db_table, this is needed for migrations to work
Expand Down Expand Up @@ -37,3 +38,4 @@ class Language:
name = peewee.CharField(null=False, default='Untitled Course Room')
description = peewee.TextField(null=False, default='Course room has not got any description')
admin_token = peewee.CharField(null=False, index=True, unique=True, default=_generate_admin_token)
extra_urls = ExtraUrlsField(null=False, default=[])
3 changes: 3 additions & 0 deletions jobs/crawl/start_crawler_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ async def run_worker():

except LockAlreadyAcquiredException:
log().debug("Found a lock on the url. Skipping.")
except Exception as e:
log().error("Encountered an exception while crawling.")
log().error(e)

await redis.aclose()

Expand Down
13 changes: 12 additions & 1 deletion services/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from db.models import Course, Snapshot, Url
from config.logger import log
from db.actions.url import (
get_most_recent_url, exists_any_urls_waiting_to_be_indexed_in_snapshot,
exists_any_unvisited_urls_in_snapshot,
find_url_with_href_sha_in_snapshot,
get_most_recent_url,
)
import cache.mutex as mutex

Expand Down Expand Up @@ -58,6 +58,15 @@ def create_snapshot(course: Course) -> Snapshot:
)
root_url.save()

for extra_url in course.extra_urls:
url = Url(
snapshot=snapshot,
href=extra_url,
root=False,
distance=1,
)
url.save()

return snapshot

@staticmethod
Expand All @@ -77,6 +86,8 @@ def current_snapshot(course: Course) -> Snapshot:
for snapshot in snapshots:
if exists_any_unvisited_urls_in_snapshot(snapshot) is True:
continue
elif exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot) is True:
continue
else:
return snapshot

Expand Down
25 changes: 16 additions & 9 deletions services/crawler/playwright.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from playwright.async_api import Browser, BrowserContext, Page
from playwright.async_api import Playwright

from services.crawler.url_filters import CANVAS_DOMAIN, CANVAS_PROFILE_PAGE
from services.crawler.url_filters import CANVAS_DOMAIN, CANVAS_PROFILE_PAGE, KTH_GITHUB_DOMAIN
from db.actions.cookie import find_cookie_by_identifier
import config.settings as settings
from config.logger import log
Expand All @@ -25,17 +25,24 @@ async def get_logged_in_browser_context_and_page(playwright: Playwright) -> (Bro
f"with identifier: {settings.get_settings().COOKIE_IDENTIFIER}")

cookies = cookie.value.split('; ')

prepared_cookies = []
domains_with_cookies = [CANVAS_DOMAIN, KTH_GITHUB_DOMAIN]
for cookie in cookies:
name, value = cookie.strip().split('=', 1)
prepared_cookies.append({
'name': name,
'value': value,
'domain': CANVAS_DOMAIN,
'path': '/',
'expires': -1
})

secure = False
if name.startswith('__Host-'):
secure = True

for domain in domains_with_cookies:
prepared_cookies.append({
'name': name.strip(),
'value': value,
'domain': domain,
'path': '/',
'expires': -1,
'secure': secure
})

await context.add_cookies(prepared_cookies)

Expand Down
12 changes: 12 additions & 0 deletions services/crawler/url_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@

CANVAS_DOMAIN = "canvas.kth.se"

KTH_GITHUB_DOMAIN = "gits-15.sys.kth.se"

KATTIS_KTH_DOMAIN = "kth.kattis.com"

KATTIS_OPEN_DOMAIN = "open.kattis.com"

GOOGLE_DOCS = "docs.google.com"

CANVAS_PROFILE_PAGE = f"https://{CANVAS_DOMAIN}/profile"

DOMAIN_DENY_LIST = {
Expand Down Expand Up @@ -88,6 +92,10 @@
"https://www.kth.se/student/": True,
"https://www.kth.se": True,
"https://www.kth.se/": True,
"https://www.kth.se/en": True,
"https://www.kth.se/en/": True,
"https://www.kth.se/en/student": True,
"https://www.kth.se/en/student/": True,
}


Expand All @@ -104,6 +112,10 @@ def domain_is_kattis(url: str) -> bool:
return get_domain(url) == KATTIS_KTH_DOMAIN or get_domain(url) == KATTIS_OPEN_DOMAIN


def domain_is_google_docs(url: str) -> bool:
return get_domain(url) == GOOGLE_DOCS


def link_begins_with_deny_listed_string(url: str) -> bool:
for key in DENY_URLS_THAT_STARTS_WITH.keys():
if url.startswith(key):
Expand Down
39 changes: 38 additions & 1 deletion services/download/download.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from typing import Union
import os

from playwright.async_api import Browser, BrowserContext, Page
from pdfminer.pdfparser import PDFSyntaxError

from services.crawler.url_filters import domain_is_canvas, domain_is_kattis
from services.crawler.url_filters import domain_is_canvas, domain_is_kattis, domain_is_google_docs
from db.actions.url import find_url_referencing_content_in_snapshot
from db.actions.content import find_content_with_sha
import services.download.google_docs as google_docs
import services.download.canvas as canvas
import services.download.pptx as pptx
import services.download.docx as docx
Expand All @@ -21,10 +23,17 @@
from config.logger import log


MAX_DOWNLOAD_FILESIZE_MB = 100


class InvalidUrlStateException(Exception):
pass


class FilesizeTooLargeException(Exception):
pass


class DownloadService:

def __init__(
Expand All @@ -45,6 +54,13 @@ async def save_url_content(self, url: Url):
f"downloaded, url was in {url.state}")

if url.is_download:
filesize = self._get_filesize(url)
log().debug(f"downloaded filesize was {filesize}MB")

if filesize > MAX_DOWNLOAD_FILESIZE_MB:
raise FilesizeTooLargeException("Filesize was too large, cannot index files that exceed limit of"
f" {MAX_DOWNLOAD_FILESIZE_MB}MB")

log().info("trying to save content from url as a pdf")
content = await self._save_pdf_url_content(url)

Expand All @@ -64,6 +80,8 @@ async def save_url_content(self, url: Url):
content = await self._save_canvas_url_content(url)
elif domain_is_kattis(url.href):
content = await self._save_kattis_url_content(url)
elif domain_is_google_docs(url.href):
content = await self._save_google_docs_url_content(url)
else:
content = await self._save_web_url_content(url)

Expand All @@ -82,6 +100,12 @@ async def save_url_content(self, url: Url):
url.state = Url.States.DOWNLOADED
url.save()

def _get_filesize(self, url: Url) -> float:
content_filepath, _ = pdf.download_content(url)
filesize_bytes = os.path.getsize(content_filepath)
filesize_mb = filesize_bytes / (1024 * 1024)
return round(filesize_mb, 4)

async def _save_pdf_url_content(self, url: Url) -> Union[Content, None]:
try:
content_filepath, filename = pdf.download_content(url)
Expand Down Expand Up @@ -133,6 +157,19 @@ async def _save_kattis_url_content(self, url: Url):
log().error("Failed to parse kattis page custom instructions parser, using fallback", e)
return await self._save_web_url_content(url)

async def _save_google_docs_url_content(self, url: Url):
if google_docs.can_be_exported(url):
log().info("Found a google doc that can be exported")
try:
content_filepath, filename = google_docs.download_doc_as_pdf(url)
text = extract_text_from_pdf_file(content_filepath)
content = Content(text=text, name=filename)
return content
except Exception as e: # noqa
log().error("Failed to parse google doc, using fallback", e)

return await self._save_web_url_content(url)

async def _save_web_url_content(self, url: Url):
html, title = await web.download_content(url, self.page)
text = extract_text_from_html(html)
Expand Down
32 changes: 32 additions & 0 deletions services/download/google_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re

from services.download import pdf
from config.logger import log
from db.models import Url


class GoogleDocsException(Exception):
pass


def can_be_exported(url: Url) -> bool:
if "docs.google.com/presentation" in url.href:
return True
if "docs.google.com/document" in url.href:
return True
if "docs.google.com/spreadsheets" in url.href:
return True

return False


def download_doc_as_pdf(url: Url):
pattern = r'(https://docs\.google\.com/(presentation|document|spreadsheets)/d/[\w-]+)'
match = re.match(pattern, url.href)
if not match:
raise GoogleDocsException(f"Failed to match url: {url.href}")

base_url = match.group(1)
export_url = f"{base_url}/export?format=pdf"
log().debug(f"downloading doc using export url {export_url}")
return pdf.download_content(Url(href=export_url))
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def valid_course():
config = ChatConfig(llm_model_name=LLMModel.MISTRAL_7B_INSTRUCT, index_type=IndexType.NO_INDEX)
config.save()

course = Course(canvas_id="41428", snapshot_lifetime_in_mins=60)
course = Course(canvas_id="41428", snapshot_lifetime_in_mins=60, extra_urls=[])
course.save()

snapshot = FaqSnapshot(course=course)
Expand Down
1 change: 1 addition & 0 deletions tests/jobs/crawl/start_crawler_worker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ async def test_start_crawler_worker_will_checkout_url_and_crawl_url(
mocker.patch("jobs.crawl.start_crawler_worker.get_download_service", return_value=download_service.service)
mocker.patch("services.crawler.content_extraction.get_all_links_from_page", return_value=[])
mocker.patch("services.download.pdf.download_content", return_value=('/tmp/file.pdf', 'somefile.pdf'))
mocker.patch("services.download.download.DownloadService._get_filesize", return_value=10)
mocker.patch("pdfminer.high_level.extract_text", return_value="pdf content...")

url = new_snapshot.add_unvisited_url()
Expand Down
45 changes: 45 additions & 0 deletions tests/services/crawler/crawler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,15 @@ def test_creating_new_snapshot_also_creates_the_root_url(valid_course):
assert len(snapshot.urls) == 1


def test_creating_new_snapshot_creates_urls_for_the_extra_urls_added_to_the_course(valid_course):
valid_course.extra_urls.append("http://example.com/a-extra-url")

snapshot = CrawlerService.create_snapshot(valid_course)

assert Url.select().filter(Url.snapshot == snapshot.id).exists()
assert len(snapshot.urls) == 2


def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unvisited_urls(mocker, valid_course):
mocked_time = arrow.get('2024-03-24T00:00:00Z')

Expand Down Expand Up @@ -61,6 +70,42 @@ def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unv
assert CrawlerService.current_snapshot(valid_course) == snapshot_3


def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_urls_waiting_to_index(
mocker,
valid_course
):
mocked_time = arrow.get('2024-03-24T00:00:00Z')

with mocker.patch('arrow.now', return_value=mocked_time):
snapshot_1 = CrawlerService.create_snapshot(valid_course)
snapshot_1.created_at = arrow.now().shift(hours=-2)
snapshot_1.save()
snapshot_2 = CrawlerService.create_snapshot(valid_course)
snapshot_2.created_at = arrow.now().shift(hours=-1)
snapshot_2.save()

with pytest.raises(NoValidSnapshotException):
CrawlerService.current_snapshot(valid_course)

url = snapshot_1.urls[0]
url.state = Url.States.INDEXED
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_1

url = snapshot_2.urls[0]
url.state = Url.States.WAITING_TO_INDEX
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_1

url = snapshot_2.urls[0]
url.state = Url.States.INDEXED
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_2


@pytest.mark.asyncio
async def test_next_url_from_service_is_the_most_recent_url(get_crawler_service, new_snapshot):
crawler_service = await get_crawler_service
Expand Down
Loading

0 comments on commit 5516574

Please sign in to comment.