Merge pull request #32 from nattvara/feature/fix-crawling-bug

Update crawler with google docs support
nattvara · May 12, 2024 · 5516574 · 5516574
2 parents c19feb2 + 967981b
commit 5516574
Show file tree

Hide file tree

Showing 15 changed files with 200 additions and 12 deletions.
diff --git a/db/actions/url.py b/db/actions/url.py
@@ -19,6 +19,17 @@ def exists_any_unvisited_urls_in_snapshot(snapshot):
     ).exists()
 
 
+def exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot):
+    from db.models.url import Url
+    return Url.select().filter(
+        Url.state == Url.States.WAITING_TO_INDEX
+    ).filter(
+        Url.snapshot == snapshot
+    ).order_by(
+        Url.created_at.asc()
+    ).exists()
+
+
 def find_url_with_href_sha_in_snapshot(href_sha: str, snapshot):
     from db.models.url import Url
     return Url.select().filter(

diff --git a/db/custom_fields/__init__.py b/db/custom_fields/__init__.py
@@ -7,3 +7,4 @@
 from .embedding import EmbeddingField
 from .feedback_question_extradata import FeedbackQuestionExtraDataField
 from .courses_list import CoursesListField
+from .extra_urls import ExtraUrlsField
diff --git a/db/custom_fields/extra_urls.py b/db/custom_fields/extra_urls.py
@@ -0,0 +1,5 @@
+from playhouse.sqlite_ext import JSONField
+
+
+class ExtraUrlsField(JSONField):
+    pass
diff --git a/db/migrations/030_add_extra_urls_column_to_courses_table.py b/db/migrations/030_add_extra_urls_column_to_courses_table.py
@@ -0,0 +1,18 @@
+from peewee_migrate import Migrator
+import peewee as pw
+
+from db.migrations import column_exists, is_sqlite
+from db.custom_fields import ExtraUrlsField
+from db.models import Course
+
+
+def migrate(migrator: Migrator, database: pw.Database, fake=False, **kwargs):
+    if not column_exists(Course, 'extra_urls'):
+        extra_urls_field = ExtraUrlsField(null=False, default=[])
+        migrator.add_fields(Course, extra_urls=extra_urls_field)
+
+
+def rollback(migrator: Migrator, database: pw.Database, fake=False, **kwargs):
+    # ignore this operation on sqlite, as removing doesn't work
+    if not is_sqlite(database):
+        migrator.remove_fields(Course, 'extra_urls', cascade=True)
diff --git a/db/models/course.py b/db/models/course.py
@@ -4,6 +4,7 @@
 
 import peewee
 
+from db.custom_fields import ExtraUrlsField
 from . import BaseModel
 
 # Suppress specific DeprecationWarning about db_table, this is needed for migrations to work
@@ -37,3 +38,4 @@ class Language:
     name = peewee.CharField(null=False, default='Untitled Course Room')
     description = peewee.TextField(null=False, default='Course room has not got any description')
     admin_token = peewee.CharField(null=False, index=True, unique=True, default=_generate_admin_token)
+    extra_urls = ExtraUrlsField(null=False, default=[])
diff --git a/jobs/crawl/start_crawler_worker.py b/jobs/crawl/start_crawler_worker.py
@@ -76,6 +76,9 @@ async def run_worker():
 
             except LockAlreadyAcquiredException:
                 log().debug("Found a lock on the url. Skipping.")
+            except Exception as e:
+                log().error("Encountered an exception while crawling.")
+                log().error(e)
 
         await redis.aclose()
 

diff --git a/services/crawler/crawler.py b/services/crawler/crawler.py
@@ -12,9 +12,9 @@
 from db.models import Course, Snapshot, Url
 from config.logger import log
 from db.actions.url import (
+    get_most_recent_url, exists_any_urls_waiting_to_be_indexed_in_snapshot,
     exists_any_unvisited_urls_in_snapshot,
     find_url_with_href_sha_in_snapshot,
-    get_most_recent_url,
 )
 import cache.mutex as mutex
 
@@ -58,6 +58,15 @@ def create_snapshot(course: Course) -> Snapshot:
         )
         root_url.save()
 
+        for extra_url in course.extra_urls:
+            url = Url(
+                snapshot=snapshot,
+                href=extra_url,
+                root=False,
+                distance=1,
+            )
+            url.save()
+
         return snapshot
 
     @staticmethod
@@ -77,6 +86,8 @@ def current_snapshot(course: Course) -> Snapshot:
         for snapshot in snapshots:
             if exists_any_unvisited_urls_in_snapshot(snapshot) is True:
                 continue
+            elif exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot) is True:
+                continue
             else:
                 return snapshot
 

diff --git a/services/crawler/playwright.py b/services/crawler/playwright.py
@@ -1,7 +1,7 @@
 from playwright.async_api import Browser, BrowserContext, Page
 from playwright.async_api import Playwright
 
-from services.crawler.url_filters import CANVAS_DOMAIN, CANVAS_PROFILE_PAGE
+from services.crawler.url_filters import CANVAS_DOMAIN, CANVAS_PROFILE_PAGE, KTH_GITHUB_DOMAIN
 from db.actions.cookie import find_cookie_by_identifier
 import config.settings as settings
 from config.logger import log
@@ -25,17 +25,24 @@ async def get_logged_in_browser_context_and_page(playwright: Playwright) -> (Bro
                                         f"with identifier: {settings.get_settings().COOKIE_IDENTIFIER}")
 
     cookies = cookie.value.split('; ')
-
     prepared_cookies = []
+    domains_with_cookies = [CANVAS_DOMAIN, KTH_GITHUB_DOMAIN]
     for cookie in cookies:
         name, value = cookie.strip().split('=', 1)
-        prepared_cookies.append({
-            'name': name,
-            'value': value,
-            'domain': CANVAS_DOMAIN,
-            'path': '/',
-            'expires': -1
-        })
+
+        secure = False
+        if name.startswith('__Host-'):
+            secure = True
+
+        for domain in domains_with_cookies:
+            prepared_cookies.append({
+                'name': name.strip(),
+                'value': value,
+                'domain': domain,
+                'path': '/',
+                'expires': -1,
+                'secure': secure
+            })
 
     await context.add_cookies(prepared_cookies)
 

diff --git a/services/crawler/url_filters.py b/services/crawler/url_filters.py
@@ -4,10 +4,14 @@
 
 CANVAS_DOMAIN = "canvas.kth.se"
 
+KTH_GITHUB_DOMAIN = "gits-15.sys.kth.se"
+
 KATTIS_KTH_DOMAIN = "kth.kattis.com"
 
 KATTIS_OPEN_DOMAIN = "open.kattis.com"
 
+GOOGLE_DOCS = "docs.google.com"
+
 CANVAS_PROFILE_PAGE = f"https://{CANVAS_DOMAIN}/profile"
 
 DOMAIN_DENY_LIST = {
@@ -88,6 +92,10 @@
     "https://www.kth.se/student/": True,
     "https://www.kth.se": True,
     "https://www.kth.se/": True,
+    "https://www.kth.se/en": True,
+    "https://www.kth.se/en/": True,
+    "https://www.kth.se/en/student": True,
+    "https://www.kth.se/en/student/": True,
 }
 
 
@@ -104,6 +112,10 @@ def domain_is_kattis(url: str) -> bool:
     return get_domain(url) == KATTIS_KTH_DOMAIN or get_domain(url) == KATTIS_OPEN_DOMAIN
 
 
+def domain_is_google_docs(url: str) -> bool:
+    return get_domain(url) == GOOGLE_DOCS
+
+
 def link_begins_with_deny_listed_string(url: str) -> bool:
     for key in DENY_URLS_THAT_STARTS_WITH.keys():
         if url.startswith(key):

diff --git a/services/download/download.py b/services/download/download.py
@@ -1,11 +1,13 @@
 from typing import Union
+import os
 
 from playwright.async_api import Browser, BrowserContext, Page
 from pdfminer.pdfparser import PDFSyntaxError
 
-from services.crawler.url_filters import domain_is_canvas, domain_is_kattis
+from services.crawler.url_filters import domain_is_canvas, domain_is_kattis, domain_is_google_docs
 from db.actions.url import find_url_referencing_content_in_snapshot
 from db.actions.content import find_content_with_sha
+import services.download.google_docs as google_docs
 import services.download.canvas as canvas
 import services.download.pptx as pptx
 import services.download.docx as docx
@@ -21,10 +23,17 @@
 from config.logger import log
 
 
+MAX_DOWNLOAD_FILESIZE_MB = 100
+
+
 class InvalidUrlStateException(Exception):
     pass
 
 
+class FilesizeTooLargeException(Exception):
+    pass
+
+
 class DownloadService:
 
     def __init__(
@@ -45,6 +54,13 @@ async def save_url_content(self, url: Url):
                                            f"downloaded, url was in {url.state}")
 
         if url.is_download:
+            filesize = self._get_filesize(url)
+            log().debug(f"downloaded filesize was {filesize}MB")
+
+            if filesize > MAX_DOWNLOAD_FILESIZE_MB:
+                raise FilesizeTooLargeException("Filesize was too large, cannot index files that exceed limit of"
+                                                f" {MAX_DOWNLOAD_FILESIZE_MB}MB")
+
             log().info("trying to save content from url as a pdf")
             content = await self._save_pdf_url_content(url)
 
@@ -64,6 +80,8 @@ async def save_url_content(self, url: Url):
             content = await self._save_canvas_url_content(url)
         elif domain_is_kattis(url.href):
             content = await self._save_kattis_url_content(url)
+        elif domain_is_google_docs(url.href):
+            content = await self._save_google_docs_url_content(url)
         else:
             content = await self._save_web_url_content(url)
 
@@ -82,6 +100,12 @@ async def save_url_content(self, url: Url):
         url.state = Url.States.DOWNLOADED
         url.save()
 
+    def _get_filesize(self, url: Url) -> float:
+        content_filepath, _ = pdf.download_content(url)
+        filesize_bytes = os.path.getsize(content_filepath)
+        filesize_mb = filesize_bytes / (1024 * 1024)
+        return round(filesize_mb, 4)
+
     async def _save_pdf_url_content(self, url: Url) -> Union[Content, None]:
         try:
             content_filepath, filename = pdf.download_content(url)
@@ -133,6 +157,19 @@ async def _save_kattis_url_content(self, url: Url):
                 log().error("Failed to parse kattis page custom instructions parser, using fallback", e)
         return await self._save_web_url_content(url)
 
+    async def _save_google_docs_url_content(self, url: Url):
+        if google_docs.can_be_exported(url):
+            log().info("Found a google doc that can be exported")
+            try:
+                content_filepath, filename = google_docs.download_doc_as_pdf(url)
+                text = extract_text_from_pdf_file(content_filepath)
+                content = Content(text=text, name=filename)
+                return content
+            except Exception as e:  # noqa
+                log().error("Failed to parse google doc, using fallback", e)
+
+        return await self._save_web_url_content(url)
+
     async def _save_web_url_content(self, url: Url):
         html, title = await web.download_content(url, self.page)
         text = extract_text_from_html(html)

diff --git a/services/download/google_docs.py b/services/download/google_docs.py
@@ -0,0 +1,32 @@
+import re
+
+from services.download import pdf
+from config.logger import log
+from db.models import Url
+
+
+class GoogleDocsException(Exception):
+    pass
+
+
+def can_be_exported(url: Url) -> bool:
+    if "docs.google.com/presentation" in url.href:
+        return True
+    if "docs.google.com/document" in url.href:
+        return True
+    if "docs.google.com/spreadsheets" in url.href:
+        return True
+
+    return False
+
+
+def download_doc_as_pdf(url: Url):
+    pattern = r'(https://docs\.google\.com/(presentation|document|spreadsheets)/d/[\w-]+)'
+    match = re.match(pattern, url.href)
+    if not match:
+        raise GoogleDocsException(f"Failed to match url: {url.href}")
+
+    base_url = match.group(1)
+    export_url = f"{base_url}/export?format=pdf"
+    log().debug(f"downloading doc using export url {export_url}")
+    return pdf.download_content(Url(href=export_url))
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -156,7 +156,7 @@ def valid_course():
     config = ChatConfig(llm_model_name=LLMModel.MISTRAL_7B_INSTRUCT, index_type=IndexType.NO_INDEX)
     config.save()
 
-    course = Course(canvas_id="41428", snapshot_lifetime_in_mins=60)
+    course = Course(canvas_id="41428", snapshot_lifetime_in_mins=60, extra_urls=[])
     course.save()
 
     snapshot = FaqSnapshot(course=course)

diff --git a/tests/jobs/crawl/start_crawler_worker_test.py b/tests/jobs/crawl/start_crawler_worker_test.py
@@ -18,6 +18,7 @@ async def test_start_crawler_worker_will_checkout_url_and_crawl_url(
     mocker.patch("jobs.crawl.start_crawler_worker.get_download_service", return_value=download_service.service)
     mocker.patch("services.crawler.content_extraction.get_all_links_from_page", return_value=[])
     mocker.patch("services.download.pdf.download_content", return_value=('/tmp/file.pdf', 'somefile.pdf'))
+    mocker.patch("services.download.download.DownloadService._get_filesize", return_value=10)
     mocker.patch("pdfminer.high_level.extract_text", return_value="pdf content...")
 
     url = new_snapshot.add_unvisited_url()

diff --git a/tests/services/crawler/crawler_test.py b/tests/services/crawler/crawler_test.py
@@ -20,6 +20,15 @@ def test_creating_new_snapshot_also_creates_the_root_url(valid_course):
     assert len(snapshot.urls) == 1
 
 
+def test_creating_new_snapshot_creates_urls_for_the_extra_urls_added_to_the_course(valid_course):
+    valid_course.extra_urls.append("http://example.com/a-extra-url")
+
+    snapshot = CrawlerService.create_snapshot(valid_course)
+
+    assert Url.select().filter(Url.snapshot == snapshot.id).exists()
+    assert len(snapshot.urls) == 2
+
+
 def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unvisited_urls(mocker, valid_course):
     mocked_time = arrow.get('2024-03-24T00:00:00Z')
 
@@ -61,6 +70,42 @@ def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unv
         assert CrawlerService.current_snapshot(valid_course) == snapshot_3
 
 
+def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_urls_waiting_to_index(
+    mocker,
+    valid_course
+):
+    mocked_time = arrow.get('2024-03-24T00:00:00Z')
+
+    with mocker.patch('arrow.now', return_value=mocked_time):
+        snapshot_1 = CrawlerService.create_snapshot(valid_course)
+        snapshot_1.created_at = arrow.now().shift(hours=-2)
+        snapshot_1.save()
+        snapshot_2 = CrawlerService.create_snapshot(valid_course)
+        snapshot_2.created_at = arrow.now().shift(hours=-1)
+        snapshot_2.save()
+
+        with pytest.raises(NoValidSnapshotException):
+            CrawlerService.current_snapshot(valid_course)
+
+        url = snapshot_1.urls[0]
+        url.state = Url.States.INDEXED
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_1
+
+        url = snapshot_2.urls[0]
+        url.state = Url.States.WAITING_TO_INDEX
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_1
+
+        url = snapshot_2.urls[0]
+        url.state = Url.States.INDEXED
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_2
+
+
 @pytest.mark.asyncio
 async def test_next_url_from_service_is_the_most_recent_url(get_crawler_service, new_snapshot):
     crawler_service = await get_crawler_service