-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32 from nattvara/feature/fix-crawling-bug
Update crawler with google docs support
- Loading branch information
Showing
15 changed files
with
200 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
from playhouse.sqlite_ext import JSONField | ||
|
||
|
||
class ExtraUrlsField(JSONField): | ||
pass |
18 changes: 18 additions & 0 deletions
18
db/migrations/030_add_extra_urls_column_to_courses_table.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from peewee_migrate import Migrator | ||
import peewee as pw | ||
|
||
from db.migrations import column_exists, is_sqlite | ||
from db.custom_fields import ExtraUrlsField | ||
from db.models import Course | ||
|
||
|
||
def migrate(migrator: Migrator, database: pw.Database, fake=False, **kwargs): | ||
if not column_exists(Course, 'extra_urls'): | ||
extra_urls_field = ExtraUrlsField(null=False, default=[]) | ||
migrator.add_fields(Course, extra_urls=extra_urls_field) | ||
|
||
|
||
def rollback(migrator: Migrator, database: pw.Database, fake=False, **kwargs): | ||
# ignore this operation on sqlite, as removing doesn't work | ||
if not is_sqlite(database): | ||
migrator.remove_fields(Course, 'extra_urls', cascade=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import re | ||
|
||
from services.download import pdf | ||
from config.logger import log | ||
from db.models import Url | ||
|
||
|
||
class GoogleDocsException(Exception): | ||
pass | ||
|
||
|
||
def can_be_exported(url: Url) -> bool: | ||
if "docs.google.com/presentation" in url.href: | ||
return True | ||
if "docs.google.com/document" in url.href: | ||
return True | ||
if "docs.google.com/spreadsheets" in url.href: | ||
return True | ||
|
||
return False | ||
|
||
|
||
def download_doc_as_pdf(url: Url): | ||
pattern = r'(https://docs\.google\.com/(presentation|document|spreadsheets)/d/[\w-]+)' | ||
match = re.match(pattern, url.href) | ||
if not match: | ||
raise GoogleDocsException(f"Failed to match url: {url.href}") | ||
|
||
base_url = match.group(1) | ||
export_url = f"{base_url}/export?format=pdf" | ||
log().debug(f"downloading doc using export url {export_url}") | ||
return pdf.download_content(Url(href=export_url)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.