Skip to content

Commit

Permalink
fix issue where urls waiting to be indexed would cause broken snapsho…
Browse files Browse the repository at this point in the history
…ts to be used for queries
  • Loading branch information
nattvara committed May 1, 2024
1 parent c19feb2 commit c47397b
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 1 deletion.
11 changes: 11 additions & 0 deletions db/actions/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,17 @@ def exists_any_unvisited_urls_in_snapshot(snapshot):
).exists()


def exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot):
from db.models.url import Url
return Url.select().filter(
Url.state == Url.States.WAITING_TO_INDEX
).filter(
Url.snapshot == snapshot
).order_by(
Url.created_at.asc()
).exists()


def find_url_with_href_sha_in_snapshot(href_sha: str, snapshot):
from db.models.url import Url
return Url.select().filter(
Expand Down
4 changes: 3 additions & 1 deletion services/crawler/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
from db.models import Course, Snapshot, Url
from config.logger import log
from db.actions.url import (
get_most_recent_url, exists_any_urls_waiting_to_be_indexed_in_snapshot,
exists_any_unvisited_urls_in_snapshot,
find_url_with_href_sha_in_snapshot,
get_most_recent_url,
)
import cache.mutex as mutex

Expand Down Expand Up @@ -77,6 +77,8 @@ def current_snapshot(course: Course) -> Snapshot:
for snapshot in snapshots:
if exists_any_unvisited_urls_in_snapshot(snapshot) is True:
continue
elif exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot) is True:
continue
else:
return snapshot

Expand Down
36 changes: 36 additions & 0 deletions tests/services/crawler/crawler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,42 @@ def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unv
assert CrawlerService.current_snapshot(valid_course) == snapshot_3


def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_urls_waiting_to_index(
mocker,
valid_course
):
mocked_time = arrow.get('2024-03-24T00:00:00Z')

with mocker.patch('arrow.now', return_value=mocked_time):
snapshot_1 = CrawlerService.create_snapshot(valid_course)
snapshot_1.created_at = arrow.now().shift(hours=-2)
snapshot_1.save()
snapshot_2 = CrawlerService.create_snapshot(valid_course)
snapshot_2.created_at = arrow.now().shift(hours=-1)
snapshot_2.save()

with pytest.raises(NoValidSnapshotException):
CrawlerService.current_snapshot(valid_course)

url = snapshot_1.urls[0]
url.state = Url.States.INDEXED
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_1

url = snapshot_2.urls[0]
url.state = Url.States.WAITING_TO_INDEX
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_1

url = snapshot_2.urls[0]
url.state = Url.States.INDEXED
url.save()

assert CrawlerService.current_snapshot(valid_course) == snapshot_2


@pytest.mark.asyncio
async def test_next_url_from_service_is_the_most_recent_url(get_crawler_service, new_snapshot):
crawler_service = await get_crawler_service
Expand Down

0 comments on commit c47397b

Please sign in to comment.