fix issue where urls waiting to be indexed would cause broken snapsho…

…ts to be used for queries
nattvara · May 1, 2024 · c47397b · c47397b
1 parent c19feb2
commit c47397b
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 1 deletion.
diff --git a/db/actions/url.py b/db/actions/url.py
@@ -19,6 +19,17 @@ def exists_any_unvisited_urls_in_snapshot(snapshot):
     ).exists()
 
 
+def exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot):
+    from db.models.url import Url
+    return Url.select().filter(
+        Url.state == Url.States.WAITING_TO_INDEX
+    ).filter(
+        Url.snapshot == snapshot
+    ).order_by(
+        Url.created_at.asc()
+    ).exists()
+
+
 def find_url_with_href_sha_in_snapshot(href_sha: str, snapshot):
     from db.models.url import Url
     return Url.select().filter(

diff --git a/services/crawler/crawler.py b/services/crawler/crawler.py
@@ -12,9 +12,9 @@
 from db.models import Course, Snapshot, Url
 from config.logger import log
 from db.actions.url import (
+    get_most_recent_url, exists_any_urls_waiting_to_be_indexed_in_snapshot,
     exists_any_unvisited_urls_in_snapshot,
     find_url_with_href_sha_in_snapshot,
-    get_most_recent_url,
 )
 import cache.mutex as mutex
 
@@ -77,6 +77,8 @@ def current_snapshot(course: Course) -> Snapshot:
         for snapshot in snapshots:
             if exists_any_unvisited_urls_in_snapshot(snapshot) is True:
                 continue
+            elif exists_any_urls_waiting_to_be_indexed_in_snapshot(snapshot) is True:
+                continue
             else:
                 return snapshot
 

diff --git a/tests/services/crawler/crawler_test.py b/tests/services/crawler/crawler_test.py
@@ -61,6 +61,42 @@ def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_unv
         assert CrawlerService.current_snapshot(valid_course) == snapshot_3
 
 
+def test_the_current_snapshot_is_always_the_most_recent_snapshot_without_any_urls_waiting_to_index(
+    mocker,
+    valid_course
+):
+    mocked_time = arrow.get('2024-03-24T00:00:00Z')
+
+    with mocker.patch('arrow.now', return_value=mocked_time):
+        snapshot_1 = CrawlerService.create_snapshot(valid_course)
+        snapshot_1.created_at = arrow.now().shift(hours=-2)
+        snapshot_1.save()
+        snapshot_2 = CrawlerService.create_snapshot(valid_course)
+        snapshot_2.created_at = arrow.now().shift(hours=-1)
+        snapshot_2.save()
+
+        with pytest.raises(NoValidSnapshotException):
+            CrawlerService.current_snapshot(valid_course)
+
+        url = snapshot_1.urls[0]
+        url.state = Url.States.INDEXED
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_1
+
+        url = snapshot_2.urls[0]
+        url.state = Url.States.WAITING_TO_INDEX
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_1
+
+        url = snapshot_2.urls[0]
+        url.state = Url.States.INDEXED
+        url.save()
+
+        assert CrawlerService.current_snapshot(valid_course) == snapshot_2
+
+
 @pytest.mark.asyncio
 async def test_next_url_from_service_is_the_most_recent_url(get_crawler_service, new_snapshot):
     crawler_service = await get_crawler_service