From a4ae7936d7a282c980b0d525426a3792cf826d55 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 20 Nov 2023 15:41:23 +0100 Subject: [PATCH] [8.10] Remove timestamp optimization for full syncs (#1907) (#1909) Co-authored-by: Sean Story --- connectors/es/sink.py | 18 ++---------------- tests/test_sink.py | 18 +++++++++++------- 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/connectors/es/sink.py b/connectors/es/sink.py index d7a37fe64..d3f01c055 100644 --- a/connectors/es/sink.py +++ b/connectors/es/sink.py @@ -427,22 +427,8 @@ async def get_docs(self, generator): continue if doc_id in existing_ids: - # pop out of existing_ids - ts = existing_ids.pop(doc_id) - - # If the doc has a timestamp, we can use it to see if it has - # been modified. This reduces the bulk size a *lot* - # - # Some backends do not know how to do this, so it's optional. - # For these, we update the docs in any case. - if TIMESTAMP_FIELD in doc and ts == doc[TIMESTAMP_FIELD]: - # cancel the download - if ( - self.content_extraction_enabled - and lazy_download is not None - ): - await lazy_download(doit=False) - continue + # pop out of existing_ids, so they do not get deleted + existing_ids.pop(doc_id) self.total_docs_updated += 1 else: diff --git a/tests/test_sink.py b/tests/test_sink.py index f31838fb6..a90c1fc8b 100644 --- a/tests/test_sink.py +++ b/tests/test_sink.py @@ -433,14 +433,18 @@ async def setup_extractor( total_downloads(0), ), ( - # doc 1 is present, data source also has doc 1 with the same timestamp -> nothing happens + # doc 1 is present, data source also has doc 1 with the same timestamp -> doc one is updated [DOC_ONE], [(DOC_ONE, None, "index")], NO_FILTERING, SYNC_RULES_ENABLED, CONTENT_EXTRACTION_ENABLED, - [end_docs_operation()], - updated(0), + [ + # update happens through overwriting + index_operation(DOC_ONE), + end_docs_operation(), + ], + updated(1), created(0), deleted(0), total_downloads(0), @@ -536,17 +540,17 @@ async def setup_extractor( total_downloads(1), ), ( - # doc 1 present, data source has doc 1 -> no lazy download if timestamps are the same for the docs + # doc 1 present, data source has doc 1 -> lazy download occurs [DOC_ONE], [(DOC_ONE, lazy_download_fake(DOC_ONE), "index")], NO_FILTERING, SYNC_RULES_ENABLED, CONTENT_EXTRACTION_ENABLED, - [end_docs_operation()], - updated(0), + [index_operation(DOC_ONE), end_docs_operation()], + updated(1), created(0), deleted(0), - total_downloads(0), + total_downloads(1), ), ( # doc 1 present, data source has doc 1 with different timestamp