From a4ae7936d7a282c980b0d525426a3792cf826d55 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 20 Nov 2023 15:41:23 +0100
Subject: [PATCH] [8.10] Remove timestamp optimization for full syncs (#1907)
 (#1909)

Co-authored-by: Sean Story <sean.j.story@gmail.com>
---
 connectors/es/sink.py | 18 ++----------------
 tests/test_sink.py    | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/connectors/es/sink.py b/connectors/es/sink.py
index d7a37fe64..d3f01c055 100644
--- a/connectors/es/sink.py
+++ b/connectors/es/sink.py
@@ -427,22 +427,8 @@ async def get_docs(self, generator):
                     continue
 
                 if doc_id in existing_ids:
-                    # pop out of existing_ids
-                    ts = existing_ids.pop(doc_id)
-
-                    # If the doc has a timestamp, we can use it to see if it has
-                    # been modified. This reduces the bulk size a *lot*
-                    #
-                    # Some backends do not know how to do this, so it's optional.
-                    # For these, we update the docs in any case.
-                    if TIMESTAMP_FIELD in doc and ts == doc[TIMESTAMP_FIELD]:
-                        # cancel the download
-                        if (
-                            self.content_extraction_enabled
-                            and lazy_download is not None
-                        ):
-                            await lazy_download(doit=False)
-                        continue
+                    # pop out of existing_ids, so they do not get deleted
+                    existing_ids.pop(doc_id)
 
                     self.total_docs_updated += 1
                 else:
diff --git a/tests/test_sink.py b/tests/test_sink.py
index f31838fb6..a90c1fc8b 100644
--- a/tests/test_sink.py
+++ b/tests/test_sink.py
@@ -433,14 +433,18 @@ async def setup_extractor(
             total_downloads(0),
         ),
         (
-            # doc 1 is present, data source also has doc 1 with the same timestamp -> nothing happens
+            # doc 1 is present, data source also has doc 1 with the same timestamp -> doc one is updated
             [DOC_ONE],
             [(DOC_ONE, None, "index")],
             NO_FILTERING,
             SYNC_RULES_ENABLED,
             CONTENT_EXTRACTION_ENABLED,
-            [end_docs_operation()],
-            updated(0),
+            [
+                # update happens through overwriting
+                index_operation(DOC_ONE),
+                end_docs_operation(),
+            ],
+            updated(1),
             created(0),
             deleted(0),
             total_downloads(0),
@@ -536,17 +540,17 @@ async def setup_extractor(
             total_downloads(1),
         ),
         (
-            # doc 1 present, data source has doc 1 -> no lazy download if timestamps are the same for the docs
+            # doc 1 present, data source has doc 1 -> lazy download occurs
             [DOC_ONE],
             [(DOC_ONE, lazy_download_fake(DOC_ONE), "index")],
             NO_FILTERING,
             SYNC_RULES_ENABLED,
             CONTENT_EXTRACTION_ENABLED,
-            [end_docs_operation()],
-            updated(0),
+            [index_operation(DOC_ONE), end_docs_operation()],
+            updated(1),
             created(0),
             deleted(0),
-            total_downloads(0),
+            total_downloads(1),
         ),
         (
             # doc 1 present, data source has doc 1 with different timestamp