From 16f3386cee7f378d7fef4ccd449cbbca2e637ef0 Mon Sep 17 00:00:00 2001 From: Jedr Blaszyk Date: Wed, 3 Jan 2024 15:41:28 +0100 Subject: [PATCH 1/3] [Google Drive] Fix permission fetching bug for domain-wide delegation sync --- connectors/sources/google_drive.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/connectors/sources/google_drive.py b/connectors/sources/google_drive.py index c3c8c6414..559011097 100644 --- a/connectors/sources/google_drive.py +++ b/connectors/sources/google_drive.py @@ -174,6 +174,11 @@ async def list_files(self, fetch_permissions=False): async def list_files_from_my_drive(self, fetch_permissions=False): """Get files from Google Drive. Files can have any type. + We optimize by filtering for files a user can edit using "trashed=false and 'me' in writers", + as users with read-only access can't fetch permission lists. During sync, we iterate over all + organizational users, skipping read-only files. Eventually, we impersonate a user with write access + to each file, enabling complete file and permissions retrieval. + Args: include_permissions (bool): flag to select permissions in the request query @@ -191,7 +196,7 @@ async def list_files_from_my_drive(self, fetch_permissions=False): resource="files", method="list", corpora="user", - q="trashed=false", + q="trashed=false and 'me' in writers", orderBy="modifiedTime desc", fields=f"files({files_fields}),incompleteSearch,nextPageToken", includeItemsFromAllDrives=False, From df66bca07985aa5a3932f70e6ff9d05d072708ce Mon Sep 17 00:00:00 2001 From: Jedr Blaszyk Date: Fri, 5 Jan 2024 17:14:34 +0100 Subject: [PATCH 2/3] Adapt files list query --- connectors/sources/google_drive.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/connectors/sources/google_drive.py b/connectors/sources/google_drive.py index 559011097..95f20a570 100644 --- a/connectors/sources/google_drive.py +++ b/connectors/sources/google_drive.py @@ -172,12 +172,13 @@ async def list_files(self, fetch_permissions=False): yield file async def list_files_from_my_drive(self, fetch_permissions=False): - """Get files from Google Drive. Files can have any type. + """Retrieves files from Google Drive, with an option to fetch permissions (DLS). - We optimize by filtering for files a user can edit using "trashed=false and 'me' in writers", - as users with read-only access can't fetch permission lists. During sync, we iterate over all - organizational users, skipping read-only files. Eventually, we impersonate a user with write access - to each file, enabling complete file and permissions retrieval. + This function optimizes the retrieval process based on the 'fetch_permissions' flag. + If 'fetch_permissions' is True, the function filters for files the user can edit + ("trashed=false and 'me' in writers") as permission fetching requires write access. + If 'fetch_permissions' is False, it simply filters out trashed files ("trashed=false"), + allowing a broader file retrieval. Args: include_permissions (bool): flag to select permissions in the request query @@ -186,17 +187,19 @@ async def list_files_from_my_drive(self, fetch_permissions=False): dict: Documents from Google Drive. """ - files_fields = ( - DRIVE_ITEMS_FIELDS_WITH_PERMISSIONS - if fetch_permissions - else DRIVE_ITEMS_FIELDS - ) + if fetch_permissions: + files_fields = DRIVE_ITEMS_FIELDS_WITH_PERMISSIONS + # Google Drive API required write access to fetch file's permissions + list_query = f"trashed=false and 'me' in writers" + else: + files_fields = DRIVE_ITEMS_FIELDS + list_query = "trashed=false" async for file in self.api_call_paged( resource="files", method="list", corpora="user", - q="trashed=false and 'me' in writers", + q=list_query, orderBy="modifiedTime desc", fields=f"files({files_fields}),incompleteSearch,nextPageToken", includeItemsFromAllDrives=False, From acf6416d4b08cad8035c17ae55b26a3928ae8972 Mon Sep 17 00:00:00 2001 From: Jedr Blaszyk Date: Fri, 5 Jan 2024 17:19:00 +0100 Subject: [PATCH 3/3] make lint --- connectors/sources/google_drive.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/connectors/sources/google_drive.py b/connectors/sources/google_drive.py index 95f20a570..ead18ac16 100644 --- a/connectors/sources/google_drive.py +++ b/connectors/sources/google_drive.py @@ -190,7 +190,7 @@ async def list_files_from_my_drive(self, fetch_permissions=False): if fetch_permissions: files_fields = DRIVE_ITEMS_FIELDS_WITH_PERMISSIONS # Google Drive API required write access to fetch file's permissions - list_query = f"trashed=false and 'me' in writers" + list_query = "trashed=false and 'me' in writers" else: files_fields = DRIVE_ITEMS_FIELDS list_query = "trashed=false"