From bbe6529040e1700f53ca3eabd0c7cc31e895b7a2 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Fri, 1 Nov 2024 18:52:38 +0200 Subject: [PATCH] db: Get last modified timestamp from table rows Instead solely targeting the get_last_modified() method to databases which have load rate limit, make all of them support it, and return the maximum `_timestamp` value across all tables, if the schema has it. --- kcidb/db/__init__.py | 14 ++++++---- kcidb/db/abstract.py | 13 ++++++---- kcidb/db/bigquery/v04_00.py | 47 +++++++++++++++++++++------------ kcidb/db/mux.py | 13 ++++++---- kcidb/db/null.py | 13 ++++++---- kcidb/db/postgresql/v04_00.py | 43 ++++++++++++++++++++---------- kcidb/db/schematic.py | 45 ++++++++++++++++++-------------- kcidb/db/sql/schema.py | 22 ++++++++++++++++ kcidb/db/sqlite/v04_00.py | 49 +++++++++++++++++++++++++---------- kcidb/test_db.py | 45 ++++++++++++++++++++++++++++++-- 10 files changed, 220 insertions(+), 84 deletions(-) diff --git a/kcidb/db/__init__.py b/kcidb/db/__init__.py index 1971719c..590cb52b 100644 --- a/kcidb/db/__init__.py +++ b/kcidb/db/__init__.py @@ -182,15 +182,19 @@ def get_current_time(self): def get_last_modified(self): """ - Get the time the data in the connected database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized or its data loading interface is not limited in the amount - of load() method calls. + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. Returns: A timezone-aware datetime object representing the last - modification time. + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. """ + assert self.is_initialized() last_modified = self.driver.get_last_modified() assert isinstance(last_modified, datetime.datetime) assert last_modified.tzinfo diff --git a/kcidb/db/abstract.py b/kcidb/db/abstract.py index b4915653..2f04420c 100644 --- a/kcidb/db/abstract.py +++ b/kcidb/db/abstract.py @@ -113,14 +113,17 @@ def get_current_time(self): @abstractmethod def get_last_modified(self): """ - Get the time the data in the driven database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized, or its data loading interface is not limited in the - amount of load() method calls. + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. Returns: A timezone-aware datetime object representing the last - modification time. + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. """ @abstractmethod diff --git a/kcidb/db/bigquery/v04_00.py b/kcidb/db/bigquery/v04_00.py index a80506a4..f13689bb 100644 --- a/kcidb/db/bigquery/v04_00.py +++ b/kcidb/db/bigquery/v04_00.py @@ -161,23 +161,6 @@ def get_current_time(self): self.query_create("SELECT CURRENT_TIMESTAMP").result() ))[0] - def get_last_modified(self): - """ - Get the time the data in the connected database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized or its data loading interface is not limited in the amount - of load() method calls. - - Returns: - A timezone-aware datetime object representing the last - modification time. - """ - return next(iter(self.query_create( - "SELECT TIMESTAMP_MILLIS(MAX(last_modified_time)) " - "FROM __TABLES__" - ).result()))[0] or \ - datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) - class Schema(AbstractSchema): """BigQuery database schema v4.0""" @@ -1215,3 +1198,33 @@ def load(self, data, with_metadata): raise Exception("".join([ f"ERROR: {error['message']}\n" for error in job.errors ])) from exc + + def get_last_modified(self): + """ + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. + + Returns: + A timezone-aware datetime object representing the last + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. + """ + if not all( + next((f for f in table_schema if f.name == "_timestamp"), None) + for table_schema in self.TABLE_MAP.values() + ): + raise NoTimestamps("Database is missing timestamps in its schema") + + return next(iter(self.conn.query_create( + "SELECT MAX(last_modified) AS last_modified FROM(\n" + + "UNION ALL\n".join( + f"SELECT MAX(_timestamp) AS last_modified FROM {table_name}\n" + for table_name in self.TABLE_MAP + ) + + ")\n" + ).result()))[0] or \ + datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) diff --git a/kcidb/db/mux.py b/kcidb/db/mux.py index 8c1332fa..d1ca6e68 100644 --- a/kcidb/db/mux.py +++ b/kcidb/db/mux.py @@ -291,14 +291,17 @@ def get_current_time(self): def get_last_modified(self): """ - Get the time the data in the driven databases was last modified. - Can return the minimum timestamp constant, if the databases are not - initialized, or their data loading interface is not limited in the - amount of load() method calls. + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. Returns: A timezone-aware datetime object representing the last - modification time. + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. """ return max(driver.get_last_modified() for driver in self.drivers) diff --git a/kcidb/db/null.py b/kcidb/db/null.py index 8e9c2fe9..ad82b4fa 100644 --- a/kcidb/db/null.py +++ b/kcidb/db/null.py @@ -118,14 +118,17 @@ def get_current_time(self): def get_last_modified(self): """ - Get the time the data in the driven database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized, or its data loading interface is not limited in the - amount of load() method calls. + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. Returns: A timezone-aware datetime object representing the last - modification time. + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. """ return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) diff --git a/kcidb/db/postgresql/v04_00.py b/kcidb/db/postgresql/v04_00.py index aecad66a..8e37ea4b 100644 --- a/kcidb/db/postgresql/v04_00.py +++ b/kcidb/db/postgresql/v04_00.py @@ -184,19 +184,6 @@ def get_current_time(self): cursor.execute("SELECT CURRENT_TIMESTAMP") return cursor.fetchone()[0] - def get_last_modified(self): - """ - Get the time the data in the connected database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized or its data loading interface is not limited in the amount - of load() method calls. - - Returns: - A timezone-aware datetime object representing the last - modification time. - """ - return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) - class Schema(AbstractSchema): """PostgreSQL database schema v4.0""" @@ -950,3 +937,33 @@ def load(self, data, with_metadata): # Flip priority for the next load to maintain (rough) # parity with non-determinism of BigQuery's ANY_VALUE() self.conn.load_prio_db = not self.conn.load_prio_db + + def get_last_modified(self): + """ + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. + + Returns: + A timezone-aware datetime object representing the last + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. + """ + statement = ( + "SELECT MAX(last_modified) AS last_modified\n" + + "FROM (\n" + + textwrap.indent( + "\nUNION ALL\n".join( + table_schema.format_get_last_modified(table_name) + for table_name, table_schema in self.TABLES.items() + ), + " " * 4 + ) + "\n) AS tables\n" + ) + with self.conn, self.conn.cursor() as cursor: + cursor.execute(statement) + return cursor.fetchone()[0] or \ + datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) diff --git a/kcidb/db/schematic.py b/kcidb/db/schematic.py index 870d3def..d734d76c 100644 --- a/kcidb/db/schematic.py +++ b/kcidb/db/schematic.py @@ -89,19 +89,6 @@ def get_current_time(self): time on the database server. """ - @abstractmethod - def get_last_modified(self): - """ - Get the time the data in the connected database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized, or its data loading interface is not limited in the - amount of load() method calls. - - Returns: - A timezone-aware datetime object representing the last - modification time. - """ - def is_initialized(self): """ Check if the connected database is initialized. @@ -381,6 +368,22 @@ def load(self, data, with_metadata): """ # Relying on the driver to check compatibility/validity + @abstractmethod + def get_last_modified(self): + """ + Get the time data has arrived last into the database. Can return the + minimum timestamp constant, if the database is empty. + The database must be initialized. + + Returns: + A timezone-aware datetime object representing the last + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. + """ + class MetaDriver(ABCMeta): """A schematic metadriver""" @@ -546,16 +549,20 @@ def get_current_time(self): def get_last_modified(self): """ - Get the time the data in the driven database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized, or its data loading interface is not limited in the - amount of load() method calls. + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. Returns: A timezone-aware datetime object representing the last - modification time. + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. """ - return self.conn.get_last_modified() + assert self.is_initialized() + return self.schema.get_last_modified() def get_schemas(self): """ diff --git a/kcidb/db/sql/schema.py b/kcidb/db/sql/schema.py index 7cc2d1d4..803aca19 100644 --- a/kcidb/db/sql/schema.py +++ b/kcidb/db/sql/schema.py @@ -343,6 +343,28 @@ def format_dump(self, name, with_metadata, after, until): ] ) + def format_get_last_modified(self, name): + """ + Format the "SELECT" command returning the timestamp of last data + written to the table, or NULL, if the table is empty. + + Args: + name: The name of the target table of the command. + + Returns: + The formatted "SELECT" command, returning the timestamp in + "last_modified" column. + + Raises: + NoTimestamps - The table doesn't have row timestamps. + """ + assert isinstance(name, str) + if not self.timestamp: + raise NoTimestamps("Table has no timestamp column") + return ( + f"SELECT MAX({self.timestamp.name}) AS last_modified FROM {name}" + ) + def format_delete(self, name): """ Format the "DELETE" command for emptying the table (removing all diff --git a/kcidb/db/sqlite/v04_00.py b/kcidb/db/sqlite/v04_00.py index 398e429c..202fd50d 100644 --- a/kcidb/db/sqlite/v04_00.py +++ b/kcidb/db/sqlite/v04_00.py @@ -149,19 +149,6 @@ def get_current_time(self): finally: cursor.close() - def get_last_modified(self): - """ - Get the time the data in the connected database was last modified. - Can return the minimum timestamp constant, if the database is not - initialized or its data loading interface is not limited in the amount - of load() method calls. - - Returns: - A timezone-aware datetime object representing the last - modification time. - """ - return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) - class Schema(AbstractSchema): """SQLite database schema v4.0""" @@ -905,3 +892,39 @@ def load(self, data, with_metadata): # Flip priority for the next load to maintain (rough) # parity with non-determinism of BigQuery's ANY_VALUE() self.conn.load_prio_db = not self.conn.load_prio_db + + def get_last_modified(self): + """ + Get the time data has arrived last into the driven database. Can + return the minimum timestamp constant, if the database is empty. + The database must be initialized. + + Returns: + A timezone-aware datetime object representing the last + data arrival time. + + Raises: + NoTimestamps - The database doesn't have row timestamps, and + cannot determine the last data arrival time. + """ + statement = ( + "SELECT MAX(last_modified) AS last_modified\n" + + "FROM (\n" + + textwrap.indent( + "\nUNION ALL\n".join( + table_schema.format_get_last_modified(table_name) + for table_name, table_schema in self.TABLES.items() + ), + " " * 4 + ) + "\n) AS tables\n" + ) + with self.conn: + cursor = self.conn.cursor() + try: + cursor.execute(statement) + timestamp = cursor.fetchone()[0] + if timestamp: + return dateutil.parser.isoparse(timestamp) + finally: + cursor.close() + return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) diff --git a/kcidb/test_db.py b/kcidb/test_db.py index 49a41ff8..eece79e4 100644 --- a/kcidb/test_db.py +++ b/kcidb/test_db.py @@ -408,15 +408,56 @@ def test_get_current_time(clean_database): assert client.get_current_time() > timestamp -def test_get_last_modified(empty_database): +def test_get_last_modified(clean_database): """ Check get_last_modified() works correctly """ - client = empty_database + client = clean_database + # Check a pre-timestamp schema version + client.init(kcidb.io.schema.V4_2) + with pytest.raises(kcidb.db.misc.NoTimestamps): + client.get_last_modified() + client.load({ + **kcidb.io.schema.V4_2.new(), + "checkouts": [ + dict(id="origin:1", origin="origin",), + ], + "builds": [ + dict(checkout_id="origin:1", id="origin:1", origin="origin",), + ], + "tests": [ + dict(build_id="origin:1", id="origin:1", origin="origin",), + ], + "issues": [ + dict(id="origin:1", version=1, origin="origin",), + ], + "incidents": [ + dict( + id="origin:1", + origin="origin", + issue_id="origin:1", + issue_version=1, + ) + ] + }) + with pytest.raises(kcidb.db.misc.NoTimestamps): + client.get_last_modified() + client.cleanup() + + # Check a post-timestamp schema version + time.sleep(1) + client.init() + timestamp = client.get_last_modified() + assert timestamp == \ + datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) + before_load = client.get_current_time() + client.load(COMPREHENSIVE_IO_DATA) timestamp = client.get_last_modified() assert timestamp is not None assert isinstance(timestamp, datetime.datetime) assert timestamp.tzinfo is not None + assert timestamp >= before_load + client.cleanup() def test_all_fields(empty_database):