From 60247cca08823bc86c3aac176c9228c65d869c9b Mon Sep 17 00:00:00 2001
From: Michael Gattozzi <mgattozzi@influxdata.com>
Date: Thu, 25 Jan 2024 14:31:57 -0500
Subject: [PATCH] chore(deps): Update arrow and datafusion to 49.0.0

This commit copies in our dependency code from influxdb_iox in order for
us to be able to upgrade from a forked version of 46.0.0 to 49.0.0 of
both arrow and datafusion. Most of the important changes were around how
we consumed the crates in influxdb3(_server/_write). Those diffs are
particularly worth looking at as the rest was a straight copy and we
don't touch those crates in our development currently for influxdb3
edge.
---
 Cargo.lock                                    | 1828 ++++++--
 Cargo.toml                                    |   61 +-
 arrow_util/Cargo.toml                         |   16 +-
 arrow_util/src/bitset.rs                      |  338 +-
 arrow_util/src/string.rs                      |   65 +
 arrow_util/src/test_util.rs                   |    5 +-
 authz/Cargo.toml                              |   11 +-
 authz/src/permission.rs                       |    8 +-
 backoff/Cargo.toml                            |    7 +-
 backoff/src/lib.rs                            |    4 +
 cache_system/Cargo.toml                       |   10 +-
 cache_system/src/addressable_heap.rs          |    5 +-
 cache_system/src/backend/mod.rs               |   11 +-
 cache_system/src/backend/policy/mod.rs        |  136 +-
 cache_system/src/cache/driver.rs              |   26 +-
 cache_system/src/cache/metrics.rs             |    7 +-
 cache_system/src/lib.rs                       |    1 +
 cache_system/src/loader/batch.rs              |    7 +-
 catalog_cache/Cargo.toml                      |   23 +
 catalog_cache/src/api/client.rs               |  176 +
 catalog_cache/src/api/list.rs                 |  467 ++
 catalog_cache/src/api/mod.rs                  |  159 +
 catalog_cache/src/api/quorum.rs               |  459 ++
 catalog_cache/src/api/server.rs               |  300 ++
 catalog_cache/src/lib.rs                      |  143 +
 catalog_cache/src/local/limit.rs              |   82 +
 catalog_cache/src/local/mod.rs                |  355 ++
 clap_blocks/Cargo.toml                        |   17 +-
 clap_blocks/src/bulk_ingest.rs                |  274 ++
 clap_blocks/src/catalog_cache.rs              |  154 +
 clap_blocks/src/catalog_dsn.rs                |   18 +-
 clap_blocks/src/compactor.rs                  |  125 +-
 clap_blocks/src/compactor_scheduler.rs        |  192 +
 clap_blocks/src/garbage_collector.rs          |   87 +-
 clap_blocks/src/gossip.rs                     |   15 +-
 clap_blocks/src/ingester.rs                   |   13 +
 clap_blocks/src/ingester_address.rs           |   16 +-
 clap_blocks/src/lib.rs                        |    3 +
 clap_blocks/src/memory_size.rs                |   21 +-
 clap_blocks/src/object_store.rs               |  240 +-
 clap_blocks/src/parquet_cache.rs              |   57 +
 clap_blocks/src/querier.rs                    |   10 +-
 clap_blocks/src/router.rs                     |   16 +
 client_util/Cargo.toml                        |   11 +-
 data_types/Cargo.toml                         |   23 +-
 data_types/src/columns.rs                     |  545 ++-
 data_types/src/lib.rs                         |  361 +-
 data_types/src/namespace_name.rs              |   25 +
 data_types/src/partition.rs                   |  122 +-
 data_types/src/partition_template.rs          |  629 ++-
 data_types/src/service_limits.rs              |  312 +-
 data_types/src/snapshot/hash.rs               |  219 +
 data_types/src/snapshot/list.rs               |  192 +
 data_types/src/snapshot/mask.rs               |   71 +
 data_types/src/snapshot/mod.rs                |   11 +
 data_types/src/snapshot/partition.rs          |  246 +
 data_types/src/snapshot/table.rs              |  197 +
 datafusion_util/Cargo.toml                    |   11 +-
 datafusion_util/src/config.rs                 |    8 +
 datafusion_util/src/lib.rs                    |   83 +-
 datafusion_util/src/watch.rs                  |    2 +-
 dml/Cargo.toml                                |    3 +
 executor/Cargo.toml                           |   12 +-
 executor/src/lib.rs                           |    8 +
 flightsql/Cargo.toml                          |   11 +-
 flightsql/src/error.rs                        |    2 +-
 flightsql/src/planner.rs                      |   12 +-
 flightsql/src/sql_info/meta.rs                |   87 +
 flightsql/src/sql_info/mod.rs                 |   11 +-
 garbage_collector/Cargo.toml                  |   13 +-
 garbage_collector/src/lib.rs                  |   15 +-
 garbage_collector/src/objectstore/checker.rs  |  136 +-
 garbage_collector/src/objectstore/deleter.rs  |   74 +-
 garbage_collector/src/objectstore/lister.rs   |    8 +-
 garbage_collector/src/parquetfile/deleter.rs  |   11 +-
 garbage_collector/src/retention/flagger.rs    |    1 -
 generated_types/Cargo.toml                    |   14 +-
 generated_types/build.rs                      |   25 +-
 .../iox/bulk_ingest/v1/service.proto          |   73 +
 .../iox/catalog/v1/parquet_file.proto         |   11 +-
 .../influxdata/iox/catalog/v1/service.proto   |   12 +-
 .../influxdata/iox/catalog/v2/service.proto   |  489 ++
 .../iox/catalog_cache/v1/value.proto          |  158 +
 .../influxdata/iox/column_type/v1/type.proto  |   14 +
 .../influxdata/iox/compactor/v1/service.proto |   37 +-
 .../influxdata/iox/gossip/v1/schema.proto     |    8 +
 .../influxdata/iox/gossip/v1/sort_keys.proto  |   20 +
 .../iox/partition_template/v1/template.proto  |   12 +
 .../influxdata/iox/querier/v1/flight.proto    |   21 +
 .../influxdata/iox/querier/v1/query_log.proto |   73 +
 .../influxdata/iox/schema/v1/service.proto    |   40 +-
 .../v1/skipped_compaction.proto               |   29 +
 .../influxdata/iox/table/v1/service.proto     |   16 +
 .../protos/influxdata/iox/wal/v1/wal.proto    |   49 -
 generated_types/src/lib.rs                    |   85 +-
 grpc-binary-logger-proto/Cargo.toml           |    9 +-
 grpc-binary-logger-test-proto/Cargo.toml      |    7 +-
 grpc-binary-logger-test-proto/src/lib.rs      |    7 +-
 grpc-binary-logger/Cargo.toml                 |    7 +-
 grpc-binary-logger/src/lib.rs                 |    2 +-
 grpc-binary-logger/src/predicate.rs           |    2 +-
 .../tests/end_to_end_cases/server.rs          |    2 +-
 .../tests/end_to_end_cases/test_utils.rs      |   14 +-
 import_export/Cargo.toml                      |   11 +-
 import_export/src/file/export.rs              |   29 +-
 import_export/src/file/import.rs              |  119 +-
 influxdb2_client/Cargo.toml                   |   15 +-
 .../tests/common/server_fixture.rs            |    2 +
 influxdb3_server/Cargo.toml                   |    2 +
 influxdb3_server/src/http.rs                  |    9 +-
 influxdb3_server/src/query_executor.rs        |   77 +-
 influxdb3_write/src/catalog.rs                |   22 +-
 influxdb3_write/src/persister.rs              |   20 +-
 influxdb3_write/src/write_buffer.rs           |   20 +-
 influxdb_influxql_parser/Cargo.toml           |    7 +-
 influxdb_influxql_parser/src/explain.rs       |  175 +-
 influxdb_influxql_parser/src/internal.rs      |   10 +-
 influxdb_influxql_parser/src/keywords.rs      |    2 +-
 ...ser__visit__test__explain_statement-2.snap |   13 +
 ...ser__visit__test__explain_statement-3.snap |   13 +
 ...ser__visit__test__explain_statement-4.snap |   15 +
 ...ser__visit__test__explain_statement-5.snap |   13 +
 ...ser__visit__test__explain_statement-6.snap |   13 +
 ...ser__visit__test__explain_statement-7.snap |   13 +
 ...ser__visit__test__explain_statement-8.snap |   31 +
 ...arser__visit__test__explain_statement.snap |    2 +
 ..._visit_mut__test__explain_statement-2.snap |   13 +
 ..._visit_mut__test__explain_statement-3.snap |   13 +
 ..._visit_mut__test__explain_statement-4.snap |   15 +
 ..._visit_mut__test__explain_statement-5.snap |   13 +
 ..._visit_mut__test__explain_statement-6.snap |   13 +
 ..._visit_mut__test__explain_statement-7.snap |   13 +
 ..._visit_mut__test__explain_statement-8.snap |   31 +
 ...r__visit_mut__test__explain_statement.snap |    2 +
 influxdb_influxql_parser/src/time_range.rs    |   12 +-
 influxdb_influxql_parser/src/visit.rs         |   12 +-
 influxdb_influxql_parser/src/visit_mut.rs     |   13 +-
 influxdb_iox_client/Cargo.toml                |   18 +-
 influxdb_iox_client/src/client.rs             |    3 +
 influxdb_iox_client/src/client/catalog.rs     |    8 +-
 influxdb_iox_client/src/client/compactor.rs   |    7 +-
 influxdb_iox_client/src/client/error.rs       |   20 +-
 influxdb_iox_client/src/client/flight/mod.rs  |   61 +
 .../src/client/flight/query.rs                |  154 +
 influxdb_iox_client/src/client/query_log.rs   |   30 +
 influxdb_iox_client/src/client/table.rs       |   18 +
 influxdb_iox_client/src/client/write.rs       |  126 +-
 influxdb_iox_client/src/format.rs             |    2 +-
 influxdb_iox_client/src/format/influxql.rs    |   18 +-
 influxdb_line_protocol/Cargo.toml             |    8 +-
 influxdb_line_protocol/README.md              |    6 +-
 influxdb_line_protocol/RELEASE.md             |    2 +-
 influxdb_line_protocol/src/lib.rs             |  226 +-
 influxdb_storage_client/Cargo.toml            |    5 +-
 influxrpc_parser/Cargo.toml                   |    8 +-
 influxrpc_parser/src/predicate.rs             |    6 +-
 ingester_query_grpc/Cargo.toml                |   15 +-
 .../influxdata/iox/ingester/v2/query.proto    |   15 +-
 ingester_query_grpc/src/lib.rs                |   29 +-
 iox_catalog/Cargo.toml                        |   23 +-
 ...1180000_set_partition_sort_key_to_null.sql |    1 +
 .../20231003120000_drop_sort_key.sql          |    1 +
 .../20231004120000_add_empty_sort_key.sql     |    4 +
 ...0231121120000_add_partition_generation.sql |    1 +
 .../20231121150000_partition_id_trigger.sql   |   20 +
 ...1123120000_partition_id_from_partition.sql |   47 +
 .../20240111150000_add_table_generation.sql   |    1 +
 ...6_complete_kafkaless_transition_sqlite.sql |   39 +
 ...1180000_set_partition_sort_key_to_null.sql |    2 +
 .../20231003120000_drop_sort_key.sql          |    1 +
 .../20231004120000_add_empty_sort_key.sql     |    1 +
 ...0231121120000_add_partition_generation.sql |    1 +
 ...1123120000_partition_id_from_partition.sql |    8 +
 .../20240111150000_add_table_generation.sql   |    1 +
 iox_catalog/src/cache.rs                      |  831 ++++
 iox_catalog/src/constants.rs                  |   19 +
 iox_catalog/src/grpc/client.rs                |  997 ++++
 iox_catalog/src/grpc/mod.rs                   |  143 +
 iox_catalog/src/grpc/serialization.rs         |  712 +++
 iox_catalog/src/grpc/server.rs                | 1032 ++++
 iox_catalog/src/interface.rs                  | 3168 +------------
 iox_catalog/src/interface_tests.rs            | 3203 +++++++++++++
 iox_catalog/src/lib.rs                        |  695 +--
 iox_catalog/src/mem.rs                        |  793 ++--
 iox_catalog/src/metrics.rs                    |   53 +-
 iox_catalog/src/migrate.rs                    |   72 +-
 iox_catalog/src/postgres.rs                   | 1107 ++---
 iox_catalog/src/sqlite.rs                     | 1131 ++---
 iox_catalog/src/test_helpers.rs               |   92 +
 iox_catalog/src/util.rs                       |  897 ++++
 iox_data_generator/Cargo.toml                 |   17 +-
 iox_data_generator/src/substitution.rs        |   26 +-
 iox_data_generator/src/tag_pair.rs            |    2 +-
 iox_query/Cargo.toml                          |   16 +-
 iox_query/src/chunk_statistics.rs             |  185 +-
 iox_query/src/exec.rs                         |   32 +-
 iox_query/src/exec/context.rs                 |   48 +-
 iox_query/src/exec/field.rs                   |    2 +-
 iox_query/src/exec/fieldlist.rs               |   42 +-
 iox_query/src/exec/gapfill/algo.rs            |  123 +-
 .../src/exec/gapfill/algo/interpolate.rs      |   43 +-
 iox_query/src/exec/gapfill/buffered_input.rs  |    6 +-
 iox_query/src/exec/gapfill/exec_tests.rs      |   13 +-
 iox_query/src/exec/gapfill/mod.rs             |   37 +-
 iox_query/src/exec/gapfill/params.rs          |    2 +-
 iox_query/src/exec/gapfill/stream.rs          |   13 +-
 iox_query/src/exec/non_null_checker.rs        |   26 +-
 iox_query/src/exec/query_tracing.rs           |    6 +-
 iox_query/src/exec/schema_pivot.rs            |    5 +-
 iox_query/src/exec/seriesset/converter.rs     |    9 +-
 iox_query/src/exec/sleep.rs                   |  265 ++
 iox_query/src/exec/split.rs                   |   32 +-
 iox_query/src/frontend/reorg.rs               |   97 +-
 iox_query/src/frontend/sql.rs                 |    8 +-
 iox_query/src/lib.rs                          |  181 +-
 .../src/logical_optimizer/extract_sleep.rs    |  100 +
 .../src/logical_optimizer/handle_gapfill.rs   |  243 +-
 .../handle_gapfill/range_predicate.rs         |   82 +-
 .../influx_regex_to_datafusion_regex.rs       |   15 +-
 iox_query/src/logical_optimizer/mod.rs        |    5 +-
 .../physical_optimizer/chunk_extraction.rs    |   26 +-
 .../src/physical_optimizer/combine_chunks.rs  |  296 +-
 .../dedup/dedup_null_columns.rs               |    4 +-
 .../dedup/dedup_sort_order.rs                 |    4 +-
 .../dedup/partition_split.rs                  |   24 +-
 .../physical_optimizer/dedup/remove_dedup.rs  |   16 +-
 .../physical_optimizer/dedup/time_split.rs    |   18 +-
 iox_query/src/physical_optimizer/mod.rs       |   10 +-
 .../physical_optimizer/predicate_pushdown.rs  |   64 +-
 .../physical_optimizer/projection_pushdown.rs |   80 +-
 iox_query/src/physical_optimizer/sort/mod.rs  |    2 +
 .../sort/order_union_sorted_inputs.rs         | 1487 ++++++
 .../sort/parquet_sortness.rs                  |   92 +-
 .../sort/push_sort_through_union.rs           |  121 +-
 iox_query/src/physical_optimizer/sort/util.rs |  102 +
 iox_query/src/physical_optimizer/tests.rs     |  210 +
 .../physical_optimizer/union/nested_union.rs  |   34 +-
 .../src/physical_optimizer/union/one_union.rs |   18 +-
 iox_query/src/provider.rs                     |   43 +-
 iox_query/src/provider/adapter.rs             |    8 +-
 iox_query/src/provider/deduplicate.rs         |   23 +-
 iox_query/src/provider/deduplicate/algo.rs    |   24 +-
 iox_query/src/provider/overlap.rs             |   21 +-
 iox_query/src/provider/physical.rs            |  266 +-
 iox_query/src/provider/progressive_eval.rs    | 1206 +++++
 iox_query/src/provider/record_batch_exec.rs   |   54 +-
 iox_query/src/pruning.rs                      |   49 +-
 iox_query/src/query_log.rs                    |  704 +++
 iox_query/src/statistics.rs                   | 1427 ++++--
 iox_query/src/test.rs                         |  285 +-
 iox_query/src/util.rs                         |  286 +-
 iox_query_influxql/Cargo.toml                 |    9 +-
 .../src/aggregate/percentile.rs               |    3 +-
 iox_query_influxql/src/frontend/planner.rs    |   27 +-
 iox_query_influxql/src/plan/ir.rs             |    4 +-
 iox_query_influxql/src/plan/planner.rs        |  939 ++--
 iox_query_influxql/src/plan/planner/select.rs |   16 +-
 .../src/plan/planner_rewrite_expression.rs    |   14 +-
 iox_query_influxql/src/plan/rewriter.rs       |   29 +-
 iox_query_influxql/src/plan/udf.rs            |  284 +-
 iox_query_influxql/src/plan/util.rs           |    4 +-
 iox_query_influxql/src/window.rs              |  112 +-
 .../src/window/cumulative_sum.rs              |   61 +-
 iox_query_influxql/src/window/derivative.rs   |   85 +-
 iox_query_influxql/src/window/difference.rs   |   60 +-
 .../src/window/moving_average.rs              |   60 +-
 iox_query_influxql/src/window/non_negative.rs |   57 +-
 .../src/window/percent_row_number.rs          |   61 +-
 iox_query_influxrpc/Cargo.toml                |    9 +-
 iox_query_influxrpc/src/lib.rs                |    7 +-
 iox_query_influxrpc/src/missing_columns.rs    |    4 +-
 iox_query_influxrpc/src/scan_plan.rs          |   23 +-
 iox_query_params/Cargo.toml                   |   22 +
 iox_query_params/src/lib.rs                   |   21 +
 iox_query_params/src/params.rs                |  675 +++
 iox_tests/Cargo.toml                          |    3 +
 iox_tests/src/builders.rs                     |   50 +-
 iox_tests/src/catalog.rs                      |  209 +-
 iox_tests/src/lib.rs                          |   10 -
 iox_time/Cargo.toml                           |    5 +-
 iox_time/src/lib.rs                           |    2 +-
 ioxd_common/Cargo.toml                        |   21 +-
 ioxd_common/src/http/error.rs                 |   32 +-
 ioxd_common/src/http/mod.rs                   |   23 +-
 ioxd_common/src/http/pprof.rs                 |    2 +-
 ioxd_common/src/lib.rs                        |   38 +-
 ioxd_common/src/rpc.rs                        |   14 +-
 ioxd_common/src/server_type.rs                |   14 +
 ioxd_common/src/service.rs                    |   14 +-
 ioxd_test/Cargo.toml                          |    7 +-
 ioxd_test/src/lib.rs                          |    2 +-
 kube_test/Cargo.toml                          |   22 +
 kube_test/src/call.rs                         |   70 +
 kube_test/src/error.rs                        |   57 +
 kube_test/src/handler.rs                      |   25 +
 kube_test/src/lib.rs                          |   31 +
 kube_test/src/object_map.rs                   |  178 +
 kube_test/src/request.rs                      |  115 +
 kube_test/src/resource_handler.rs             |  267 ++
 kube_test/src/service.rs                      |   54 +
 kube_test/src/status.rs                       |   61 +
 logfmt/Cargo.toml                             |    5 +-
 metric/Cargo.toml                             |    3 +
 metric/src/counter.rs                         |    5 +-
 metric/src/duration.rs                        |    8 +
 metric/src/histogram.rs                       |   91 +
 metric/src/lib.rs                             |    3 +-
 metric/src/metric.rs                          |    7 +-
 metric_exporters/Cargo.toml                   |    3 +
 mutable_batch/Cargo.toml                      |   20 +-
 mutable_batch/src/column.rs                   |  878 +++-
 mutable_batch/src/lib.rs                      |  231 +-
 mutable_batch/src/payload.rs                  |  150 +-
 mutable_batch/src/writer.rs                   |   12 +-
 mutable_batch/tests/writer.rs                 |  109 +-
 mutable_batch/tests/writer_fuzz.rs            |    5 +-
 mutable_batch_lp/Cargo.toml                   |    8 +-
 mutable_batch_lp/fuzz/.gitignore              |    4 +
 mutable_batch_lp/fuzz/Cargo.lock              | 4129 +++++++++++++++++
 mutable_batch_lp/fuzz/Cargo.toml              |   27 +
 mutable_batch_lp/fuzz/README.md               |   46 +
 .../fuzz/fuzz_targets/lines_converter.rs      |   66 +
 mutable_batch_lp/src/lib.rs                   |  291 +-
 mutable_batch_pb/Cargo.toml                   |    8 +-
 mutable_batch_pb/src/decode.rs                |   14 +-
 mutable_batch_pb/src/lib.rs                   |    2 +
 mutable_batch_pb/tests/encode.rs              |    3 +-
 mutable_batch_tests/Cargo.toml                |   12 +-
 mutable_batch_tests/benches/statistics.rs     |  184 +
 object_store_metrics/Cargo.toml               |   11 +-
 object_store_metrics/src/dummy.rs             |   30 +-
 object_store_metrics/src/lib.rs               | 1051 +++--
 observability_deps/Cargo.toml                 |    3 +
 panic_logging/Cargo.toml                      |    3 +
 panic_logging/src/lib.rs                      |   53 +-
 parquet_cache/Cargo.toml                      |   60 +
 parquet_cache/src/client.rs                   |   16 +
 parquet_cache/src/client/cache_connector.rs   |   37 +
 parquet_cache/src/client/http.rs              |   62 +
 parquet_cache/src/client/keyspace.rs          |  314 ++
 parquet_cache/src/client/mock.rs              |  153 +
 parquet_cache/src/client/object_store.rs      |  776 ++++
 parquet_cache/src/client/request.rs           |   46 +
 parquet_cache/src/client/write_hints.rs       |  223 +
 parquet_cache/src/controller.rs               |   53 +
 parquet_cache/src/controller/error.rs         |   29 +
 parquet_cache/src/controller/kube_util.rs     |   93 +
 parquet_cache/src/controller/parquet_cache.rs |  139 +
 .../controller/parquet_cache_controller.rs    | 1446 ++++++
 .../src/controller/parquet_cache_set.rs       |   75 +
 .../parquet_cache_set_controller.rs           |  676 +++
 parquet_cache/src/controller/state_service.rs |  109 +
 parquet_cache/src/data_types.rs               |   12 +
 parquet_cache/src/data_types/keyspace.rs      |  164 +
 parquet_cache/src/data_types/objects.rs       |   79 +
 parquet_cache/src/data_types/policy.rs        |   17 +
 parquet_cache/src/data_types/state.rs         |   52 +
 parquet_cache/src/data_types/write_hints.rs   |   81 +
 parquet_cache/src/lib.rs                      |   51 +
 parquet_cache/src/server.rs                   |  482 ++
 parquet_cache/src/server/cache.rs             |  113 +
 parquet_cache/src/server/data.rs              |  810 ++++
 parquet_cache/src/server/data/manager.rs      |  836 ++++
 parquet_cache/src/server/data/reads.rs        |   23 +
 parquet_cache/src/server/data/store.rs        |  510 ++
 parquet_cache/src/server/data/writes.rs       |   69 +
 parquet_cache/src/server/error.rs             |   55 +
 parquet_cache/src/server/keyspace.rs          |  957 ++++
 parquet_cache/src/server/mock.rs              |  217 +
 parquet_cache/src/server/precondition.rs      |   57 +
 parquet_cache/src/server/response.rs          |   83 +
 parquet_file/Cargo.toml                       |   23 +-
 parquet_file/src/chunk.rs                     |    5 +-
 parquet_file/src/lib.rs                       |  183 +-
 parquet_file/src/metadata.rs                  |   61 +-
 parquet_file/src/serialize.rs                 |    4 +-
 parquet_file/src/storage.rs                   |   24 +-
 parquet_file/tests/metadata.rs                |   33 +-
 parquet_to_line_protocol/Cargo.toml           |    8 +-
 parquet_to_line_protocol/src/batch.rs         |    2 +-
 parquet_to_line_protocol/src/lib.rs           |   11 +-
 partition/Cargo.toml                          |   37 +
 partition/benches/partitioner.rs              |  246 +
 partition/src/bucket.rs                       |   49 +
 partition/src/filter.rs                       |  145 +
 partition/src/lib.rs                          | 1704 +++++++
 partition/src/strftime.rs                     |  415 ++
 partition/src/traits.rs                       |   61 +
 partition/src/traits/mutable_batch.rs         |   60 +
 partition/src/traits/record_batch.rs          |   82 +
 predicate/Cargo.toml                          |   11 +-
 predicate/src/delete_expr.rs                  |   21 +-
 predicate/src/lib.rs                          |   19 +-
 predicate/src/rpc_predicate/column_rewrite.rs |    5 +-
 predicate/src/rpc_predicate/field_rewrite.rs  |    2 +-
 query_functions/Cargo.toml                    |   13 +-
 query_functions/src/coalesce_struct.rs        |  110 +-
 query_functions/src/gapfill.rs                |  175 +-
 query_functions/src/lib.rs                    |   22 +-
 query_functions/src/regex.rs                  |    5 +-
 query_functions/src/registry.rs               |    6 +-
 query_functions/src/selectors/internal.rs     |    2 +-
 query_functions/src/sleep.rs                  |   94 +
 query_functions/src/to_timestamp.rs           |   85 +
 query_functions/src/window.rs                 |   28 +-
 schema/Cargo.toml                             |   10 +-
 schema/src/lib.rs                             |   20 +-
 schema/src/sort.rs                            |   43 +-
 service_common/Cargo.toml                     |   15 +-
 service_common/src/error.rs                   |    8 +-
 service_common/src/lib.rs                     |   35 +-
 service_grpc_flight/Cargo.toml                |   18 +-
 service_grpc_flight/src/keep_alive.rs         |   42 +-
 service_grpc_flight/src/lib.rs                |  380 +-
 service_grpc_flight/src/planner.rs            |  113 +
 service_grpc_flight/src/request.rs            |  224 +-
 service_grpc_testing/Cargo.toml               |    3 +
 sharder/Cargo.toml                            |    3 +
 sharder/benches/sharder.rs                    |    2 +-
 sqlx-hotswap-pool/Cargo.toml                  |    7 +-
 test_fixtures/README.md                       |   26 +
 test_fixtures/parquet/influxql_log_1.parquet  |  Bin 0 -> 309561 bytes
 test_fixtures/parquet/influxql_log_2.parquet  |  Bin 0 -> 320350 bytes
 test_fixtures/parquet/influxql_log_3.parquet  |  Bin 0 -> 449494 bytes
 test_fixtures/parquet/sql_query_log_1.parquet |  Bin 0 -> 227212 bytes
 test_fixtures/parquet/sql_query_log_2.parquet |  Bin 0 -> 72243 bytes
 test_fixtures/parquet/sql_query_log_3.parquet |  Bin 0 -> 60961 bytes
 test_fixtures/wal/9.dat                       |  Bin 0 -> 467 bytes
 test_helpers/Cargo.toml                       |   11 +-
 test_helpers_end_to_end/Cargo.toml            |   29 +-
 test_helpers_end_to_end/src/addrs.rs          |  114 +-
 test_helpers_end_to_end/src/client.rs         |  120 +-
 test_helpers_end_to_end/src/config.rs         |  111 +
 test_helpers_end_to_end/src/data_generator.rs |    1 +
 test_helpers_end_to_end/src/database.rs       |    2 +-
 test_helpers_end_to_end/src/grpc.rs           |    8 +-
 .../src/http_reverse_proxy.rs                 |  160 +
 test_helpers_end_to_end/src/lib.rs            |    3 +
 test_helpers_end_to_end/src/mini_cluster.rs   |  183 +-
 test_helpers_end_to_end/src/server_fixture.rs |  369 +-
 test_helpers_end_to_end/src/server_type.rs    |   76 +-
 test_helpers_end_to_end/src/service_link.rs   |   99 +
 .../src/snapshot_comparison.rs                |  153 +-
 .../src/snapshot_comparison/queries.rs        |   80 +-
 test_helpers_end_to_end/src/steps.rs          |  185 +-
 test_helpers_end_to_end/src/udp_listener.rs   |    3 +-
 tokio_metrics_bridge/Cargo.toml               |    5 +-
 tokio_watchdog/Cargo.toml                     |   18 +
 tokio_watchdog/src/lib.rs                     |  231 +
 tower_trailer/Cargo.toml                      |   21 +
 tower_trailer/src/lib.rs                      |  194 +
 trace/Cargo.toml                              |    3 +
 trace/src/lib.rs                              |    2 +-
 trace/src/span.rs                             |   57 +-
 trace_exporters/Cargo.toml                    |    9 +-
 trace_exporters/src/jaeger.rs                 |   97 +-
 trace_exporters/src/jaeger/span.rs            |   24 +-
 trace_exporters/src/lib.rs                    |   20 +-
 trace_http/Cargo.toml                         |    8 +-
 trace_http/src/classify.rs                    |   62 +-
 trace_http/src/lib.rs                         |    2 +-
 trace_http/src/metrics.rs                     |  160 +-
 trace_http/src/tower.rs                       |   94 +-
 tracker/Cargo.toml                            |   15 +-
 tracker/src/async_semaphore.rs                |   40 +-
 tracker/src/disk_metric.rs                    |   21 +-
 tracker/src/lock.rs                           |  123 +-
 tracker/src/task.rs                           |    2 +-
 tracker/src/task/history.rs                   |    8 +-
 trogging/Cargo.toml                           |    7 +-
 wal/Cargo.toml                                |   13 +-
 wal/src/blocking/reader.rs                    |   60 +-
 wal/src/lib.rs                                |   39 +-
 wal/tests/end_to_end.rs                       |   14 +-
 wal_inspect/Cargo.toml                        |    7 +-
 workspace-hack/Cargo.toml                     |   59 +-
 476 files changed, 52639 insertions(+), 11570 deletions(-)
 create mode 100644 catalog_cache/Cargo.toml
 create mode 100644 catalog_cache/src/api/client.rs
 create mode 100644 catalog_cache/src/api/list.rs
 create mode 100644 catalog_cache/src/api/mod.rs
 create mode 100644 catalog_cache/src/api/quorum.rs
 create mode 100644 catalog_cache/src/api/server.rs
 create mode 100644 catalog_cache/src/lib.rs
 create mode 100644 catalog_cache/src/local/limit.rs
 create mode 100644 catalog_cache/src/local/mod.rs
 create mode 100644 clap_blocks/src/bulk_ingest.rs
 create mode 100644 clap_blocks/src/catalog_cache.rs
 create mode 100644 clap_blocks/src/parquet_cache.rs
 create mode 100644 data_types/src/snapshot/hash.rs
 create mode 100644 data_types/src/snapshot/list.rs
 create mode 100644 data_types/src/snapshot/mask.rs
 create mode 100644 data_types/src/snapshot/mod.rs
 create mode 100644 data_types/src/snapshot/partition.rs
 create mode 100644 data_types/src/snapshot/table.rs
 create mode 100644 generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto
 create mode 100644 generated_types/protos/influxdata/iox/catalog/v2/service.proto
 create mode 100644 generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
 create mode 100644 generated_types/protos/influxdata/iox/column_type/v1/type.proto
 create mode 100644 generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto
 create mode 100644 generated_types/protos/influxdata/iox/querier/v1/query_log.proto
 create mode 100644 generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap
 create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap
 create mode 100644 influxdb_iox_client/src/client/flight/query.rs
 create mode 100644 influxdb_iox_client/src/client/query_log.rs
 create mode 100644 iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql
 create mode 100644 iox_catalog/migrations/20231003120000_drop_sort_key.sql
 create mode 100644 iox_catalog/migrations/20231004120000_add_empty_sort_key.sql
 create mode 100644 iox_catalog/migrations/20231121120000_add_partition_generation.sql
 create mode 100644 iox_catalog/migrations/20231121150000_partition_id_trigger.sql
 create mode 100644 iox_catalog/migrations/20231123120000_partition_id_from_partition.sql
 create mode 100644 iox_catalog/migrations/20240111150000_add_table_generation.sql
 create mode 100644 iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql
 create mode 100644 iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql
 create mode 100644 iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql
 create mode 100644 iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql
 create mode 100644 iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql
 create mode 100644 iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql
 create mode 100644 iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql
 create mode 100644 iox_catalog/src/cache.rs
 create mode 100644 iox_catalog/src/constants.rs
 create mode 100644 iox_catalog/src/grpc/client.rs
 create mode 100644 iox_catalog/src/grpc/mod.rs
 create mode 100644 iox_catalog/src/grpc/serialization.rs
 create mode 100644 iox_catalog/src/grpc/server.rs
 create mode 100644 iox_catalog/src/interface_tests.rs
 create mode 100644 iox_catalog/src/test_helpers.rs
 create mode 100644 iox_catalog/src/util.rs
 create mode 100644 iox_query/src/exec/sleep.rs
 create mode 100644 iox_query/src/logical_optimizer/extract_sleep.rs
 create mode 100644 iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
 create mode 100644 iox_query/src/physical_optimizer/sort/util.rs
 create mode 100644 iox_query/src/physical_optimizer/tests.rs
 create mode 100644 iox_query/src/provider/progressive_eval.rs
 create mode 100644 iox_query/src/query_log.rs
 create mode 100644 iox_query_params/Cargo.toml
 create mode 100644 iox_query_params/src/lib.rs
 create mode 100644 iox_query_params/src/params.rs
 create mode 100644 kube_test/Cargo.toml
 create mode 100644 kube_test/src/call.rs
 create mode 100644 kube_test/src/error.rs
 create mode 100644 kube_test/src/handler.rs
 create mode 100644 kube_test/src/lib.rs
 create mode 100644 kube_test/src/object_map.rs
 create mode 100644 kube_test/src/request.rs
 create mode 100644 kube_test/src/resource_handler.rs
 create mode 100644 kube_test/src/service.rs
 create mode 100644 kube_test/src/status.rs
 create mode 100644 mutable_batch_lp/fuzz/.gitignore
 create mode 100644 mutable_batch_lp/fuzz/Cargo.lock
 create mode 100644 mutable_batch_lp/fuzz/Cargo.toml
 create mode 100644 mutable_batch_lp/fuzz/README.md
 create mode 100644 mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs
 create mode 100644 mutable_batch_tests/benches/statistics.rs
 create mode 100644 parquet_cache/Cargo.toml
 create mode 100644 parquet_cache/src/client.rs
 create mode 100644 parquet_cache/src/client/cache_connector.rs
 create mode 100644 parquet_cache/src/client/http.rs
 create mode 100644 parquet_cache/src/client/keyspace.rs
 create mode 100644 parquet_cache/src/client/mock.rs
 create mode 100644 parquet_cache/src/client/object_store.rs
 create mode 100644 parquet_cache/src/client/request.rs
 create mode 100644 parquet_cache/src/client/write_hints.rs
 create mode 100644 parquet_cache/src/controller.rs
 create mode 100644 parquet_cache/src/controller/error.rs
 create mode 100644 parquet_cache/src/controller/kube_util.rs
 create mode 100644 parquet_cache/src/controller/parquet_cache.rs
 create mode 100644 parquet_cache/src/controller/parquet_cache_controller.rs
 create mode 100644 parquet_cache/src/controller/parquet_cache_set.rs
 create mode 100644 parquet_cache/src/controller/parquet_cache_set_controller.rs
 create mode 100644 parquet_cache/src/controller/state_service.rs
 create mode 100644 parquet_cache/src/data_types.rs
 create mode 100644 parquet_cache/src/data_types/keyspace.rs
 create mode 100644 parquet_cache/src/data_types/objects.rs
 create mode 100644 parquet_cache/src/data_types/policy.rs
 create mode 100644 parquet_cache/src/data_types/state.rs
 create mode 100644 parquet_cache/src/data_types/write_hints.rs
 create mode 100644 parquet_cache/src/lib.rs
 create mode 100644 parquet_cache/src/server.rs
 create mode 100644 parquet_cache/src/server/cache.rs
 create mode 100644 parquet_cache/src/server/data.rs
 create mode 100644 parquet_cache/src/server/data/manager.rs
 create mode 100644 parquet_cache/src/server/data/reads.rs
 create mode 100644 parquet_cache/src/server/data/store.rs
 create mode 100644 parquet_cache/src/server/data/writes.rs
 create mode 100644 parquet_cache/src/server/error.rs
 create mode 100644 parquet_cache/src/server/keyspace.rs
 create mode 100644 parquet_cache/src/server/mock.rs
 create mode 100644 parquet_cache/src/server/precondition.rs
 create mode 100644 parquet_cache/src/server/response.rs
 create mode 100644 partition/Cargo.toml
 create mode 100644 partition/benches/partitioner.rs
 create mode 100644 partition/src/bucket.rs
 create mode 100644 partition/src/filter.rs
 create mode 100644 partition/src/lib.rs
 create mode 100644 partition/src/strftime.rs
 create mode 100644 partition/src/traits.rs
 create mode 100644 partition/src/traits/mutable_batch.rs
 create mode 100644 partition/src/traits/record_batch.rs
 create mode 100644 query_functions/src/sleep.rs
 create mode 100644 query_functions/src/to_timestamp.rs
 create mode 100644 service_grpc_flight/src/planner.rs
 create mode 100644 test_fixtures/README.md
 create mode 100644 test_fixtures/parquet/influxql_log_1.parquet
 create mode 100644 test_fixtures/parquet/influxql_log_2.parquet
 create mode 100644 test_fixtures/parquet/influxql_log_3.parquet
 create mode 100644 test_fixtures/parquet/sql_query_log_1.parquet
 create mode 100644 test_fixtures/parquet/sql_query_log_2.parquet
 create mode 100644 test_fixtures/parquet/sql_query_log_3.parquet
 create mode 100644 test_fixtures/wal/9.dat
 create mode 100644 test_helpers_end_to_end/src/http_reverse_proxy.rs
 create mode 100644 test_helpers_end_to_end/src/service_link.rs
 create mode 100644 tokio_watchdog/Cargo.toml
 create mode 100644 tokio_watchdog/src/lib.rs
 create mode 100644 tower_trailer/Cargo.toml
 create mode 100644 tower_trailer/src/lib.rs

diff --git a/Cargo.lock b/Cargo.lock
index ebc077b1352..71d353372bb 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -142,6 +142,12 @@ version = "1.0.79"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
 
+[[package]]
+name = "arc-swap"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+
 [[package]]
 name = "arrayref"
 version = "0.3.7"
@@ -156,8 +162,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 
 [[package]]
 name = "arrow"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614"
 dependencies = [
  "ahash",
  "arrow-arith",
@@ -177,22 +184,24 @@ dependencies = [
 
 [[package]]
 name = "arrow-arith"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "chrono",
- "half 2.3.1",
+ "half",
  "num",
 ]
 
 [[package]]
 name = "arrow-array"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d"
 dependencies = [
  "ahash",
  "arrow-buffer",
@@ -200,42 +209,46 @@ dependencies = [
  "arrow-schema",
  "chrono",
  "chrono-tz",
- "half 2.3.1",
+ "half",
  "hashbrown 0.14.3",
  "num",
 ]
 
 [[package]]
 name = "arrow-buffer"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c"
 dependencies = [
  "bytes",
- "half 2.3.1",
+ "half",
  "num",
 ]
 
 [[package]]
 name = "arrow-cast"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "arrow-select",
+ "base64",
  "chrono",
  "comfy-table",
- "half 2.3.1",
+ "half",
  "lexical-core",
  "num",
 ]
 
 [[package]]
 name = "arrow-csv"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -252,19 +265,21 @@ dependencies = [
 
 [[package]]
 name = "arrow-data"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
- "half 2.3.1",
+ "half",
  "num",
 ]
 
 [[package]]
 name = "arrow-flight"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "624e0dcb6b5a7a06222bfd2be3f7e905ce849a6b714ec989f18cdba330c77d38"
 dependencies = [
  "arrow-arith",
  "arrow-array",
@@ -277,20 +292,21 @@ dependencies = [
  "arrow-schema",
  "arrow-select",
  "arrow-string",
- "base64 0.21.7",
+ "base64",
  "bytes",
  "futures",
  "once_cell",
  "paste",
- "prost",
+ "prost 0.12.3",
  "tokio",
- "tonic",
+ "tonic 0.10.2",
 ]
 
 [[package]]
 name = "arrow-ipc"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -298,12 +314,14 @@ dependencies = [
  "arrow-data",
  "arrow-schema",
  "flatbuffers",
+ "lz4_flex",
 ]
 
 [[package]]
 name = "arrow-json"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -311,7 +329,7 @@ dependencies = [
  "arrow-data",
  "arrow-schema",
  "chrono",
- "half 2.3.1",
+ "half",
  "indexmap 2.1.0",
  "lexical-core",
  "num",
@@ -321,42 +339,47 @@ dependencies = [
 
 [[package]]
 name = "arrow-ord"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
  "arrow-select",
- "half 2.3.1",
+ "half",
  "num",
 ]
 
 [[package]]
 name = "arrow-row"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a"
 dependencies = [
  "ahash",
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
  "arrow-schema",
- "half 2.3.1",
+ "half",
  "hashbrown 0.14.3",
 ]
 
 [[package]]
 name = "arrow-schema"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167"
 
 [[package]]
 name = "arrow-select"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036"
 dependencies = [
+ "ahash",
  "arrow-array",
  "arrow-buffer",
  "arrow-data",
@@ -366,8 +389,9 @@ dependencies = [
 
 [[package]]
 name = "arrow-string"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -376,7 +400,7 @@ dependencies = [
  "arrow-select",
  "num",
  "regex",
- "regex-syntax 0.7.5",
+ "regex-syntax 0.8.2",
 ]
 
 [[package]]
@@ -391,9 +415,10 @@ dependencies = [
  "hashbrown 0.14.3",
  "num-traits",
  "once_cell",
+ "proptest",
  "rand",
  "regex",
- "snafu",
+ "snafu 0.8.0",
  "uuid",
  "workspace-hack",
 ]
@@ -429,6 +454,19 @@ version = "1.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9"
 
+[[package]]
+name = "async-channel"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca33f4bc4ed1babef42cad36cc1f51fa88be00420404e5b1e80ab1b18f7678c"
+dependencies = [
+ "concurrent-queue",
+ "event-listener 4.0.3",
+ "event-listener-strategy",
+ "futures-core",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "async-compression"
 version = "0.4.6"
@@ -443,8 +481,17 @@ dependencies = [
  "pin-project-lite",
  "tokio",
  "xz2",
- "zstd 0.13.0",
- "zstd-safe 7.0.0",
+ "zstd",
+ "zstd-safe",
+]
+
+[[package]]
+name = "async-lock"
+version = "2.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b"
+dependencies = [
+ "event-listener 2.5.3",
 ]
 
 [[package]]
@@ -505,8 +552,8 @@ version = "0.1.0"
 dependencies = [
  "assert_matches",
  "async-trait",
- "backoff",
- "base64 0.21.7",
+ "backoff 0.1.0",
+ "base64",
  "generated_types",
  "http",
  "iox_time",
@@ -514,10 +561,10 @@ dependencies = [
  "observability_deps",
  "parking_lot 0.12.1",
  "paste",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers_end_to_end",
  "tokio",
- "tonic",
+ "tonic 0.10.2",
  "workspace-hack",
 ]
 
@@ -578,11 +625,22 @@ version = "0.1.0"
 dependencies = [
  "observability_deps",
  "rand",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "workspace-hack",
 ]
 
+[[package]]
+name = "backoff"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1"
+dependencies = [
+ "getrandom",
+ "instant",
+ "rand",
+]
+
 [[package]]
 name = "backtrace"
 version = "0.3.69"
@@ -598,12 +656,6 @@ dependencies = [
  "rustc-demangle",
 ]
 
-[[package]]
-name = "base64"
-version = "0.13.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
-
 [[package]]
 name = "base64"
 version = "0.21.7"
@@ -690,7 +742,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
 dependencies = [
  "memchr",
- "regex-automata 0.4.3",
+ "regex-automata 0.4.5",
  "serde",
 ]
 
@@ -700,11 +752,17 @@ version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
 
+[[package]]
+name = "bytecount"
+version = "0.6.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205"
+
 [[package]]
 name = "bytemuck"
-version = "1.14.0"
+version = "1.14.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6"
+checksum = "ed2490600f404f2b94c167e31d3ed1d5f3c225a0f3b80230053b3e0b7b962bd9"
 
 [[package]]
 name = "byteorder"
@@ -744,7 +802,7 @@ name = "cache_system"
 version = "0.1.0"
 dependencies = [
  "async-trait",
- "backoff",
+ "backoff 0.1.0",
  "criterion",
  "futures",
  "iox_time",
@@ -759,15 +817,63 @@ dependencies = [
  "tokio",
  "tokio-util",
  "trace",
+ "tracker",
  "workspace-hack",
 ]
 
+[[package]]
+name = "camino"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo-platform"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ceed8ef69d8518a5dda55c07425450b58a4e1946f4951eab6d7191ee86c2443d"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "cargo_metadata"
+version = "0.14.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa"
+dependencies = [
+ "camino",
+ "cargo-platform",
+ "semver",
+ "serde",
+ "serde_json",
+]
+
 [[package]]
 name = "cast"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
+[[package]]
+name = "catalog_cache"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "dashmap",
+ "futures",
+ "hyper",
+ "reqwest",
+ "snafu 0.8.0",
+ "tokio",
+ "tokio-util",
+ "url",
+ "workspace-hack",
+]
+
 [[package]]
 name = "cc"
 version = "1.0.83"
@@ -786,9 +892,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
 [[package]]
 name = "chrono"
-version = "0.4.31"
+version = "0.4.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb"
 dependencies = [
  "android-tzdata",
  "iana-time-zone",
@@ -796,7 +902,7 @@ dependencies = [
  "num-traits",
  "serde",
  "wasm-bindgen",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.0",
 ]
 
 [[package]]
@@ -823,9 +929,9 @@ dependencies = [
 
 [[package]]
 name = "ciborium"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926"
+checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 dependencies = [
  "ciborium-io",
  "ciborium-ll",
@@ -834,18 +940,18 @@ dependencies = [
 
 [[package]]
 name = "ciborium-io"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656"
+checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
 
 [[package]]
 name = "ciborium-ll"
-version = "0.2.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b"
+checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 dependencies = [
  "ciborium-io",
- "half 1.8.2",
+ "half",
 ]
 
 [[package]]
@@ -863,19 +969,25 @@ name = "clap_blocks"
 version = "0.1.0"
 dependencies = [
  "clap",
+ "ed25519-dalek",
  "futures",
  "http",
  "humantime",
  "iox_catalog",
+ "iox_time",
+ "itertools 0.12.0",
  "metric",
+ "non-empty-string",
  "object_store",
  "observability_deps",
- "snafu",
+ "parquet_cache",
+ "snafu 0.8.0",
  "sysinfo",
  "tempfile",
  "test_helpers",
  "trace_exporters",
  "trogging",
+ "url",
  "uuid",
  "workspace-hack",
 ]
@@ -919,7 +1031,7 @@ dependencies = [
  "reqwest",
  "thiserror",
  "tokio",
- "tonic",
+ "tonic 0.10.2",
  "tower",
  "workspace-hack",
 ]
@@ -941,6 +1053,15 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "concurrent-queue"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d16048cd947b08fa32c24458a22f5dc5e835264f689f4f5653210c69fd107363"
+dependencies = [
+ "crossbeam-utils",
+]
+
 [[package]]
 name = "console"
 version = "0.15.8"
@@ -959,9 +1080,9 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2895653b4d9f1538a83970077cb01dfc77a4810524e51a110944688e916b18e"
 dependencies = [
- "prost",
- "prost-types",
- "tonic",
+ "prost 0.11.9",
+ "prost-types 0.11.9",
+ "tonic 0.9.2",
  "tracing-core",
 ]
 
@@ -978,13 +1099,13 @@ dependencies = [
  "hdrhistogram",
  "humantime",
  "parking_lot 0.12.1",
- "prost-types",
+ "prost-types 0.11.9",
  "serde",
  "serde_json",
  "thread_local",
  "tokio",
  "tokio-stream",
- "tonic",
+ "tonic 0.9.2",
  "tracing",
  "tracing-core",
  "tracing-subscriber",
@@ -1216,6 +1337,69 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "curve25519-dalek"
+version = "4.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e89b8c6a2e4b1f45971ad09761aafb85514a84744b67a95e32c3cc1352d1f65c"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "curve25519-dalek-derive",
+ "digest",
+ "fiat-crypto",
+ "platforms",
+ "rustc_version",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "curve25519-dalek-derive"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "darling"
+version = "0.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e"
+dependencies = [
+ "darling_core",
+ "darling_macro",
+]
+
+[[package]]
+name = "darling_core"
+version = "0.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "177e3443818124b357d8e76f53be906d60937f0d3a90773a664fa63fa253e621"
+dependencies = [
+ "fnv",
+ "ident_case",
+ "proc-macro2",
+ "quote",
+ "strsim",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "darling_macro"
+version = "0.20.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5"
+dependencies = [
+ "darling_core",
+ "quote",
+ "syn 2.0.48",
+]
+
 [[package]]
 name = "dashmap"
 version = "5.5.3"
@@ -1233,22 +1417,28 @@ dependencies = [
 name = "data_types"
 version = "0.1.0"
 dependencies = [
+ "arrow-buffer",
  "assert_matches",
+ "bytes",
  "chrono",
  "croaring",
  "generated_types",
  "hex",
  "influxdb-line-protocol",
  "iox_time",
+ "murmur3",
  "observability_deps",
  "once_cell",
- "ordered-float 3.9.2",
+ "ordered-float 4.2.0",
  "paste",
  "percent-encoding",
  "proptest",
+ "prost 0.12.3",
  "schema",
- "serde",
+ "serde_json",
  "sha2",
+ "siphasher 1.0.0",
+ "snafu 0.8.0",
  "sqlx",
  "test_helpers",
  "thiserror",
@@ -1258,12 +1448,13 @@ dependencies = [
 
 [[package]]
 name = "datafusion"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "ahash",
  "arrow",
  "arrow-array",
+ "arrow-ipc",
  "arrow-schema",
  "async-compression",
  "async-trait",
@@ -1281,16 +1472,15 @@ dependencies = [
  "flate2",
  "futures",
  "glob",
- "half 2.3.1",
+ "half",
  "hashbrown 0.14.3",
  "indexmap 2.1.0",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "log",
  "num_cpus",
  "object_store",
  "parking_lot 0.12.1",
  "parquet",
- "percent-encoding",
  "pin-project-lite",
  "rand",
  "sqlparser",
@@ -1300,36 +1490,32 @@ dependencies = [
  "url",
  "uuid",
  "xz2",
- "zstd 0.12.4",
+ "zstd",
 ]
 
 [[package]]
 name = "datafusion-common"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
+ "ahash",
  "arrow",
  "arrow-array",
- "async-compression",
- "bytes",
- "bzip2",
+ "arrow-buffer",
+ "arrow-schema",
  "chrono",
- "flate2",
- "futures",
+ "half",
+ "libc",
  "num_cpus",
  "object_store",
  "parquet",
  "sqlparser",
- "tokio",
- "tokio-util",
- "xz2",
- "zstd 0.12.4",
 ]
 
 [[package]]
 name = "datafusion-execution"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "arrow",
  "chrono",
@@ -1348,12 +1534,14 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "ahash",
  "arrow",
+ "arrow-array",
  "datafusion-common",
+ "paste",
  "sqlparser",
  "strum",
  "strum_macros",
@@ -1361,8 +1549,8 @@ dependencies = [
 
 [[package]]
 name = "datafusion-optimizer"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1371,33 +1559,33 @@ dependencies = [
  "datafusion-expr",
  "datafusion-physical-expr",
  "hashbrown 0.14.3",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "log",
- "regex-syntax 0.7.5",
+ "regex-syntax 0.8.2",
 ]
 
 [[package]]
 name = "datafusion-physical-expr"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "ahash",
  "arrow",
  "arrow-array",
  "arrow-buffer",
+ "arrow-ord",
  "arrow-schema",
- "base64 0.21.7",
+ "base64",
  "blake2",
  "blake3",
  "chrono",
  "datafusion-common",
  "datafusion-expr",
- "half 2.3.1",
+ "half",
  "hashbrown 0.14.3",
  "hex",
  "indexmap 2.1.0",
- "itertools 0.11.0",
- "libc",
+ "itertools 0.12.0",
  "log",
  "md-5",
  "paste",
@@ -1411,8 +1599,8 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-plan"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "ahash",
  "arrow",
@@ -1426,26 +1614,23 @@ dependencies = [
  "datafusion-expr",
  "datafusion-physical-expr",
  "futures",
- "half 2.3.1",
+ "half",
  "hashbrown 0.14.3",
  "indexmap 2.1.0",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "log",
  "once_cell",
  "parking_lot 0.12.1",
  "pin-project-lite",
  "rand",
- "rstest",
- "tempfile",
- "termtree",
  "tokio",
  "uuid",
 ]
 
 [[package]]
 name = "datafusion-proto"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "arrow",
  "chrono",
@@ -1453,13 +1638,13 @@ dependencies = [
  "datafusion-common",
  "datafusion-expr",
  "object_store",
- "prost",
+ "prost 0.12.3",
 ]
 
 [[package]]
 name = "datafusion-sql"
-version = "31.0.0"
-source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178"
+version = "34.0.0"
+source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7"
 dependencies = [
  "arrow",
  "arrow-schema",
@@ -1495,6 +1680,17 @@ dependencies = [
  "uuid",
 ]
 
+[[package]]
+name = "delegate"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "082a24a9967533dc5d743c602157637116fc1b52806d694a5a45e6f32567fcdd"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "der"
 version = "0.7.8"
@@ -1506,6 +1702,17 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "derivative"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "diff"
 version = "0.1.13"
@@ -1556,6 +1763,36 @@ version = "0.15.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
 
+[[package]]
+name = "dyn-clone"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d"
+
+[[package]]
+name = "ed25519"
+version = "2.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53"
+dependencies = [
+ "pkcs8",
+ "signature",
+]
+
+[[package]]
+name = "ed25519-dalek"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f628eaec48bfd21b865dc2950cfa014450c01d2fa2b69a86c2fd5844ec523c0"
+dependencies = [
+ "curve25519-dalek",
+ "ed25519",
+ "serde",
+ "sha2",
+ "subtle",
+ "zeroize",
+]
+
 [[package]]
 name = "either"
 version = "1.9.0"
@@ -1596,6 +1833,15 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "error-chain"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "etcetera"
 version = "0.8.0"
@@ -1613,6 +1859,27 @@ version = "2.5.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
 
+[[package]]
+name = "event-listener"
+version = "4.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "67b215c49b2b248c855fb73579eb1f4f26c38ffdc12973e20e07b91d78d5646e"
+dependencies = [
+ "concurrent-queue",
+ "parking",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "event-listener-strategy"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3"
+dependencies = [
+ "event-listener 4.0.3",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "executor"
 version = "0.1.0"
@@ -1624,10 +1891,11 @@ dependencies = [
  "once_cell",
  "parking_lot 0.12.1",
  "pin-project",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "tokio-util",
  "tokio_metrics_bridge",
+ "tokio_watchdog",
  "workspace-hack",
 ]
 
@@ -1637,6 +1905,24 @@ version = "2.0.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
 
+[[package]]
+name = "fiat-crypto"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "27573eac26f4dd11e2b1916c3fe1baa56407c83c71a773a8ba17ec0bca03b6b7"
+
+[[package]]
+name = "filetime"
+version = "0.2.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall 0.4.1",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "findshlibs"
 version = "0.10.2"
@@ -1693,8 +1979,8 @@ dependencies = [
  "iox_query",
  "observability_deps",
  "once_cell",
- "prost",
- "snafu",
+ "prost 0.12.3",
+ "snafu 0.8.0",
  "workspace-hack",
 ]
 
@@ -1725,8 +2011,17 @@ dependencies = [
 ]
 
 [[package]]
-name = "futures"
-version = "0.3.30"
+name = "fsevent-sys"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
 dependencies = [
@@ -1806,12 +2101,6 @@ version = "0.3.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
 
-[[package]]
-name = "futures-timer"
-version = "3.0.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
-
 [[package]]
 name = "futures-util"
 version = "0.3.30"
@@ -1836,14 +2125,15 @@ version = "0.1.0"
 dependencies = [
  "bytes",
  "observability_deps",
- "pbjson 0.6.0",
+ "pbjson",
  "pbjson-build",
  "pbjson-types",
- "prost",
+ "prost 0.12.3",
  "prost-build",
  "serde",
- "tonic",
+ "tonic 0.10.2",
  "tonic-build",
+ "uuid",
  "workspace-hack",
 ]
 
@@ -1885,7 +2175,7 @@ name = "grpc-binary-logger"
 version = "0.1.0"
 dependencies = [
  "assert_matches",
- "base64 0.21.7",
+ "base64",
  "byteorder",
  "bytes",
  "futures",
@@ -1895,11 +2185,11 @@ dependencies = [
  "http-body",
  "hyper",
  "pin-project",
- "prost",
+ "prost 0.12.3",
  "prost-build",
  "tokio",
  "tokio-stream",
- "tonic",
+ "tonic 0.10.2",
  "tonic-build",
  "tower",
  "workspace-hack",
@@ -1909,10 +2199,10 @@ dependencies = [
 name = "grpc-binary-logger-proto"
 version = "0.1.0"
 dependencies = [
- "prost",
+ "prost 0.12.3",
  "prost-build",
- "prost-types",
- "tonic",
+ "prost-types 0.12.3",
+ "tonic 0.10.2",
  "tonic-build",
  "workspace-hack",
 ]
@@ -1921,9 +2211,9 @@ dependencies = [
 name = "grpc-binary-logger-test-proto"
 version = "0.1.0"
 dependencies = [
- "prost",
+ "prost 0.12.3",
  "prost-build",
- "tonic",
+ "tonic 0.10.2",
  "tonic-build",
  "workspace-hack",
 ]
@@ -1947,12 +2237,6 @@ dependencies = [
  "tracing",
 ]
 
-[[package]]
-name = "half"
-version = "1.8.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
-
 [[package]]
 name = "half"
 version = "2.3.1"
@@ -1966,9 +2250,9 @@ dependencies = [
 
 [[package]]
 name = "handlebars"
-version = "4.5.0"
+version = "5.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225"
+checksum = "c73166c591e67fb4bf9bc04011b4e35f12e89fe8d676193aa263df065955a379"
 dependencies = [
  "log",
  "pest",
@@ -2009,7 +2293,7 @@ version = "7.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
 dependencies = [
- "base64 0.21.7",
+ "base64",
  "byteorder",
  "flate2",
  "nom",
@@ -2019,7 +2303,7 @@ dependencies = [
 [[package]]
 name = "heappy"
 version = "0.1.0"
-source = "git+https://github.com/mkmik/heappy?rev=1de977a241cdd768acc5b6c82c0728b30c7db7b4#1de977a241cdd768acc5b6c82c0728b30c7db7b4"
+source = "git+https://github.com/mkmik/heappy?rev=01a1f88e1b404c5894f89eb1a57f813f713d7ad1#01a1f88e1b404c5894f89eb1a57f813f713d7ad1"
 dependencies = [
  "backtrace",
  "bytes",
@@ -2158,7 +2442,9 @@ dependencies = [
  "futures-util",
  "http",
  "hyper",
+ "log",
  "rustls",
+ "rustls-native-certs",
  "tokio",
  "tokio-rustls",
 ]
@@ -2198,6 +2484,12 @@ dependencies = [
  "cc",
 ]
 
+[[package]]
+name = "ident_case"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
+
 [[package]]
 name = "idna"
 version = "0.5.0"
@@ -2269,13 +2561,13 @@ dependencies = [
 
 [[package]]
 name = "influxdb-line-protocol"
-version = "2.0.0"
+version = "1.0.0"
 dependencies = [
  "bytes",
  "log",
  "nom",
  "smallvec",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
 ]
 
@@ -2291,7 +2583,7 @@ dependencies = [
  "reqwest",
  "serde",
  "serde_json",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "tokio",
  "url",
@@ -2339,6 +2631,8 @@ name = "influxdb3_server"
 version = "0.1.0"
 dependencies = [
  "arrow",
+ "arrow-json",
+ "arrow-schema",
  "async-trait",
  "authz",
  "bytes",
@@ -2372,7 +2666,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-util",
- "tonic",
+ "tonic 0.10.2",
  "tower",
  "trace",
  "trace_exporters",
@@ -2443,7 +2737,8 @@ dependencies = [
  "generated_types",
  "influxdb-line-protocol",
  "insta",
- "prost",
+ "iox_query_params",
+ "prost 0.12.3",
  "rand",
  "reqwest",
  "schema",
@@ -2451,7 +2746,7 @@ dependencies = [
  "thiserror",
  "tokio",
  "tokio-stream",
- "tonic",
+ "tonic 0.10.2",
 ]
 
 [[package]]
@@ -2462,8 +2757,8 @@ dependencies = [
  "futures-util",
  "generated_types",
  "observability_deps",
- "prost",
- "tonic",
+ "prost 0.12.3",
+ "tonic 0.10.2",
  "workspace-hack",
 ]
 
@@ -2476,7 +2771,7 @@ dependencies = [
  "integer-encoding 4.0.0",
  "observability_deps",
  "rand",
- "snafu",
+ "snafu 0.7.5",
  "snap",
  "test_helpers",
  "workspace-hack",
@@ -2487,7 +2782,7 @@ name = "influxrpc_parser"
 version = "0.1.0"
 dependencies = [
  "generated_types",
- "snafu",
+ "snafu 0.8.0",
  "sqlparser",
  "workspace-hack",
 ]
@@ -2497,25 +2792,45 @@ name = "ingester_query_grpc"
 version = "0.1.0"
 dependencies = [
  "arrow",
- "base64 0.21.7",
+ "base64",
  "bytes",
  "data_types",
  "datafusion",
  "datafusion-proto",
  "flatbuffers",
- "pbjson 0.6.0",
+ "pbjson",
  "pbjson-build",
  "predicate",
- "prost",
+ "prost 0.12.3",
  "prost-build",
  "query_functions",
  "serde",
- "snafu",
- "tonic",
+ "snafu 0.8.0",
+ "tonic 0.10.2",
  "tonic-build",
  "workspace-hack",
 ]
 
+[[package]]
+name = "inotify"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff"
+dependencies = [
+ "bitflags 1.3.2",
+ "inotify-sys",
+ "libc",
+]
+
+[[package]]
+name = "inotify-sys"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb"
+dependencies = [
+ "libc",
+]
+
 [[package]]
 name = "insta"
 version = "1.34.0"
@@ -2557,6 +2872,8 @@ version = "0.1.0"
 dependencies = [
  "assert_matches",
  "async-trait",
+ "backoff 0.1.0",
+ "catalog_cache",
  "data_types",
  "dotenvy",
  "futures",
@@ -2575,13 +2892,15 @@ dependencies = [
  "rand",
  "serde",
  "siphasher 1.0.0",
- "snafu",
+ "snafu 0.8.0",
  "sqlx",
  "sqlx-hotswap-pool",
  "tempfile",
  "test_helpers",
  "thiserror",
  "tokio",
+ "tonic 0.10.2",
+ "trace_http",
  "uuid",
  "workspace-hack",
 ]
@@ -2599,7 +2918,7 @@ dependencies = [
  "handlebars",
  "humantime",
  "influxdb2_client",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "mutable_batch",
  "mutable_batch_lp",
  "parquet_file",
@@ -2608,7 +2927,7 @@ dependencies = [
  "schema",
  "serde",
  "serde_json",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "tokio",
  "toml",
@@ -2634,7 +2953,8 @@ dependencies = [
  "hashbrown 0.14.3",
  "indexmap 2.1.0",
  "insta",
- "itertools 0.11.0",
+ "iox_time",
+ "itertools 0.12.0",
  "metric",
  "object_store",
  "observability_deps",
@@ -2645,11 +2965,13 @@ dependencies = [
  "query_functions",
  "schema",
  "serde",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "tokio",
  "tokio-stream",
  "trace",
+ "tracker",
+ "uuid",
  "workspace-hack",
 ]
 
@@ -2667,7 +2989,7 @@ dependencies = [
  "influxdb_influxql_parser",
  "insta",
  "iox_query",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "observability_deps",
  "once_cell",
  "predicate",
@@ -2697,12 +3019,26 @@ dependencies = [
  "predicate",
  "query_functions",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "tokio",
  "workspace-hack",
 ]
 
+[[package]]
+name = "iox_query_params"
+version = "0.1.0"
+dependencies = [
+ "assert_matches",
+ "datafusion",
+ "generated_types",
+ "observability_deps",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "workspace-hack",
+]
+
 [[package]]
 name = "iox_tests"
 version = "0.1.0"
@@ -2762,15 +3098,16 @@ dependencies = [
  "serde_json",
  "serde_urlencoded",
  "service_grpc_testing",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "tokio-stream",
  "tokio-util",
- "tonic",
+ "tonic 0.10.2",
  "tonic-health",
  "tonic-reflection",
  "tower",
  "tower-http",
+ "tower_trailer",
  "trace",
  "trace_exporters",
  "trace_http",
@@ -2786,7 +3123,7 @@ dependencies = [
  "hyper",
  "ioxd_common",
  "metric",
- "snafu",
+ "snafu 0.8.0",
  "tokio-util",
  "trace",
  "workspace-hack",
@@ -2860,6 +3197,188 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json-patch"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6"
+dependencies = [
+ "serde",
+ "serde_json",
+ "thiserror",
+ "treediff",
+]
+
+[[package]]
+name = "jsonpath-rust"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829"
+dependencies = [
+ "pest",
+ "pest_derive",
+ "regex",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "k8s-openapi"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "schemars",
+ "serde",
+ "serde-value",
+ "serde_json",
+]
+
+[[package]]
+name = "kqueue"
+version = "1.0.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c"
+dependencies = [
+ "kqueue-sys",
+ "libc",
+]
+
+[[package]]
+name = "kqueue-sys"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b"
+dependencies = [
+ "bitflags 1.3.2",
+ "libc",
+]
+
+[[package]]
+name = "kube"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e"
+dependencies = [
+ "k8s-openapi",
+ "kube-client",
+ "kube-core",
+ "kube-derive",
+ "kube-runtime",
+]
+
+[[package]]
+name = "kube-client"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "either",
+ "futures",
+ "home",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-rustls",
+ "hyper-timeout",
+ "jsonpath-rust",
+ "k8s-openapi",
+ "kube-core",
+ "pem",
+ "pin-project",
+ "rustls",
+ "rustls-pemfile",
+ "secrecy",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+]
+
+[[package]]
+name = "kube-core"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae"
+dependencies = [
+ "chrono",
+ "form_urlencoded",
+ "http",
+ "json-patch",
+ "k8s-openapi",
+ "once_cell",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "kube-derive"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4"
+dependencies = [
+ "darling",
+ "proc-macro2",
+ "quote",
+ "serde_json",
+ "syn 2.0.48",
+]
+
+[[package]]
+name = "kube-runtime"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae"
+dependencies = [
+ "ahash",
+ "async-trait",
+ "backoff 0.4.0",
+ "derivative",
+ "futures",
+ "hashbrown 0.14.3",
+ "json-patch",
+ "k8s-openapi",
+ "kube-client",
+ "parking_lot 0.12.1",
+ "pin-project",
+ "serde",
+ "serde_json",
+ "smallvec",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "kube_test"
+version = "0.1.0"
+dependencies = [
+ "http",
+ "hyper",
+ "k8s-openapi",
+ "kube-core",
+ "rand",
+ "serde",
+ "serde_json",
+ "serde_yaml",
+ "tower",
+ "workspace-hack",
+]
+
 [[package]]
 name = "lazy_static"
 version = "1.4.0"
@@ -2997,23 +3516,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "lz4"
-version = "1.24.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1"
-dependencies = [
- "libc",
- "lz4-sys",
-]
-
-[[package]]
-name = "lz4-sys"
-version = "1.9.4"
+name = "lz4_flex"
+version = "0.11.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900"
+checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15"
 dependencies = [
- "cc",
- "libc",
+ "twox-hash",
 ]
 
 [[package]]
@@ -3060,9 +3568,9 @@ checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
 
 [[package]]
 name = "memmap2"
-version = "0.9.3"
+version = "0.9.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92"
+checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322"
 dependencies = [
  "libc",
 ]
@@ -3114,6 +3622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
 dependencies = [
  "libc",
+ "log",
  "wasi",
  "windows-sys 0.48.0",
 ]
@@ -3136,12 +3645,53 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "moka"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad9dc9808102655926a6086abd0b9965ebefd4a39ef0d184f074c34ba5049ec6"
+dependencies = [
+ "async-lock",
+ "async-trait",
+ "crossbeam-channel",
+ "crossbeam-epoch",
+ "crossbeam-utils",
+ "futures-util",
+ "once_cell",
+ "parking_lot 0.12.1",
+ "quanta",
+ "rustc_version",
+ "skeptic",
+ "smallvec",
+ "tagptr",
+ "thiserror",
+ "triomphe",
+ "uuid",
+]
+
+[[package]]
+name = "mpchash"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bdd8199faa645318222f8aeb383fca4216a3f75b144f1e264ac74c0835d871a9"
+dependencies = [
+ "num-traits",
+ "rand",
+ "xxhash-rust",
+]
+
 [[package]]
 name = "multimap"
 version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
+[[package]]
+name = "murmur3"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b"
+
 [[package]]
 name = "mutable_batch"
 version = "0.1.0"
@@ -3149,20 +3699,17 @@ dependencies = [
  "arrow",
  "arrow_util",
  "assert_matches",
- "chrono",
  "data_types",
  "hashbrown 0.14.3",
  "iox_time",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "mutable_batch_lp",
- "paste",
- "percent-encoding",
+ "partition",
+ "pretty_assertions",
  "proptest",
  "rand",
  "schema",
- "snafu",
- "thiserror",
- "unicode-segmentation",
+ "snafu 0.8.0",
  "workspace-hack",
 ]
 
@@ -3175,9 +3722,11 @@ dependencies = [
  "criterion",
  "hashbrown 0.14.3",
  "influxdb-line-protocol",
+ "itertools 0.12.0",
  "mutable_batch",
  "schema",
- "snafu",
+ "snafu 0.8.0",
+ "test_helpers",
  "workspace-hack",
 ]
 
@@ -3192,8 +3741,9 @@ dependencies = [
  "hashbrown 0.14.3",
  "mutable_batch",
  "mutable_batch_lp",
+ "partition",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "workspace-hack",
 ]
 
@@ -3210,7 +3760,7 @@ dependencies = [
  "mutable_batch",
  "mutable_batch_lp",
  "mutable_batch_pb",
- "prost",
+ "prost 0.12.3",
 ]
 
 [[package]]
@@ -3245,6 +3795,34 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "non-empty-string"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55cf0f4060e345ae505219853da9ca1150564158a648a6aa6a528f0d5794bb33"
+dependencies = [
+ "delegate",
+]
+
+[[package]]
+name = "notify"
+version = "6.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d"
+dependencies = [
+ "bitflags 2.4.2",
+ "crossbeam-channel",
+ "filetime",
+ "fsevent-sys",
+ "inotify",
+ "kqueue",
+ "libc",
+ "log",
+ "mio",
+ "walkdir",
+ "windows-sys 0.48.0",
+]
+
 [[package]]
 name = "ntapi"
 version = "0.4.1"
@@ -3389,12 +3967,12 @@ dependencies = [
 
 [[package]]
 name = "object_store"
-version = "0.7.1"
+version = "0.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4"
+checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050"
 dependencies = [
  "async-trait",
- "base64 0.21.7",
+ "base64",
  "bytes",
  "chrono",
  "futures",
@@ -3403,14 +3981,14 @@ dependencies = [
  "itertools 0.11.0",
  "parking_lot 0.12.1",
  "percent-encoding",
- "quick-xml 0.30.0",
+ "quick-xml 0.31.0",
  "rand",
  "reqwest",
- "ring 0.16.20",
+ "ring",
  "rustls-pemfile",
  "serde",
  "serde_json",
- "snafu",
+ "snafu 0.7.5",
  "tokio",
  "tracing",
  "url",
@@ -3428,7 +4006,7 @@ dependencies = [
  "metric",
  "object_store",
  "pin-project",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "workspace-hack",
 ]
@@ -3456,6 +4034,12 @@ version = "11.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
+[[package]]
+name = "openssl-probe"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+
 [[package]]
 name = "ordered-float"
 version = "2.10.1"
@@ -3467,18 +4051,18 @@ dependencies = [
 
 [[package]]
 name = "ordered-float"
-version = "3.9.2"
+version = "4.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
+checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
 dependencies = [
  "num-traits",
 ]
 
 [[package]]
 name = "ouroboros"
-version = "0.18.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a50b637ffd883b2733a8483599fb6136b9dcedaa1850f7ac08b9b6f9f2061208"
+checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c"
 dependencies = [
  "aliasable",
  "ouroboros_macro",
@@ -3487,9 +4071,9 @@ dependencies = [
 
 [[package]]
 name = "ouroboros_macro"
-version = "0.18.2"
+version = "0.18.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3633d65683f13b9bcfaa3150880b018899fb0e5d0542f4adaea4f503fdb5eabf"
+checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33"
 dependencies = [
  "heck",
  "itertools 0.12.0",
@@ -3515,6 +4099,12 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "parking"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae"
+
 [[package]]
 name = "parking_lot"
 version = "0.11.2"
@@ -3565,8 +4155,9 @@ dependencies = [
 
 [[package]]
 name = "parquet"
-version = "46.0.0"
-source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af88740a842787da39b3d69ce5fbf6fce97d20211d3b299fee0a0da6430c74d4"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -3576,14 +4167,14 @@ dependencies = [
  "arrow-ipc",
  "arrow-schema",
  "arrow-select",
- "base64 0.21.7",
+ "base64",
  "brotli",
  "bytes",
  "chrono",
  "flate2",
  "futures",
  "hashbrown 0.14.3",
- "lz4",
+ "lz4_flex",
  "num",
  "num-bigint",
  "object_store",
@@ -3593,7 +4184,55 @@ dependencies = [
  "thrift",
  "tokio",
  "twox-hash",
- "zstd 0.12.4",
+ "zstd",
+]
+
+[[package]]
+name = "parquet_cache"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arc-swap",
+ "assert_matches",
+ "async-channel",
+ "async-trait",
+ "backoff 0.1.0",
+ "bytes",
+ "chrono",
+ "data_types",
+ "fnv",
+ "futures",
+ "http",
+ "hyper",
+ "iox_catalog",
+ "iox_tests",
+ "iox_time",
+ "k8s-openapi",
+ "kube",
+ "kube_test",
+ "lazy_static",
+ "moka",
+ "mpchash",
+ "notify",
+ "object_store",
+ "observability_deps",
+ "parking_lot 0.12.1",
+ "parquet_file",
+ "pin-project",
+ "rand",
+ "reqwest",
+ "schemars",
+ "serde",
+ "serde_json",
+ "tempfile",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower",
+ "url",
+ "uuid",
+ "workspace-hack",
 ]
 
 [[package]]
@@ -3601,7 +4240,8 @@ name = "parquet_file"
 version = "0.1.0"
 dependencies = [
  "arrow",
- "base64 0.21.7",
+ "assert_matches",
+ "base64",
  "bytes",
  "data_types",
  "datafusion",
@@ -3613,17 +4253,17 @@ dependencies = [
  "observability_deps",
  "parquet",
  "pbjson-types",
- "prost",
+ "prost 0.12.3",
  "rand",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "thiserror",
  "thrift",
  "tokio",
  "uuid",
  "workspace-hack",
- "zstd 0.12.4",
+ "zstd",
 ]
 
 [[package]]
@@ -3639,7 +4279,7 @@ dependencies = [
  "object_store",
  "parquet_file",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "workspace-hack",
 ]
@@ -3653,55 +4293,69 @@ dependencies = [
  "regex",
 ]
 
+[[package]]
+name = "partition"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "assert_matches",
+ "chrono",
+ "criterion",
+ "data_types",
+ "generated_types",
+ "hashbrown 0.14.3",
+ "mutable_batch",
+ "mutable_batch_lp",
+ "paste",
+ "percent-encoding",
+ "proptest",
+ "rand",
+ "schema",
+ "test_helpers",
+ "thiserror",
+ "unicode-segmentation",
+ "workspace-hack",
+]
+
 [[package]]
 name = "paste"
 version = "1.0.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
 
-[[package]]
-name = "pbjson"
-version = "0.5.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "048f9ac93c1eab514f9470c4bc8d97ca2a0a236b84f45cc19d69a59fc11467f6"
-dependencies = [
- "base64 0.13.1",
- "serde",
-]
-
 [[package]]
 name = "pbjson"
 version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90"
 dependencies = [
- "base64 0.21.7",
+ "base64",
  "serde",
 ]
 
 [[package]]
 name = "pbjson-build"
-version = "0.5.1"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bdbb7b706f2afc610f3853550cdbbf6372fd324824a087806bd4480ea4996e24"
+checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735"
 dependencies = [
  "heck",
- "itertools 0.10.5",
- "prost",
- "prost-types",
+ "itertools 0.11.0",
+ "prost 0.12.3",
+ "prost-types 0.12.3",
 ]
 
 [[package]]
 name = "pbjson-types"
-version = "0.5.1"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4a88c8d87f99a4ac14325e7a4c24af190fca261956e3b82dd7ed67e77e6c7043"
+checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12"
 dependencies = [
  "bytes",
  "chrono",
- "pbjson 0.5.1",
+ "pbjson",
  "pbjson-build",
- "prost",
+ "prost 0.12.3",
  "prost-build",
  "serde",
 ]
@@ -3715,6 +4369,16 @@ dependencies = [
  "fixedbitset",
 ]
 
+[[package]]
+name = "pem"
+version = "3.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
+dependencies = [
+ "base64",
+ "serde",
+]
+
 [[package]]
 name = "pem-rfc7468"
 version = "0.7.0"
@@ -3825,18 +4489,18 @@ dependencies = [
 
 [[package]]
 name = "pin-project"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422"
+checksum = "0302c4a0442c456bd56f841aee5c3bfd17967563f6fadc9ceb9f9c23cf3807e0"
 dependencies = [
  "pin-project-internal",
 ]
 
 [[package]]
 name = "pin-project-internal"
-version = "1.1.3"
+version = "1.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
+checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -3882,11 +4546,17 @@ version = "0.3.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb"
 
+[[package]]
+name = "platforms"
+version = "3.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "626dec3cac7cc0e1577a2ec3fc496277ec2baa084bebad95bb6fdbfae235f84c"
+
 [[package]]
 name = "pprof"
-version = "0.12.1"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "978385d59daf9269189d052ca8a84c1acfd0715c0599a5d5188d4acc078ca46a"
+checksum = "ef5c97c51bd34c7e742402e216abdeb44d415fbe6ae41d56b114723e953711cb"
 dependencies = [
  "backtrace",
  "cfg-if",
@@ -3897,9 +4567,9 @@ dependencies = [
  "nix 0.26.4",
  "once_cell",
  "parking_lot 0.12.1",
- "prost",
+ "prost 0.12.3",
  "prost-build",
- "prost-derive",
+ "prost-derive 0.12.3",
  "protobuf",
  "sha2",
  "smallvec",
@@ -3923,11 +4593,11 @@ dependencies = [
  "data_types",
  "datafusion",
  "datafusion_util",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "observability_deps",
  "query_functions",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "sqlparser",
  "test_helpers",
  "workspace-hack",
@@ -3972,19 +4642,19 @@ dependencies = [
 
 [[package]]
 name = "prettyplease"
-version = "0.1.25"
+version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86"
+checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
 dependencies = [
  "proc-macro2",
- "syn 1.0.109",
+ "syn 2.0.48",
 ]
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.76"
+version = "1.0.78"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c"
+checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
 dependencies = [
  "unicode-ident",
 ]
@@ -4039,27 +4709,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
 dependencies = [
  "bytes",
- "prost-derive",
+ "prost-derive 0.11.9",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a"
+dependencies = [
+ "bytes",
+ "prost-derive 0.12.3",
 ]
 
 [[package]]
 name = "prost-build"
-version = "0.11.9"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270"
+checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2"
 dependencies = [
  "bytes",
  "heck",
- "itertools 0.10.5",
- "lazy_static",
+ "itertools 0.11.0",
  "log",
  "multimap",
+ "once_cell",
  "petgraph",
  "prettyplease",
- "prost",
- "prost-types",
+ "prost 0.12.3",
+ "prost-types 0.12.3",
  "regex",
- "syn 1.0.109",
+ "syn 2.0.48",
  "tempfile",
  "which",
 ]
@@ -4077,13 +4757,35 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "prost-derive"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e"
+dependencies = [
+ "anyhow",
+ "itertools 0.11.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
 [[package]]
 name = "prost-types"
 version = "0.11.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13"
 dependencies = [
- "prost",
+ "prost 0.11.9",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e"
+dependencies = [
+ "prost 0.12.3",
 ]
 
 [[package]]
@@ -4092,6 +4794,32 @@ version = "2.28.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94"
 
+[[package]]
+name = "pulldown-cmark"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998"
+dependencies = [
+ "bitflags 1.3.2",
+ "memchr",
+ "unicase",
+]
+
+[[package]]
+name = "quanta"
+version = "0.12.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ca0b7bac0b97248c40bb77288fc52029cf1459c0461ea1b05ee32ccf011de2c"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "query_functions"
 version = "0.1.0"
@@ -4100,12 +4828,12 @@ dependencies = [
  "chrono",
  "datafusion",
  "datafusion_util",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "once_cell",
  "regex",
- "regex-syntax 0.7.5",
+ "regex-syntax 0.8.2",
  "schema",
- "snafu",
+ "snafu 0.8.0",
  "tokio",
  "workspace-hack",
 ]
@@ -4121,9 +4849,9 @@ dependencies = [
 
 [[package]]
 name = "quick-xml"
-version = "0.30.0"
+version = "0.31.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
 dependencies = [
  "memchr",
  "serde",
@@ -4177,6 +4905,15 @@ dependencies = [
  "rand_core",
 ]
 
+[[package]]
+name = "raw-cpuid"
+version = "11.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d86a7c4638d42c44551f4791a20e687dbb4c3de1f33c43dd71e355cd429def1"
+dependencies = [
+ "bitflags 2.4.2",
+]
+
 [[package]]
 name = "rayon"
 version = "1.8.1"
@@ -4217,13 +4954,13 @@ dependencies = [
 
 [[package]]
 name = "regex"
-version = "1.10.2"
+version = "1.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15"
 dependencies = [
  "aho-corasick",
  "memchr",
- "regex-automata 0.4.3",
+ "regex-automata 0.4.5",
  "regex-syntax 0.8.2",
 ]
 
@@ -4238,9 +4975,9 @@ dependencies = [
 
 [[package]]
 name = "regex-automata"
-version = "0.4.3"
+version = "0.4.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -4253,31 +4990,19 @@ version = "0.6.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
 
-[[package]]
-name = "regex-syntax"
-version = "0.7.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
-
 [[package]]
 name = "regex-syntax"
 version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
 
-[[package]]
-name = "relative-path"
-version = "1.9.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e898588f33fdd5b9420719948f9f2a32c922a246964576f71ba7f24f80610fbc"
-
 [[package]]
 name = "reqwest"
 version = "0.11.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41"
 dependencies = [
- "base64 0.21.7",
+ "base64",
  "bytes",
  "encoding_rs",
  "futures-core",
@@ -4295,6 +5020,7 @@ dependencies = [
  "percent-encoding",
  "pin-project-lite",
  "rustls",
+ "rustls-native-certs",
  "rustls-pemfile",
  "serde",
  "serde_json",
@@ -4309,7 +5035,7 @@ dependencies = [
  "wasm-bindgen-futures",
  "wasm-streams",
  "web-sys",
- "webpki-roots 0.25.3",
+ "webpki-roots",
  "winreg",
 ]
 
@@ -4322,21 +5048,6 @@ dependencies = [
  "bytemuck",
 ]
 
-[[package]]
-name = "ring"
-version = "0.16.20"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc"
-dependencies = [
- "cc",
- "libc",
- "once_cell",
- "spin 0.5.2",
- "untrusted 0.7.1",
- "web-sys",
- "winapi",
-]
-
 [[package]]
 name = "ring"
 version = "0.17.7"
@@ -4347,7 +5058,7 @@ dependencies = [
  "getrandom",
  "libc",
  "spin 0.9.8",
- "untrusted 0.9.0",
+ "untrusted",
  "windows-sys 0.48.0",
 ]
 
@@ -4371,35 +5082,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "rstest"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "97eeab2f3c0a199bc4be135c36c924b6590b88c377d416494288c14f2db30199"
-dependencies = [
- "futures",
- "futures-timer",
- "rstest_macros",
- "rustc_version",
-]
-
-[[package]]
-name = "rstest_macros"
-version = "0.18.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605"
-dependencies = [
- "cfg-if",
- "glob",
- "proc-macro2",
- "quote",
- "regex",
- "relative-path",
- "rustc_version",
- "syn 2.0.48",
- "unicode-ident",
-]
-
 [[package]]
 name = "rustc-demangle"
 version = "0.1.23"
@@ -4435,28 +5117,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba"
 dependencies = [
  "log",
- "ring 0.17.7",
- "rustls-webpki 0.101.7",
+ "ring",
+ "rustls-webpki",
  "sct",
 ]
 
 [[package]]
-name = "rustls-pemfile"
-version = "1.0.4"
+name = "rustls-native-certs"
+version = "0.6.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
 dependencies = [
- "base64 0.21.7",
+ "openssl-probe",
+ "rustls-pemfile",
+ "schannel",
+ "security-framework",
 ]
 
 [[package]]
-name = "rustls-webpki"
-version = "0.100.3"
+name = "rustls-pemfile"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5f6a5fc258f1c1276dfe3016516945546e2d5383911efc0fc4f1cdc5df3a4ae3"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
 dependencies = [
- "ring 0.16.20",
- "untrusted 0.7.1",
+ "base64",
 ]
 
 [[package]]
@@ -4465,8 +5149,8 @@ version = "0.101.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
 dependencies = [
- "ring 0.17.7",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
 ]
 
 [[package]]
@@ -4485,9 +5169,18 @@ checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
 name = "same-file"
 version = "1.0.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
 dependencies = [
- "winapi-util",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -4498,10 +5191,35 @@ dependencies = [
  "hashbrown 0.14.3",
  "indexmap 2.1.0",
  "observability_deps",
- "snafu",
+ "once_cell",
+ "snafu 0.8.0",
  "workspace-hack",
 ]
 
+[[package]]
+name = "schemars"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "scopeguard"
 version = "1.2.0"
@@ -4514,8 +5232,41 @@ version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
 dependencies = [
- "ring 0.17.7",
- "untrusted 0.9.0",
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "secrecy"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e"
+dependencies = [
+ "serde",
+ "zeroize",
+]
+
+[[package]]
+name = "security-framework"
+version = "2.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
 ]
 
 [[package]]
@@ -4523,6 +5274,9 @@ name = "semver"
 version = "1.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0"
+dependencies = [
+ "serde",
+]
 
 [[package]]
 name = "seq-macro"
@@ -4539,6 +5293,16 @@ dependencies = [
  "serde_derive",
 ]
 
+[[package]]
+name = "serde-value"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
+dependencies = [
+ "ordered-float 2.10.1",
+ "serde",
+]
+
 [[package]]
 name = "serde_derive"
 version = "1.0.195"
@@ -4550,6 +5314,17 @@ dependencies = [
  "syn 2.0.48",
 ]
 
+[[package]]
+name = "serde_derive_internals"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
 [[package]]
 name = "serde_json"
 version = "1.0.111"
@@ -4582,24 +5357,27 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "serde_yaml"
+version = "0.9.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1bf28c79a99f70ee1f1d83d10c875d2e70618417fda01ad1785e027579d9d38"
+dependencies = [
+ "indexmap 2.1.0",
+ "itoa",
+ "ryu",
+ "serde",
+ "unsafe-libyaml",
+]
+
 [[package]]
 name = "service_common"
 version = "0.1.0"
 dependencies = [
- "async-trait",
- "bytes",
+ "arrow",
  "datafusion",
  "executor",
- "flightsql",
- "iox_query",
- "iox_query_influxql",
- "iox_query_influxrpc",
- "metric",
- "parking_lot 0.12.1",
- "predicate",
- "tonic",
- "trace",
- "tracker",
+ "tonic 0.10.2",
  "workspace-hack",
 ]
 
@@ -4619,16 +5397,19 @@ dependencies = [
  "futures",
  "generated_types",
  "iox_query",
+ "iox_query_influxql",
+ "iox_query_params",
  "metric",
  "observability_deps",
- "prost",
+ "prost 0.12.3",
  "serde",
  "serde_json",
  "service_common",
- "snafu",
+ "snafu 0.8.0",
  "test_helpers",
  "tokio",
- "tonic",
+ "tonic 0.10.2",
+ "tower_trailer",
  "trace",
  "trace_http",
  "tracker",
@@ -4641,7 +5422,7 @@ version = "0.1.0"
 dependencies = [
  "generated_types",
  "observability_deps",
- "tonic",
+ "tonic 0.10.2",
  "workspace-hack",
 ]
 
@@ -4728,6 +5509,21 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
 
+[[package]]
+name = "skeptic"
+version = "0.13.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8"
+dependencies = [
+ "bytecount",
+ "cargo_metadata",
+ "error-chain",
+ "glob",
+ "pulldown-cmark",
+ "tempfile",
+ "walkdir",
+]
+
 [[package]]
 name = "slab"
 version = "0.4.9"
@@ -4750,7 +5546,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6"
 dependencies = [
  "doc-comment",
- "snafu-derive",
+ "snafu-derive 0.7.5",
+]
+
+[[package]]
+name = "snafu"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d342c51730e54029130d7dc9fd735d28c4cd360f1368c01981d4f03ff207f096"
+dependencies = [
+ "snafu-derive 0.8.0",
 ]
 
 [[package]]
@@ -4765,6 +5570,18 @@ dependencies = [
  "syn 1.0.109",
 ]
 
+[[package]]
+name = "snafu-derive"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "080c44971436b1af15d6f61ddd8b543995cf63ab8e677d46b00cc06f4ef267a0"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.48",
+]
+
 [[package]]
 name = "snap"
 version = "1.1.1"
@@ -4819,9 +5636,9 @@ dependencies = [
 
 [[package]]
 name = "sqlparser"
-version = "0.37.0"
+version = "0.41.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "37ae05a8250b968a3f7db93155a84d68b2e6cea1583949af5ca5b5170c76c075"
+checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964"
 dependencies = [
  "log",
  "sqlparser_derive",
@@ -4829,13 +5646,13 @@ dependencies = [
 
 [[package]]
 name = "sqlparser_derive"
-version = "0.1.1"
+version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e"
+checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.48",
 ]
 
 [[package]]
@@ -4865,7 +5682,7 @@ dependencies = [
  "crossbeam-queue",
  "dotenvy",
  "either",
- "event-listener",
+ "event-listener 2.5.3",
  "futures-channel",
  "futures-core",
  "futures-intrusive",
@@ -4892,7 +5709,7 @@ dependencies = [
  "tracing",
  "url",
  "uuid",
- "webpki-roots 0.25.3",
+ "webpki-roots",
 ]
 
 [[package]]
@@ -4955,7 +5772,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4"
 dependencies = [
  "atoi",
- "base64 0.21.7",
+ "base64",
  "bitflags 2.4.2",
  "byteorder",
  "bytes",
@@ -4998,7 +5815,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24"
 dependencies = [
  "atoi",
- "base64 0.21.7",
+ "base64",
  "bitflags 2.4.2",
  "byteorder",
  "crc",
@@ -5177,9 +5994,9 @@ checksum = "d3543ca0810e71767052bdcdd5653f23998b192642a22c5164bfa6581e40a4a2"
 
 [[package]]
 name = "sysinfo"
-version = "0.29.11"
+version = "0.30.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cd727fc423c2060f6c92d9534cef765c65a6ed3f428a03d7def74a8c4348e666"
+checksum = "1fb4f3438c8f6389c864e61221cbc97e9bca98b4daf39a5beb7bea660f528bb2"
 dependencies = [
  "cfg-if",
  "core-foundation-sys",
@@ -5187,7 +6004,7 @@ dependencies = [
  "ntapi",
  "once_cell",
  "rayon",
- "winapi",
+ "windows",
 ]
 
 [[package]]
@@ -5211,6 +6028,12 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "tagptr"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
+
 [[package]]
 name = "tempfile"
 version = "3.9.0"
@@ -5240,7 +6063,7 @@ dependencies = [
  "parking_lot 0.12.1",
  "tempfile",
  "tokio",
- "tracing-log 0.1.4",
+ "tracing-log",
  "tracing-subscriber",
  "workspace-hack",
 ]
@@ -5253,6 +6076,7 @@ dependencies = [
  "arrow-flight",
  "arrow_util",
  "assert_cmd",
+ "assert_matches",
  "bytes",
  "data_types",
  "dml",
@@ -5262,24 +6086,27 @@ dependencies = [
  "hyper",
  "influxdb_iox_client",
  "ingester_query_grpc",
+ "insta",
  "iox_catalog",
+ "iox_query_params",
  "mutable_batch_lp",
  "mutable_batch_pb",
  "nix 0.27.1",
  "observability_deps",
  "once_cell",
  "parking_lot 0.12.1",
- "prost",
+ "prost 0.12.3",
  "rand",
  "regex",
  "reqwest",
- "snafu",
+ "serde_json",
+ "snafu 0.8.0",
  "sqlx",
  "tempfile",
  "test_helpers",
  "tokio",
  "tokio-util",
- "tonic",
+ "tonic 0.10.2",
  "workspace-hack",
 ]
 
@@ -5463,6 +6290,7 @@ dependencies = [
  "futures-io",
  "futures-sink",
  "pin-project-lite",
+ "slab",
  "tokio",
  "tracing",
 ]
@@ -5477,6 +6305,17 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "tokio_watchdog"
+version = "0.1.0"
+dependencies = [
+ "metric",
+ "observability_deps",
+ "test_helpers",
+ "tokio",
+ "workspace-hack",
+]
+
 [[package]]
 name = "toml"
 version = "0.8.8"
@@ -5517,10 +6356,9 @@ version = "0.9.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
 dependencies = [
- "async-stream",
  "async-trait",
  "axum",
- "base64 0.21.7",
+ "base64",
  "bytes",
  "futures-core",
  "futures-util",
@@ -5531,7 +6369,36 @@ dependencies = [
  "hyper-timeout",
  "percent-encoding",
  "pin-project",
- "prost",
+ "prost 0.11.9",
+ "tokio",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost 0.12.3",
+ "rustls",
+ "rustls-native-certs",
  "rustls-pemfile",
  "tokio",
  "tokio-rustls",
@@ -5540,46 +6407,45 @@ dependencies = [
  "tower-layer",
  "tower-service",
  "tracing",
- "webpki-roots 0.23.1",
 ]
 
 [[package]]
 name = "tonic-build"
-version = "0.9.2"
+version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
 dependencies = [
  "prettyplease",
  "proc-macro2",
  "prost-build",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.48",
 ]
 
 [[package]]
 name = "tonic-health"
-version = "0.9.2"
+version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "080964d45894b90273d2b1dd755fdd114560db8636bb41cea615213c45043c4d"
+checksum = "f80db390246dfb46553481f6024f0082ba00178ea495dbb99e70ba9a4fafb5e1"
 dependencies = [
  "async-stream",
- "prost",
+ "prost 0.12.3",
  "tokio",
  "tokio-stream",
- "tonic",
+ "tonic 0.10.2",
 ]
 
 [[package]]
 name = "tonic-reflection"
-version = "0.9.2"
+version = "0.10.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0543d7092032041fbeac1f2c84304537553421a11a623c2301b12ef0264862c7"
+checksum = "3fa37c513df1339d197f4ba21d28c918b9ef1ac1768265f11ecb6b7f1cba1b76"
 dependencies = [
- "prost",
- "prost-types",
+ "prost 0.12.3",
+ "prost-types 0.12.3",
  "tokio",
  "tokio-stream",
- "tonic",
+ "tonic 0.10.2",
 ]
 
 [[package]]
@@ -5608,6 +6474,7 @@ version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
 dependencies = [
+ "base64",
  "bitflags 2.4.2",
  "bytes",
  "futures-core",
@@ -5615,6 +6482,7 @@ dependencies = [
  "http",
  "http-body",
  "http-range-header",
+ "mime",
  "pin-project-lite",
  "tower-layer",
  "tower-service",
@@ -5633,6 +6501,19 @@ version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
+[[package]]
+name = "tower_trailer"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "http",
+ "http-body",
+ "parking_lot 0.12.1",
+ "pin-project",
+ "tower",
+ "workspace-hack",
+]
+
 [[package]]
 name = "trace"
 version = "0.1.0"
@@ -5654,7 +6535,7 @@ dependencies = [
  "futures",
  "iox_time",
  "observability_deps",
- "snafu",
+ "snafu 0.8.0",
  "thrift",
  "tokio",
  "trace",
@@ -5665,16 +6546,17 @@ dependencies = [
 name = "trace_http"
 version = "0.1.0"
 dependencies = [
+ "bytes",
  "futures",
  "hashbrown 0.14.3",
  "http",
  "http-body",
- "itertools 0.11.0",
+ "itertools 0.12.0",
  "metric",
  "observability_deps",
  "parking_lot 0.12.1",
  "pin-project",
- "snafu",
+ "snafu 0.8.0",
  "tower",
  "trace",
  "workspace-hack",
@@ -5713,17 +6595,6 @@ dependencies = [
  "valuable",
 ]
 
-[[package]]
-name = "tracing-log"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2"
-dependencies = [
- "log",
- "once_cell",
- "tracing-core",
-]
-
 [[package]]
 name = "tracing-log"
 version = "0.2.0"
@@ -5763,7 +6634,7 @@ dependencies = [
  "thread_local",
  "tracing",
  "tracing-core",
- "tracing-log 0.2.0",
+ "tracing-log",
  "tracing-serde",
 ]
 
@@ -5788,6 +6659,21 @@ dependencies = [
  "workspace-hack",
 ]
 
+[[package]]
+name = "treediff"
+version = "4.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52984d277bdf2a751072b5df30ec0377febdb02f7696d64c2d7d54630bac4303"
+dependencies = [
+ "serde_json",
+]
+
+[[package]]
+name = "triomphe"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3"
+
 [[package]]
 name = "trogging"
 version = "0.1.0"
@@ -5798,7 +6684,7 @@ dependencies = [
  "regex",
  "synchronized-writer",
  "thiserror",
- "tracing-log 0.1.4",
+ "tracing-log",
  "tracing-subscriber",
 ]
 
@@ -5836,6 +6722,15 @@ version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
 
+[[package]]
+name = "unicase"
+version = "2.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.15"
@@ -5876,10 +6771,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
 
 [[package]]
-name = "untrusted"
-version = "0.7.1"
+name = "unsafe-libyaml"
+version = "0.2.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
+checksum = "ab4c90930b95a82d00dc9e9ac071b4991924390d46cbd0dfe566148667605e4b"
 
 [[package]]
 name = "untrusted"
@@ -5918,9 +6813,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
 
 [[package]]
 name = "uuid"
-version = "1.6.1"
+version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a"
 dependencies = [
  "getrandom",
 ]
@@ -5968,8 +6863,8 @@ dependencies = [
  "mutable_batch_pb",
  "observability_deps",
  "parking_lot 0.12.1",
- "prost",
- "snafu",
+ "prost 0.12.3",
+ "snafu 0.8.0",
  "snap",
  "test_helpers",
  "tokio",
@@ -6110,15 +7005,6 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "webpki-roots"
-version = "0.23.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338"
-dependencies = [
- "rustls-webpki 0.100.3",
-]
-
 [[package]]
 name = "webpki-roots"
 version = "0.25.3"
@@ -6174,6 +7060,16 @@ version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
 
+[[package]]
+name = "windows"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be"
+dependencies = [
+ "windows-core",
+ "windows-targets 0.52.0",
+]
+
 [[package]]
 name = "windows-core"
 version = "0.52.0"
@@ -6339,10 +7235,9 @@ name = "workspace-hack"
 version = "0.1.0"
 dependencies = [
  "ahash",
- "arrow",
  "arrow-array",
- "arrow-flight",
- "arrow-string",
+ "arrow-cast",
+ "arrow-ipc",
  "bitflags 2.4.2",
  "byteorder",
  "bytes",
@@ -6352,8 +7247,6 @@ dependencies = [
  "clap_builder",
  "crossbeam-utils",
  "crypto-common",
- "datafusion",
- "datafusion-optimizer",
  "digest",
  "either",
  "fixedbitset",
@@ -6368,8 +7261,10 @@ dependencies = [
  "getrandom",
  "hashbrown 0.14.3",
  "heck",
+ "hyper",
  "indexmap 2.1.0",
  "itertools 0.10.5",
+ "itertools 0.11.0",
  "libc",
  "lock_api",
  "log",
@@ -6381,27 +7276,26 @@ dependencies = [
  "object_store",
  "once_cell",
  "parking_lot 0.12.1",
- "parquet",
  "petgraph",
  "phf_shared",
  "proptest",
- "prost",
- "prost-types",
+ "prost 0.11.9",
+ "prost 0.12.3",
+ "prost-types 0.11.9",
+ "prost-types 0.12.3",
  "rand",
  "rand_core",
  "regex",
- "regex-automata 0.4.3",
- "regex-syntax 0.7.5",
+ "regex-automata 0.4.5",
  "regex-syntax 0.8.2",
  "reqwest",
- "ring 0.16.20",
+ "ring",
  "rustls",
  "serde",
  "serde_json",
  "sha2",
  "similar",
  "spin 0.9.8",
- "sqlparser",
  "sqlx",
  "sqlx-core",
  "sqlx-macros",
@@ -6415,7 +7309,6 @@ dependencies = [
  "tokio",
  "tokio-stream",
  "tokio-util",
- "tonic",
  "tower",
  "tracing",
  "tracing-core",
@@ -6429,6 +7322,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "xxhash-rust"
+version = "0.8.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53be06678ed9e83edb1745eb72efc0bbcd7b5c3c35711a860906aed827a13d61"
+
 [[package]]
 name = "xz2"
 version = "0.1.7"
@@ -6485,32 +7384,13 @@ version = "1.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
 
-[[package]]
-name = "zstd"
-version = "0.12.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c"
-dependencies = [
- "zstd-safe 6.0.6",
-]
-
 [[package]]
 name = "zstd"
 version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110"
 dependencies = [
- "zstd-safe 7.0.0",
-]
-
-[[package]]
-name = "zstd-safe"
-version = "6.0.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581"
-dependencies = [
- "libc",
- "zstd-sys",
+ "zstd-safe",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index d9cedab98ae..3094e83cc7b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -98,18 +98,41 @@ edition = "2021"
 license = "MIT OR Apache-2.0"
 
 [workspace.dependencies]
-arrow = { version = "46.0.0" }
-arrow-flight = { version = "46.0.0" }
-datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178", default-features = false }
-datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178" }
+arrow = { version = "49.0.0", features = ["prettyprint", "chrono-tz"] }
+arrow-buffer = { version = "49.0.0" }
+arrow-flight = { version = "49.0.0", features = ["flight-sql-experimental"] }
+datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" }
+datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" }
+hashbrown = { version = "0.14.3" }
+object_store = { version = "0.8.0" }
+parquet = { version = "49.0.0", features = ["object_store"] }
+pbjson = { version = "0.6.0" }
+pbjson-build = { version = "0.6.2" }
+pbjson-types = { version = "0.6.0" }
+prost = { version = "0.12.3" }
+prost-build = { version = "0.12.2" }
+prost-types = { version = "0.12.3" }
+sqlparser = { version = "0.41.0" }
+tonic = { version = "0.10.2", features = ["tls", "tls-roots"] }
+tonic-build = { version = "0.10.2" }
+tonic-health = { version = "0.10.2" }
+tonic-reflection = { version = "0.10.2" }
 
-hashbrown = { version = "0.14.0" }
-object_store = { version = "0.7.0" }
-parquet = { version = "46.0.0" }
-tonic = { version = "0.9.2", features = ["tls", "tls-webpki-roots"] }
-tonic-build = { version = "0.9.2" }
-tonic-health = { version = "0.9.2" }
-tonic-reflection = { version = "0.9.2" }
+[workspace.lints.rust]
+rust_2018_idioms = "deny"
+unreachable_pub = "deny"
+missing_debug_implementations = "deny"
+missing_copy_implementations = "deny"
+
+[workspace.lints.clippy]
+dbg_macro = "deny"
+todo = "deny"
+clone_on_ref_ptr = "deny"
+future_not_send = "deny"
+
+[workspace.lints.rustdoc]
+broken_intra_doc_links = "deny"
+bare_urls = "deny"
 
 # This profile optimizes for runtime performance and small binary size at the expense of longer
 # build times. It's most suitable for final release builds.
@@ -135,19 +158,3 @@ opt-level = 3
 
 [profile.dev.package.similar]
 opt-level = 3
-
-[patch.crates-io]
-# Can remove after arrow 47 is released
-# Pin to https://github.com/apache/arrow-rs/pull/4790
-# To get fixes for
-# - https://github.com/apache/arrow-rs/issues/4788,
-# - https://github.com/apache/arrow-rs/pull/4799
-arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-select = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
-parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" }
diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml
index 83a826f7e0a..18ac4bf7c2c 100644
--- a/arrow_util/Cargo.toml
+++ b/arrow_util/Cargo.toml
@@ -6,22 +6,24 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] }
-# need dyn_cmp_dict feature for comparing dictionary arrays
-arrow = { workspace = true, features = ["prettyprint", "dyn_cmp_dict"] }
+arrow = { workspace = true }
 # used by arrow anyway (needed for printing workaround)
 chrono = { version = "0.4", default-features = false }
-comfy-table = { version = "7.0", default-features = false }
+comfy-table = { version = "7.1", default-features = false }
 hashbrown = { workspace = true }
 num-traits = "0.2"
-once_cell = { version = "1.18", features = ["parking_lot"] }
-regex = "1.9.5"
-snafu = "0.7"
+once_cell = { version = "1.19", features = ["parking_lot"] }
+regex = "1.10.2"
+snafu = "0.8"
 uuid = "1"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
-
 [dev-dependencies]
 datafusion = { workspace = true }
+proptest = { version = "1.4.0", default-features = false, features = ["std"] }
 rand = "0.8.3"
diff --git a/arrow_util/src/bitset.rs b/arrow_util/src/bitset.rs
index 34f177915ef..7fecee6fe51 100644
--- a/arrow_util/src/bitset.rs
+++ b/arrow_util/src/bitset.rs
@@ -22,6 +22,15 @@ impl BitSet {
         Self::default()
     }
 
+    /// Construct an empty [`BitSet`] with a pre-allocated capacity for `n`
+    /// bits.
+    pub fn with_capacity(n: usize) -> Self {
+        Self {
+            buffer: Vec::with_capacity((n + 7) / 8),
+            len: 0,
+        }
+    }
+
     /// Creates a new BitSet with `count` unset bits.
     pub fn with_size(count: usize) -> Self {
         let mut bitset = Self::default();
@@ -31,30 +40,30 @@ impl BitSet {
 
     /// Reserve space for `count` further bits
     pub fn reserve(&mut self, count: usize) {
-        let new_buf_len = (self.len + count + 7) >> 3;
+        let new_buf_len = (self.len + count + 7) / 8;
         self.buffer.reserve(new_buf_len);
     }
 
     /// Appends `count` unset bits
     pub fn append_unset(&mut self, count: usize) {
         self.len += count;
-        let new_buf_len = (self.len + 7) >> 3;
+        let new_buf_len = (self.len + 7) / 8;
         self.buffer.resize(new_buf_len, 0);
     }
 
     /// Appends `count` set bits
     pub fn append_set(&mut self, count: usize) {
         let new_len = self.len + count;
-        let new_buf_len = (new_len + 7) >> 3;
+        let new_buf_len = (new_len + 7) / 8;
 
-        let skew = self.len & 7;
+        let skew = self.len % 8;
         if skew != 0 {
             *self.buffer.last_mut().unwrap() |= 0xFF << skew;
         }
 
         self.buffer.resize(new_buf_len, 0xFF);
 
-        let rem = new_len & 7;
+        let rem = new_len % 8;
         if rem != 0 {
             *self.buffer.last_mut().unwrap() &= (1 << rem) - 1;
         }
@@ -64,15 +73,27 @@ impl BitSet {
 
     /// Truncates the bitset to the provided length
     pub fn truncate(&mut self, len: usize) {
-        let new_buf_len = (len + 7) >> 3;
+        let new_buf_len = (len + 7) / 8;
         self.buffer.truncate(new_buf_len);
-        let overrun = len & 7;
+        let overrun = len % 8;
         if overrun > 0 {
             *self.buffer.last_mut().unwrap() &= (1 << overrun) - 1;
         }
         self.len = len;
     }
 
+    /// Split this bitmap at the specified bit boundary, such that after this
+    /// call, `self` contains the range `[0, n)` and the returned value contains
+    /// `[n, len)`.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        let mut right = Self::with_capacity(self.len - n);
+        right.extend_from_range(self, n..self.len);
+
+        self.truncate(n);
+
+        right
+    }
+
     /// Extends this [`BitSet`] by the context of `other`
     pub fn extend_from(&mut self, other: &BitSet) {
         self.append_bits(other.len, &other.buffer)
@@ -85,9 +106,9 @@ impl BitSet {
             return;
         }
 
-        let start_byte = range.start >> 3;
-        let end_byte = (range.end + 7) >> 3;
-        let skew = range.start & 7;
+        let start_byte = range.start / 8;
+        let end_byte = (range.end + 7) / 8;
+        let skew = range.start % 8;
 
         // `append_bits` requires the provided `to_set` to be byte aligned, therefore
         // if the range being copied is not byte aligned we must first append
@@ -109,16 +130,16 @@ impl BitSet {
 
     /// Appends `count` boolean values from the slice of packed bits
     pub fn append_bits(&mut self, count: usize, to_set: &[u8]) {
-        assert_eq!((count + 7) >> 3, to_set.len());
+        assert_eq!((count + 7) / 8, to_set.len());
 
         let new_len = self.len + count;
-        let new_buf_len = (new_len + 7) >> 3;
+        let new_buf_len = (new_len + 7) / 8;
         self.buffer.reserve(new_buf_len - self.buffer.len());
 
-        let whole_bytes = count >> 3;
-        let overrun = count & 7;
+        let whole_bytes = count / 8;
+        let overrun = count % 8;
 
-        let skew = self.len & 7;
+        let skew = self.len % 8;
         if skew == 0 {
             self.buffer.extend_from_slice(&to_set[..whole_bytes]);
             if overrun > 0 {
@@ -158,8 +179,8 @@ impl BitSet {
     pub fn set(&mut self, idx: usize) {
         assert!(idx <= self.len);
 
-        let byte_idx = idx >> 3;
-        let bit_idx = idx & 7;
+        let byte_idx = idx / 8;
+        let bit_idx = idx % 8;
         self.buffer[byte_idx] |= 1 << bit_idx;
     }
 
@@ -167,8 +188,8 @@ impl BitSet {
     pub fn get(&self, idx: usize) -> bool {
         assert!(idx <= self.len);
 
-        let byte_idx = idx >> 3;
-        let bit_idx = idx & 7;
+        let byte_idx = idx / 8;
+        let bit_idx = idx % 8;
         (self.buffer[byte_idx] >> bit_idx) & 1 != 0
     }
 
@@ -227,8 +248,97 @@ impl BitSet {
     pub fn is_all_unset(&self) -> bool {
         self.buffer.iter().all(|&v| v == 0)
     }
+
+    /// Returns the number of set bits in this bitmap.
+    pub fn count_ones(&self) -> usize {
+        // Invariant: the bits outside of [0, self.len) are always 0
+        self.buffer.iter().map(|v| v.count_ones() as usize).sum()
+    }
+
+    /// Returns the number of unset bits in this bitmap.
+    pub fn count_zeros(&self) -> usize {
+        self.len() - self.count_ones()
+    }
+
+    /// Returns true if any bit is set (short circuiting).
+    pub fn is_any_set(&self) -> bool {
+        self.buffer.iter().any(|&v| v != 0)
+    }
+
+    /// Returns a value [`Iterator`] that yields boolean values encoded in the
+    /// bitmap.
+    pub fn iter(&self) -> Iter<'_> {
+        Iter::new(self)
+    }
+
+    /// Returns the bitwise AND between the two [`BitSet`] instances.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the two sets have differing lengths.
+    pub fn and(&self, other: &Self) -> Self {
+        assert_eq!(self.len, other.len);
+
+        Self {
+            buffer: self
+                .buffer
+                .iter()
+                .zip(other.buffer.iter())
+                .map(|(a, b)| a & b)
+                .collect(),
+            len: self.len,
+        }
+    }
+}
+
+/// A value iterator yielding the boolean values encoded in the bitmap.
+#[derive(Debug)]
+pub struct Iter<'a> {
+    /// A reference to the bitmap buffer.
+    buffer: &'a [u8],
+    /// The index of the next yielded bit in `buffer`.
+    idx: usize,
+    /// The number of bits stored in buffer.
+    len: usize,
+}
+
+impl<'a> Iter<'a> {
+    fn new(b: &'a BitSet) -> Self {
+        Self {
+            buffer: &b.buffer,
+            idx: 0,
+            len: b.len(),
+        }
+    }
+}
+
+impl<'a> Iterator for Iter<'a> {
+    type Item = bool;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.idx >= self.len {
+            return None;
+        }
+
+        let byte_idx = self.idx / 8;
+        let shift = self.idx % 8;
+
+        self.idx += 1;
+
+        let byte = self.buffer[byte_idx];
+        let byte = byte >> shift;
+
+        Some(byte & 1 == 1)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let v = self.len - self.idx;
+        (v, Some(v))
+    }
 }
 
+impl<'a> ExactSizeIterator for Iter<'a> {}
+
 /// Returns an iterator over set bit positions in increasing order
 pub fn iter_set_positions(bytes: &[u8]) -> impl Iterator<Item = usize> + '_ {
     iter_set_positions_with_offset(bytes, 0)
@@ -240,17 +350,17 @@ pub fn iter_set_positions_with_offset(
     bytes: &[u8],
     offset: usize,
 ) -> impl Iterator<Item = usize> + '_ {
-    let mut byte_idx = offset >> 3;
+    let mut byte_idx = offset / 8;
     let mut in_progress = bytes.get(byte_idx).cloned().unwrap_or(0);
 
-    let skew = offset & 7;
+    let skew = offset % 8;
     in_progress &= 0xFF << skew;
 
     std::iter::from_fn(move || loop {
         if in_progress != 0 {
             let bit_pos = in_progress.trailing_zeros();
             in_progress ^= 1 << bit_pos;
-            return Some((byte_idx << 3) + (bit_pos as usize));
+            return Some((byte_idx * 8) + (bit_pos as usize));
         }
         byte_idx += 1;
         in_progress = *bytes.get(byte_idx)?;
@@ -259,11 +369,13 @@ pub fn iter_set_positions_with_offset(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use arrow::array::BooleanBufferBuilder;
+    use proptest::prelude::*;
     use rand::prelude::*;
     use rand::rngs::OsRng;
 
+    use super::*;
+
     /// Computes a compacted representation of a given bool array
     fn compact_bools(bools: &[bool]) -> Vec<u8> {
         bools
@@ -284,9 +396,8 @@ mod tests {
         bools
             .iter()
             .enumerate()
-            // Filter out all y that are not true and then return only x
-            .filter(|&(_, y)| *y)
-            .map(|(x, _)| x)
+            .filter(|&(_x, y)| *y)
+            .map(|(x, _y)| x)
     }
 
     #[test]
@@ -304,8 +415,11 @@ mod tests {
     fn test_bit_mask() {
         let mut mask = BitSet::new();
 
+        assert!(!mask.is_any_set());
+
         mask.append_bits(8, &[0b11111111]);
         let d1 = mask.buffer.clone();
+        assert!(mask.is_any_set());
 
         mask.append_bits(3, &[0b01010010]);
         let d2 = mask.buffer.clone();
@@ -522,9 +636,17 @@ mod tests {
     fn test_all_set_unset() {
         for i in 1..100 {
             let mut v = BitSet::new();
+            assert!(!v.is_any_set());
             v.append_set(i);
             assert!(v.is_all_set());
             assert!(!v.is_all_unset());
+            assert!(v.is_any_set());
+
+            let mut v = BitSet::new();
+            v.append_unset(i);
+            assert!(!v.is_any_set());
+            v.append_set(1);
+            assert!(v.is_any_set());
         }
     }
 
@@ -590,4 +712,168 @@ mod tests {
         assert!(!v.is_all_set());
         assert!(v.is_all_unset());
     }
+
+    #[test]
+    fn test_split_byte_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(16);
+        a.append_unset(8);
+        a.append_set(8);
+
+        let b = a.split_off(16);
+
+        assert_eq!(a.len(), 16);
+        assert_eq!(b.len(), 16);
+
+        // All the bits in A are set.
+        assert!(a.is_all_set());
+        for i in 0..16 {
+            assert!(a.get(i));
+        }
+
+        // The first 8 bits in b are unset, and the next 8 bits are set.
+        for i in 0..8 {
+            assert!(!b.get(i));
+        }
+        for i in 8..16 {
+            assert!(b.get(i));
+        }
+    }
+
+    #[test]
+    fn test_split_sub_byte_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(3);
+        a.append_unset(3);
+        a.append_set(1);
+
+        assert_eq!(a.bytes(), &[0b01000111]);
+
+        let b = a.split_off(5);
+
+        assert_eq!(a.len(), 5);
+        assert_eq!(b.len(), 2);
+
+        // A contains 3 set bits & 2 unset bits, with the rest masked out.
+        assert_eq!(a.bytes(), &[0b00000111]);
+
+        // B contains 1 unset bit, and then 1 set bit
+        assert_eq!(b.bytes(), &[0b0000010]);
+    }
+
+    #[test]
+    fn test_split_multi_byte_unclean_boundary() {
+        let mut a = BitSet::new();
+
+        a.append_set(8);
+        a.append_unset(1);
+        a.append_set(1);
+        a.append_unset(1);
+        a.append_set(1);
+
+        assert_eq!(a.bytes(), &[0b11111111, 0b00001010]);
+
+        let b = a.split_off(10);
+
+        assert_eq!(a.len(), 10);
+        assert_eq!(b.len(), 2);
+
+        assert_eq!(a.bytes(), &[0b11111111, 0b00000010]);
+        assert_eq!(b.bytes(), &[0b0000010]);
+    }
+
+    #[test]
+    fn test_count_ones_with_truncate() {
+        // For varying sizes of bitmaps.
+        for i in 1..150 {
+            let mut b = BitSet::new();
+
+            // Set "i" number of bits in 2*i values.
+            for _ in 0..i {
+                b.append_unset(1);
+                b.append_set(1);
+            }
+
+            assert_eq!(b.len(), 2 * i);
+            assert_eq!(b.count_ones(), i);
+            assert_eq!(b.count_zeros(), i);
+
+            // Split it such that the last bit is removed.
+            let other = b.split_off((2 * i) - 1);
+            assert_eq!(other.len(), 1);
+            assert_eq!(other.count_ones(), 1);
+            assert_eq!(other.count_zeros(), 0);
+
+            // Which means the original bitmap must now have 1 less 1 bit.
+            assert_eq!(b.len(), (2 * i) - 1);
+            assert_eq!(b.count_ones(), i - 1);
+            assert_eq!(b.count_zeros(), i);
+        }
+    }
+
+    prop_compose! {
+        /// Returns a [`BitSet`] of random length and content.
+        fn arbitrary_bitset()(
+            values in prop::collection::vec(any::<bool>(), 0..20)
+        ) -> BitSet {
+            let mut b = BitSet::new();
+
+            for v in &values {
+                match v {
+                    true => b.append_set(1),
+                    false => b.append_unset(1),
+                }
+            }
+
+            b
+        }
+    }
+
+    proptest! {
+        #[test]
+        fn prop_iter(
+            values in prop::collection::vec(any::<bool>(), 0..20),
+        ) {
+            let mut b = BitSet::new();
+
+            for v in &values {
+                match v {
+                    true => b.append_set(1),
+                    false => b.append_unset(1),
+                }
+            }
+
+            assert_eq!(values.len(), b.len());
+
+            let got = b.iter().collect::<Vec<_>>();
+            assert_eq!(values, got);
+
+            // Exact size iter
+            assert_eq!(b.iter().len(), values.len());
+        }
+
+        #[test]
+        fn prop_and(
+            mut a in arbitrary_bitset(),
+            mut b in arbitrary_bitset(),
+        ) {
+            let min_len = a.len().min(b.len());
+            // Truncate a and b to the same length.
+            a.truncate(min_len);
+            b.truncate(min_len);
+
+            let want = a
+                .iter()
+                .zip(b.iter())
+                .map(|(a, b)| a & b)
+                .collect::<Vec<_>>();
+
+            let c = a.and(&b);
+            let got = c.iter().collect::<Vec<_>>();
+
+            assert_eq!(got, want);
+        }
+    }
 }
diff --git a/arrow_util/src/string.rs b/arrow_util/src/string.rs
index fe3dcc225a7..5460a38b7ff 100644
--- a/arrow_util/src/string.rs
+++ b/arrow_util/src/string.rs
@@ -154,6 +154,37 @@ impl<K: AsPrimitive<usize> + FromPrimitive + Zero> PackedStringArray<K> {
     pub fn into_inner(self) -> (Vec<K>, String) {
         (self.offsets, self.storage)
     }
+
+    /// Split this [`PackedStringArray`] at `n`, such that `self`` contains the
+    /// elements `[0, n)` and the returned [`PackedStringArray`] contains
+    /// elements `[n, len)`.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        if n > self.len() {
+            return Default::default();
+        }
+
+        let offsets = self.offsets.split_off(n + 1);
+
+        // Figure out where to split the string storage.
+        let split_point = self.offsets.last().map(|v| v.as_()).unwrap();
+
+        // Split the storage at the split point, such that the first N values
+        // appear in self.
+        let storage = self.storage.split_off(split_point);
+
+        // The new "offsets" now needs remapping such that the first offset
+        // starts at 0, so that indexing into the new storage string will hit
+        // the right start point.
+        let offsets = std::iter::once(K::zero())
+            .chain(
+                offsets
+                    .into_iter()
+                    .map(|v| K::from_usize(v.as_() - split_point).unwrap()),
+            )
+            .collect::<Vec<_>>();
+
+        Self { offsets, storage }
+    }
 }
 
 impl PackedStringArray<i32> {
@@ -201,6 +232,8 @@ impl<'a, K: AsPrimitive<usize> + FromPrimitive + Zero> Iterator for PackedString
 mod tests {
     use crate::string::PackedStringArray;
 
+    use proptest::prelude::*;
+
     #[test]
     fn test_storage() {
         let mut array = PackedStringArray::<i32>::new();
@@ -316,4 +349,36 @@ mod tests {
             vec!["hello", "world", "cupcake", "", "bar", "", "foo", "bar", "", "fiz"]
         );
     }
+
+    proptest! {
+        #[test]
+        fn prop_split_off(
+            a in prop::collection::vec(any::<String>(), 0..20),
+            b in prop::collection::vec(any::<String>(), 0..20),
+        ) {
+            let mut p = PackedStringArray::<i32>::new();
+
+            // Add all the elements in "a" and "b" to the string array.
+            for v in a.iter().chain(b.iter()) {
+                p.append(v);
+            }
+
+            // Split the packed string array at the boundary of "a".
+            let p2 = p.split_off(a.len());
+
+            assert_eq!(p.iter().collect::<Vec<_>>(), a, "parent");
+            assert_eq!(p2.iter().collect::<Vec<_>>(), b, "child");
+        }
+    }
+
+    #[test]
+    fn test_split_off_oob() {
+        let mut p = PackedStringArray::<i32>::new();
+
+        p.append("bananas");
+
+        let got = p.split_off(42);
+        assert_eq!(p.len(), 1);
+        assert_eq!(got.len(), 0);
+    }
 }
diff --git a/arrow_util/src/test_util.rs b/arrow_util/src/test_util.rs
index 17e80f88c46..8126e251787 100644
--- a/arrow_util/src/test_util.rs
+++ b/arrow_util/src/test_util.rs
@@ -240,7 +240,7 @@ static REGEX_FILTER: Lazy<Regex> = Lazy::new(|| {
 
 /// Matches things like `time@3 < -9223372036854775808` and `time_min@2 > 1641031200399937022`
 static REGEX_TIME_OP: Lazy<Regex> = Lazy::new(|| {
-    Regex::new("(?P<prefix>time((_min)|(_max))?@[0-9]+ [<>=]=? )(?P<value>-?[0-9]+)")
+    Regex::new("(?P<prefix>time((_min)|(_max))?@[0-9]+ [<>=]=? (CAST\\()?)(?P<value>-?[0-9]+)(?P<suffix> AS Timestamp\\(Nanosecond, \"[^\"]\"\\)\\))?")
         .expect("time opt regex")
 });
 
@@ -258,7 +258,8 @@ fn normalize_time_ops(s: &str) -> String {
     REGEX_TIME_OP
         .replace_all(s, |c: &Captures<'_>| {
             let prefix = c.name("prefix").expect("always captures").as_str();
-            format!("{prefix}<REDACTED>")
+            let suffix = c.name("suffix").map_or("", |m| m.as_str());
+            format!("{prefix}<REDACTED>{suffix}")
         })
         .to_string()
 }
diff --git a/authz/Cargo.toml b/authz/Cargo.toml
index 06b0a68a5b5..9fc5ed9a961 100644
--- a/authz/Cargo.toml
+++ b/authz/Cargo.toml
@@ -6,11 +6,12 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[lints]
+workspace = true
 
 [dependencies]
 backoff = { path = "../backoff" }
-http = {version = "0.2.9", optional = true }
+http = {version = "0.2.11", optional = true }
 iox_time = { version = "0.1.0", path = "../iox_time" }
 generated_types = { path = "../generated_types" }
 metric = { version = "0.1.0", path = "../metric" }
@@ -19,8 +20,8 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 # crates.io dependencies in alphabetical order.
 async-trait = "0.1"
-base64 = "0.21.4"
-snafu = "0.7"
+base64 = "0.21.7"
+snafu = "0.8"
 tonic = { workspace = true }
 
 [dev-dependencies]
@@ -28,7 +29,7 @@ assert_matches = "1.5.0"
 parking_lot = "0.12.1"
 paste = "1.0.14"
 test_helpers_end_to_end = { path = "../test_helpers_end_to_end" }
-tokio = "1.32.0"
+tokio = "1.35.1"
 
 [features]
 http = ["dep:http"]
diff --git a/authz/src/permission.rs b/authz/src/permission.rs
index 1836e655cd7..9ffced0e4b1 100644
--- a/authz/src/permission.rs
+++ b/authz/src/permission.rs
@@ -75,13 +75,13 @@ impl TryFrom<proto::Permission> for Permission {
         match value.permission_one_of {
             Some(proto::permission::PermissionOneOf::ResourceAction(ra)) => {
                 let r = Resource::try_from_proto(
-                    proto::resource_action_permission::ResourceType::from_i32(ra.resource_type)
-                        .ok_or(IncompatiblePermissionError {})?,
+                    proto::resource_action_permission::ResourceType::try_from(ra.resource_type)
+                        .map_err(|_| IncompatiblePermissionError {})?,
                     ra.resource_id,
                 )?;
                 let a = Action::try_from(
-                    proto::resource_action_permission::Action::from_i32(ra.action)
-                        .ok_or(IncompatiblePermissionError {})?,
+                    proto::resource_action_permission::Action::try_from(ra.action)
+                        .map_err(|_| IncompatiblePermissionError {})?,
                 )?;
                 Ok(Self::ResourceAction(r, a))
             }
diff --git a/backoff/Cargo.toml b/backoff/Cargo.toml
index 1bd3cb34351..484412fb187 100644
--- a/backoff/Cargo.toml
+++ b/backoff/Cargo.toml
@@ -5,9 +5,12 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-tokio = { version = "1.32", features = ["macros", "time"] }
+tokio = { version = "1.35", features = ["macros", "time"] }
 observability_deps = { path = "../observability_deps" }
 rand = "0.8"
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/backoff/src/lib.rs b/backoff/src/lib.rs
index e1264840265..907847b380f 100644
--- a/backoff/src/lib.rs
+++ b/backoff/src/lib.rs
@@ -163,6 +163,7 @@ impl Backoff {
         F1: std::future::Future<Output = ControlFlow<B, E>> + Send,
         E: std::error::Error + Send + 'static,
     {
+        let mut fail_count = 0_usize;
         loop {
             // first execute `F` and then use it, so we can avoid `F: Sync`.
             let do_stuff = do_stuff();
@@ -182,10 +183,13 @@ impl Backoff {
                 }
             };
 
+            fail_count += 1;
+
             warn!(
                 error=%e,
                 task_name,
                 backoff_secs = backoff.as_secs(),
+                fail_count,
                 "request encountered non-fatal error - backing off",
             );
             tokio::time::sleep(backoff).await;
diff --git a/cache_system/Cargo.toml b/cache_system/Cargo.toml
index d17d0ed6272..bb07eba7480 100644
--- a/cache_system/Cargo.toml
+++ b/cache_system/Cargo.toml
@@ -5,8 +5,11 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-async-trait = "0.1.73"
+async-trait = "0.1.77"
 backoff = { path = "../backoff" }
 futures = "0.3"
 iox_time = { path = "../iox_time" }
@@ -16,9 +19,10 @@ ouroboros = "0.18"
 parking_lot = { version = "0.12", features = ["arc_lock"] }
 pdatastructs = { version = "0.7", default-features = false, features = ["fixedbitset"] }
 rand = "0.8.3"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
-tokio-util = { version = "0.7.9" }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio-util = { version = "0.7.10" }
 trace = { path = "../trace"}
+tracker = { path = "../tracker"}
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/cache_system/src/addressable_heap.rs b/cache_system/src/addressable_heap.rs
index 5ce84b6e41c..4f3466fe3d3 100644
--- a/cache_system/src/addressable_heap.rs
+++ b/cache_system/src/addressable_heap.rs
@@ -171,6 +171,7 @@ fn project_tuple<A, B>(t: &(A, B)) -> (&A, &B) {
 }
 
 /// Iterator of [`AddressableHeap::iter`].
+#[derive(Debug)]
 pub struct AddressableHeapIter<'a, K, V, O>
 where
     K: Clone + Eq + Hash + Ord,
@@ -477,13 +478,11 @@ mod tests {
             res
         }
 
+        #[allow(clippy::map_identity)] // https://github.com/rust-lang/rust-clippy/issues/11764
         fn peek(&self) -> Option<(&u8, &String, &i8)> {
-            #[allow(clippy::map_identity)]
             self.inner
                 .iter()
                 .min_by_key(|(k, _v, o)| (o, k))
-                // This is a false positive as this actually changes
-                // Option<&(u8, String, i8)> -> Option<(&u8, &String, &i8)>
                 .map(|(k, v, o)| (k, v, o))
         }
 
diff --git a/cache_system/src/backend/mod.rs b/cache_system/src/backend/mod.rs
index 09855233e34..8395c830d63 100644
--- a/cache_system/src/backend/mod.rs
+++ b/cache_system/src/backend/mod.rs
@@ -10,7 +10,7 @@ mod test_util;
 /// Backend to keep and manage stored entries.
 ///
 /// A backend might remove entries at any point, e.g. due to memory pressure or expiration.
-pub trait CacheBackend: Debug + Send + 'static {
+pub trait CacheBackend: Debug {
     /// Cache key.
     type K: Clone + Eq + Hash + Ord + Debug + Send + 'static;
 
@@ -37,13 +37,12 @@ pub trait CacheBackend: Debug + Send + 'static {
     fn as_any(&self) -> &dyn Any;
 }
 
-impl<K, V> CacheBackend for Box<dyn CacheBackend<K = K, V = V>>
+impl<T> CacheBackend for Box<T>
 where
-    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
-    V: Clone + Debug + Send + 'static,
+    T: CacheBackend + ?Sized + 'static,
 {
-    type K = K;
-    type V = V;
+    type K = T::K;
+    type V = T::V;
 
     fn get(&mut self, k: &Self::K) -> Option<Self::V> {
         self.as_mut().get(k)
diff --git a/cache_system/src/backend/policy/mod.rs b/cache_system/src/backend/policy/mod.rs
index d525d284394..c503c2a849a 100644
--- a/cache_system/src/backend/policy/mod.rs
+++ b/cache_system/src/backend/policy/mod.rs
@@ -11,7 +11,7 @@ use std::{
 };
 
 use iox_time::{Time, TimeProvider};
-use parking_lot::{lock_api::ArcMutexGuard, Mutex, RawMutex, ReentrantMutex};
+use parking_lot::{lock_api::ArcReentrantMutexGuard, RawMutex, RawThreadId, ReentrantMutex};
 
 use super::CacheBackend;
 
@@ -274,14 +274,14 @@ where
     ///
     /// Panics if `inner` is not empty.
     pub fn new(
-        inner: Box<dyn CacheBackend<K = K, V = V>>,
+        inner: Box<dyn CacheBackend<K = K, V = V> + Send>,
         time_provider: Arc<dyn TimeProvider>,
     ) -> Self {
         assert!(inner.is_empty(), "inner backend is not empty");
 
         Self {
             inner: Arc::new(ReentrantMutex::new(RefCell::new(PolicyBackendInner {
-                inner: Arc::new(Mutex::new(inner)),
+                inner,
                 subscribers: Vec::new(),
                 time_provider,
             }))),
@@ -324,11 +324,8 @@ where
     pub fn inner_ref(&mut self) -> InnerBackendRef<'_, K, V> {
         // NOTE: We deliberately use a mutable reference here to prevent users from using `<Self as
         // CacheBackend>` while we hold a lock to the underlying backend.
-        lock_inner!(guard = self.inner);
-        InnerBackendRef {
-            inner: guard.inner.lock_arc(),
-            _phantom: PhantomData,
-        }
+
+        inner_ref::build(Arc::clone(&self.inner))
     }
 }
 
@@ -345,8 +342,7 @@ where
         perform_changes(&mut guard, vec![ChangeRequest::get(k.clone())]);
 
         // poll inner backend AFTER everything has settled
-        let mut inner = guard.inner.lock();
-        inner.get(k)
+        guard.inner.get(k)
     }
 
     fn set(&mut self, k: Self::K, v: Self::V) {
@@ -361,8 +357,7 @@ where
 
     fn is_empty(&self) -> bool {
         lock_inner!(guard = self.inner);
-        let inner = guard.inner.lock();
-        inner.is_empty()
+        guard.inner.is_empty()
     }
 
     fn as_any(&self) -> &dyn std::any::Any {
@@ -410,11 +405,7 @@ where
     V: Clone + Debug + Send + 'static,
 {
     /// Underlying cache backend.
-    ///
-    /// This is wrapped into another `Arc<Mutex<...>>` construct even though [`PolicyBackendInner`]
-    /// is already guarded by a lock because we need to reference the underlying backend from
-    /// [`Recorder`], and [`Recorder`] implements [`CacheBackend`] which is `'static`.
-    inner: Arc<Mutex<Box<dyn CacheBackend<K = K, V = V>>>>,
+    inner: Box<dyn CacheBackend<K = K, V = V> + Send>,
 
     /// List of subscribers.
     subscribers: Vec<Box<dyn Subscriber<K = K, V = V>>>,
@@ -439,7 +430,7 @@ fn perform_changes<K, V>(
 
     while let Some(change_request) = tasks.pop_front() {
         let mut recorder = Recorder {
-            inner: Arc::clone(&inner.inner),
+            inner: inner.inner.as_mut(),
             records: vec![],
         };
 
@@ -542,7 +533,7 @@ where
     /// patterns work out of the box without the need to fear interleaving modifications.
     pub fn from_fn<F>(f: F) -> Self
     where
-        F: for<'b> FnOnce(&'b mut Recorder<K, V>) + 'a,
+        F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, K, V>) + 'a,
     {
         Self { fun: Box::new(f) }
     }
@@ -578,13 +569,13 @@ where
     }
 
     /// Execute this change request.
-    pub fn eval(self, backend: &mut Recorder<K, V>) {
-        (self.fun)(backend)
+    pub fn eval(self, backend: &mut Recorder<'_, K, V>) {
+        (self.fun)(backend);
     }
 }
 
 /// Function captured within [`ChangeRequest`].
-type ChangeRequestFn<'a, K, V> = Box<dyn for<'b> FnOnce(&'b mut Recorder<K, V>) + 'a>;
+type ChangeRequestFn<'a, K, V> = Box<dyn for<'b, 'c> FnOnce(&'c mut Recorder<'b, K, V>) + 'a>;
 
 /// Records of interactions with the callback [`CacheBackend`].
 #[derive(Debug, PartialEq)]
@@ -614,16 +605,16 @@ enum Record<K, V> {
 /// Specialized [`CacheBackend`] that forwards changes and requests to the underlying backend of
 /// [`PolicyBackend`] but also records all changes into [`Record`]s.
 #[derive(Debug)]
-pub struct Recorder<K, V>
+pub struct Recorder<'a, K, V>
 where
     K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
     V: Clone + Debug + Send + 'static,
 {
-    inner: Arc<Mutex<Box<dyn CacheBackend<K = K, V = V>>>>,
+    inner: &'a mut (dyn CacheBackend<K = K, V = V> + Send),
     records: Vec<Record<K, V>>,
 }
 
-impl<K, V> Recorder<K, V>
+impl<'a, K, V> Recorder<'a, K, V>
 where
     K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
     V: Clone + Debug + Send + 'static,
@@ -637,11 +628,11 @@ where
     /// modifying requests like [`SET`](CacheBackend::set) or [`REMOVE`](CacheBackend::remove)
     /// since they always require policies to be in-sync.
     pub fn get_untracked(&mut self, k: &K) -> Option<V> {
-        self.inner.lock().get(k)
+        self.inner.get(k)
     }
 }
 
-impl<K, V> CacheBackend for Recorder<K, V>
+impl<'a, K, V> CacheBackend for Recorder<'a, K, V>
 where
     K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
     V: Clone + Debug + Send + 'static,
@@ -651,7 +642,7 @@ where
 
     fn get(&mut self, k: &Self::K) -> Option<Self::V> {
         self.records.push(Record::Get { k: k.clone() });
-        self.inner.lock().get(k)
+        self.inner.get(k)
     }
 
     fn set(&mut self, k: Self::K, v: Self::V) {
@@ -659,64 +650,75 @@ where
             k: k.clone(),
             v: v.clone(),
         });
-        self.inner.lock().set(k, v);
+        self.inner.set(k, v);
     }
 
     fn remove(&mut self, k: &Self::K) {
         self.records.push(Record::Remove { k: k.clone() });
-        self.inner.lock().remove(k);
+        self.inner.remove(k);
     }
 
     fn is_empty(&self) -> bool {
-        self.inner.lock().is_empty()
+        self.inner.is_empty()
     }
 
     fn as_any(&self) -> &dyn std::any::Any {
-        self
+        panic!("don't any-cast the recorder please")
     }
 }
 
-/// Read-only ref to the inner backend of [`PolicyBackend`] for debugging.
-pub struct InnerBackendRef<'a, K, V>
-where
-    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
-    V: Clone + Debug + Send + 'static,
-{
-    inner: ArcMutexGuard<RawMutex, Box<dyn CacheBackend<K = K, V = V>>>,
-    _phantom: PhantomData<&'a mut ()>,
-}
+/// Helper module that wraps the implementation of [`InnerBackendRef`].
+///
+/// This is required because [`ouroboros`] generates a bunch of code that we do not want to leak all over the place.
+mod inner_ref {
+    #![allow(non_snake_case, clippy::future_not_send)]
 
-// Workaround for <https://github.com/rust-lang/rust/issues/100573>.
-impl<'a, K, V> Drop for InnerBackendRef<'a, K, V>
-where
-    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
-    V: Clone + Debug + Send + 'static,
-{
-    fn drop(&mut self) {}
-}
+    use super::*;
+    use ouroboros::self_referencing;
 
-impl<'a, K, V> Debug for InnerBackendRef<'a, K, V>
-where
-    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
-    V: Clone + Debug + Send + 'static,
-{
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InnerBackendRef").finish_non_exhaustive()
+    /// Read-only ref to the inner backend of [`PolicyBackend`] for debugging.
+    #[self_referencing]
+    pub struct InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        l1: ArcReentrantMutexGuard<RawMutex, RawThreadId, RefCell<PolicyBackendInner<K, V>>>,
+        #[borrows(l1)]
+        #[covariant]
+        l2: std::cell::RefMut<'this, PolicyBackendInner<K, V>>,
+        _phantom: PhantomData<&'a mut ()>,
     }
-}
 
-impl<'a, K, V> Deref for InnerBackendRef<'a, K, V>
-where
-    K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
-    V: Clone + Debug + Send + 'static,
-{
-    type Target = dyn CacheBackend<K = K, V = V>;
+    impl<'a, K, V> Deref for InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        type Target = dyn CacheBackend<K = K, V = V>;
+
+        fn deref(&self) -> &Self::Target {
+            self.borrow_l2().inner.as_ref()
+        }
+    }
 
-    fn deref(&self) -> &Self::Target {
-        self.inner.as_ref()
+    pub(super) fn build<'a, K, V>(inner: StrongSharedInner<K, V>) -> InnerBackendRef<'a, K, V>
+    where
+        K: Clone + Eq + Hash + Ord + Debug + Send + 'static,
+        V: Clone + Debug + Send + 'static,
+    {
+        let inner = inner.lock_arc();
+        InnerBackendRefBuilder {
+            l1: inner,
+            l2_builder: |l1| l1.borrow_mut(),
+            _phantom: PhantomData,
+        }
+        .build()
     }
 }
 
+pub use inner_ref::InnerBackendRef;
+
 #[cfg(test)]
 mod tests {
     use std::{collections::HashMap, sync::Barrier, thread::JoinHandle};
@@ -1923,7 +1925,7 @@ mod tests {
 
     /// Same as [`ChangeRequestFn`] but implements `Send`.
     type SendableChangeRequestFn =
-        Box<dyn for<'a> FnOnce(&'a mut Recorder<String, usize>) + Send + 'static>;
+        Box<dyn for<'a, 'b> FnOnce(&'b mut Recorder<'a, String, usize>) + Send + 'static>;
 
     /// Same as [`ChangeRequest`] but implements `Send`.
     struct SendableChangeRequest {
@@ -1940,7 +1942,7 @@ mod tests {
     impl SendableChangeRequest {
         fn from_fn<F>(f: F) -> Self
         where
-            F: for<'b> FnOnce(&'b mut Recorder<String, usize>) + Send + 'static,
+            F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, String, usize>) + Send + 'static,
         {
             Self { fun: Box::new(f) }
         }
diff --git a/cache_system/src/cache/driver.rs b/cache_system/src/cache/driver.rs
index 2b5e17ba06f..c0c9773b677 100644
--- a/cache_system/src/cache/driver.rs
+++ b/cache_system/src/cache/driver.rs
@@ -12,8 +12,8 @@ use futures::{
     FutureExt, TryFutureExt,
 };
 use observability_deps::tracing::debug;
-use parking_lot::Mutex;
 use std::{collections::HashMap, fmt::Debug, future::Future, sync::Arc};
+use tracker::{LockMetrics, Mutex};
 
 use super::{Cache, CacheGetStatus, CachePeekStatus};
 
@@ -21,7 +21,7 @@ use super::{Cache, CacheGetStatus, CachePeekStatus};
 #[derive(Debug)]
 pub struct CacheDriver<B, L>
 where
-    B: CacheBackend,
+    B: CacheBackend + Send + 'static,
     L: Loader<K = B::K, V = B::V>,
 {
     state: Arc<Mutex<CacheState<B>>>,
@@ -30,13 +30,18 @@ where
 
 impl<B, L> CacheDriver<B, L>
 where
-    B: CacheBackend,
+    B: CacheBackend + Send + 'static,
     L: Loader<K = B::K, V = B::V>,
 {
     /// Create new, empty cache with given loader function.
-    pub fn new(loader: Arc<L>, backend: B) -> Self {
+    pub fn new(loader: Arc<L>, backend: B, metrics: &metric::Registry, name: &'static str) -> Self {
+        let metrics = Arc::new(LockMetrics::new(
+            metrics,
+            &[("what", "cache_driver_state"), ("cache", name)],
+        ));
+
         Self {
-            state: Arc::new(Mutex::new(CacheState {
+            state: Arc::new(metrics.new_mutex(CacheState {
                 cached_entries: backend,
                 running_queries: HashMap::new(),
                 tag_counter: 0,
@@ -140,7 +145,7 @@ where
 #[async_trait]
 impl<B, L> Cache for CacheDriver<B, L>
 where
-    B: CacheBackend,
+    B: CacheBackend + Send,
     L: Loader<K = B::K, V = B::V>,
 {
     type K = B::K;
@@ -257,7 +262,7 @@ where
 
 impl<B, L> Drop for CacheDriver<B, L>
 where
-    B: CacheBackend,
+    B: CacheBackend + Send,
     L: Loader<K = B::K, V = B::V>,
 {
     fn drop(&mut self) {
@@ -430,7 +435,12 @@ mod tests {
         type Cache = CacheDriver<HashMap<u8, String>, TestLoader>;
 
         fn construct(&self, loader: Arc<TestLoader>) -> Arc<Self::Cache> {
-            Arc::new(CacheDriver::new(Arc::clone(&loader) as _, HashMap::new()))
+            Arc::new(CacheDriver::new(
+                Arc::clone(&loader) as _,
+                HashMap::new(),
+                &metric::Registry::default(),
+                "test",
+            ))
         }
 
         fn get_extra(&self, inner: bool) -> Self::GetExtra {
diff --git a/cache_system/src/cache/metrics.rs b/cache_system/src/cache/metrics.rs
index 7ebad842a16..c72364aef43 100644
--- a/cache_system/src/cache/metrics.rs
+++ b/cache_system/src/cache/metrics.rs
@@ -645,7 +645,12 @@ mod tests {
         }
 
         fn new_with_loader(loader: Arc<TestLoader>) -> Self {
-            let inner = CacheDriver::new(Arc::clone(&loader) as _, HashMap::new());
+            let inner = CacheDriver::new(
+                Arc::clone(&loader) as _,
+                HashMap::new(),
+                &metric::Registry::default(),
+                "test",
+            );
             let time_provider =
                 Arc::new(MockProvider::new(Time::from_timestamp_millis(0).unwrap()));
             let metric_registry = metric::Registry::new();
diff --git a/cache_system/src/lib.rs b/cache_system/src/lib.rs
index f20f1a9c8b3..68e60ae3e2c 100644
--- a/cache_system/src/lib.rs
+++ b/cache_system/src/lib.rs
@@ -12,6 +12,7 @@
     clippy::dbg_macro,
     unused_crate_dependencies
 )]
+#![allow(unreachable_pub)]
 
 // Workaround for "unused crate" lint false positives.
 #[cfg(test)]
diff --git a/cache_system/src/loader/batch.rs b/cache_system/src/loader/batch.rs
index 4d30196a0dc..36ab123929b 100644
--- a/cache_system/src/loader/batch.rs
+++ b/cache_system/src/loader/batch.rs
@@ -464,7 +464,12 @@ mod tests {
     #[tokio::test]
     async fn test_auto_flush_integration_with_cache_driver() {
         let (inner, batch) = setup();
-        let cache = CacheDriver::new(Arc::clone(&batch), HashMap::new());
+        let cache = CacheDriver::new(
+            Arc::clone(&batch),
+            HashMap::new(),
+            &metric::Registry::default(),
+            "test",
+        );
 
         inner.mock_next(vec![1, 2], vec![String::from("foo"), String::from("bar")]);
         inner.mock_next(vec![3], vec![String::from("baz")]);
diff --git a/catalog_cache/Cargo.toml b/catalog_cache/Cargo.toml
new file mode 100644
index 00000000000..cdb79c5347b
--- /dev/null
+++ b/catalog_cache/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "catalog_cache"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+bytes = "1.5"
+dashmap = "5.5"
+futures = "0.3"
+hyper = "0.14"
+url = "2.5"
+reqwest = { version = "0.11", default-features = false }
+snafu = "0.8"
+tokio = { version = "1.35", default-features = false, features = ["macros", "rt"] }
+tokio-util = "0.7"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/catalog_cache/src/api/client.rs b/catalog_cache/src/api/client.rs
new file mode 100644
index 00000000000..94e9bf9b47c
--- /dev/null
+++ b/catalog_cache/src/api/client.rs
@@ -0,0 +1,176 @@
+//! Client for the cache HTTP API
+
+use crate::api::list::{ListDecoder, ListEntry, MAX_VALUE_SIZE};
+use crate::api::{RequestPath, GENERATION};
+use crate::{CacheKey, CacheValue};
+use bytes::{Buf, Bytes};
+use futures::prelude::*;
+use futures::stream::BoxStream;
+use reqwest::{Client, Response, StatusCode, Url};
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::time::Duration;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Creating client: {source}"))]
+    Client { source: reqwest::Error },
+
+    #[snafu(display("Put Reqwest error: {source}"))]
+    Put { source: reqwest::Error },
+
+    #[snafu(display("Get Reqwest error: {source}"))]
+    Get { source: reqwest::Error },
+
+    #[snafu(display("List Reqwest error: {source}"))]
+    List { source: reqwest::Error },
+
+    #[snafu(display("Health Reqwest error: {source}"))]
+    Health { source: reqwest::Error },
+
+    #[snafu(display("Missing generation header"))]
+    MissingGeneration,
+
+    #[snafu(display("Invalid generation value"))]
+    InvalidGeneration,
+
+    #[snafu(display("Error decoding list stream: {source}"), context(false))]
+    ListStream { source: crate::api::list::Error },
+}
+
+/// Result type for [`CatalogCacheClient`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// The type returned by [`CatalogCacheClient::list`]
+pub type ListStream = BoxStream<'static, Result<ListEntry>>;
+
+const RESOURCE_REQUEST_TIMEOUT: Duration = Duration::from_secs(1);
+
+/// We use a longer timeout for list request as they may transfer a non-trivial amount of data
+const LIST_REQUEST_TIMEOUT: Duration = Duration::from_secs(20);
+
+/// A client for accessing a remote catalog cache
+#[derive(Debug)]
+pub struct CatalogCacheClient {
+    client: Client,
+    endpoint: Url,
+}
+
+impl CatalogCacheClient {
+    /// Create a new [`CatalogCacheClient`] with the given remote endpoint
+    pub fn try_new(endpoint: Url) -> Result<Self> {
+        let client = Client::builder()
+            .connect_timeout(Duration::from_secs(2))
+            .build()
+            .context(ClientSnafu)?;
+
+        Ok(Self { endpoint, client })
+    }
+
+    /// Retrieve the given value from the remote cache, if present
+    pub async fn get(&self, key: CacheKey) -> Result<Option<CacheValue>> {
+        let url = format!("{}{}", self.endpoint, RequestPath::Resource(key));
+        let timeout = RESOURCE_REQUEST_TIMEOUT;
+        let req = self.client.get(url).timeout(timeout);
+        let resp = req.send().await.context(GetSnafu)?;
+
+        if resp.status() == StatusCode::NOT_FOUND {
+            return Ok(None);
+        }
+        let resp = resp.error_for_status().context(GetSnafu)?;
+
+        let generation = resp
+            .headers()
+            .get(&GENERATION)
+            .context(MissingGenerationSnafu)?;
+
+        let generation = generation
+            .to_str()
+            .ok()
+            .and_then(|v| v.parse().ok())
+            .context(InvalidGenerationSnafu)?;
+
+        let data = resp.bytes().await.context(GetSnafu)?;
+
+        Ok(Some(CacheValue::new(data, generation)))
+    }
+
+    /// Upsert the given key-value pair to the remote cache
+    ///
+    /// Returns false if the value had a generation less than or equal to
+    /// an existing value
+    pub async fn put(&self, key: CacheKey, value: &CacheValue) -> Result<bool> {
+        let url = format!("{}{}", self.endpoint, RequestPath::Resource(key));
+
+        let response = self
+            .client
+            .put(url)
+            .timeout(RESOURCE_REQUEST_TIMEOUT)
+            .header(&GENERATION, value.generation)
+            .body(value.data.clone())
+            .send()
+            .await
+            .context(PutSnafu)?
+            .error_for_status()
+            .context(PutSnafu)?;
+
+        Ok(matches!(response.status(), StatusCode::OK))
+    }
+
+    /// List the contents of the remote cache
+    ///
+    /// Values larger than `max_value_size` will not be returned inline, with only the key
+    /// and generation returned instead. Defaults to [`MAX_VALUE_SIZE`]
+    pub fn list(&self, max_value_size: Option<usize>) -> ListStream {
+        let size = max_value_size.unwrap_or(MAX_VALUE_SIZE);
+        let url = format!("{}{}?size={size}", self.endpoint, RequestPath::List);
+        let fut = self.client.get(url).timeout(LIST_REQUEST_TIMEOUT).send();
+
+        futures::stream::once(fut.map_err(|source| Error::List { source }))
+            .and_then(move |response| futures::future::ready(list_stream(response, size)))
+            .try_flatten()
+            .boxed()
+    }
+}
+
+struct ListStreamState {
+    response: Response,
+    current: Bytes,
+    decoder: ListDecoder,
+}
+
+impl ListStreamState {
+    fn new(response: Response, max_value_size: usize) -> Self {
+        Self {
+            response,
+            current: Default::default(),
+            decoder: ListDecoder::new().with_max_value_size(max_value_size),
+        }
+    }
+}
+
+fn list_stream(
+    response: Response,
+    max_value_size: usize,
+) -> Result<impl Stream<Item = Result<ListEntry>>> {
+    let resp = response.error_for_status().context(ListSnafu)?;
+    let state = ListStreamState::new(resp, max_value_size);
+    Ok(stream::try_unfold(state, |mut state| async move {
+        loop {
+            if state.current.is_empty() {
+                match state.response.chunk().await.context(ListSnafu)? {
+                    Some(new) => state.current = new,
+                    None => break,
+                }
+            }
+
+            let to_read = state.current.len();
+            let read = state.decoder.decode(&state.current)?;
+            state.current.advance(read);
+            if read != to_read {
+                break;
+            }
+        }
+        Ok(state.decoder.flush()?.map(|entry| (entry, state)))
+    }))
+}
diff --git a/catalog_cache/src/api/list.rs b/catalog_cache/src/api/list.rs
new file mode 100644
index 00000000000..155f7949196
--- /dev/null
+++ b/catalog_cache/src/api/list.rs
@@ -0,0 +1,467 @@
+//! The encoding mechanism for list streams
+//!
+//! This is capable of streaming both keys and values, this saves round-trips when hydrating
+//! a cache from a remote, and avoids creating a flood of HTTP GET requests
+
+use bytes::Bytes;
+use snafu::{ensure, Snafu};
+
+use crate::{CacheKey, CacheValue};
+
+/// Error type for list streams
+#[derive(Debug, Snafu)]
+#[allow(missing_copy_implementations, missing_docs)]
+pub enum Error {
+    #[snafu(display("Unexpected EOF whilst decoding list stream"))]
+    UnexpectedEOF,
+
+    #[snafu(display("List value of {size} bytes too large"))]
+    ValueTooLarge { size: usize },
+}
+
+/// Result type for list streams
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// The size at which to flush [`Bytes`] from [`ListEncoder`]
+pub const FLUSH_SIZE: usize = 1024 * 1024; // Flush in 1MB blocks
+
+/// The maximum value size to send in a [`ListEntry`]
+///
+/// This primarily exists as a self-protection limit to prevent large or corrupted streams
+/// from swamping the client, but also mitigates Head-Of-Line blocking resulting from
+/// large cache values
+pub const MAX_VALUE_SIZE: usize = 1024 * 10;
+
+/// Encodes [`ListEntry`] as an iterator of [`Bytes`]
+///
+/// Each [`ListEntry`] is encoded as a `ListHeader`, followed by the value data
+#[derive(Debug)]
+pub struct ListEncoder {
+    /// The current offset into entries
+    offset: usize,
+    /// The ListEntry to encode
+    entries: Vec<ListEntry>,
+    /// The flush size, made configurable for testing
+    flush_size: usize,
+    /// The maximum value size to write
+    max_value_size: usize,
+}
+
+impl ListEncoder {
+    /// Create a new [`ListEncoder`] from the provided [`ListEntry`]
+    pub fn new(entries: Vec<ListEntry>) -> Self {
+        Self {
+            entries,
+            offset: 0,
+            flush_size: FLUSH_SIZE,
+            max_value_size: MAX_VALUE_SIZE,
+        }
+    }
+
+    /// Override the maximum value size to write
+    pub fn with_max_value_size(mut self, size: usize) -> Self {
+        self.max_value_size = size;
+        self
+    }
+}
+
+impl Iterator for ListEncoder {
+    type Item = Bytes;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if self.offset == self.entries.len() {
+            return None;
+        }
+
+        let mut cap = 0;
+        let mut end_offset = self.offset;
+        while end_offset < self.entries.len() && cap < self.flush_size {
+            match &self.entries[end_offset].data {
+                Some(d) if d.len() <= self.max_value_size => cap += ListHeader::SIZE + d.len(),
+                _ => cap += ListHeader::SIZE,
+            };
+            end_offset += 1;
+        }
+
+        let mut buf = Vec::with_capacity(cap);
+        for entry in self.entries.iter().take(end_offset).skip(self.offset) {
+            match &entry.data {
+                Some(d) if d.len() <= self.max_value_size => {
+                    buf.extend_from_slice(&entry.header(false).encode());
+                    buf.extend_from_slice(d)
+                }
+                _ => buf.extend_from_slice(&entry.header(true).encode()),
+            }
+        }
+        self.offset = end_offset;
+        Some(buf.into())
+    }
+}
+
+#[allow(non_snake_case)]
+mod Flags {
+    /// The value is not included in this response
+    ///
+    /// [`ListEncoder`](super::ListEncoder) only sends inline values for values smaller than a
+    /// configured threshold
+    pub(crate) const HEAD: u8 = 1;
+}
+
+/// The header encoded in a list stream
+#[derive(Debug)]
+struct ListHeader {
+    /// The size of the value
+    size: u32,
+    /// Reserved for future usage
+    reserved: u16,
+    /// A bitmask of [`Flags`]
+    flags: u8,
+    /// The variant of [`CacheKey`]
+    variant: u8,
+    /// The generation of this value
+    generation: u64,
+    /// The key contents of [`CacheKey`]
+    key: u128,
+}
+
+impl ListHeader {
+    /// The encoded size of [`ListHeader`]
+    const SIZE: usize = 32;
+
+    /// Encodes [`ListHeader`] to an array
+    fn encode(&self) -> [u8; Self::SIZE] {
+        let mut out = [0; Self::SIZE];
+        out[..4].copy_from_slice(&self.size.to_le_bytes());
+        out[4..6].copy_from_slice(&self.reserved.to_le_bytes());
+        out[6] = self.flags;
+        out[7] = self.variant;
+        out[8..16].copy_from_slice(&self.generation.to_le_bytes());
+        out[16..32].copy_from_slice(&self.key.to_le_bytes());
+        out
+    }
+
+    /// Decodes [`ListHeader`] from an array
+    fn decode(buf: [u8; Self::SIZE]) -> Self {
+        Self {
+            size: u32::from_le_bytes(buf[..4].try_into().unwrap()),
+            reserved: u16::from_le_bytes(buf[4..6].try_into().unwrap()),
+            flags: buf[6],
+            variant: buf[7],
+            generation: u64::from_le_bytes(buf[8..16].try_into().unwrap()),
+            key: u128::from_le_bytes(buf[16..32].try_into().unwrap()),
+        }
+    }
+}
+
+/// The state for [`ListDecoder`]
+#[derive(Debug)]
+enum DecoderState {
+    /// Decoding a header, contains the decoded data and the current offset
+    Header([u8; ListHeader::SIZE], usize),
+    /// Decoding value data for the given [`ListHeader`]
+    Body(ListHeader, Vec<u8>),
+}
+
+impl Default for DecoderState {
+    fn default() -> Self {
+        Self::Header([0; ListHeader::SIZE], 0)
+    }
+}
+
+/// Decodes [`ListEntry`] from a stream of bytes
+#[derive(Debug)]
+pub struct ListDecoder {
+    state: DecoderState,
+    max_size: usize,
+}
+
+impl Default for ListDecoder {
+    fn default() -> Self {
+        Self {
+            state: DecoderState::default(),
+            max_size: MAX_VALUE_SIZE,
+        }
+    }
+}
+
+impl ListDecoder {
+    /// Create a new [`ListDecoder`]
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Overrides the maximum value to deserialize
+    ///
+    /// Values larger than this will result in an error
+    /// Defaults to [`MAX_VALUE_SIZE`]
+    pub fn with_max_value_size(mut self, size: usize) -> Self {
+        self.max_size = size;
+        self
+    }
+
+    /// Decode an entry from `buf`, returning the number of bytes consumed
+    ///
+    /// This is meant to be used in combination with [`Self::flush`]
+    pub fn decode(&mut self, mut buf: &[u8]) -> Result<usize> {
+        let initial = buf.len();
+        while !buf.is_empty() {
+            match &mut self.state {
+                DecoderState::Header(header, offset) => {
+                    let to_read = buf.len().min(ListHeader::SIZE - *offset);
+                    header[*offset..*offset + to_read].copy_from_slice(&buf[..to_read]);
+                    *offset += to_read;
+                    buf = &buf[to_read..];
+
+                    if *offset == ListHeader::SIZE {
+                        let header = ListHeader::decode(*header);
+                        let size = header.size as _;
+                        ensure!(size <= self.max_size, ValueTooLargeSnafu { size });
+                        self.state = DecoderState::Body(header, Vec::with_capacity(size))
+                    }
+                }
+                DecoderState::Body(header, value) => {
+                    let to_read = buf.len().min(header.size as usize - value.len());
+                    if to_read == 0 {
+                        break;
+                    }
+                    value.extend_from_slice(&buf[..to_read]);
+                    buf = &buf[to_read..];
+                }
+            }
+        }
+        Ok(initial - buf.len())
+    }
+
+    /// Flush the contents of this [`ListDecoder`]
+    ///
+    /// Returns `Ok(Some(entry))` if a record is fully decoded
+    /// Returns `Ok(None)` if no in-progress record
+    /// Otherwise returns an error
+    pub fn flush(&mut self) -> Result<Option<ListEntry>> {
+        match std::mem::take(&mut self.state) {
+            DecoderState::Body(header, value) if value.len() == header.size as usize => {
+                Ok(Some(ListEntry {
+                    variant: header.variant,
+                    key: header.key,
+                    generation: header.generation,
+                    data: ((header.flags & Flags::HEAD) == 0).then(|| value.into()),
+                }))
+            }
+            DecoderState::Header(_, 0) => Ok(None),
+            _ => Err(Error::UnexpectedEOF),
+        }
+    }
+}
+
+/// A key value pair encoded as part of a list
+///
+/// Unlike [`CacheKey`] and [`CacheValue`] this allows:
+///
+/// * Non-fatal handling of unknown key variants
+/// * The option to not include the value data, e.g. if too large
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct ListEntry {
+    variant: u8,
+    generation: u64,
+    key: u128,
+    data: Option<Bytes>,
+}
+
+impl ListEntry {
+    /// Create a new [`ListEntry`] from the provided key and value
+    pub fn new(key: CacheKey, value: CacheValue) -> Self {
+        let (variant, key) = match key {
+            CacheKey::Namespace(v) => (b'n', v as _),
+            CacheKey::Table(v) => (b't', v as _),
+            CacheKey::Partition(v) => (b'p', v as _),
+        };
+
+        Self {
+            key,
+            variant,
+            generation: value.generation,
+            data: Some(value.data),
+        }
+    }
+
+    /// The key if it matches a known variant of [`CacheKey`]
+    ///
+    /// Returns `None` otherwise
+    pub fn key(&self) -> Option<CacheKey> {
+        match self.variant {
+            b't' => Some(CacheKey::Table(self.key as _)),
+            b'n' => Some(CacheKey::Namespace(self.key as _)),
+            b'p' => Some(CacheKey::Partition(self.key as _)),
+            _ => None,
+        }
+    }
+
+    /// The generation of this entry
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
+    /// Returns the value data if present
+    pub fn value(&self) -> Option<&Bytes> {
+        self.data.as_ref()
+    }
+
+    /// Returns the [`ListHeader`] for a given [`ListEntry`]
+    fn header(&self, head: bool) -> ListHeader {
+        let generation = self.generation;
+        let (flags, size) = match (head, &self.data) {
+            (false, Some(data)) => (0, data.len() as u32),
+            _ => (Flags::HEAD, 0),
+        };
+
+        ListHeader {
+            size,
+            flags,
+            variant: self.variant,
+            key: self.key,
+            generation,
+            reserved: 0,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Buf;
+    use std::io::BufRead;
+
+    fn decode_entries<R: BufRead>(mut r: R) -> Result<Vec<ListEntry>> {
+        let mut decoder = ListDecoder::default();
+        let iter = std::iter::from_fn(move || {
+            loop {
+                let buf = r.fill_buf().unwrap();
+                if buf.is_empty() {
+                    break;
+                }
+                let to_read = buf.len();
+                let read = decoder.decode(buf).unwrap();
+                r.consume(read);
+                if read != to_read {
+                    break;
+                }
+            }
+            decoder.flush().transpose()
+        });
+        iter.collect()
+    }
+
+    #[test]
+    fn test_roundtrip() {
+        let expected = vec![
+            ListEntry::new(CacheKey::Namespace(2), CacheValue::new("test".into(), 32)),
+            ListEntry::new(CacheKey::Namespace(6), CacheValue::new("3".into(), 4)),
+            ListEntry {
+                variant: 0,
+                key: u128::MAX,
+                generation: u64::MAX,
+                data: Some("unknown".into()),
+            },
+            ListEntry::new(CacheKey::Table(6), CacheValue::new("3".into(), 23)),
+            ListEntry {
+                variant: b'p',
+                key: 45,
+                generation: 23,
+                data: None,
+            },
+            ListEntry::new(
+                CacheKey::Partition(3),
+                CacheValue::new("bananas".into(), 23),
+            ),
+        ];
+
+        let encoded: Vec<_> = ListEncoder::new(expected.clone()).collect();
+        assert_eq!(encoded.len(), 1); // Expect entries to be encoded in single flush
+
+        for buf_size in [3, 5, 12] {
+            let reader = std::io::BufReader::with_capacity(buf_size, encoded[0].clone().reader());
+            let entries = decode_entries(reader).unwrap();
+            assert_eq!(entries, expected);
+
+            // Invalid key should not be fatal
+            assert_eq!(entries[2].key(), None);
+            // Head response should not be fatal
+            assert_eq!(entries[4].value(), None);
+        }
+    }
+
+    #[test]
+    fn test_empty() {
+        let data: Vec<_> = ListEncoder::new(vec![]).collect();
+        assert_eq!(data.len(), 0);
+
+        let entries = decode_entries(std::io::Cursor::new([0_u8; 0])).unwrap();
+        assert_eq!(entries.len(), 0);
+    }
+
+    #[test]
+    fn test_flush_size() {
+        let data = Bytes::from(vec![0; 128]);
+        let entries = (0..1024)
+            .map(|x| ListEntry::new(CacheKey::Namespace(x), CacheValue::new(data.clone(), 0)))
+            .collect();
+
+        let mut encoder = ListEncoder::new(entries);
+        encoder.flush_size = 1024; // Lower limit for test
+
+        let mut remaining = 1024;
+        for block in encoder {
+            let expected = remaining.min(7);
+            assert_eq!(block.len(), (data.len() + ListHeader::SIZE) * expected);
+            let decoded = decode_entries(block.reader()).unwrap();
+            assert_eq!(decoded.len(), expected);
+            remaining -= expected;
+        }
+    }
+
+    #[test]
+    fn test_size_limit() {
+        let entries = vec![
+            ListEntry::new(
+                CacheKey::Namespace(0),
+                CacheValue::new(vec![0; 128].into(), 0),
+            ),
+            ListEntry::new(
+                CacheKey::Namespace(1),
+                CacheValue::new(vec![0; 129].into(), 0),
+            ),
+            ListEntry::new(
+                CacheKey::Namespace(2),
+                CacheValue::new(vec![0; 128].into(), 0),
+            ),
+        ];
+
+        let mut encoder = ListEncoder::new(entries);
+        encoder.max_value_size = 128; // Artificially lower limit for test
+
+        let encoded: Vec<_> = encoder.collect();
+        assert_eq!(encoded.len(), 1);
+
+        let decoded = decode_entries(encoded[0].clone().reader()).unwrap();
+        assert_eq!(decoded[0].value().unwrap().len(), 128);
+        assert_eq!(decoded[1].value(), None); // Should omit value that is too large
+        assert_eq!(decoded[2].value().unwrap().len(), 128);
+
+        let mut decoder = ListDecoder::new();
+        decoder.max_size = 12;
+        let err = decoder.decode(&encoded[0]).unwrap_err().to_string();
+        assert_eq!(err, "List value of 128 bytes too large");
+
+        let mut decoder = ListDecoder::new();
+        decoder.max_size = 128;
+
+        let consumed = decoder.decode(&encoded[0]).unwrap();
+        let r = decoder.flush().unwrap().unwrap();
+        assert_eq!(r.value().unwrap().len(), 128);
+
+        // Next record skipped by encoder as too large
+        decoder.decode(&encoded[0][consumed..]).unwrap();
+        let r = decoder.flush().unwrap().unwrap();
+        assert_eq!(r.value(), None);
+    }
+}
diff --git a/catalog_cache/src/api/mod.rs b/catalog_cache/src/api/mod.rs
new file mode 100644
index 00000000000..66d404229ae
--- /dev/null
+++ b/catalog_cache/src/api/mod.rs
@@ -0,0 +1,159 @@
+//! The remote API for the catalog cache
+
+use crate::CacheKey;
+use hyper::http::HeaderName;
+
+pub mod client;
+
+pub mod quorum;
+
+pub mod server;
+
+pub mod list;
+
+/// The header used to encode the generation in a get response
+static GENERATION: HeaderName = HeaderName::from_static("x-influx-generation");
+
+/// Defines the mapping to HTTP paths for given request types
+#[derive(Debug, Eq, PartialEq)]
+enum RequestPath {
+    /// A request addressing a resource identified by [`CacheKey`]
+    Resource(CacheKey),
+    /// A list request
+    List,
+}
+
+impl RequestPath {
+    fn parse(s: &str) -> Option<Self> {
+        let s = s.strip_prefix('/').unwrap_or(s);
+        if s == "v1/" {
+            return Some(Self::List);
+        }
+
+        let (prefix, value) = s.rsplit_once('/')?;
+        let value = u64::from_str_radix(value, 16).ok()?;
+        match prefix {
+            "v1/n" => Some(Self::Resource(CacheKey::Namespace(value as i64))),
+            "v1/t" => Some(Self::Resource(CacheKey::Table(value as i64))),
+            "v1/p" => Some(Self::Resource(CacheKey::Partition(value as i64))),
+            _ => None,
+        }
+    }
+}
+
+impl std::fmt::Display for RequestPath {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::List => write!(f, "v1/"),
+            Self::Resource(CacheKey::Namespace(v)) => write!(f, "v1/n/{v:016x}"),
+            Self::Resource(CacheKey::Table(v)) => write!(f, "v1/t/{v:016x}"),
+            Self::Resource(CacheKey::Partition(v)) => write!(f, "v1/p/{v:016x}"),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::api::list::ListEntry;
+    use crate::api::server::test_util::TestCacheServer;
+    use crate::api::RequestPath;
+    use crate::{CacheKey, CacheValue};
+    use futures::TryStreamExt;
+    use std::collections::HashSet;
+
+    #[test]
+    fn test_request_path() {
+        let paths = [
+            RequestPath::List,
+            RequestPath::Resource(CacheKey::Partition(12)),
+            RequestPath::Resource(CacheKey::Partition(i64::MAX)),
+            RequestPath::Resource(CacheKey::Partition(i64::MIN)),
+            RequestPath::Resource(CacheKey::Namespace(12)),
+            RequestPath::Resource(CacheKey::Namespace(i64::MAX)),
+            RequestPath::Resource(CacheKey::Namespace(i64::MIN)),
+            RequestPath::Resource(CacheKey::Table(12)),
+            RequestPath::Resource(CacheKey::Table(i64::MAX)),
+            RequestPath::Resource(CacheKey::Table(i64::MIN)),
+        ];
+
+        let mut set = HashSet::with_capacity(paths.len());
+        for path in paths {
+            let s = path.to_string();
+            let back = RequestPath::parse(&s).unwrap();
+            assert_eq!(back, path);
+            assert!(set.insert(s), "should be unique");
+        }
+    }
+
+    #[tokio::test]
+    async fn test_basic() {
+        let serve = TestCacheServer::bind_ephemeral();
+        let client = serve.client();
+
+        let key = CacheKey::Partition(1);
+
+        let v1 = CacheValue::new("1".into(), 2);
+        assert!(client.put(key, &v1).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v1, returned);
+
+        // Duplicate upsert ignored
+        assert!(!client.put(key, &v1).await.unwrap());
+
+        // Stale upsert ignored
+        let v2 = CacheValue::new("2".into(), 1);
+        assert!(!client.put(key, &v2).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v1, returned);
+
+        let v3 = CacheValue::new("3".into(), 3);
+        assert!(client.put(key, &v3).await.unwrap());
+
+        let returned = client.get(key).await.unwrap().unwrap();
+        assert_eq!(v3, returned);
+
+        let key2 = CacheKey::Partition(5);
+        assert!(client.put(key2, &v1).await.unwrap());
+
+        let mut result = client.list(None).try_collect::<Vec<_>>().await.unwrap();
+        result.sort_unstable_by_key(|entry| entry.key());
+
+        let expected = vec![ListEntry::new(key, v3), ListEntry::new(key2, v1)];
+        assert_eq!(result, expected);
+
+        serve.shutdown().await;
+    }
+
+    #[tokio::test]
+    async fn test_list_size() {
+        let serve = TestCacheServer::bind_ephemeral();
+        let client = serve.client();
+
+        let v1 = CacheValue::new("123".into(), 2);
+        client.put(CacheKey::Table(1), &v1).await.unwrap();
+
+        let v2 = CacheValue::new("13".into(), 2);
+        client.put(CacheKey::Table(2), &v2).await.unwrap();
+
+        let v3 = CacheValue::new("1".into(), 2);
+        client.put(CacheKey::Table(3), &v3).await.unwrap();
+
+        let mut res = client.list(Some(2)).try_collect::<Vec<_>>().await.unwrap();
+        res.sort_unstable_by_key(|x| x.key());
+
+        assert_eq!(res.len(), 3);
+
+        assert_eq!(res[0].value(), None);
+        assert_eq!(res[1].value(), Some(&v2.data));
+        assert_eq!(res[2].value(), Some(&v3.data));
+
+        let mut res = client.list(Some(3)).try_collect::<Vec<_>>().await.unwrap();
+        res.sort_unstable_by_key(|x| x.key());
+
+        assert_eq!(res[0].value(), Some(&v1.data));
+        assert_eq!(res[1].value(), Some(&v2.data));
+        assert_eq!(res[2].value(), Some(&v3.data));
+    }
+}
diff --git a/catalog_cache/src/api/quorum.rs b/catalog_cache/src/api/quorum.rs
new file mode 100644
index 00000000000..17c4edf8bdd
--- /dev/null
+++ b/catalog_cache/src/api/quorum.rs
@@ -0,0 +1,459 @@
+//! Client for performing quorum catalog reads/writes
+
+use crate::api::client::{CatalogCacheClient, Error as ClientError};
+use crate::local::CatalogCache;
+use crate::{CacheKey, CacheValue};
+use futures::channel::oneshot;
+use futures::future::{select, Either};
+use futures::{pin_mut, StreamExt};
+use snafu::{ResultExt, Snafu};
+use std::collections::HashMap;
+use std::sync::Arc;
+use tokio::task::JoinError;
+use tokio_util::sync::CancellationToken;
+
+/// Error for [`QuorumCatalogCache`]
+#[allow(missing_docs)]
+#[derive(Debug, Snafu)]
+pub enum Error {
+    #[snafu(display("Failed to communicate with any remote replica: {source}"))]
+    NoRemote { source: ClientError },
+
+    #[snafu(display("Write task was aborted"))]
+    Cancelled,
+
+    #[snafu(display("Join Error: {source}"))]
+    Join { source: JoinError },
+
+    #[snafu(display("Failed to establish a read quorum: {generations:?}"))]
+    Quorum {
+        generations: [Result<Option<u64>, ClientError>; 3],
+    },
+
+    #[snafu(display("Failed to list replica: {source}"))]
+    List { source: ClientError },
+
+    #[snafu(display("Local cache error: {source}"), context(false))]
+    Local { source: crate::local::Error },
+}
+
+/// Result for [`QuorumCatalogCache`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// Performs quorum reads and writes across a local [`CatalogCache`] and two [`CatalogCacheClient`]
+#[derive(Debug)]
+pub struct QuorumCatalogCache {
+    local: Arc<CatalogCache>,
+    replicas: Arc<[CatalogCacheClient; 2]>,
+    shutdown: CancellationToken,
+}
+
+impl Drop for QuorumCatalogCache {
+    fn drop(&mut self) {
+        self.shutdown.cancel()
+    }
+}
+
+impl QuorumCatalogCache {
+    /// Create a new [`QuorumCatalogCache`]
+    pub fn new(local: Arc<CatalogCache>, replicas: Arc<[CatalogCacheClient; 2]>) -> Self {
+        Self {
+            local,
+            replicas,
+            shutdown: CancellationToken::new(),
+        }
+    }
+
+    /// Retrieve the given value from the remote cache
+    ///
+    /// Returns `None` if value is not present in a quorum of replicas
+    /// Returns [`Error::Quorum`] if cannot establish a read quorum
+    pub async fn get(&self, key: CacheKey) -> Result<Option<CacheValue>> {
+        let local = self.local.get(key);
+
+        let fut1 = self.replicas[0].get(key);
+        let fut2 = self.replicas[1].get(key);
+        pin_mut!(fut1);
+        pin_mut!(fut2);
+
+        match select(fut1, fut2).await {
+            Either::Left((result, fut)) | Either::Right((result, fut)) => match (local, result) {
+                (None, Ok(None)) => Ok(None),
+                (Some(l), Ok(Some(r))) if l.generation <= r.generation => {
+                    // preempt write from remote to local that arrives late
+                    if l.generation < r.generation {
+                        self.local.insert(key, r.clone())?;
+                    }
+                    Ok(Some(r))
+                }
+                (local, r1) => {
+                    // r1 either failed or did not return anything
+                    let r2 = fut.await;
+                    match (local, r1, r2) {
+                        (None, _, Ok(None)) | (_, Ok(None), Ok(None)) => Ok(None),
+                        (Some(l), _, Ok(Some(r))) if l.generation <= r.generation => {
+                            // preempt write from remote to local that arrives late
+                            if l.generation < r.generation {
+                                self.local.insert(key, r.clone())?;
+                            }
+                            Ok(Some(r))
+                        }
+                        (local, Ok(Some(l)), Ok(Some(r))) if l.generation == r.generation => {
+                            if local.map(|x| x.generation < l.generation).unwrap_or(true) {
+                                self.local.insert(key, l.clone())?;
+                            }
+                            Ok(Some(l))
+                        }
+                        (l, r1, r2) => Err(Error::Quorum {
+                            generations: [
+                                Ok(l.map(|x| x.generation)),
+                                r1.map(|x| x.map(|x| x.generation)),
+                                r2.map(|x| x.map(|x| x.generation)),
+                            ],
+                        }),
+                    }
+                }
+            },
+        }
+    }
+
+    /// Upsert the given key-value pair
+    ///
+    /// Returns Ok if able to replicate the write to a quorum
+    pub async fn put(&self, key: CacheKey, value: CacheValue) -> Result<()> {
+        self.local.insert(key, value.clone())?;
+
+        let replicas = Arc::clone(&self.replicas);
+        let (sender, receiver) = oneshot::channel();
+
+        let fut = async move {
+            let fut1 = replicas[0].put(key, &value);
+            let fut2 = replicas[1].put(key, &value);
+            pin_mut!(fut1);
+            pin_mut!(fut2);
+
+            match select(fut1, fut2).await {
+                Either::Left((r, fut)) | Either::Right((r, fut)) => {
+                    let _ = sender.send(r);
+                    fut.await
+                }
+            }
+        };
+
+        // We spawn a tokio task so that we can potentially continue to replicate
+        // to the second replica asynchronously once we receive an ok response
+        let cancel = self.shutdown.child_token();
+        let handle = tokio::spawn(async move {
+            let cancelled = cancel.cancelled();
+            pin_mut!(fut);
+            pin_mut!(cancelled);
+            match select(cancelled, fut).await {
+                Either::Left(_) => Err(Error::Cancelled),
+                Either::Right((Ok(_), _)) => Ok(()),
+                Either::Right((Err(source), _)) => Err(Error::NoRemote { source }),
+            }
+        });
+
+        match receiver.await {
+            Ok(Ok(_)) => Ok(()),
+            _ => match handle.await {
+                Ok(r) => r,
+                Err(source) => Err(Error::Join { source }),
+            },
+        }
+    }
+
+    /// Warm the local cache by performing quorum reads from the other two replicas
+    ///
+    /// This method should be called after this server has been participating in the write quorum
+    /// for a period of time, e.g. 1 minute. This avoids an issue where a quorum cannot be
+    /// established for in-progress writes.
+    pub async fn warm(&self) -> Result<()> {
+        // List doesn't return keys in any particular order
+        //
+        // We therefore build a hashmap with the keys from one replica and compare
+        // this against those returned by the other
+        //
+        // We don't need to consult the local `CatalogCache`, as we only need to insert
+        // if a read quorum can be established between the replicas and isn't present locally
+        let mut generations = HashMap::with_capacity(128);
+        let mut list = self.replicas[0].list(Some(0));
+        while let Some(entry) = list.next().await.transpose().context(ListSnafu)? {
+            if let Some(k) = entry.key() {
+                generations.insert(k, entry.generation());
+            }
+        }
+
+        let mut list = self.replicas[1].list(None);
+        while let Some(entry) = list.next().await.transpose().context(ListSnafu)? {
+            if let Some(k) = entry.key() {
+                match (generations.get(&k), entry.value()) {
+                    (Some(generation), Some(v)) if *generation == entry.generation() => {
+                        let value = CacheValue::new(v.clone(), *generation);
+                        // In the case that local already has the given version
+                        // this will be a no-op
+                        self.local.insert(k, value)?;
+                    }
+                    _ => {}
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::api::server::test_util::TestCacheServer;
+    use std::future::Future;
+    use std::task::Context;
+    use std::time::Duration;
+
+    #[tokio::test]
+    async fn test_basic() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        assert_eq!(quorum.get(CacheKey::Table(1)).await.unwrap(), None);
+
+        let k1 = CacheKey::Table(1);
+        let k2 = CacheKey::Table(2);
+        let k3 = CacheKey::Table(3);
+
+        let v1 = CacheValue::new("foo".into(), 2);
+        quorum.put(k1, v1.clone()).await.unwrap();
+        quorum.put(k2, v1.clone()).await.unwrap();
+
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // New value
+        let v2 = CacheValue::new("foo".into(), 4);
+        quorum.put(k2, v2.clone()).await.unwrap();
+
+        let r = quorum.get(k1).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Can remove value from one replica and still get quorum
+        r2.cache().delete(k2).unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Loss of two copies results in not found
+        r1.cache().delete(k2).unwrap();
+        let r = quorum.get(k2).await.unwrap();
+        assert_eq!(r, None);
+
+        // Simulate stale value in r1
+        r1.cache().insert(k2, v1.clone()).unwrap();
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // If quorum has stale value follows quorum
+        r2.cache().delete(k2);
+        r2.cache().insert(k2, v1.clone()).unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Simulate loss of replica 2
+        r2.shutdown().await;
+
+        // Can still establish a write quorum
+        quorum.put(k3, v1.clone()).await.unwrap();
+
+        // Can read newly inserted value
+        let r = quorum.get(k3).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Can still read from quorum of k1
+        let r = quorum.get(k1).await.unwrap().unwrap();
+        assert_eq!(r, v1);
+
+        // Cannot get quorum as lost single node and local disagrees with replica 1
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // Can establish quorum following write
+        quorum.put(k2, v2.clone()).await.unwrap();
+        let r = quorum.get(k2).await.unwrap().unwrap();
+        assert_eq!(r, v2);
+
+        // Still cannot establish quorum
+        r1.cache().delete(k2);
+        let err = quorum.get(k2).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+
+        // k2 is now no longer present anywhere, can establish quorum
+        local.delete(k2);
+        let r = quorum.get(k2).await.unwrap();
+        assert_eq!(r, None);
+
+        // Simulate loss of replica 1 (in addition to replica 2)
+        r1.shutdown().await;
+
+        // Can no longer get quorum for anything
+        let err = quorum.get(k1).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+    }
+
+    #[tokio::test]
+    async fn test_read_through() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        let key = CacheKey::Table(1);
+        let v0 = CacheValue::new("v0".into(), 0);
+
+        r1.cache().insert(key, v0.clone()).unwrap();
+        r2.cache().insert(key, v0.clone()).unwrap();
+
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v0);
+
+        // Should have read-through to local
+        assert_eq!(local.get(key).unwrap(), v0);
+
+        let v1 = CacheValue::new("v1".into(), 1);
+        let v2 = CacheValue::new("v2".into(), 2);
+
+        r1.cache().insert(key, v1.clone()).unwrap();
+        r2.cache().insert(key, v2.clone()).unwrap();
+
+        // A quorum request will get either v1 or v2 depending on which it contacts first
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert!(result == v1 || result == v2, "{result:?}");
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), result);
+
+        // Update r1 with version 2
+        r1.cache().insert(key, v2.clone()).unwrap();
+
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v2);
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), v2);
+
+        let v3 = CacheValue::new("v3".into(), 3);
+        local.insert(key, v3.clone()).unwrap();
+
+        // Should establish quorum for v2 even though local is v3
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v2);
+
+        // Should not read-through
+        assert_eq!(local.get(key).unwrap(), v3);
+
+        let v4 = CacheValue::new("v4".into(), 4);
+        let v5 = CacheValue::new("v5".into(), 5);
+
+        local.insert(key, v5.clone()).unwrap();
+        r1.cache().insert(key, v4.clone()).unwrap();
+
+        // Should fail as cannot establish quorum of three different versions of `[5, 4, 2]`
+        // and has latest version locally
+        let err = quorum.get(key).await.unwrap_err();
+        assert!(matches!(err, Error::Quorum { .. }), "{err}");
+        assert_eq!(local.get(key).unwrap(), v5);
+
+        let v6 = CacheValue::new("v6".into(), 6);
+        r1.cache().insert(key, v6.clone()).unwrap();
+
+        // Should succeed as r1 has newer version than local
+        let result = quorum.get(key).await.unwrap().unwrap();
+        assert_eq!(result, v6);
+
+        // Should read-through
+        assert_eq!(local.get(key).unwrap(), v6);
+    }
+
+    #[tokio::test]
+    async fn test_warm() {
+        let local = Arc::new(CatalogCache::default());
+        let r1 = TestCacheServer::bind_ephemeral();
+        let r2 = TestCacheServer::bind_ephemeral();
+
+        let replicas = Arc::new([r1.client(), r2.client()]);
+        let quorum = QuorumCatalogCache::new(local, Arc::clone(&replicas));
+
+        let k1 = CacheKey::Table(1);
+        let v1 = CacheValue::new("v1".into(), 1);
+        quorum.put(k1, v1.clone()).await.unwrap();
+
+        let k2 = CacheKey::Table(2);
+        let v2 = CacheValue::new("v2".into(), 1);
+        quorum.put(k2, v2.clone()).await.unwrap();
+
+        // Simulate local restart
+        let local = Arc::new(CatalogCache::default());
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        assert_eq!(local.list().count(), 0);
+
+        quorum.warm().await.unwrap();
+
+        // Should populate both entries
+        let mut entries: Vec<_> = local.list().collect();
+        entries.sort_unstable_by_key(|(k, _)| *k);
+        assert_eq!(entries, vec![(k1, v1.clone()), (k2, v2.clone())]);
+
+        // Simulate local restart
+        let local = Arc::new(CatalogCache::default());
+        let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas));
+
+        // Simulate in-progress write
+        let v3 = CacheValue::new("v3".into(), 2);
+        assert!(r1.cache().insert(k2, v3.clone()).unwrap());
+
+        // Cannot establish quorum for k1 so should skip over
+        quorum.warm().await.unwrap();
+        let entries: Vec<_> = local.list().collect();
+        assert_eq!(entries.len(), 1);
+        assert_eq!(entries[0], (k1, v1.clone()));
+
+        // If r2 updated warming should pick up new quorum
+        assert!(r2.cache().insert(k2, v3.clone()).unwrap());
+        quorum.warm().await.unwrap();
+        let mut entries: Vec<_> = local.list().collect();
+        entries.sort_unstable_by_key(|(k, _)| *k);
+        assert_eq!(entries, vec![(k1, v1), (k2, v3)]);
+
+        // Test cancellation safety
+        let k3 = CacheKey::Table(3);
+        let fut = quorum.put(k3, v2.clone());
+        {
+            // `fut` is dropped (cancelled) on exit from this code block
+            pin_mut!(fut);
+
+            let noop_waker = futures::task::noop_waker();
+            let mut cx = Context::from_waker(&noop_waker);
+            assert!(fut.poll(&mut cx).is_pending());
+        }
+
+        // Write should still propagate asynchronously
+        let mut attempts = 0;
+        loop {
+            tokio::time::sleep(Duration::from_millis(1)).await;
+            match quorum.get(k3).await {
+                Ok(Some(_)) => break,
+                _ => {
+                    assert!(attempts < 100);
+                    attempts += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/catalog_cache/src/api/server.rs b/catalog_cache/src/api/server.rs
new file mode 100644
index 00000000000..b29d841f880
--- /dev/null
+++ b/catalog_cache/src/api/server.rs
@@ -0,0 +1,300 @@
+//! Server for the cache HTTP API
+
+use crate::api::list::{ListEncoder, ListEntry};
+use crate::api::{RequestPath, GENERATION};
+use crate::local::CatalogCache;
+use crate::CacheValue;
+use futures::ready;
+use hyper::body::HttpBody;
+use hyper::header::ToStrError;
+use hyper::http::request::Parts;
+use hyper::service::Service;
+use hyper::{Body, Method, Request, Response, StatusCode};
+use snafu::{OptionExt, ResultExt, Snafu};
+use std::convert::Infallible;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+enum Error {
+    #[snafu(display("Http error: {source}"), context(false))]
+    Http { source: hyper::http::Error },
+
+    #[snafu(display("Hyper error: {source}"), context(false))]
+    Hyper { source: hyper::Error },
+
+    #[snafu(display("Local cache error: {source}"), context(false))]
+    Local { source: crate::local::Error },
+
+    #[snafu(display("Non UTF-8 Header: {source}"))]
+    BadHeader { source: ToStrError },
+
+    #[snafu(display("Request missing generation header"))]
+    MissingGeneration,
+
+    #[snafu(display("Invalid generation header: {source}"))]
+    InvalidGeneration { source: std::num::ParseIntError },
+
+    #[snafu(display("List query missing size"))]
+    MissingSize,
+
+    #[snafu(display("List query invalid size: {source}"))]
+    InvalidSize { source: std::num::ParseIntError },
+}
+
+impl Error {
+    /// Convert an error into a [`Response`]
+    fn response(self) -> Response<Body> {
+        let mut response = Response::new(Body::from(self.to_string()));
+        *response.status_mut() = match &self {
+            Self::Http { .. } | Self::Hyper { .. } | Self::Local { .. } => {
+                StatusCode::INTERNAL_SERVER_ERROR
+            }
+            Self::InvalidGeneration { .. }
+            | Self::MissingGeneration
+            | Self::InvalidSize { .. }
+            | Self::MissingSize
+            | Self::BadHeader { .. } => StatusCode::BAD_REQUEST,
+        };
+        response
+    }
+}
+
+/// A [`Service`] that wraps a [`CatalogCache`]
+#[derive(Debug, Clone)]
+pub struct CatalogCacheService(Arc<ServiceState>);
+
+/// Shared state for [`CatalogCacheService`]
+#[derive(Debug)]
+struct ServiceState {
+    cache: Arc<CatalogCache>,
+}
+
+impl Service<Request<Body>> for CatalogCacheService {
+    type Response = Response<Body>;
+
+    type Error = Infallible;
+    type Future = CatalogRequestFuture;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Infallible>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let (parts, body) = req.into_parts();
+        CatalogRequestFuture {
+            parts,
+            body,
+            buffer: vec![],
+            state: Arc::clone(&self.0),
+        }
+    }
+}
+
+/// The future for [`CatalogCacheService`]
+#[derive(Debug)]
+pub struct CatalogRequestFuture {
+    /// The request body
+    body: Body,
+    /// The request parts
+    parts: Parts,
+    /// The in-progress body
+    ///
+    /// We use Vec not Bytes to ensure the cache isn't storing slices of large allocations
+    buffer: Vec<u8>,
+    /// The cache to service requests
+    state: Arc<ServiceState>,
+}
+
+impl Future for CatalogRequestFuture {
+    type Output = Result<Response<Body>, Infallible>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let r = loop {
+            match ready!(Pin::new(&mut self.body).poll_data(cx)) {
+                Some(Ok(b)) => self.buffer.extend_from_slice(&b),
+                Some(Err(e)) => break Err(e.into()),
+                None => break Ok(()),
+            }
+        };
+        Poll::Ready(Ok(match r.and_then(|_| self.call()) {
+            Ok(resp) => resp,
+            Err(e) => e.response(),
+        }))
+    }
+}
+
+impl CatalogRequestFuture {
+    fn call(&mut self) -> Result<Response<Body>, Error> {
+        let body = std::mem::take(&mut self.buffer);
+
+        let status = match RequestPath::parse(self.parts.uri.path()) {
+            Some(RequestPath::List) => match self.parts.method {
+                Method::GET => {
+                    let query = self.parts.uri.query().context(MissingSizeSnafu)?;
+                    let mut parts = url::form_urlencoded::parse(query.as_bytes());
+                    let (_, size) = parts.find(|(k, _)| k == "size").context(MissingSizeSnafu)?;
+                    let size = size.parse().context(InvalidSizeSnafu)?;
+
+                    let iter = self.state.cache.list();
+                    let entries = iter.map(|(k, v)| ListEntry::new(k, v)).collect();
+                    let encoder = ListEncoder::new(entries).with_max_value_size(size);
+
+                    let stream = futures::stream::iter(encoder.map(Ok::<_, Error>));
+                    let response = Response::builder().body(Body::wrap_stream(stream))?;
+                    return Ok(response);
+                }
+                _ => StatusCode::METHOD_NOT_ALLOWED,
+            },
+            Some(RequestPath::Resource(key)) => match self.parts.method {
+                Method::GET => match self.state.cache.get(key) {
+                    Some(value) => {
+                        let response = Response::builder()
+                            .header(&GENERATION, value.generation)
+                            .body(value.data.into())?;
+                        return Ok(response);
+                    }
+                    None => StatusCode::NOT_FOUND,
+                },
+                Method::PUT => {
+                    let headers = &self.parts.headers;
+                    let generation = headers.get(&GENERATION).context(MissingGenerationSnafu)?;
+                    let generation = generation.to_str().context(BadHeaderSnafu)?;
+                    let generation = generation.parse().context(InvalidGenerationSnafu)?;
+                    let value = CacheValue::new(body.into(), generation);
+
+                    match self.state.cache.insert(key, value)? {
+                        true => StatusCode::OK,
+                        false => StatusCode::NOT_MODIFIED,
+                    }
+                }
+                Method::DELETE => {
+                    self.state.cache.delete(key);
+                    StatusCode::OK
+                }
+                _ => StatusCode::METHOD_NOT_ALLOWED,
+            },
+            None => StatusCode::NOT_FOUND,
+        };
+
+        let mut response = Response::new(Body::empty());
+        *response.status_mut() = status;
+        Ok(response)
+    }
+}
+
+/// Runs a [`CatalogCacheService`] in a background task
+///
+/// Will abort the background task on drop
+#[derive(Debug)]
+pub struct CatalogCacheServer {
+    state: Arc<ServiceState>,
+}
+
+impl CatalogCacheServer {
+    /// Create a new [`CatalogCacheServer`].
+    ///
+    /// Note that the HTTP interface needs to be wired up in some higher-level structure. Use [`service`](Self::service)
+    /// for that.
+    pub fn new(cache: Arc<CatalogCache>) -> Self {
+        let state = Arc::new(ServiceState { cache });
+
+        Self { state }
+    }
+
+    /// Returns HTTP service.
+    pub fn service(&self) -> CatalogCacheService {
+        CatalogCacheService(Arc::clone(&self.state))
+    }
+
+    /// Returns a reference to the [`CatalogCache`] of this server
+    pub fn cache(&self) -> &Arc<CatalogCache> {
+        &self.state.cache
+    }
+}
+
+/// Test utilities.
+pub mod test_util {
+    use std::{net::SocketAddr, ops::Deref};
+
+    use hyper::{service::make_service_fn, Server};
+    use tokio::task::JoinHandle;
+    use tokio_util::sync::CancellationToken;
+
+    use crate::api::client::CatalogCacheClient;
+
+    use super::*;
+
+    /// Test runner for a [`CatalogCacheServer`].
+    #[derive(Debug)]
+    pub struct TestCacheServer {
+        addr: SocketAddr,
+        server: CatalogCacheServer,
+        shutdown: CancellationToken,
+        handle: Option<JoinHandle<()>>,
+    }
+
+    impl TestCacheServer {
+        /// Create a new [`TestCacheServer`] bound to an ephemeral port
+        pub fn bind_ephemeral() -> Self {
+            Self::bind(&SocketAddr::from(([127, 0, 0, 1], 0)))
+        }
+
+        /// Create a new [`CatalogCacheServer`] bound to the provided [`SocketAddr`]
+        pub fn bind(addr: &SocketAddr) -> Self {
+            let server = CatalogCacheServer::new(Arc::new(CatalogCache::default()));
+            let service = server.service();
+            let make_service = make_service_fn(move |_conn| {
+                futures::future::ready(Ok::<_, Infallible>(service.clone()))
+            });
+
+            let hyper_server = Server::bind(addr).serve(make_service);
+            let addr = hyper_server.local_addr();
+
+            let shutdown = CancellationToken::new();
+            let signal = shutdown.clone().cancelled_owned();
+            let graceful = hyper_server.with_graceful_shutdown(signal);
+            let handle = Some(tokio::spawn(async move { graceful.await.unwrap() }));
+
+            Self {
+                addr,
+                server,
+                shutdown,
+                handle,
+            }
+        }
+
+        /// Returns a [`CatalogCacheClient`] for communicating with this server
+        pub fn client(&self) -> CatalogCacheClient {
+            let addr = format!("http://{}", self.addr);
+            CatalogCacheClient::try_new(addr.parse().unwrap()).unwrap()
+        }
+
+        /// Triggers and waits for graceful shutdown
+        pub async fn shutdown(mut self) {
+            self.shutdown.cancel();
+            if let Some(x) = self.handle.take() {
+                x.await.unwrap()
+            }
+        }
+    }
+
+    impl Deref for TestCacheServer {
+        type Target = CatalogCacheServer;
+
+        fn deref(&self) -> &Self::Target {
+            &self.server
+        }
+    }
+
+    impl Drop for TestCacheServer {
+        fn drop(&mut self) {
+            if let Some(x) = &self.handle {
+                x.abort()
+            }
+        }
+    }
+}
diff --git a/catalog_cache/src/lib.rs b/catalog_cache/src/lib.rs
new file mode 100644
index 00000000000..037044899ee
--- /dev/null
+++ b/catalog_cache/src/lib.rs
@@ -0,0 +1,143 @@
+//! Consistent cache system used by the catalog service
+//!
+//! # Design
+//!
+//! The catalog service needs to be able to service queries without needing to communicate
+//! with its underlying backing store. This serves the dual purpose of reducing load on this
+//! backing store, and also returning results in a more timely manner.
+//!
+//! This caching must be transparent to the users of the catalog service, and therefore must not
+//! introduce eventually consistent behaviour, or other consistency effects.
+//!
+//! As such this crate provides a strongly-consistent, distributed key-value cache.
+//!
+//! In order to keep things simple, this only provides a mapping from [`CacheKey`] to opaque
+//! binary payloads, with no support for structured payloads.
+//!
+//! This avoids:
+//!
+//! * Complex replicated state machines
+//! * Forward compatibility challenges where newer data can't roundtrip through older servers
+//! * Simple to introspect, debug and reason about
+//! * Predictable and easily quantifiable memory usage
+//!
+//! However, it does have the following implications:
+//!
+//! * Care must be taken to ensure that parsing of the cached payloads does not become a bottleneck
+//! * Large values (> 1MB) should be avoided, as updates will resend the entire value
+//!
+//! ## Components
+//!
+//! This crate is broken into multiple parts
+//!
+//! * [`CatalogCache`] provides a local key value store
+//! * [`CatalogCacheService`] exposes this [`CatalogCache`] over an HTTP API
+//! * [`CatalogCacheClient`] communicates with a remote [`CatalogCacheService`]
+//! * [`QuorumCatalogCache`] combines the above into a strongly-consistent distributed cache
+//!
+//! [`CatalogCache`]: local::CatalogCache
+//! [`CatalogCacheClient`]: api::client::CatalogCacheClient
+//! [`CatalogCacheService`]: api::server::CatalogCacheService
+//! [`QuorumCatalogCache`]: api::quorum::QuorumCatalogCache
+//!
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use bytes::Bytes;
+use std::sync::atomic::AtomicBool;
+
+pub mod api;
+pub mod local;
+
+/// The types of catalog cache key
+#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
+pub enum CacheKey {
+    /// A catalog namespace
+    Namespace(i64),
+    /// A catalog table
+    Table(i64),
+    /// A catalog partition
+    Partition(i64),
+}
+
+impl CacheKey {
+    /// Variant as string.
+    ///
+    /// This can be used for logging and metrics.
+    pub fn variant(&self) -> &'static str {
+        match self {
+            Self::Namespace(_) => "namespace",
+            Self::Table(_) => "table",
+            Self::Partition(_) => "partition",
+        }
+    }
+
+    /// Untyped ID.
+    pub fn id(&self) -> i64 {
+        match self {
+            Self::Namespace(id) => *id,
+            Self::Table(id) => *id,
+            Self::Partition(id) => *id,
+        }
+    }
+}
+
+/// A value stored in [`CatalogCache`](local::CatalogCache)
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct CacheValue {
+    /// The data stored for this cache
+    data: Bytes,
+    /// The generation of this cache data
+    generation: u64,
+}
+
+impl CacheValue {
+    /// Create a new [`CacheValue`] with the provided `data` and `generation`
+    pub fn new(data: Bytes, generation: u64) -> Self {
+        Self { data, generation }
+    }
+
+    /// The data stored for this cache
+    pub fn data(&self) -> &Bytes {
+        &self.data
+    }
+
+    /// The generation of this cache data
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+}
+
+/// Combines a [`CacheValue`] with an [`AtomicBool`] for the purposes of NRU-eviction
+#[derive(Debug)]
+struct CacheEntry {
+    /// The value of this cache entry
+    value: CacheValue,
+    /// An atomic flag that is set to `true` by `CatalogCache::get` and
+    /// cleared by `CatalogCache::evict_unused`
+    used: AtomicBool,
+}
+
+impl From<CacheValue> for CacheEntry {
+    fn from(value: CacheValue) -> Self {
+        Self {
+            value,
+            // Values start used to prevent racing with `evict_unused`
+            used: AtomicBool::new(true),
+        }
+    }
+}
diff --git a/catalog_cache/src/local/limit.rs b/catalog_cache/src/local/limit.rs
new file mode 100644
index 00000000000..6c38fee5a82
--- /dev/null
+++ b/catalog_cache/src/local/limit.rs
@@ -0,0 +1,82 @@
+//! A memory limiter
+
+use super::{Error, Result};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+#[derive(Debug)]
+pub(crate) struct MemoryLimiter {
+    current: AtomicUsize,
+    limit: usize,
+}
+
+impl MemoryLimiter {
+    /// Create a new [`MemoryLimiter`] limited to `limit` bytes
+    pub(crate) fn new(limit: usize) -> Self {
+        Self {
+            current: AtomicUsize::new(0),
+            limit,
+        }
+    }
+
+    /// Reserve `size` bytes, returning an error if this would exceed the limit
+    pub(crate) fn reserve(&self, size: usize) -> Result<()> {
+        let limit = self.limit;
+        let max = limit
+            .checked_sub(size)
+            .ok_or(Error::TooLarge { size, limit })?;
+
+        // We can use relaxed ordering as not relying on this to
+        // synchronise memory accesses beyond itself
+        self.current
+            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| {
+                // This cannot overflow as current + size <= limit
+                (current <= max).then_some(current + size)
+            })
+            .map_err(|current| Error::OutOfMemory {
+                size,
+                current,
+                limit,
+            })?;
+        Ok(())
+    }
+
+    /// Free `size` bytes
+    pub(crate) fn free(&self, size: usize) {
+        self.current.fetch_sub(size, Ordering::Relaxed);
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_limiter() {
+        let limiter = MemoryLimiter::new(100);
+
+        limiter.reserve(20).unwrap();
+        limiter.reserve(70).unwrap();
+
+        let err = limiter.reserve(20).unwrap_err().to_string();
+        assert_eq!(err, "Cannot reserve additional 20 bytes for cache containing 90 bytes as would exceed limit of 100 bytes");
+
+        limiter.reserve(10).unwrap();
+        limiter.reserve(0).unwrap();
+
+        let err = limiter.reserve(1).unwrap_err().to_string();
+        assert_eq!(err, "Cannot reserve additional 1 bytes for cache containing 100 bytes as would exceed limit of 100 bytes");
+
+        limiter.free(10);
+        limiter.reserve(10).unwrap();
+
+        limiter.free(100);
+
+        // Can add single value taking entire range
+        limiter.reserve(100).unwrap();
+        limiter.free(100);
+
+        // Protected against overflow
+        let err = limiter.reserve(usize::MAX).unwrap_err();
+        assert!(matches!(err, Error::TooLarge { .. }), "{err}");
+    }
+}
diff --git a/catalog_cache/src/local/mod.rs b/catalog_cache/src/local/mod.rs
new file mode 100644
index 00000000000..373dd628112
--- /dev/null
+++ b/catalog_cache/src/local/mod.rs
@@ -0,0 +1,355 @@
+//! A local in-memory cache
+
+mod limit;
+
+use crate::local::limit::MemoryLimiter;
+use crate::{CacheEntry, CacheKey, CacheValue};
+use dashmap::mapref::entry::Entry;
+use dashmap::DashMap;
+use snafu::Snafu;
+use std::sync::atomic::Ordering;
+use std::sync::Arc;
+
+/// Error for [`CatalogCache`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs, missing_copy_implementations)]
+pub enum Error {
+    #[snafu(display("Cannot reserve additional {size} bytes for cache containing {current} bytes as would exceed limit of {limit} bytes"))]
+    OutOfMemory {
+        size: usize,
+        current: usize,
+        limit: usize,
+    },
+
+    #[snafu(display("Cannot reserve additional {size} bytes for cache as request exceeds total memory limit of {limit} bytes"))]
+    TooLarge { size: usize, limit: usize },
+}
+
+/// Result for [`CatalogCache`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A trait for observing updated to [`CatalogCache`]
+///
+/// This can be used for injecting metrics, maintaining secondary indices or otherwise
+///
+/// Note: members are invoked under locks in [`CatalogCache`] and should therefore
+/// be short-running and not call back into [`CatalogCache`]
+pub trait CatalogCacheObserver: std::fmt::Debug + Send + Sync {
+    /// Called before a value is potentially inserted into [`CatalogCache`]
+    ///
+    /// This is called regardless of it [`CatalogCache`] already contains the value
+    fn insert(&self, key: CacheKey, new: &CacheValue, old: Option<&CacheValue>);
+
+    /// A key removed from the [`CatalogCache`]
+    fn evict(&self, key: CacheKey, value: &CacheValue);
+}
+
+/// A concurrent Not-Recently-Used cache mapping [`CacheKey`] to [`CacheValue`]
+#[derive(Debug, Default)]
+pub struct CatalogCache {
+    map: DashMap<CacheKey, CacheEntry>,
+    observer: Option<Arc<dyn CatalogCacheObserver>>,
+    limit: Option<MemoryLimiter>,
+}
+
+impl CatalogCache {
+    /// Create a new `CatalogCache` with an optional memory limit
+    pub fn new(limit: Option<usize>) -> Self {
+        Self {
+            limit: limit.map(MemoryLimiter::new),
+            ..Default::default()
+        }
+    }
+
+    /// Sets a [`CatalogCacheObserver`] for this [`CatalogCache`]
+    pub fn with_observer(self, observer: Arc<dyn CatalogCacheObserver>) -> Self {
+        Self {
+            observer: Some(observer),
+            ..self
+        }
+    }
+
+    /// Returns the value for `key` if it exists
+    pub fn get(&self, key: CacheKey) -> Option<CacheValue> {
+        let entry = self.map.get(&key)?;
+        entry.used.store(true, Ordering::Relaxed);
+        Some(entry.value.clone())
+    }
+
+    /// Insert the given `value` into the cache
+    ///
+    /// Skips insertion and returns false iff an entry already exists with the
+    /// same or greater generation
+    pub fn insert(&self, key: CacheKey, value: CacheValue) -> Result<bool> {
+        match self.map.entry(key) {
+            Entry::Occupied(mut o) => {
+                let old = &o.get().value;
+                if value.generation <= old.generation {
+                    return Ok(false);
+                }
+                if let Some(l) = &self.limit {
+                    let new_len = value.data.len();
+                    let cur_len = old.data.len();
+                    match new_len > cur_len {
+                        true => l.reserve(new_len - cur_len)?,
+                        false => l.free(cur_len - new_len),
+                    }
+                }
+                if let Some(v) = &self.observer {
+                    v.insert(key, &value, Some(old));
+                }
+                o.insert(value.into());
+            }
+            Entry::Vacant(v) => {
+                if let Some(l) = &self.limit {
+                    l.reserve(value.data.len())?;
+                }
+                if let Some(v) = &self.observer {
+                    v.insert(key, &value, None);
+                }
+                v.insert(value.into());
+            }
+        }
+        Ok(true)
+    }
+
+    /// Removes the [`CacheValue`] for the given `key` if any
+    pub fn delete(&self, key: CacheKey) -> Option<CacheValue> {
+        match self.map.entry(key) {
+            Entry::Occupied(o) => {
+                let old = &o.get().value;
+                if let Some(v) = &self.observer {
+                    v.evict(key, old)
+                }
+                if let Some(l) = &self.limit {
+                    l.free(old.data.len())
+                }
+                Some(o.remove().value)
+            }
+            _ => None,
+        }
+    }
+
+    /// Returns an iterator over the items in this cache
+    pub fn list(&self) -> CacheIterator<'_> {
+        CacheIterator(self.map.iter())
+    }
+
+    /// Evict all entries not accessed with [`CatalogCache::get`] or updated since
+    /// the last call to this function
+    ///
+    /// Periodically calling this provides a Not-Recently-Used eviction policy
+    pub fn evict_unused(&self) {
+        self.map.retain(|key, entry| {
+            let retain = entry.used.swap(false, Ordering::Relaxed);
+            if !retain {
+                if let Some(v) = &self.observer {
+                    v.evict(*key, &entry.value);
+                }
+                if let Some(l) = &self.limit {
+                    l.free(entry.value.data.len());
+                }
+            }
+            retain
+        });
+    }
+}
+
+/// Iterator for [`CatalogCache`]
+#[allow(missing_debug_implementations)]
+pub struct CacheIterator<'a>(dashmap::iter::Iter<'a, CacheKey, CacheEntry>);
+
+impl<'a> Iterator for CacheIterator<'a> {
+    type Item = (CacheKey, CacheValue);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let value = self.0.next()?;
+        Some((*value.key(), value.value().value.clone()))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.0.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use bytes::Bytes;
+    use dashmap::DashSet;
+
+    #[derive(Debug, Default)]
+    struct KeyObserver {
+        keys: DashSet<CacheKey>,
+    }
+
+    impl KeyObserver {
+        fn keys(&self) -> Vec<CacheKey> {
+            let mut keys: Vec<_> = self.keys.iter().map(|k| *k).collect();
+            keys.sort_unstable();
+            keys
+        }
+    }
+
+    impl CatalogCacheObserver for KeyObserver {
+        fn insert(&self, key: CacheKey, _new: &CacheValue, _old: Option<&CacheValue>) {
+            self.keys.insert(key);
+        }
+
+        fn evict(&self, key: CacheKey, _value: &CacheValue) {
+            self.keys.remove(&key);
+        }
+    }
+
+    #[test]
+    fn test_basic() {
+        let observer = Arc::new(KeyObserver::default());
+        let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _);
+
+        let v1 = CacheValue::new("1".into(), 5);
+        assert!(cache.insert(CacheKey::Table(0), v1.clone()).unwrap());
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1);
+
+        // Older generation rejected
+        assert!(!cache
+            .insert(CacheKey::Table(0), CacheValue::new("2".into(), 3))
+            .unwrap());
+
+        // Value unchanged
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1);
+
+        // Different key accepted
+        let v2 = CacheValue::new("2".into(), 5);
+        assert!(cache.insert(CacheKey::Table(1), v2.clone()).unwrap());
+        assert_eq!(cache.get(CacheKey::Table(1)).unwrap(), v2);
+
+        let v3 = CacheValue::new("3".into(), 0);
+        assert!(cache.insert(CacheKey::Partition(0), v3.clone()).unwrap());
+
+        // Newer generation updates
+        let v4 = CacheValue::new("4".into(), 6);
+        assert!(cache.insert(CacheKey::Table(0), v4.clone()).unwrap());
+
+        let mut values: Vec<_> = cache.list().collect();
+        values.sort_unstable_by(|(a, _), (b, _)| a.cmp(b));
+
+        assert_eq!(
+            values,
+            vec![
+                (CacheKey::Table(0), v4.clone()),
+                (CacheKey::Table(1), v2),
+                (CacheKey::Partition(0), v3),
+            ]
+        );
+        assert_eq!(
+            observer.keys(),
+            vec![
+                CacheKey::Table(0),
+                CacheKey::Table(1),
+                CacheKey::Partition(0)
+            ]
+        );
+
+        assert_eq!(cache.get(CacheKey::Namespace(0)), None);
+        assert_eq!(cache.delete(CacheKey::Namespace(0)), None);
+
+        assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v4);
+        assert_eq!(cache.delete(CacheKey::Table(0)).unwrap(), v4);
+        assert_eq!(cache.get(CacheKey::Table(0)), None);
+
+        assert_eq!(cache.list().count(), 2);
+        assert_eq!(observer.keys.len(), 2);
+    }
+
+    #[test]
+    fn test_nru() {
+        let observer = Arc::new(KeyObserver::default());
+        let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _);
+
+        let value = CacheValue::new("1".into(), 0);
+        cache.insert(CacheKey::Namespace(0), value.clone()).unwrap();
+        cache.insert(CacheKey::Partition(0), value.clone()).unwrap();
+        cache.insert(CacheKey::Table(0), value.clone()).unwrap();
+
+        cache.evict_unused();
+        // Inserted records should only be evicted on the next pass
+        assert_eq!(cache.list().count(), 3);
+        assert_eq!(observer.keys.len(), 3);
+
+        // Updating a record marks it used
+        cache
+            .insert(CacheKey::Table(0), CacheValue::new("2".into(), 1))
+            .unwrap();
+
+        // Fetching a record marks it used
+        cache.get(CacheKey::Partition(0)).unwrap();
+
+        // Insert a new record is used
+        cache.insert(CacheKey::Partition(1), value.clone()).unwrap();
+
+        cache.evict_unused();
+
+        // Namespace(0) evicted
+        let mut values: Vec<_> = cache.list().map(|(k, _)| k).collect();
+        values.sort_unstable();
+        let expected = vec![
+            CacheKey::Table(0),
+            CacheKey::Partition(0),
+            CacheKey::Partition(1),
+        ];
+        assert_eq!(values, expected);
+        assert_eq!(observer.keys(), expected);
+
+        // Stale updates don't count as usage
+        assert!(!cache.insert(CacheKey::Partition(0), value).unwrap());
+
+        // Listing does not preserve recently used
+        assert_eq!(cache.list().count(), 3);
+
+        cache.evict_unused();
+        assert_eq!(cache.list().count(), 0);
+        assert_eq!(observer.keys.len(), 0)
+    }
+
+    #[test]
+    fn test_limit() {
+        let cache = CatalogCache::new(Some(200));
+
+        let k1 = CacheKey::Table(1);
+        let k2 = CacheKey::Table(2);
+        let k3 = CacheKey::Table(3);
+
+        let v_100 = Bytes::from(vec![0; 100]);
+        let v_20 = Bytes::from(vec![0; 20]);
+
+        cache.insert(k1, CacheValue::new(v_100.clone(), 0)).unwrap();
+        cache.insert(k2, CacheValue::new(v_100.clone(), 0)).unwrap();
+
+        let r = cache.insert(k3, CacheValue::new(v_20.clone(), 0));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 20 bytes for cache containing 200 bytes as would exceed limit of 200 bytes");
+
+        // Upsert k1 to 20 bytes
+        cache.insert(k1, CacheValue::new(v_20.clone(), 1)).unwrap();
+
+        // Can now insert k3
+        cache.insert(k3, CacheValue::new(v_20.clone(), 0)).unwrap();
+
+        // Should evict nothing
+        cache.evict_unused();
+
+        // Cannot increase size of k3 to 100
+        let r = cache.insert(k3, CacheValue::new(v_100.clone(), 1));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 80 bytes for cache containing 140 bytes as would exceed limit of 200 bytes");
+
+        cache.delete(k2).unwrap();
+        cache.insert(k3, CacheValue::new(v_100.clone(), 1)).unwrap();
+
+        let r = cache.insert(k2, CacheValue::new(v_100.clone(), 1));
+        assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 100 bytes for cache containing 120 bytes as would exceed limit of 200 bytes");
+
+        // Should evict everything apart from k3
+        cache.evict_unused();
+
+        cache.insert(k2, CacheValue::new(v_100.clone(), 1)).unwrap();
+    }
+}
diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml
index f7707ac0848..de5d836c42a 100644
--- a/clap_blocks/Cargo.toml
+++ b/clap_blocks/Cargo.toml
@@ -5,24 +5,33 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 clap = { version = "4", features = ["derive", "env"] }
+ed25519-dalek = { version = "2", features = ["pem"] }
 futures = "0.3"
-http = "0.2.9"
+http = "0.2.11"
 humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
+iox_time = { path = "../iox_time" }
+itertools = "0.12.0"
 metric = { path = "../metric" }
+non-empty-string = "0.2.4"
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
-snafu = "0.7"
-sysinfo = "0.29.10"
+parquet_cache = { path = "../parquet_cache" }
+snafu = "0.8"
+sysinfo = "0.30.5"
 trace_exporters = { path = "../trace_exporters" }
 trogging = { path = "../trogging", default-features = false, features = ["clap"] }
+url = "2.4"
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-tempfile = "3.8.0"
+tempfile = "3.9.0"
 test_helpers = { path = "../test_helpers" }
 
 [features]
diff --git a/clap_blocks/src/bulk_ingest.rs b/clap_blocks/src/bulk_ingest.rs
new file mode 100644
index 00000000000..df383b5f8e0
--- /dev/null
+++ b/clap_blocks/src/bulk_ingest.rs
@@ -0,0 +1,274 @@
+//! CLI config for the router to enable bulk ingest APIs
+
+use ed25519_dalek::{
+    pkcs8::{DecodePrivateKey, DecodePublicKey},
+    SigningKey, VerifyingKey,
+};
+use snafu::{ResultExt, Snafu};
+use std::{fs, io, path::PathBuf};
+
+/// CLI config for bulk ingest.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct BulkIngestConfig {
+    /// Private signing key used for Parquet metadata returned from the `NewParquetMetadata` gRPC
+    /// API to prevent tampering/corruption of Parquet metadata provided by IOx to the process
+    /// preparing Parquet files for bulk ingest.
+    ///
+    /// This is a path to an Ed25519 private key file generated by OpenSSL with the command:
+    /// `openssl genpkey -algorithm ed25519 -out private-key-filename.pem`
+    ///
+    /// The public key used to verify signatures will be derived from this private key. Additional
+    /// public verification keys can be specified with
+    /// `-bulk-ingest-additional-verification-key-files` to support key rotation.
+    ///
+    /// If not specified, the `NewParquetMetadata` gRPC API will return unimplemented.
+    #[clap(
+        long = "bulk-ingest-metadata-signing-key-file",
+        env = "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE"
+    )]
+    metadata_signing_key_file: Option<PathBuf>,
+
+    /// When in the process of rotating keys, specify paths to files containing public verification
+    /// keys of previously used private signing keys used for signing metadata in the past.
+    ///
+    /// These files can be derived from private key files with this OpenSSL command:
+    /// `openssl pkey -in private-key-filename.pem -pubout -out public-key-filename.pem`
+    ///
+    /// Example: "public-key-1.pem,public-key-2.pem"
+    ///
+    /// If verification of the metadata signature fails with the current public key derived from
+    /// the current signing key, these verification keys will be tested in order to allow older
+    /// signatures generated with the old key to still be validated. For best performance of
+    /// signature verification, specify the additional verification keys in order of most likely
+    /// candidates first (probably most recently used first).
+    ///
+    /// If no additional verification keys are specified, only the verification key associated with
+    /// the current metadata signing key will be used to validate signatures.
+    #[clap(
+        long = "bulk-ingest-additional-verification-key-files",
+        env = "INFLUXDB_IOX_BULK_INGEST_ADDITIONAL_VERIFICATION_KEY_FILES",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+    )]
+    additional_verification_key_files: Vec<PathBuf>,
+
+    /// Rather than using whatever object store configuration may have been specified as a source
+    /// of presigned upload URLs for bulk ingest, use a mock implementation that returns an upload
+    /// URL value that can be inspected but not used.
+    ///
+    /// Only useful for testing bulk ingest without setting up S3! Do not use this in production!
+    #[clap(
+        hide = true,
+        long = "bulk-ingest-use-mock-presigned-url-signer",
+        env = "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER",
+        default_value = "false"
+    )]
+    pub use_mock_presigned_url_signer: bool,
+}
+
+impl BulkIngestConfig {
+    /// Constructor for bulk ingest configuration.
+    pub fn new(
+        metadata_signing_key_file: Option<PathBuf>,
+        additional_verification_key_files: Vec<PathBuf>,
+        use_mock_presigned_url_signer: bool,
+    ) -> Self {
+        Self {
+            metadata_signing_key_file,
+            additional_verification_key_files,
+            use_mock_presigned_url_signer,
+        }
+    }
+}
+
+impl TryFrom<&BulkIngestConfig> for Option<BulkIngestKeys> {
+    type Error = BulkIngestConfigError;
+
+    fn try_from(config: &BulkIngestConfig) -> Result<Self, Self::Error> {
+        config
+            .metadata_signing_key_file
+            .as_ref()
+            .map(|signing_key_file| {
+                let signing_key: SigningKey = fs::read_to_string(signing_key_file)
+                    .context(ReadingSigningKeyFileSnafu {
+                        filename: &signing_key_file,
+                    })
+                    .and_then(|file_contents| {
+                        DecodePrivateKey::from_pkcs8_pem(&file_contents).context(
+                            DecodingSigningKeySnafu {
+                                filename: signing_key_file,
+                            },
+                        )
+                    })?;
+
+                let additional_verifying_keys: Vec<_> = config
+                    .additional_verification_key_files
+                    .iter()
+                    .map(|verification_key_file| {
+                        fs::read_to_string(verification_key_file)
+                            .context(ReadingVerifyingKeyFileSnafu {
+                                filename: &verification_key_file,
+                            })
+                            .and_then(|file_contents| {
+                                DecodePublicKey::from_public_key_pem(&file_contents).context(
+                                    DecodingVerifyingKeySnafu {
+                                        filename: verification_key_file,
+                                    },
+                                )
+                            })
+                    })
+                    .collect::<Result<Vec<_>, _>>()?;
+
+                Ok(BulkIngestKeys {
+                    signing_key,
+                    additional_verifying_keys,
+                })
+            })
+            .transpose()
+    }
+}
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum BulkIngestConfigError {
+    #[snafu(display("Could not read signing key from {}: {source}", filename.display()))]
+    ReadingSigningKeyFile {
+        filename: PathBuf,
+        source: io::Error,
+    },
+
+    #[snafu(display("Could not decode signing key from {}: {source}", filename.display()))]
+    DecodingSigningKey {
+        filename: PathBuf,
+        source: ed25519_dalek::pkcs8::Error,
+    },
+
+    #[snafu(display("Could not read verifying key from {}: {source}", filename.display()))]
+    ReadingVerifyingKeyFile {
+        filename: PathBuf,
+        source: io::Error,
+    },
+
+    #[snafu(display("Could not decode verifying key from {}: {source}", filename.display()))]
+    DecodingVerifyingKey {
+        filename: PathBuf,
+        source: ed25519_dalek::pkcs8::spki::Error,
+    },
+}
+
+/// Key values extracted from the files specified to the CLI. To get an instance, first create a
+/// `BulkIngestConfig`, then call `try_from` to get a `Result` containing an
+/// `Option<BulkIngestKeys>` where the `Option` will be `Some` if the `BulkIngestConfig`'s
+/// `metadata_signing_key_file` value is `Some`.
+///
+/// If any filenames specified anywhere in the `BulkIngestConfig` can't be read or don't contain
+/// valid key values, the `try_from` implementation will return an error.
+#[derive(Debug)]
+pub struct BulkIngestKeys {
+    /// The parsed private signing key value contained in the file specified to
+    /// `--bulk-ingest-metadata-signing-key-file`.
+    pub signing_key: SigningKey,
+
+    /// If any files were specified in `--bulk-ingest-additional-verification-key-files`, this list
+    /// will contain their parsed public verification key values.
+    pub additional_verifying_keys: Vec<VerifyingKey>,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+    use std::process::Command;
+    use test_helpers::{assert_contains, make_temp_file, tmp_dir};
+
+    #[test]
+    fn missing_signing_key_param() {
+        // No signing key file -> no keys
+        let config = BulkIngestConfig::try_parse_from(["something"]).unwrap();
+        let keys: Option<BulkIngestKeys> = (&config).try_into().unwrap();
+        assert!(keys.is_none(), "expected None, got: {:?}", keys);
+
+        // Even if there are additional verification key files; no signing key file means no keys
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-additional-verification-key-files",
+            "some-public-key-filename.pem",
+        ])
+        .unwrap();
+        let keys: Option<BulkIngestKeys> = (&config).try_into().unwrap();
+        assert!(keys.is_none(), "expected None, got: {:?}", keys);
+    }
+
+    #[test]
+    fn signing_key_file_not_found() {
+        let nonexistent_filename = "do-not-create-a-file-with-this-name-or-this-test-will-fail";
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            nonexistent_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let err = keys.unwrap_err();
+        assert_contains!(
+            err.to_string(),
+            format!("Could not read signing key from {nonexistent_filename}")
+        );
+    }
+
+    #[test]
+    fn signing_key_file_contents_invalid() {
+        let signing_key_file = make_temp_file("not a valid signing key");
+        let signing_key_filename = signing_key_file.path().display().to_string();
+
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            &signing_key_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let err = keys.unwrap_err();
+        assert_contains!(
+            err.to_string(),
+            format!("Could not decode signing key from {signing_key_filename}")
+        );
+    }
+
+    #[test]
+    fn valid_signing_key_file_no_additional_key_files() {
+        let tmp_dir = tmp_dir().unwrap();
+        let signing_key_filename = tmp_dir
+            .path()
+            .join("test-private-key.pem")
+            .display()
+            .to_string();
+        Command::new("openssl")
+            .arg("genpkey")
+            .arg("-algorithm")
+            .arg("ed25519")
+            .arg("-out")
+            .arg(&signing_key_filename)
+            .output()
+            .unwrap();
+
+        let config = BulkIngestConfig::try_parse_from([
+            "something",
+            "--bulk-ingest-metadata-signing-key-file",
+            &signing_key_filename,
+        ])
+        .unwrap();
+
+        let keys: Result<Option<BulkIngestKeys>, _> = (&config).try_into();
+        let keys = keys.unwrap().unwrap();
+        let additional_keys = keys.additional_verifying_keys;
+        assert!(
+            additional_keys.is_empty(),
+            "expected additional keys to be empty, got {:?}",
+            additional_keys
+        );
+    }
+}
diff --git a/clap_blocks/src/catalog_cache.rs b/clap_blocks/src/catalog_cache.rs
new file mode 100644
index 00000000000..a9b85435a8c
--- /dev/null
+++ b/clap_blocks/src/catalog_cache.rs
@@ -0,0 +1,154 @@
+//! Config for the catalog cache server mode.
+
+use std::time::Duration;
+
+use itertools::Itertools;
+use snafu::{OptionExt, Snafu};
+use url::{Host, Url};
+
+use crate::memory_size::MemorySize;
+
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("host '{host}' is not a prefix of '{prefix}'"))]
+    NotAPrefix { host: String, prefix: String },
+
+    #[snafu(display("host '{host}' is not a valid host"))]
+    NotAValidHost { host: String },
+
+    #[snafu(display("invalid url: {source}"))]
+    InvalidUrl { source: url::ParseError },
+
+    #[snafu(display("Expected exactly two peers"))]
+    InvalidPeers,
+}
+
+/// CLI config for catalog configuration
+#[derive(Debug, Clone, PartialEq, Eq, clap::Parser)]
+pub struct CatalogConfig {
+    /// Host Name
+    ///
+    /// If provided, any matching entries in peers will be ignored
+    #[clap(long = "hostname", env = "INFLUXDB_IOX_HOSTNAME", value_parser = Host::parse)]
+    pub hostname: Option<Host<String>>,
+
+    /// Peers
+    ///
+    /// Can be provided as a comma-separated list, or on the command line multiple times
+    #[clap(
+        long = "catalog-cache-peers",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_PEERS",
+        required = false,
+        value_delimiter = ','
+    )]
+    pub peers: Vec<Url>,
+
+    /// Warmup delay.
+    ///
+    /// The warm-up (via dumping the cache of our peers) is delayed by the given time to make sure that we already
+    /// receive quorum writes. This ensure a gaplass transition / roll-out w/o any cache MISSes (esp. w/o any backend requests).
+    #[clap(
+        long = "catalog-cache-warmup-delay",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY",
+        default_value = default_warmup_delay(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub warmup_delay: Duration,
+
+    /// Garbage collection interval.
+    ///
+    /// Every time this interval past, cache elements that have not been used (i.e. read or updated) since the last time
+    /// are evicted from the cache.
+    #[clap(
+        long = "catalog-cache-gc-interval",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL",
+        default_value = default_gc_interval(),
+        value_parser = humantime::parse_duration,
+    )]
+    pub gc_interval: Duration,
+
+    /// Maximum number of bytes that should be cached within the catalog cache.
+    ///
+    /// If that limit is exceeded, no new values are accepted. This is meant as a safety measurement. You should adjust
+    /// your pod size and the GC interval (`--catalog-cache-gc-interval` / `INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL`) to
+    /// your workload.
+    ///
+    /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`).
+    #[clap(
+        long = "catalog-cache-size-limit",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_SIZE_LIMIT",
+        default_value = "1073741824",  // 1GB
+        action
+    )]
+    pub cache_size_limit: MemorySize,
+
+    /// Number of concurrent quorum operations that a single request can trigger.
+    #[clap(
+        long = "catalog-cache-quorum-fanout",
+        env = "INFLUXDB_IOX_CATALOG_CACHE_QUORUM_FANOUT",
+        default_value_t = 10
+    )]
+    pub quorum_fanout: usize,
+}
+
+impl CatalogConfig {
+    /// Return URL of other catalog cache nodes.
+    pub fn peers(&self) -> Result<[Url; 2], Error> {
+        let (peer1, peer2) = self
+            .peers
+            .iter()
+            .filter(|x| match (x.host(), &self.hostname) {
+                (Some(a), Some(r)) => &a != r,
+                _ => true,
+            })
+            .collect_tuple()
+            .context(InvalidPeersSnafu)?;
+
+        Ok([peer1.clone(), peer2.clone()])
+    }
+}
+
+fn default_warmup_delay() -> &'static str {
+    let s = humantime::format_duration(Duration::from_secs(60 * 5)).to_string();
+    Box::leak(Box::new(s))
+}
+
+fn default_gc_interval() -> &'static str {
+    let s = humantime::format_duration(Duration::from_secs(60 * 15)).to_string();
+    Box::leak(Box::new(s))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use clap::Parser;
+
+    #[test]
+    fn test_peers() {
+        let config = CatalogConfig::parse_from([
+            "binary",
+            "--catalog-cache-peers",
+            "http://peer1:8080",
+            "--catalog-cache-peers",
+            "http://peer2:9090",
+        ]);
+        let peer1 = Url::parse("http://peer1:8080").unwrap();
+        let peer2 = Url::parse("http://peer2:9090").unwrap();
+
+        let peers = config.peers().unwrap();
+        assert_eq!(peers, [peer1.clone(), peer2.clone()]);
+
+        let mut config = CatalogConfig::parse_from([
+            "binary",
+            "--catalog-cache-peers",
+            "http://peer1:8080,http://peer2:9090,http://peer3:9091",
+        ]);
+        let err = config.peers().unwrap_err();
+        assert!(matches!(err, Error::InvalidPeers), "{err}");
+
+        config.hostname = Some(Host::parse("peer3").unwrap());
+        let peers = config.peers().unwrap();
+        assert_eq!(peers, [peer1.clone(), peer2.clone()]);
+    }
+}
diff --git a/clap_blocks/src/catalog_dsn.rs b/clap_blocks/src/catalog_dsn.rs
index a7c9e13cd91..74e84bc73bf 100644
--- a/clap_blocks/src/catalog_dsn.rs
+++ b/clap_blocks/src/catalog_dsn.rs
@@ -1,10 +1,13 @@
 //! Catalog-DSN-related configs.
+use http::uri::InvalidUri;
+use iox_catalog::grpc::client::GrpcCatalogClient;
 use iox_catalog::sqlite::{SqliteCatalog, SqliteConnectionOptions};
 use iox_catalog::{
     interface::Catalog,
     mem::MemCatalog,
     postgres::{PostgresCatalog, PostgresConnectionOptions},
 };
+use iox_time::TimeProvider;
 use observability_deps::tracing::*;
 use snafu::{ResultExt, Snafu};
 use std::{sync::Arc, time::Duration};
@@ -18,6 +21,9 @@ pub enum Error {
     #[snafu(display("Catalog DSN not specified. Expected a string like 'postgresql://postgres@localhost:5432/postgres' or 'sqlite:///tmp/catalog.sqlite'"))]
     DsnNotSpecified {},
 
+    #[snafu(display("Invalid URI: {source}"))]
+    InvalidUri { source: InvalidUri },
+
     #[snafu(display("A catalog error occurred: {}", source))]
     Catalog {
         source: iox_catalog::interface::Error,
@@ -55,7 +61,9 @@ pub struct CatalogDsnConfig {
     ///
     /// PostgreSQL: `postgresql://postgres@localhost:5432/postgres`
     ///
-    /// Sqlite (a local filename /tmp/foo.sqlite): `sqlite:///tmp/foo.sqlite`
+    /// Sqlite (a local filename /tmp/foo.sqlite): `sqlite:///tmp/foo.sqlite` -
+    /// note sqlite is for development/testing only and should not be used for
+    /// production workloads.
     ///
     /// Memory (ephemeral, only useful for testing): `memory`
     ///
@@ -117,6 +125,7 @@ impl CatalogDsnConfig {
         &self,
         app_name: &'static str,
         metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
     ) -> Result<Arc<dyn Catalog>, Error> {
         let Some(dsn) = self.dsn.as_ref() else {
             return Err(Error::DsnNotSpecified {});
@@ -141,7 +150,7 @@ impl CatalogDsnConfig {
             ))
         } else if dsn == "memory" {
             info!("Catalog: In-memory");
-            let mem = MemCatalog::new(metrics);
+            let mem = MemCatalog::new(metrics, time_provider);
             Ok(Arc::new(mem))
         } else if let Some(file_path) = dsn.strip_prefix("sqlite://") {
             info!(file_path, "Catalog: Sqlite");
@@ -153,6 +162,11 @@ impl CatalogDsnConfig {
                     .await
                     .context(CatalogSnafu)?,
             ))
+        } else if dsn.starts_with("http://") || dsn.starts_with("https://") {
+            info!("Catalog: gRPC");
+            let uri = dsn.parse().context(InvalidUriSnafu)?;
+            let grpc = GrpcCatalogClient::new(uri, metrics, time_provider);
+            Ok(Arc::new(grpc))
         } else {
             Err(Error::UnknownCatalogDsn {
                 dsn: dsn.to_string(),
diff --git a/clap_blocks/src/compactor.rs b/clap_blocks/src/compactor.rs
index 55ab6c15d8b..9b63bc82f1a 100644
--- a/clap_blocks/src/compactor.rs
+++ b/clap_blocks/src/compactor.rs
@@ -73,80 +73,29 @@ pub struct CompactorConfig {
     #[clap(
         long = "exec-mem-pool-bytes",
         env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES",
-        default_value = "8589934592",  // 8GB
+        default_value = "17179869184",  // 16GB
         action
     )]
     pub exec_mem_pool_bytes: MemorySize,
 
-    /// Desired max size of compacted parquet files.
+    /// Overrides INFLUXDB_IOX_EXEC_MEM_POOL_BYTES to set the size of memory pool
+    /// used during compaction DF plan execution.  This value is expressed as a percent
+    /// of the memory limit for the cgroup (e.g. 70 = 70% of the cgroup memory limit).
+    /// This is converted to a byte limit as the compactor starts.
     ///
-    /// Note this is a target desired value, rather than a guarantee.
-    /// 1024 * 1024 * 100 =  104,857,600
-    #[clap(
-        long = "compaction-max-desired-size-bytes",
-        env = "INFLUXDB_IOX_COMPACTION_MAX_DESIRED_FILE_SIZE_BYTES",
-        default_value = "104857600",
-        action
-    )]
-    pub max_desired_file_size_bytes: u64,
-
-    /// Percentage of desired max file size for "leading edge split"
-    /// optimization.
-    ///
-    /// This setting controls the estimated output file size at which
-    /// the compactor will apply the "leading edge" optimization.
-    ///
-    /// When compacting files together, if the output size is
-    /// estimated to be greater than the following quantity, the
-    /// "leading edge split" optimization will be applied:
-    ///
-    /// percentage_max_file_size * max_desired_file_size_bytes
-    ///
-    /// This value must be between (0, 100)
-    ///
-    /// Default is 20
-    #[clap(
-        long = "compaction-percentage-max-file_size",
-        env = "INFLUXDB_IOX_COMPACTION_PERCENTAGE_MAX_FILE_SIZE",
-        default_value = "20",
-        action
-    )]
-    pub percentage_max_file_size: u16,
-
-    /// Split file percentage for "leading edge split"
-    ///
-    /// To reduce the likelihood of recompacting the same data too many
-    /// times, the compactor uses the "leading edge split"
-    /// optimization for the common case where the new data written
-    /// into a partition also has the most recent timestamps.
-    ///
-    /// When compacting multiple files together, if the compactor
-    /// estimates the resulting file will be large enough (see
-    /// `percentage_max_file_size`) it creates two output files
-    /// rather than one, split by time, like this:
-    ///
-    /// `|-------------- older_data -----------------||---- newer_data ----|`
-    ///
-    /// In the common case, the file containing `older_data` is less
-    /// likely to overlap with new data written in.
-    ///
-    /// This setting controls what percentage of data is placed into
-    /// the `older_data` portion.
+    /// Extreme values (<20% or >90%) are ignored and INFLUXDB_IOX_EXEC_MEM_POOL_BYTES
+    /// is used.  It will also use INFLUXDB_IOX_EXEC_MEM_POOL_BYTES if we fail to read
+    /// the cgroup limit, or it doesn't parse to a sane value.
     ///
-    /// Increasing this value increases the average size of compacted
-    /// files after the first round of compaction. However, doing so
-    /// also increase the likelihood that late arriving data will
-    /// overlap with larger existing files, necessitating additional
-    /// compaction rounds.
-    ///
-    /// This value must be between (0, 100)
+    /// If compaction plans attempt to allocate more than the computed byte limit
+    /// during execution, they will error with "ResourcesExhausted".
     #[clap(
-        long = "compaction-split-percentage",
-        env = "INFLUXDB_IOX_COMPACTION_SPLIT_PERCENTAGE",
-        default_value = "80",
+        long = "exec-mem-pool-percent",
+        env = "INFLUXDB_IOX_EXEC_MEM_POOL_PERCENT",
+        default_value = "70",
         action
     )]
-    pub split_percentage: u16,
+    pub exec_mem_pool_percent: u64,
 
     /// Maximum duration of the per-partition compaction task in seconds.
     #[clap(
@@ -182,39 +131,6 @@ pub struct CompactorConfig {
     )]
     pub enable_scratchpad: bool,
 
-    /// Maximum number of files that the compactor will try and
-    /// compact in a single plan.
-    ///
-    /// The higher this setting is the fewer compactor plans are run
-    /// and thus fewer resources over time are consumed by the
-    /// compactor. Increasing this setting also increases the peak
-    /// memory used for each compaction plan, and thus if it is set
-    /// too high, the compactor plans may exceed available memory.
-    #[clap(
-        long = "compaction-max-num-files-per-plan",
-        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_FILES_PER_PLAN",
-        default_value = "20",
-        action
-    )]
-    pub max_num_files_per_plan: usize,
-
-    /// Minimum number of L1 files to compact to L2.
-    ///
-    /// If there are more than this many L1 (by definition non
-    /// overlapping) files in a partition, the compactor will compact
-    /// them together into one or more larger L2 files.
-    ///
-    /// Setting this value higher in general results in fewer overall
-    /// resources spent on compaction but more files per partition (and
-    /// thus less optimal compression and query performance).
-    #[clap(
-        long = "compaction-min-num-l1-files-to-compact",
-        env = "INFLUXDB_IOX_COMPACTION_MIN_NUM_L1_FILES_TO_COMPACT",
-        default_value = "10",
-        action
-    )]
-    pub min_num_l1_files_to_compact: usize,
-
     /// Only process all discovered partitions once.
     ///
     /// By default the compactor will continuously loop over all
@@ -227,19 +143,6 @@ pub struct CompactorConfig {
     )]
     pub process_once: bool,
 
-    /// Maximum number of columns in a table of a partition that
-    /// will be able to considered to get compacted
-    ///
-    /// If a table has more than this many columns, the compactor will
-    /// not compact it, to avoid large memory use.
-    #[clap(
-        long = "compaction-max-num-columns-per-table",
-        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_COLUMNS_PER_TABLE",
-        default_value = "10000",
-        action
-    )]
-    pub max_num_columns_per_table: usize,
-
     /// Limit the number of partition fetch queries to at most the specified
     /// number of queries per second.
     ///
diff --git a/clap_blocks/src/compactor_scheduler.rs b/clap_blocks/src/compactor_scheduler.rs
index 25aeecc6aed..e2b3c8f3cac 100644
--- a/clap_blocks/src/compactor_scheduler.rs
+++ b/clap_blocks/src/compactor_scheduler.rs
@@ -1,5 +1,8 @@
 //! Compactor-Scheduler-related configs.
 
+use crate::socket_addr::SocketAddr;
+use std::str::FromStr;
+
 /// Compaction Scheduler type.
 #[derive(Debug, Default, Clone, Copy, PartialEq, clap::ValueEnum)]
 pub enum CompactorSchedulerType {
@@ -90,6 +93,62 @@ pub struct PartitionSourceConfigForLocalScheduler {
     pub ignore_partition_skip_marker: bool,
 }
 
+/// CLI config for scheduler's gossip.
+#[derive(Debug, Clone, clap::Parser)]
+pub struct CompactorSchedulerGossipConfig {
+    /// A comma-delimited set of seed gossip peer addresses.
+    ///
+    /// Example: "10.0.0.1:4242,10.0.0.2:4242"
+    ///
+    /// These seeds will be used to discover all other peers that talk to the
+    /// same seeds. Typically all nodes in the cluster should use the same set
+    /// of seeds.
+    #[clap(
+        long = "compactor-scheduler-gossip-seed-list",
+        env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_SEED_LIST",
+        required = false,
+        num_args=1..,
+        value_delimiter = ',',
+        requires = "scheduler_gossip_bind_address", // Field name, not flag
+    )]
+    pub scheduler_seed_list: Vec<String>,
+
+    /// The UDP socket address IOx will use for gossip communication between
+    /// peers.
+    ///
+    /// Example: "0.0.0.0:4242"
+    ///
+    /// If not provided, the gossip sub-system is disabled.
+    #[clap(
+        long = "compactor-scheduler-gossip-bind-address",
+        env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_BIND_ADDR",
+        default_value = "0.0.0.0:0",
+        required = false,
+        action
+    )]
+    pub scheduler_gossip_bind_address: SocketAddr,
+}
+
+impl Default for CompactorSchedulerGossipConfig {
+    fn default() -> Self {
+        Self {
+            scheduler_seed_list: vec![],
+            scheduler_gossip_bind_address: SocketAddr::from_str("0.0.0.0:4324").unwrap(),
+        }
+    }
+}
+
+impl CompactorSchedulerGossipConfig {
+    /// constructor for GossipConfig
+    ///
+    pub fn new(bind_address: &str, seed_list: Vec<String>) -> Self {
+        Self {
+            scheduler_seed_list: seed_list,
+            scheduler_gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(),
+        }
+    }
+}
+
 /// CLI config for compactor scheduler.
 #[derive(Debug, Clone, Default, clap::Parser)]
 pub struct CompactorSchedulerConfig {
@@ -103,6 +162,135 @@ pub struct CompactorSchedulerConfig {
     )]
     pub compactor_scheduler_type: CompactorSchedulerType,
 
+    /// Maximum number of files that the compactor will try and
+    /// compact in a single plan.
+    ///
+    /// The higher this setting is the fewer compactor plans are run
+    /// and thus fewer resources over time are consumed by the
+    /// compactor. Increasing this setting also increases the peak
+    /// memory used for each compaction plan, and thus if it is set
+    /// too high, the compactor plans may exceed available memory.
+    #[clap(
+        long = "compaction-max-num-files-per-plan",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_FILES_PER_PLAN",
+        default_value = "20",
+        action
+    )]
+    pub max_num_files_per_plan: usize,
+
+    /// Desired max size of compacted parquet files.
+    ///
+    /// Note this is a target desired value, rather than a guarantee.
+    /// 1024 * 1024 * 100 =  104,857,600
+    #[clap(
+        long = "compaction-max-desired-size-bytes",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_DESIRED_FILE_SIZE_BYTES",
+        default_value = "104857600",
+        action
+    )]
+    pub max_desired_file_size_bytes: u64,
+
+    /// Minimum number of L1 files to compact to L2.
+    ///
+    /// If there are more than this many L1 (by definition non
+    /// overlapping) files in a partition, the compactor will compact
+    /// them together into one or more larger L2 files.
+    ///
+    /// Setting this value higher in general results in fewer overall
+    /// resources spent on compaction but more files per partition (and
+    /// thus less optimal compression and query performance).
+    #[clap(
+        long = "compaction-min-num-l1-files-to-compact",
+        env = "INFLUXDB_IOX_COMPACTION_MIN_NUM_L1_FILES_TO_COMPACT",
+        default_value = "10",
+        action
+    )]
+    pub min_num_l1_files_to_compact: usize,
+
+    /// Maximum number of columns in a table of a partition that
+    /// will be able to considered to get compacted
+    ///
+    /// If a table has more than this many columns, the compactor will
+    /// not compact it, to avoid large memory use.
+    #[clap(
+        long = "compaction-max-num-columns-per-table",
+        env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_COLUMNS_PER_TABLE",
+        default_value = "10000",
+        action
+    )]
+    pub max_num_columns_per_table: usize,
+
+    /// Percentage of desired max file size for "leading edge split"
+    /// optimization.
+    ///
+    /// This setting controls the estimated output file size at which
+    /// the compactor will apply the "leading edge" optimization.
+    ///
+    /// When compacting files together, if the output size is
+    /// estimated to be greater than the following quantity, the
+    /// "leading edge split" optimization will be applied:
+    ///
+    /// percentage_max_file_size * target_file_size
+    ///
+    /// This value must be between (0, 100)
+    ///
+    /// Default is 20
+    #[clap(
+        long = "compaction-percentage-max-file_size",
+        env = "INFLUXDB_IOX_COMPACTION_PERCENTAGE_MAX_FILE_SIZE",
+        default_value = "20",
+        action
+    )]
+    pub percentage_max_file_size: u16,
+
+    /// Enable new priority-based compaction selection.
+    ///
+    /// Eventually, this will be the only way to select partitions.
+    ///
+    /// Default is false
+    #[clap(
+        long = "compaction-priority-based-selection",
+        env = "INFLUXDB_IOX_COMPACTION_PRIORITY_BASED_SELECTION",
+        default_value = "false",
+        action
+    )]
+    pub priority_based_selection: bool,
+
+    /// Split file percentage for "leading edge split"
+    ///
+    /// To reduce the likelihood of recompacting the same data too many
+    /// times, the compactor uses the "leading edge split"
+    /// optimization for the common case where the new data written
+    /// into a partition also has the most recent timestamps.
+    ///
+    /// When compacting multiple files together, if the compactor
+    /// estimates the resulting file will be large enough (see
+    /// `percentage_max_file_size`) it creates two output files
+    /// rather than one, split by time, like this:
+    ///
+    /// `|-------------- older_data -----------------||---- newer_data ----|`
+    ///
+    /// In the common case, the file containing `older_data` is less
+    /// likely to overlap with new data written in.
+    ///
+    /// This setting controls what percentage of data is placed into
+    /// the `older_data` portion.
+    ///
+    /// Increasing this value increases the average size of compacted
+    /// files after the first round of compaction. However, doing so
+    /// also increase the likelihood that late arriving data will
+    /// overlap with larger existing files, necessitating additional
+    /// compaction rounds.
+    ///
+    /// This value must be between (0, 100)
+    #[clap(
+        long = "compaction-split-percentage",
+        env = "INFLUXDB_IOX_COMPACTION_SPLIT_PERCENTAGE",
+        default_value = "80",
+        action
+    )]
+    pub split_percentage: u16,
+
     /// Partition source config used by the local scheduler.
     #[clap(flatten)]
     pub partition_source_config: PartitionSourceConfigForLocalScheduler,
@@ -110,6 +298,10 @@ pub struct CompactorSchedulerConfig {
     /// Shard config used by the local scheduler.
     #[clap(flatten)]
     pub shard_config: ShardConfigForLocalScheduler,
+
+    /// Gossip config.
+    #[clap(flatten)]
+    pub gossip_config: CompactorSchedulerGossipConfig,
 }
 
 #[cfg(test)]
diff --git a/clap_blocks/src/garbage_collector.rs b/clap_blocks/src/garbage_collector.rs
index 95e6aa3a7d8..0b10d785456 100644
--- a/clap_blocks/src/garbage_collector.rs
+++ b/clap_blocks/src/garbage_collector.rs
@@ -24,14 +24,6 @@ pub struct GarbageCollectorConfig {
     )]
     pub objectstore_cutoff: Duration,
 
-    /// Number of concurrent object store deletion tasks
-    #[clap(
-        long,
-        default_value_t = 5,
-        env = "INFLUXDB_IOX_GC_OBJECTSTORE_CONCURRENT_DELETES"
-    )]
-    pub objectstore_concurrent_deletes: usize,
-
     /// Number of minutes to sleep between iterations of the objectstore list loop.
     /// This is the sleep between entirely fresh list operations.
     /// Defaults to 30 minutes.
@@ -65,13 +57,26 @@ pub struct GarbageCollectorConfig {
     pub parquetfile_cutoff: Duration,
 
     /// Number of minutes to sleep between iterations of the parquet file deletion loop.
+    ///
     /// Defaults to 30 minutes.
+    ///
+    /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and
+    /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen
+    #[clap(long, env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES")]
+    pub parquetfile_sleep_interval_minutes: Option<u64>,
+
+    /// Duration to sleep between iterations of the parquet file deletion loop.
+    ///
+    /// Defaults to 30 minutes.
+    ///
+    /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and
+    /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen
     #[clap(
         long,
-        default_value_t = 30,
-        env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES"
+        value_parser = parse_duration,
+        env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL"
     )]
-    pub parquetfile_sleep_interval_minutes: u64,
+    pub parquetfile_sleep_interval: Option<Duration>,
 
     /// Number of minutes to sleep between iterations of the retention code.
     /// Defaults to 35 minutes to reduce incidence of it running at the same time as the parquet
@@ -83,3 +88,63 @@ pub struct GarbageCollectorConfig {
     )]
     pub retention_sleep_interval_minutes: u64,
 }
+
+impl GarbageCollectorConfig {
+    /// Returns the parquet_file sleep interval
+    pub fn parquetfile_sleep_interval(&self) -> Duration {
+        match (
+            self.parquetfile_sleep_interval,
+            self.parquetfile_sleep_interval_minutes,
+        ) {
+            (None, None) => Duration::from_secs(30 * 60),
+            (Some(d), None) => d,
+            (None, Some(m)) => Duration::from_secs(m * 60),
+            (Some(d), Some(m)) => d.min(Duration::from_secs(m * 60)),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_gc_config() {
+        let a: &[&str] = &[];
+        let config = GarbageCollectorConfig::parse_from(a);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(30 * 60)
+        );
+
+        let config =
+            GarbageCollectorConfig::parse_from(["something", "--parquetfile-sleep-interval", "3d"]);
+
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(24 * 60 * 60 * 3)
+        );
+
+        let config = GarbageCollectorConfig::parse_from([
+            "something",
+            "--parquetfile-sleep-interval-minutes",
+            "34",
+        ]);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(34 * 60)
+        );
+
+        let config = GarbageCollectorConfig::parse_from([
+            "something",
+            "--parquetfile-sleep-interval-minutes",
+            "34",
+            "--parquetfile-sleep-interval",
+            "35m",
+        ]);
+        assert_eq!(
+            config.parquetfile_sleep_interval(),
+            Duration::from_secs(34 * 60)
+        );
+    }
+}
diff --git a/clap_blocks/src/gossip.rs b/clap_blocks/src/gossip.rs
index 90623631c46..47365baec4d 100644
--- a/clap_blocks/src/gossip.rs
+++ b/clap_blocks/src/gossip.rs
@@ -1,6 +1,7 @@
 //! CLI config for cluster gossip communication.
 
 use crate::socket_addr::SocketAddr;
+use std::str::FromStr;
 
 /// Configuration parameters for the cluster gossip communication mechanism.
 #[derive(Debug, Clone, clap::Parser)]
@@ -32,18 +33,20 @@ pub struct GossipConfig {
     #[clap(
         long = "gossip-bind-address",
         env = "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
-        requires = "seed_list", // Field name, not flag
+        default_value = "0.0.0.0:4242",
+        required = false,
         action
     )]
-    pub gossip_bind_address: Option<SocketAddr>,
+    pub gossip_bind_address: SocketAddr,
 }
 
 impl GossipConfig {
-    /// Initialise the gossip config to be disabled.
-    pub fn disabled() -> Self {
+    /// constructor for GossipConfig
+    ///
+    pub fn new(bind_address: &str, seed_list: Vec<String>) -> Self {
         Self {
-            seed_list: vec![],
-            gossip_bind_address: None,
+            seed_list,
+            gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(),
         }
     }
 }
diff --git a/clap_blocks/src/ingester.rs b/clap_blocks/src/ingester.rs
index d736b16d921..be2ab26022a 100644
--- a/clap_blocks/src/ingester.rs
+++ b/clap_blocks/src/ingester.rs
@@ -76,6 +76,19 @@ pub struct IngesterConfig {
     )]
     pub persist_hot_partition_cost: usize,
 
+    /// An optional lower bound byte size limit that buffered data within a
+    /// partition must reach in order to be converted into an incremental
+    /// snapshot at query time.
+    ///
+    /// Snapshots improve query performance by amortising response generation at
+    /// the expense of a small memory overhead. Snapshots are retained until the
+    /// buffer is persisted.
+    #[clap(
+        long = "min-partition-snapshot-size",
+        env = "INFLUXDB_IOX_MIN_PARTITION_SNAPSHOT_SIZE"
+    )]
+    pub min_partition_snapshot_size: Option<NonZeroUsize>,
+
     /// Limit the number of partitions that may be buffered in a single
     /// namespace (across all tables) at any one time.
     ///
diff --git a/clap_blocks/src/ingester_address.rs b/clap_blocks/src/ingester_address.rs
index 96300e92352..90a8e8d489f 100644
--- a/clap_blocks/src/ingester_address.rs
+++ b/clap_blocks/src/ingester_address.rs
@@ -1,7 +1,7 @@
 //! Shared configuration and tests for accepting ingester addresses as arguments.
 
 use http::uri::{InvalidUri, InvalidUriParts, Uri};
-use snafu::Snafu;
+use snafu::{ResultExt, Snafu};
 use std::{fmt::Display, str::FromStr};
 
 /// An address to an ingester's gRPC API. Create by using `IngesterAddress::from_str`.
@@ -14,7 +14,7 @@ pub struct IngesterAddress {
 #[allow(missing_docs)]
 #[derive(Snafu, Debug)]
 pub enum Error {
-    #[snafu(context(false))]
+    #[snafu(display("{source}"))]
     Invalid { source: InvalidUri },
 
     #[snafu(display("Port is required; no port found in `{value}`"))]
@@ -28,14 +28,14 @@ impl FromStr for IngesterAddress {
     type Err = Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let uri = Uri::from_str(s)?;
+        let uri = Uri::from_str(s).context(InvalidSnafu)?;
 
         if uri.port().is_none() {
             return MissingPortSnafu { value: s }.fail();
         }
 
         let uri = if uri.scheme().is_none() {
-            Uri::from_str(&format!("http://{s}"))?
+            Uri::from_str(&format!("http://{s}")).context(InvalidSnafu)?
         } else {
             uri
         };
@@ -67,7 +67,7 @@ mod tests {
             num_args=1..,
             value_delimiter = ','
         )]
-        pub ingester_addresses: Vec<IngesterAddress>,
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
     }
 
     #[test]
@@ -89,7 +89,7 @@ mod tests {
             num_args=0..,
             value_delimiter = ','
         )]
-        pub ingester_addresses: Vec<IngesterAddress>,
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
     }
 
     #[test]
@@ -243,7 +243,7 @@ mod tests {
             num_args=1..,
             value_delimiter = ','
         )]
-        pub ingester_addresses: Vec<IngesterAddress>,
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
     }
 
     #[test]
@@ -281,7 +281,7 @@ mod tests {
             num_args=0..,
             value_delimiter = ','
         )]
-        pub ingester_addresses: Vec<IngesterAddress>,
+        pub(crate) ingester_addresses: Vec<IngesterAddress>,
     }
 
     #[test]
diff --git a/clap_blocks/src/lib.rs b/clap_blocks/src/lib.rs
index 255870036fc..d9f689133ce 100644
--- a/clap_blocks/src/lib.rs
+++ b/clap_blocks/src/lib.rs
@@ -18,6 +18,8 @@
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
+pub mod bulk_ingest;
+pub mod catalog_cache;
 pub mod catalog_dsn;
 pub mod compactor;
 pub mod compactor_scheduler;
@@ -27,6 +29,7 @@ pub mod ingester;
 pub mod ingester_address;
 pub mod memory_size;
 pub mod object_store;
+pub mod parquet_cache;
 pub mod querier;
 pub mod router;
 pub mod run_config;
diff --git a/clap_blocks/src/memory_size.rs b/clap_blocks/src/memory_size.rs
index 6204472d5aa..6e7515df599 100644
--- a/clap_blocks/src/memory_size.rs
+++ b/clap_blocks/src/memory_size.rs
@@ -2,7 +2,7 @@
 
 use std::{str::FromStr, sync::OnceLock};
 
-use sysinfo::{RefreshKind, System, SystemExt};
+use sysinfo::{MemoryRefreshKind, RefreshKind, System};
 
 /// Memory size.
 ///
@@ -46,10 +46,7 @@ impl FromStr for MemorySize {
                         "relative memory size must be in [0, 100] but is {percentage}"
                     ));
                 }
-                let total = *TOTAL_MEM_BYTES.get_or_init(|| {
-                    let sys = System::new_with_specifics(RefreshKind::new().with_memory());
-                    sys.total_memory() as usize
-                });
+                let total = total_mem_bytes();
                 let bytes = (percentage as f64 / 100f64 * total as f64).round() as usize;
                 Ok(Self(bytes))
             }
@@ -62,9 +59,17 @@ impl FromStr for MemorySize {
 }
 
 /// Totally available memory size in bytes.
-///
-/// Keep this in a global state so that we only need to inspect the system once during IOx startup.
-static TOTAL_MEM_BYTES: OnceLock<usize> = OnceLock::new();
+pub fn total_mem_bytes() -> usize {
+    // Keep this in a global state so that we only need to inspect the system once during IOx startup.
+    static TOTAL_MEM_BYTES: OnceLock<usize> = OnceLock::new();
+
+    *TOTAL_MEM_BYTES.get_or_init(|| {
+        let sys = System::new_with_specifics(
+            RefreshKind::new().with_memory(MemoryRefreshKind::everything()),
+        );
+        sys.total_memory() as usize
+    })
+}
 
 #[cfg(test)]
 mod tests {
diff --git a/clap_blocks/src/object_store.rs b/clap_blocks/src/object_store.rs
index 38a96bc9aa4..e961357a30c 100644
--- a/clap_blocks/src/object_store.rs
+++ b/clap_blocks/src/object_store.rs
@@ -1,16 +1,20 @@
 //! CLI handling for object store config (via CLI arguments and environment variables).
 
 use futures::TryStreamExt;
-use object_store::memory::InMemory;
-use object_store::path::Path;
-use object_store::throttle::ThrottledStore;
-use object_store::{throttle::ThrottleConfig, DynObjectStore};
+use non_empty_string::NonEmptyString;
+use object_store::{
+    memory::InMemory,
+    path::Path,
+    throttle::{ThrottleConfig, ThrottledStore},
+    DynObjectStore,
+};
 use observability_deps::tracing::{info, warn};
 use snafu::{ResultExt, Snafu};
-use std::sync::Arc;
-use std::{fs, num::NonZeroUsize, path::PathBuf, time::Duration};
+use std::{convert::Infallible, fs, num::NonZeroUsize, path::PathBuf, sync::Arc, time::Duration};
 use uuid::Uuid;
 
+use crate::parquet_cache::ParquetCacheClientConfig;
+
 #[derive(Debug, Snafu)]
 #[allow(missing_docs)]
 pub enum ParseError {
@@ -53,6 +57,12 @@ pub enum ParseError {
 /// specified.
 pub const FALLBACK_AWS_REGION: &str = "us-east-1";
 
+/// A `clap` `value_parser` which returns `None` when given an empty string and
+/// `Some(NonEmptyString)` otherwise.
+fn parse_optional_string(s: &str) -> Result<Option<NonEmptyString>, Infallible> {
+    Ok(NonEmptyString::new(s.to_string()).ok())
+}
+
 /// CLI config for object stores.
 #[derive(Debug, Clone, clap::Parser)]
 pub struct ObjectStoreConfig {
@@ -74,7 +84,8 @@ pub struct ObjectStoreConfig {
         long = "object-store",
         env = "INFLUXDB_IOX_OBJECT_STORE",
         ignore_case = true,
-        action
+        action,
+        verbatim_doc_comment
     )]
     pub object_store: Option<ObjectStoreType>,
 
@@ -108,8 +119,11 @@ pub struct ObjectStoreConfig {
     ///
     /// Prefer the environment variable over the command line flag in shared
     /// environments.
-    #[clap(long = "aws-access-key-id", env = "AWS_ACCESS_KEY_ID", action)]
-    pub aws_access_key_id: Option<String>,
+    ///
+    /// An empty string value is equivalent to omitting the flag.
+    /// Note: must refer to std::option::Option explicitly, see <https://github.com/clap-rs/clap/issues/4626>
+    #[clap(long = "aws-access-key-id", env = "AWS_ACCESS_KEY_ID", value_parser = parse_optional_string, default_value="", action)]
+    pub aws_access_key_id: std::option::Option<NonEmptyString>,
 
     /// When using Amazon S3 as the object store, set this to the secret access
     /// key that goes with the specified access key ID.
@@ -119,8 +133,11 @@ pub struct ObjectStoreConfig {
     ///
     /// Prefer the environment variable over the command line flag in shared
     /// environments.
-    #[clap(long = "aws-secret-access-key", env = "AWS_SECRET_ACCESS_KEY", action)]
-    pub aws_secret_access_key: Option<String>,
+    ///
+    /// An empty string value is equivalent to omitting the flag.
+    /// Note: must refer to std::option::Option explicitly, see <https://github.com/clap-rs/clap/issues/4626>
+    #[clap(long = "aws-secret-access-key", env = "AWS_SECRET_ACCESS_KEY", value_parser = parse_optional_string, default_value = "", action)]
+    pub aws_secret_access_key: std::option::Option<NonEmptyString>,
 
     /// When using Amazon S3 as the object store, set this to the region
     /// that goes with the specified bucket if different from the fallback
@@ -203,6 +220,10 @@ pub struct ObjectStoreConfig {
         action
     )]
     pub object_store_connection_limit: NonZeroUsize,
+
+    /// Optional config for the cache client.
+    #[clap(flatten)]
+    pub cache_config: Option<ParquetCacheClientConfig>,
 }
 
 impl ObjectStoreConfig {
@@ -229,6 +250,7 @@ impl ObjectStoreConfig {
             google_service_account: Default::default(),
             object_store,
             object_store_connection_limit: NonZeroUsize::new(16).unwrap(),
+            cache_config: Default::default(),
         }
     }
 }
@@ -284,10 +306,24 @@ fn new_gcs(_: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
 
 #[cfg(feature = "aws")]
 fn new_s3(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError> {
-    use object_store::aws::AmazonS3Builder;
     use object_store::limit::LimitStore;
 
-    info!(bucket=?config.bucket, endpoint=?config.aws_endpoint, object_store_type="S3", "Object Store");
+    info!(
+        bucket=?config.bucket,
+        endpoint=?config.aws_endpoint,
+        object_store_type="S3",
+        "Object Store"
+    );
+
+    Ok(Arc::new(LimitStore::new(
+        build_s3(config)?,
+        config.object_store_connection_limit.get(),
+    )))
+}
+
+#[cfg(feature = "aws")]
+fn build_s3(config: &ObjectStoreConfig) -> Result<object_store::aws::AmazonS3, ParseError> {
+    use object_store::aws::AmazonS3Builder;
 
     let mut builder = AmazonS3Builder::new()
         .with_allow_http(config.aws_allow_http)
@@ -298,22 +334,19 @@ fn new_s3(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStore>, ParseError>
         builder = builder.with_bucket_name(bucket);
     }
     if let Some(key_id) = &config.aws_access_key_id {
-        builder = builder.with_access_key_id(key_id);
+        builder = builder.with_access_key_id(key_id.get());
     }
     if let Some(token) = &config.aws_session_token {
         builder = builder.with_token(token);
     }
     if let Some(secret) = &config.aws_secret_access_key {
-        builder = builder.with_secret_access_key(secret);
+        builder = builder.with_secret_access_key(secret.get());
     }
     if let Some(endpoint) = &config.aws_endpoint {
         builder = builder.with_endpoint(endpoint);
     }
 
-    Ok(Arc::new(LimitStore::new(
-        builder.build().context(InvalidS3ConfigSnafu)?,
-        config.object_store_connection_limit.get(),
-    )))
+    builder.build().context(InvalidS3ConfigSnafu)
 }
 
 #[cfg(not(feature = "aws"))]
@@ -361,10 +394,10 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStor
         }
     }
 
-    match &config.object_store {
+    let remote_store: Arc<DynObjectStore> = match &config.object_store {
         Some(ObjectStoreType::Memory) | None => {
             info!(object_store_type = "Memory", "Object Store");
-            Ok(Arc::new(InMemory::new()))
+            Arc::new(InMemory::new())
         }
         Some(ObjectStoreType::MemoryThrottled) => {
             let config = ThrottleConfig {
@@ -384,12 +417,12 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStor
             };
 
             info!(?config, object_store_type = "Memory", "Object Store");
-            Ok(Arc::new(ThrottledStore::new(InMemory::new(), config)))
+            Arc::new(ThrottledStore::new(InMemory::new(), config))
         }
 
-        Some(ObjectStoreType::Google) => new_gcs(config),
-        Some(ObjectStoreType::S3) => new_s3(config),
-        Some(ObjectStoreType::Azure) => new_azure(config),
+        Some(ObjectStoreType::Google) => new_gcs(config)?,
+        Some(ObjectStoreType::S3) => new_s3(config)?,
+        Some(ObjectStoreType::Azure) => new_azure(config)?,
         Some(ObjectStoreType::File) => match config.database_directory.as_ref() {
             Some(db_dir) => {
                 info!(?db_dir, object_store_type = "Directory", "Object Store");
@@ -398,17 +431,49 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result<Arc<DynObjectStor
 
                 let store = object_store::local::LocalFileSystem::new_with_prefix(db_dir)
                     .context(CreateLocalFileSystemSnafu { path: db_dir })?;
-                Ok(Arc::new(store))
+                Arc::new(store)
             }
             None => MissingObjectStoreConfigSnafu {
                 object_store: ObjectStoreType::File,
                 missing: "data-dir",
             }
-            .fail(),
+            .fail()?,
         },
+    };
+
+    if let Some(cache_config) = &config.cache_config {
+        let cache = parquet_cache::make_client(
+            cache_config.namespace_addr.clone(),
+            Arc::clone(&remote_store),
+        );
+        info!(?cache_config, "Parquet cache enabled");
+        Ok(cache)
+    } else {
+        Ok(remote_store)
+    }
+}
+
+/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so when the AWS
+/// feature is enabled and the configured object store is S3, return a signer.
+#[cfg(feature = "aws")]
+pub fn make_presigned_url_signer(
+    config: &ObjectStoreConfig,
+) -> Result<Option<Arc<dyn object_store::signer::Signer>>, ParseError> {
+    match &config.object_store {
+        Some(ObjectStoreType::S3) => Ok(Some(Arc::new(build_s3(config)?))),
+        _ => Ok(None),
     }
 }
 
+/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so if the AWS
+/// feature isn't enabled, don't return a signer.
+#[cfg(not(feature = "aws"))]
+pub fn make_presigned_url_signer(
+    _config: &ObjectStoreConfig,
+) -> Result<Option<Arc<dyn object_store::signer::Signer>>, ParseError> {
+    Ok(None)
+}
+
 #[derive(Debug, Snafu)]
 #[allow(missing_docs)]
 pub enum CheckError {
@@ -425,10 +490,7 @@ pub async fn check_object_store(object_store: &DynObjectStore) -> Result<(), Che
     let prefix = Path::from_iter([uuid]);
 
     // create stream (this might fail if the store is not readable)
-    let mut stream = object_store
-        .list(Some(&prefix))
-        .await
-        .context(CannotReadObjectStoreSnafu)?;
+    let mut stream = object_store.list(Some(&prefix));
 
     // ... but sometimes it fails only if we use the resulting stream, so try that once
     stream
@@ -464,6 +526,14 @@ mod tests {
         assert_eq!(&object_store.to_string(), "InMemory")
     }
 
+    #[test]
+    fn default_url_signer_is_none() {
+        let config = ObjectStoreConfig::try_parse_from(["server"]).unwrap();
+
+        let signer = make_presigned_url_signer(&config).unwrap();
+        assert!(signer.is_none(), "Expected None, got {signer:?}");
+    }
+
     #[test]
     #[cfg(feature = "aws")]
     fn valid_s3_config() {
@@ -481,7 +551,10 @@ mod tests {
         .unwrap();
 
         let object_store = make_object_store(&config).unwrap();
-        assert_eq!(&object_store.to_string(), "AmazonS3(mybucket)")
+        assert_eq!(
+            &object_store.to_string(),
+            "LimitStore(16, AmazonS3(mybucket))"
+        )
     }
 
     #[test]
@@ -497,13 +570,73 @@ mod tests {
 
         assert_eq!(
             err,
-            "Specified S3 for the object store, required configuration missing for bucket"
+            "Error configuring Amazon S3: Generic S3 error: Missing bucket name"
+        );
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn valid_s3_url_signer() {
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "s3",
+            "--bucket",
+            "mybucket",
+            "--aws-access-key-id",
+            "NotARealAWSAccessKey",
+            "--aws-secret-access-key",
+            "NotARealAWSSecretAccessKey",
+        ])
+        .unwrap();
+
+        assert!(make_presigned_url_signer(&config).unwrap().is_some());
+
+        // Even with the aws feature on, any other object store shouldn't create a signer.
+        let root = TempDir::new().unwrap();
+        let root_path = root.path().to_str().unwrap();
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "file",
+            "--data-dir",
+            root_path,
+        ])
+        .unwrap();
+
+        let signer = make_presigned_url_signer(&config).unwrap();
+        assert!(signer.is_none(), "Expected None, got {signer:?}");
+    }
+
+    #[test]
+    #[cfg(feature = "aws")]
+    fn s3_url_signer_config_missing_params() {
+        let mut config =
+            ObjectStoreConfig::try_parse_from(["server", "--object-store", "s3"]).unwrap();
+
+        // clean out eventual leaks via env variables
+        config.bucket = None;
+
+        let err = make_presigned_url_signer(&config).unwrap_err().to_string();
+
+        assert_eq!(
+            err,
+            "Error configuring Amazon S3: Generic S3 error: Missing bucket name"
         );
     }
 
     #[test]
     #[cfg(feature = "gcp")]
     fn valid_google_config() {
+        use std::io::Write;
+        use tempfile::NamedTempFile;
+
+        let mut file = NamedTempFile::new().expect("tempfile should be created");
+        const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#;
+        writeln!(file, "{FAKE_KEY}").unwrap();
+        let path = file.path().to_str().expect("file path should exist");
+
         let config = ObjectStoreConfig::try_parse_from([
             "server",
             "--object-store",
@@ -511,12 +644,15 @@ mod tests {
             "--bucket",
             "mybucket",
             "--google-service-account",
-            "~/Not/A/Real/path.json",
+            path,
         ])
         .unwrap();
 
         let object_store = make_object_store(&config).unwrap();
-        assert_eq!(&object_store.to_string(), "GoogleCloudStorage(mybucket)")
+        assert_eq!(
+            &object_store.to_string(),
+            "LimitStore(16, GoogleCloudStorage(mybucket))"
+        )
     }
 
     #[test]
@@ -532,8 +668,7 @@ mod tests {
 
         assert_eq!(
             err,
-            "Specified Google for the object store, required configuration missing for \
-            bucket, google-service-account"
+            "Error configuring GCS: Generic GCS error: Missing bucket name"
         );
     }
 
@@ -549,12 +684,12 @@ mod tests {
             "--azure-storage-account",
             "NotARealStorageAccount",
             "--azure-storage-access-key",
-            "NotARealKey",
+            "Zm9vYmFy", // base64 encoded "foobar"
         ])
         .unwrap();
 
         let object_store = make_object_store(&config).unwrap();
-        assert_eq!(&object_store.to_string(), "MicrosoftAzure(mybucket)")
+        assert_eq!(&object_store.to_string(), "LimitStore(16, MicrosoftAzure { account: NotARealStorageAccount, container: mybucket })")
     }
 
     #[test]
@@ -570,8 +705,7 @@ mod tests {
 
         assert_eq!(
             err,
-            "Specified Azure for the object store, required configuration missing for \
-            bucket, azure-storage-account, azure-storage-access-key"
+            "Error configuring Microsoft Azure: Generic MicrosoftAzure error: Container name must be specified"
         );
     }
 
@@ -614,4 +748,28 @@ mod tests {
             data-dir"
         );
     }
+
+    #[test]
+    fn valid_cache_config() {
+        let root = TempDir::new().unwrap();
+        let root_path = root.path().to_str().unwrap();
+
+        let config = ObjectStoreConfig::try_parse_from([
+            "server",
+            "--object-store",
+            "file",
+            "--data-dir",
+            root_path,
+            "--parquet-cache-namespace-addr",
+            "http://k8s-noninstance-general-service-route:8080",
+        ])
+        .unwrap();
+
+        let object_store = make_object_store(&config).unwrap().to_string();
+        assert!(
+            object_store.starts_with("DataCacheObjectStore"),
+            "{}",
+            object_store
+        )
+    }
 }
diff --git a/clap_blocks/src/parquet_cache.rs b/clap_blocks/src/parquet_cache.rs
new file mode 100644
index 00000000000..d93aa944a4c
--- /dev/null
+++ b/clap_blocks/src/parquet_cache.rs
@@ -0,0 +1,57 @@
+//! CLI handling for parquet data cache config (via CLI arguments and environment variables).
+
+/// Config for cache client.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct ParquetCacheClientConfig {
+    /// The address for the service namespace (not a given instance).
+    ///
+    /// When the client comes online, it discovers the keyspace
+    /// by issue requests to this address.
+    #[clap(
+        long = "parquet-cache-namespace-addr",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_NAMESPACE_ADDR",
+        required = false
+    )]
+    pub namespace_addr: String,
+}
+
+/// Config for cache instance.
+#[derive(Debug, Clone, Default, clap::Parser)]
+pub struct ParquetCacheInstanceConfig {
+    /// The path to the config file for the keyspace.
+    #[clap(
+        long = "parquet-cache-keyspace-config-path",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_KEYSPACE_CONFIG_PATH",
+        required = true
+    )]
+    pub keyspace_config_path: String,
+
+    /// The hostname of the cache instance (k8s pod) running this process.
+    ///
+    /// Cache controller should be setting this env var.
+    #[clap(
+        long = "parquet-cache-instance-hostname",
+        env = "HOSTNAME",
+        required = true
+    )]
+    pub instance_hostname: String,
+
+    /// The local directory to store data.
+    #[clap(
+        long = "parquet-cache-local-dir",
+        env = "INFLUXDB_IOX_PARQUET_CACHE_LOCAL_DIR",
+        required = true
+    )]
+    pub local_dir: String,
+}
+
+impl From<ParquetCacheInstanceConfig> for parquet_cache::ParquetCacheServerConfig {
+    fn from(instance_config: ParquetCacheInstanceConfig) -> Self {
+        Self {
+            keyspace_config_path: instance_config.keyspace_config_path,
+            hostname: instance_config.instance_hostname,
+            local_dir: instance_config.local_dir,
+            policy_config: Default::default(),
+        }
+    }
+}
diff --git a/clap_blocks/src/querier.rs b/clap_blocks/src/querier.rs
index e92b55b3189..4a62455b0ee 100644
--- a/clap_blocks/src/querier.rs
+++ b/clap_blocks/src/querier.rs
@@ -120,6 +120,14 @@ pub struct QuerierConfig {
         action
     )]
     pub datafusion_config: HashMap<String, String>,
+
+    /// Use the new V2 API to talk to the ingester.
+    ///
+    /// Defaults to "no".
+    ///
+    /// See <https://github.com/influxdata/influxdb_iox/issues/8169>.
+    #[clap(long = "v2-ingester-api", env = "INFLUXDB_IOX_V2_INGESTER_API", action)]
+    pub v2_ingester_api: bool,
 }
 
 fn parse_datafusion_config(
@@ -213,7 +221,7 @@ mod tests {
             "error: \
             invalid value '\\ingester-0:8082' \
             for '--ingester-addresses [<INGESTER_ADDRESSES>...]': \
-            Invalid: invalid uri character"
+            invalid uri character"
         );
     }
 
diff --git a/clap_blocks/src/router.rs b/clap_blocks/src/router.rs
index 68381407baf..28442d79e72 100644
--- a/clap_blocks/src/router.rs
+++ b/clap_blocks/src/router.rs
@@ -1,6 +1,7 @@
 //! CLI config for the router using the RPC write path
 
 use crate::{
+    bulk_ingest::BulkIngestConfig,
     gossip::GossipConfig,
     ingester_address::IngesterAddress,
     single_tenant::{
@@ -20,6 +21,10 @@ pub struct RouterConfig {
     #[clap(flatten)]
     pub gossip_config: GossipConfig,
 
+    /// Bulk ingest API config.
+    #[clap(flatten)]
+    pub bulk_ingest_config: BulkIngestConfig,
+
     /// Addr for connection to authz
     #[clap(
         long = CONFIG_AUTHZ_FLAG,
@@ -57,6 +62,17 @@ pub struct RouterConfig {
     )]
     pub http_request_limit: usize,
 
+    /// When writing line protocol data, does an error on a single line
+    /// reject the write? Or will all individual valid lines be written?
+    /// Set to true to enable all valid lines to write.
+    #[clap(
+        long = "partial-writes-enabled",
+        env = "INFLUXDB_IOX_PARTIAL_WRITES_ENABLED",
+        default_value = "false",
+        action
+    )]
+    pub permit_partial_writes: bool,
+
     /// gRPC address for the router to talk with the ingesters. For
     /// example:
     ///
diff --git a/client_util/Cargo.toml b/client_util/Cargo.toml
index 803a11001fc..8b2e12f6663 100644
--- a/client_util/Cargo.toml
+++ b/client_util/Cargo.toml
@@ -6,14 +6,17 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-http = "0.2.9"
-reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
-thiserror = "1.0.48"
+http = "0.2.11"
+reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls-native-roots"] }
+thiserror = "1.0.56"
 tonic = { workspace = true }
 tower = "0.4"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread"] }
 mockito = { version = "1.2", default-features = false }
diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml
index 5eebe4050da..c38745ce4e5 100644
--- a/data_types/Cargo.toml
+++ b/data_types/Cargo.toml
@@ -6,27 +6,36 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
+arrow-buffer = { workspace = true }
+bytes = "1.5"
 chrono = { version = "0.4", default-features = false }
 croaring = "1.0.0"
 influxdb-line-protocol = { path = "../influxdb_line_protocol" }
 iox_time = { path = "../iox_time" }
 generated_types = { path = "../generated_types" }
+murmur3 = "0.5.2"
 observability_deps = { path = "../observability_deps" }
 once_cell = "1"
-ordered-float = "3"
+ordered-float = "4"
+percent-encoding = "2.3.1"
+prost = { workspace = true }
 schema = { path = "../schema" }
-sha2 = "0.10"
-sqlx = { version = "0.7.1", features = ["runtime-tokio-rustls", "postgres", "uuid"] }
-thiserror = "1.0.48"
+serde_json = "1.0"
+siphasher = "1.0"
+sha2 = { version = "0.10", default-features = false }
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "uuid"] }
+thiserror = "1.0.56"
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-percent-encoding = "2.2.0"
-serde = { version = "1.0.188", features = ["derive"] }
 
 [dev-dependencies] # In alphabetical order
 assert_matches = "1"
 paste = "1.0.14"
-proptest = { version = "1.2.0", default-features = false }
+proptest = { version = "1.4.0", default-features = false }
 test_helpers = { path = "../test_helpers" }
 hex = "0.4.2"
diff --git a/data_types/src/columns.rs b/data_types/src/columns.rs
index 958287317ca..1c6b0a91adc 100644
--- a/data_types/src/columns.rs
+++ b/data_types/src/columns.rs
@@ -1,14 +1,17 @@
 //! Types having to do with columns.
 
 use super::TableId;
-use generated_types::influxdata::iox::{gossip, schema::v1 as proto};
+use generated_types::influxdata::iox::{column_type::v1 as proto, gossip};
 use influxdb_line_protocol::FieldValue;
-use schema::{builder::SchemaBuilder, InfluxColumnType, InfluxFieldType, Schema};
-use serde::{Deserialize, Serialize};
+use schema::{builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, Schema};
+use snafu::Snafu;
+use std::cmp::Ordering;
+use std::collections::HashSet;
 use std::{
     collections::{BTreeMap, BTreeSet, HashMap},
     convert::TryFrom,
     ops::Deref,
+    sync::Arc,
 };
 
 /// Unique ID for a `Column`
@@ -27,11 +30,11 @@ impl ColumnId {
 }
 
 /// Column definitions for a table indexed by their name
-#[derive(Debug, Clone, Eq, PartialEq, Hash)]
-pub struct ColumnsByName(BTreeMap<String, ColumnSchema>);
+#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)]
+pub struct ColumnsByName(BTreeMap<Arc<str>, ColumnSchema>);
 
-impl From<BTreeMap<String, ColumnSchema>> for ColumnsByName {
-    fn from(value: BTreeMap<String, ColumnSchema>) -> Self {
+impl From<BTreeMap<Arc<str>, ColumnSchema>> for ColumnsByName {
+    fn from(value: BTreeMap<Arc<str>, ColumnSchema>) -> Self {
         Self(value)
     }
 }
@@ -44,7 +47,7 @@ impl ColumnsByName {
                 .into_iter()
                 .map(|c| {
                     (
-                        c.name,
+                        Arc::from(c.name),
                         ColumnSchema {
                             id: c.id,
                             column_type: c.column_type,
@@ -60,13 +63,13 @@ impl ColumnsByName {
     /// # Panics
     ///
     /// This method panics if a column of the same name already exists in `self`.
-    pub fn add_column(&mut self, column_name: String, column_schema: ColumnSchema) {
-        let old = self.0.insert(column_name, column_schema);
+    pub fn add_column(&mut self, column_name: impl Into<Arc<str>>, column_schema: ColumnSchema) {
+        let old = self.0.insert(column_name.into(), column_schema);
         assert!(old.is_none());
     }
 
     /// Iterate over the names and columns.
-    pub fn iter(&self) -> impl Iterator<Item = (&String, &ColumnSchema)> {
+    pub fn iter(&self) -> impl Iterator<Item = (&Arc<str>, &ColumnSchema)> {
         self.0.iter()
     }
 
@@ -83,7 +86,7 @@ impl ColumnsByName {
     /// Return the set of column names. Used in combination with a write operation's
     /// column names to determine whether a write would exceed the max allowed columns.
     pub fn names(&self) -> BTreeSet<&str> {
-        self.0.keys().map(|name| name.as_str()).collect()
+        self.0.keys().map(|name| name.as_ref()).collect()
     }
 
     /// Return an iterator of the set of column IDs.
@@ -92,9 +95,16 @@ impl ColumnsByName {
     }
 
     /// Return column ids of the given column names
-    /// Will panic if any of the names are not found
-    pub fn ids_for_names(&self, names: &[&str]) -> SortedColumnSet {
-        SortedColumnSet::from(names.iter().map(|name| {
+    ///
+    /// # Panics
+    ///
+    /// Panics if any of the names are not found in this set.
+    pub fn ids_for_names<T>(&self, names: impl IntoIterator<Item = T> + Send) -> SortKeyIds
+    where
+        T: AsRef<str>,
+    {
+        SortKeyIds::from(names.into_iter().map(|name| {
+            let name = name.as_ref();
             self.get(name)
                 .unwrap_or_else(|| panic!("column name not found: {}", name))
                 .id
@@ -107,26 +117,32 @@ impl ColumnsByName {
         self.0.get(name)
     }
 
+    /// Get the `ColumnId` for the time column, if present (a table created through
+    /// `table_load_or_create` will always have a time column).
+    pub fn time_column_id(&self) -> Option<ColumnId> {
+        self.get(schema::TIME_COLUMN_NAME).map(|column| column.id)
+    }
+
     /// Create `ID->name` map for columns.
-    pub fn id_map(&self) -> HashMap<ColumnId, &str> {
+    pub fn id_map(&self) -> HashMap<ColumnId, Arc<str>> {
         self.0
             .iter()
-            .map(|(name, c)| (c.id, name.as_str()))
+            .map(|(name, c)| (c.id, Arc::clone(name)))
             .collect()
     }
 }
 
 impl IntoIterator for ColumnsByName {
-    type Item = (String, ColumnSchema);
-    type IntoIter = std::collections::btree_map::IntoIter<String, ColumnSchema>;
+    type Item = (Arc<str>, ColumnSchema);
+    type IntoIter = std::collections::btree_map::IntoIter<Arc<str>, ColumnSchema>;
 
     fn into_iter(self) -> Self::IntoIter {
         self.0.into_iter()
     }
 }
 
-impl FromIterator<(String, ColumnSchema)> for ColumnsByName {
-    fn from_iter<T: IntoIterator<Item = (String, ColumnSchema)>>(iter: T) -> Self {
+impl FromIterator<(Arc<str>, ColumnSchema)> for ColumnsByName {
+    fn from_iter<T: IntoIterator<Item = (Arc<str>, ColumnSchema)>>(iter: T) -> Self {
         Self(BTreeMap::from_iter(iter))
     }
 }
@@ -140,7 +156,7 @@ impl TryFrom<ColumnsByName> for Schema {
 
         for (column_name, column_schema) in value.into_iter() {
             let t = InfluxColumnType::from(column_schema.column_type);
-            builder.influx_column(column_name, t);
+            builder.influx_column(column_name.as_ref(), t);
         }
 
         builder.build()
@@ -167,7 +183,7 @@ impl Column {
     }
 
     /// returns true if the column type matches the line protocol field value type
-    pub fn matches_field_type(&self, field_value: &FieldValue) -> bool {
+    pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool {
         match field_value {
             FieldValue::I64(_) => self.column_type == ColumnType::I64,
             FieldValue::U64(_) => self.column_type == ColumnType::U64,
@@ -194,7 +210,7 @@ impl ColumnSchema {
     }
 
     /// returns true if the column matches the line protocol field value type
-    pub fn matches_field_type(&self, field_value: &FieldValue) -> bool {
+    pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool {
         matches!(
             (field_value, self.column_type),
             (FieldValue::I64(_), ColumnType::I64)
@@ -224,9 +240,7 @@ impl TryFrom<&gossip::v1::Column> for ColumnSchema {
 
 /// The column data type
 #[allow(missing_docs)]
-#[derive(
-    Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, sqlx::Type, Serialize, Deserialize,
-)]
+#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, sqlx::Type)]
 #[repr(i16)]
 pub enum ColumnType {
     I64 = 1,
@@ -261,8 +275,14 @@ impl std::fmt::Display for ColumnType {
     }
 }
 
+/// Errors deserialising a protobuf serialised [`ColumnType`].
+#[derive(Debug, Snafu)]
+#[snafu(display("invalid column value"))]
+#[allow(missing_copy_implementations)]
+pub struct ColumnTypeProtoError {}
+
 impl TryFrom<i16> for ColumnType {
-    type Error = Box<dyn std::error::Error>;
+    type Error = ColumnTypeProtoError;
 
     fn try_from(value: i16) -> Result<Self, Self::Error> {
         match value {
@@ -273,7 +293,7 @@ impl TryFrom<i16> for ColumnType {
             x if x == Self::String as i16 => Ok(Self::String),
             x if x == Self::Time as i16 => Ok(Self::Time),
             x if x == Self::Tag as i16 => Ok(Self::Tag),
-            _ => Err("invalid column value".into()),
+            _ => Err(ColumnTypeProtoError {}),
         }
     }
 }
@@ -321,7 +341,7 @@ impl PartialEq<InfluxColumnType> for ColumnType {
 }
 
 /// Returns the `ColumnType` for the passed in line protocol `FieldValue` type
-pub fn column_type_from_field(field_value: &FieldValue) -> ColumnType {
+pub fn column_type_from_field(field_value: &FieldValue<'_>) -> ColumnType {
     match field_value {
         FieldValue::I64(_) => ColumnType::I64,
         FieldValue::U64(_) => ColumnType::U64,
@@ -331,27 +351,43 @@ pub fn column_type_from_field(field_value: &FieldValue) -> ColumnType {
     }
 }
 
-impl TryFrom<proto::column_schema::ColumnType> for ColumnType {
-    type Error = Box<dyn std::error::Error>;
+impl TryFrom<proto::ColumnType> for ColumnType {
+    type Error = &'static str;
 
-    fn try_from(value: proto::column_schema::ColumnType) -> Result<Self, Self::Error> {
+    fn try_from(value: proto::ColumnType) -> Result<Self, Self::Error> {
         Ok(match value {
-            proto::column_schema::ColumnType::I64 => ColumnType::I64,
-            proto::column_schema::ColumnType::U64 => ColumnType::U64,
-            proto::column_schema::ColumnType::F64 => ColumnType::F64,
-            proto::column_schema::ColumnType::Bool => ColumnType::Bool,
-            proto::column_schema::ColumnType::String => ColumnType::String,
-            proto::column_schema::ColumnType::Time => ColumnType::Time,
-            proto::column_schema::ColumnType::Tag => ColumnType::Tag,
-            proto::column_schema::ColumnType::Unspecified => {
-                return Err("unknown column type".into())
-            }
+            proto::ColumnType::I64 => Self::I64,
+            proto::ColumnType::U64 => Self::U64,
+            proto::ColumnType::F64 => Self::F64,
+            proto::ColumnType::Bool => Self::Bool,
+            proto::ColumnType::String => Self::String,
+            proto::ColumnType::Time => Self::Time,
+            proto::ColumnType::Tag => Self::Tag,
+            proto::ColumnType::Unspecified => return Err("unknown column type"),
         })
     }
 }
 
+impl From<ColumnType> for proto::ColumnType {
+    fn from(value: ColumnType) -> Self {
+        match value {
+            ColumnType::I64 => Self::I64,
+            ColumnType::U64 => Self::U64,
+            ColumnType::F64 => Self::F64,
+            ColumnType::Bool => Self::Bool,
+            ColumnType::String => Self::String,
+            ColumnType::Time => Self::Time,
+            ColumnType::Tag => Self::Tag,
+        }
+    }
+}
+
 /// Set of columns and used as Set data type.
-/// Its inner is implemneted as a vector because postgres does not have set type
+///
+/// # Data Structure
+/// This is internally implemented as a sorted vector. The sorting allows for fast [`PartialEq`]/[`Eq`]/[`Hash`] and
+/// ensures that the PostgreSQL data is deterministic. Note that PostgreSQL does NOT have a set type at the moment, so
+/// this is stored as an array.
 #[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type)]
 #[sqlx(transparent, no_pg_array)]
 pub struct ColumnSet(Vec<ColumnId>);
@@ -370,16 +406,21 @@ impl ColumnSet {
         let mut columns: Vec<ColumnId> = columns.into_iter().collect();
         columns.sort();
 
-        let len_pre_dedup = columns.len();
-        columns.dedup();
-        let len_post_dedup = columns.len();
-        assert_eq!(len_pre_dedup, len_post_dedup, "set contains duplicates");
+        assert!(
+            columns.windows(2).all(|w| w[0] != w[1]),
+            "set contains duplicates"
+        );
 
         columns.shrink_to_fit();
 
         Self(columns)
     }
 
+    /// Create a new empty [`ColumnSet`]
+    pub fn empty() -> Self {
+        Self(Vec::new())
+    }
+
     /// Estimate the memory consumption of this object and its contents
     pub fn size(&self) -> usize {
         std::mem::size_of_val(self) + (std::mem::size_of::<ColumnId>() * self.0.capacity())
@@ -389,6 +430,66 @@ impl ColumnSet {
     pub fn is_empty(&self) -> bool {
         self.0.is_empty()
     }
+
+    /// Computes the union of `self` and `other`
+    pub fn union(&mut self, other: &Self) {
+        let mut insert_idx = 0;
+        let mut src_idx = 0;
+
+        while insert_idx < self.0.len() && src_idx < other.0.len() {
+            let s = self.0[insert_idx];
+            let o = other.0[src_idx];
+
+            match s.cmp(&o) {
+                Ordering::Less => insert_idx += 1,
+                Ordering::Equal => {
+                    insert_idx += 1;
+                    src_idx += 1;
+                }
+                Ordering::Greater => {
+                    self.0.insert(insert_idx, o);
+                    insert_idx += 1;
+                    src_idx += 1;
+                }
+            }
+        }
+        self.0.extend_from_slice(&other.0[src_idx..]);
+    }
+
+    /// Returns the indices and ids in `self` that are present in both `self` and `other`
+    ///
+    /// ```
+    /// # use data_types::{ColumnId, ColumnSet};
+    /// let a = ColumnSet::new([1, 2, 4, 6, 7].into_iter().map(ColumnId::new));
+    /// let b = ColumnSet::new([2, 4, 6].into_iter().map(ColumnId::new));
+    ///
+    /// assert_eq!(
+    ///     a.intersect(&b).collect::<Vec<_>>(),
+    ///     vec![(1, b[0]), (2, b[1]), (3, b[2])]
+    /// )
+    /// ```
+    pub fn intersect<'a>(
+        &'a self,
+        other: &'a Self,
+    ) -> impl Iterator<Item = (usize, ColumnId)> + 'a {
+        let mut left_idx = 0;
+        let mut right_idx = 0;
+        std::iter::from_fn(move || loop {
+            let s = self.0.get(left_idx)?;
+            let o = other.get(right_idx)?;
+
+            match s.cmp(o) {
+                Ordering::Less => left_idx += 1,
+                Ordering::Greater => right_idx += 1,
+                Ordering::Equal => {
+                    let t = left_idx;
+                    left_idx += 1;
+                    right_idx += 1;
+                    return Some((t, *s));
+                }
+            }
+        })
+    }
 }
 
 impl From<ColumnSet> for Vec<ColumnId> {
@@ -405,12 +506,13 @@ impl Deref for ColumnSet {
     }
 }
 
-/// Set of sorted columns in a specific given order at created time
+/// Set of sorted column IDs in a specific given order at creation time, to be used as a
+/// [`SortKey`] by looking up the column names in the table's schema.
 #[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type, Default)]
 #[sqlx(transparent, no_pg_array)]
-pub struct SortedColumnSet(Vec<ColumnId>);
+pub struct SortKeyIds(Vec<ColumnId>);
 
-impl SortedColumnSet {
+impl SortKeyIds {
     /// Create new sorted column set.
     ///
     /// The order of the passed columns will be preserved.
@@ -423,32 +525,143 @@ impl SortedColumnSet {
     {
         let mut columns: Vec<ColumnId> = columns.into_iter().collect();
 
-        // verify if there are duplicates
-        let mut columns_sorted = columns.clone();
-        columns_sorted.sort();
-        let len_pre_dedup = columns_sorted.len();
-        columns_sorted.dedup();
-        let len_post_dedup = columns_sorted.len();
-        assert_eq!(len_pre_dedup, len_post_dedup, "set contains duplicates");
+        // Validate the ID set contains no duplicates.
+        //
+        // This validates an invariant in debug builds, skipping the cost
+        // for release builds.
+        if cfg!(debug_assertions) {
+            SortKeyIds::check_for_deplicates(&columns);
+        }
 
         // Must continue with columns in original order
         columns.shrink_to_fit();
+
         Self(columns)
     }
 
+    /// Given another set of sort key IDs, merge them together and, if needed, return a value to
+    /// use to update the catalog.
+    ///
+    /// If `other` contains any column IDs that are not present in `self`, create a new
+    /// `SortKeyIds` instance that includes the new columns in `other` (in the same order they
+    /// appear in `other`) appended to the existing columns, but keeping the time column ID last.
+    ///
+    /// If existing columns appear in `self` in a different order than they appear in `other`, the
+    /// order in `self` takes precedence and remains unchanged.
+    ///
+    /// If `self` contains all the sort keys in `other` already (regardless of order), this will
+    /// return `None` as no update to the catalog is needed.
+    pub fn maybe_append(&self, other: &Self, time_column_id: ColumnId) -> Option<Self> {
+        let existing_columns_without_time = self
+            .iter()
+            .cloned()
+            .filter(|&column_id| column_id != time_column_id);
+
+        let mut new_columns = other
+            .iter()
+            .cloned()
+            .filter(|column_id| !self.contains(column_id))
+            .peekable();
+
+        if new_columns.peek().is_none() {
+            None
+        } else {
+            Some(SortKeyIds::new(
+                existing_columns_without_time
+                    .chain(new_columns)
+                    .chain(std::iter::once(time_column_id)),
+            ))
+        }
+    }
+
     /// Estimate the memory consumption of this object and its contents
     pub fn size(&self) -> usize {
         std::mem::size_of_val(self) + (std::mem::size_of::<ColumnId>() * self.0.capacity())
     }
+
+    /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided
+    /// [`ColumnsByName`] map by converting it to a `HashMap<ColumnId, &str>. If you already have
+    /// an id-to-name column map, use [`SortKeyIds::to_sort_key_using_map`] instead.
+    ///
+    /// If you have a [`Partition`][super::Partition], it may be more convenient to call the
+    /// [`Partition::sort_key`][super::Partition::sort_key] method instead!
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn to_sort_key(&self, columns: &ColumnsByName) -> SortKey {
+        let column_id_map = columns.id_map();
+        self.to_sort_key_using_map(&column_id_map)
+    }
+
+    /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided
+    /// [`HashMap<ColumnId, &str>`] map.
+    ///
+    /// If you have a [`Partition`][super::Partition], it may be more convenient to call the
+    /// [`Partition::sort_key`][super::Partition::sort_key] method instead!
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn to_sort_key_using_map(&self, column_id_map: &HashMap<ColumnId, Arc<str>>) -> SortKey {
+        SortKey::from_columns(self.0.iter().map(|id| {
+            Arc::clone(
+                column_id_map.get(id).unwrap_or_else(|| {
+                    panic!("cannot find column names for sort key id {}", id.get())
+                }),
+            )
+        }))
+    }
+
+    /// Returns `true` if `other` is a monotonic update of `self`.
+    ///
+    /// # Panics
+    ///
+    /// Assumes "time" is the last column in both sets, and panics if the last
+    /// columns are not identical.
+    pub fn is_monotonic_update(&self, other: &Self) -> bool {
+        // The SortKeyIds always reference the time column last (if set).
+        if self.0.last().is_some() {
+            assert_eq!(
+                self.0.last(),
+                other.last(),
+                "last column in sort IDs must be time, and cannot change"
+            );
+        }
+
+        // Ensure the values in other are a prefix match, with the exception of
+        // the last "time" column.
+        self.0.len() <= other.len()
+            && self
+                .0
+                .iter()
+                .take(self.0.len().saturating_sub(1))
+                .zip(other.iter())
+                .all(|(a, b)| a == b)
+    }
+
+    fn check_for_deplicates(columns: &[ColumnId]) {
+        let mut column_ids: HashSet<i64> = HashSet::with_capacity(columns.len());
+        for c in columns {
+            match column_ids.get(&c.0) {
+                Some(_) => {
+                    panic!("set contains duplicates");
+                }
+                _ => {
+                    column_ids.insert(c.0);
+                }
+            }
+        }
+    }
 }
 
-impl From<SortedColumnSet> for Vec<ColumnId> {
-    fn from(set: SortedColumnSet) -> Self {
+impl From<SortKeyIds> for Vec<ColumnId> {
+    fn from(set: SortKeyIds) -> Self {
         set.0
     }
 }
 
-impl Deref for SortedColumnSet {
+impl Deref for SortKeyIds {
     type Target = [ColumnId];
 
     fn deref(&self) -> &Self::Target {
@@ -456,7 +669,7 @@ impl Deref for SortedColumnSet {
     }
 }
 
-impl<I> From<I> for SortedColumnSet
+impl<I> From<I> for SortKeyIds
 where
     I: IntoIterator<Item = i64>,
 {
@@ -465,9 +678,17 @@ where
     }
 }
 
-impl From<SortedColumnSet> for Vec<i64> {
-    fn from(val: SortedColumnSet) -> Self {
-        val.0.into_iter().map(|id| id.get()).collect()
+impl From<&SortKeyIds> for Vec<i64> {
+    fn from(val: &SortKeyIds) -> Self {
+        val.0.iter().map(|id| id.get()).collect()
+    }
+}
+
+impl From<&SortKeyIds> for generated_types::influxdata::iox::catalog::v1::SortKeyIds {
+    fn from(val: &SortKeyIds) -> Self {
+        generated_types::influxdata::iox::catalog::v1::SortKeyIds {
+            array_sort_key_ids: val.into(),
+        }
     }
 }
 
@@ -483,10 +704,46 @@ mod tests {
         ColumnSet::new([ColumnId::new(1), ColumnId::new(2), ColumnId::new(1)]);
     }
 
+    #[test]
+    fn test_column_set_eq() {
+        let set_1 = ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]);
+        let set_2 = ColumnSet::new([ColumnId::new(2), ColumnId::new(1)]);
+        assert_eq!(set_1, set_2);
+    }
+
+    #[test]
+    fn test_column_set_union_intersect() {
+        let a = ColumnSet::new([1, 2, 5, 7].into_iter().map(ColumnId::new));
+        let b = ColumnSet::new([1, 5, 6, 7, 8].into_iter().map(ColumnId::new));
+
+        let mut t = ColumnSet::empty();
+        t.union(&a);
+        assert_eq!(t, a);
+
+        assert_eq!(
+            t.intersect(&a).collect::<Vec<_>>(),
+            vec![(0, a[0]), (1, a[1]), (2, a[2]), (3, a[3])]
+        );
+
+        t.union(&b);
+        let expected = ColumnSet::new([1, 2, 5, 6, 7, 8].into_iter().map(ColumnId::new));
+        assert_eq!(t, expected);
+
+        assert_eq!(
+            t.intersect(&a).collect::<Vec<_>>(),
+            vec![(0, a[0]), (1, a[1]), (2, a[2]), (4, a[3])]
+        );
+
+        assert_eq!(
+            t.intersect(&b).collect::<Vec<_>>(),
+            vec![(0, b[0]), (2, b[1]), (3, b[2]), (4, b[3]), (5, b[4])]
+        );
+    }
+
     #[test]
     #[should_panic = "set contains duplicates"]
     fn test_sorted_column_set_duplicates() {
-        SortedColumnSet::new([
+        SortKeyIds::new([
             ColumnId::new(2),
             ColumnId::new(1),
             ColumnId::new(3),
@@ -496,7 +753,7 @@ mod tests {
 
     #[test]
     fn test_sorted_column_set() {
-        let set = SortedColumnSet::new([ColumnId::new(2), ColumnId::new(1), ColumnId::new(3)]);
+        let set = SortKeyIds::new([ColumnId::new(2), ColumnId::new(1), ColumnId::new(3)]);
         // verify the order is preserved
         assert_eq!(set[0], ColumnId::new(2));
         assert_eq!(set[1], ColumnId::new(1));
@@ -506,35 +763,35 @@ mod tests {
     #[test]
     fn test_column_schema() {
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::I64).unwrap(),
+            ColumnType::try_from(proto::ColumnType::I64).unwrap(),
             ColumnType::I64,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::U64).unwrap(),
+            ColumnType::try_from(proto::ColumnType::U64).unwrap(),
             ColumnType::U64,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::F64).unwrap(),
+            ColumnType::try_from(proto::ColumnType::F64).unwrap(),
             ColumnType::F64,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::Bool).unwrap(),
+            ColumnType::try_from(proto::ColumnType::Bool).unwrap(),
             ColumnType::Bool,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::String).unwrap(),
+            ColumnType::try_from(proto::ColumnType::String).unwrap(),
             ColumnType::String,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::Time).unwrap(),
+            ColumnType::try_from(proto::ColumnType::Time).unwrap(),
             ColumnType::Time,
         );
         assert_eq!(
-            ColumnType::try_from(proto::column_schema::ColumnType::Tag).unwrap(),
+            ColumnType::try_from(proto::ColumnType::Tag).unwrap(),
             ColumnType::Tag,
         );
 
-        assert!(ColumnType::try_from(proto::column_schema::ColumnType::Unspecified).is_err());
+        assert!(ColumnType::try_from(proto::ColumnType::Unspecified).is_err());
     }
 
     #[test]
@@ -567,50 +824,50 @@ mod tests {
     fn test_columns_by_names_exist() {
         let columns = build_columns_by_names();
 
-        let ids = columns.ids_for_names(&["foo", "bar"]);
-        assert_eq!(ids, SortedColumnSet::from([1, 2]));
+        let ids = columns.ids_for_names(["foo", "bar"]);
+        assert_eq!(ids, SortKeyIds::from([1, 2]));
     }
 
     #[test]
     fn test_columns_by_names_exist_different_order() {
         let columns = build_columns_by_names();
 
-        let ids = columns.ids_for_names(&["bar", "foo"]);
-        assert_eq!(ids, SortedColumnSet::from([2, 1]));
+        let ids = columns.ids_for_names(["bar", "foo"]);
+        assert_eq!(ids, SortKeyIds::from([2, 1]));
     }
 
     #[test]
     #[should_panic = "column name not found: baz"]
     fn test_columns_by_names_not_exist() {
         let columns = build_columns_by_names();
-        columns.ids_for_names(&["foo", "baz"]);
+        columns.ids_for_names(["foo", "baz"]);
     }
 
     fn build_columns_by_names() -> ColumnsByName {
-        let mut columns: BTreeMap<String, ColumnSchema> = BTreeMap::new();
+        let mut columns: BTreeMap<Arc<str>, ColumnSchema> = BTreeMap::new();
         columns.insert(
-            "foo".to_string(),
+            "foo".into(),
             ColumnSchema {
                 id: ColumnId::new(1),
                 column_type: ColumnType::I64,
             },
         );
         columns.insert(
-            "bar".to_string(),
+            "bar".into(),
             ColumnSchema {
                 id: ColumnId::new(2),
                 column_type: ColumnType::I64,
             },
         );
         columns.insert(
-            "time".to_string(),
+            "time".into(),
             ColumnSchema {
                 id: ColumnId::new(3),
                 column_type: ColumnType::Time,
             },
         );
         columns.insert(
-            "tag1".to_string(),
+            "tag1".into(),
             ColumnSchema {
                 id: ColumnId::new(4),
                 column_type: ColumnType::Tag,
@@ -619,4 +876,122 @@ mod tests {
 
         ColumnsByName(columns)
     }
+
+    // panic if the sort_key_ids are not found in the columns
+    #[test]
+    #[should_panic(expected = "cannot find column names for sort key id 3")]
+    fn test_panic_build_sort_key_from_ids_and_map() {
+        // table columns
+        let uno = ColumnSchema {
+            id: ColumnId::new(1),
+            column_type: ColumnType::Tag,
+        };
+        let dos = ColumnSchema {
+            id: ColumnId::new(2),
+            column_type: ColumnType::Tag,
+        };
+        let mut column_map = ColumnsByName::default();
+        column_map.add_column("uno", uno);
+        column_map.add_column("dos", dos);
+
+        // sort_key_ids include some columns that are not in the columns
+        let sort_key_ids = SortKeyIds::from([2, 3]);
+        sort_key_ids.to_sort_key(&column_map);
+    }
+
+    #[test]
+    fn test_build_sort_key_from_ids_and_map() {
+        // table columns
+        let uno = ColumnSchema {
+            id: ColumnId::new(1),
+            column_type: ColumnType::Tag,
+        };
+        let dos = ColumnSchema {
+            id: ColumnId::new(2),
+            column_type: ColumnType::Tag,
+        };
+        let tres = ColumnSchema {
+            id: ColumnId::new(3),
+            column_type: ColumnType::Tag,
+        };
+        let mut column_map = ColumnsByName::default();
+        column_map.add_column("uno", uno);
+        column_map.add_column("dos", dos);
+        column_map.add_column("tres", tres);
+
+        // sort_key_ids is empty
+        let sort_key_ids = SortKeyIds::default();
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::empty());
+
+        // sort_key_ids include all columns and in the same order
+        let sort_key_ids = SortKeyIds::from([1, 2, 3]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["uno", "dos", "tres"]));
+
+        // sort_key_ids include all columns but in different order
+        let sort_key_ids = SortKeyIds::from([2, 3, 1]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres", "uno"]));
+
+        // sort_key_ids include some columns
+        let sort_key_ids = SortKeyIds::from([2, 3]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres"]));
+
+        // sort_key_ids include some columns in different order
+        let sort_key_ids = SortKeyIds::from([3, 1]);
+        let sort_key = sort_key_ids.to_sort_key(&column_map);
+        assert_eq!(sort_key, SortKey::from_columns(vec!["tres", "uno"]));
+    }
+
+    #[test]
+    fn test_sort_key_ids_round_trip_encoding() {
+        let original = SortKeyIds::from([1, 2, 3]);
+
+        let encoded: generated_types::influxdata::iox::catalog::v1::SortKeyIds = (&original).into();
+
+        let decoded: SortKeyIds = encoded.array_sort_key_ids.into();
+        assert_eq!(decoded, original);
+    }
+
+    macro_rules! test_is_monotonic_update {
+        (
+            $name:ident,
+            a = $a:expr,
+            b = $b:expr,
+            want = $want:expr
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_is_monotonic_update_ $name>]() {
+                    let a = SortKeyIds::new($a.into_iter().map(ColumnId::new));
+                    let b = SortKeyIds::new($b.into_iter().map(ColumnId::new));
+                    assert_eq!(a.is_monotonic_update(&b), $want)
+                }
+            }
+        };
+    }
+
+    test_is_monotonic_update!(equal, a = [42, 24, 1], b = [42, 24, 1], want = true);
+
+    test_is_monotonic_update!(empty, a = [], b = [42, 24, 1], want = true);
+
+    test_is_monotonic_update!(
+        longer_with_time,
+        a = [42, 24, 1],
+        b = [42, 24, 13, 1],
+        want = true
+    );
+
+    test_is_monotonic_update!(shorter_with_time, a = [42, 24, 1], b = [1], want = false);
+
+    test_is_monotonic_update!(
+        mismatch_with_time,
+        a = [42, 24, 1],
+        b = [24, 42, 1],
+        want = false
+    );
+
+    test_is_monotonic_update!(mismatch, a = [42, 24, 1], b = [24, 42, 1], want = false);
 }
diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs
index a158e704497..951af51d965 100644
--- a/data_types/src/lib.rs
+++ b/data_types/src/lib.rs
@@ -29,10 +29,13 @@ pub mod partition;
 pub use partition::*;
 pub mod sequence_number_set;
 pub mod service_limits;
+pub mod snapshot;
+
 pub use service_limits::*;
 
 use observability_deps::tracing::warn;
 use schema::TIME_COLUMN_NAME;
+use snafu::Snafu;
 use std::{
     borrow::Borrow,
     collections::{BTreeMap, BTreeSet, HashMap},
@@ -41,9 +44,16 @@ use std::{
     mem::{self, size_of_val},
     num::{FpCategory, NonZeroU64},
     ops::{Add, Deref, Sub},
+    sync::Arc,
 };
 use uuid::Uuid;
 
+/// Errors deserialising a protobuf serialised [`ParquetFile`].
+#[derive(Debug, Snafu)]
+#[snafu(display("invalid compaction level value"))]
+#[allow(missing_copy_implementations)]
+pub struct CompactionLevelProtoError {}
+
 /// Compaction levels
 #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, sqlx::Type)]
 #[repr(i16)]
@@ -68,14 +78,14 @@ impl Display for CompactionLevel {
 }
 
 impl TryFrom<i32> for CompactionLevel {
-    type Error = Box<dyn std::error::Error + Send + Sync + 'static>;
+    type Error = CompactionLevelProtoError;
 
     fn try_from(value: i32) -> Result<Self, Self::Error> {
         match value {
             x if x == Self::Initial as i32 => Ok(Self::Initial),
             x if x == Self::FileNonOverlapped as i32 => Ok(Self::FileNonOverlapped),
             x if x == Self::Final as i32 => Ok(Self::Final),
-            _ => Err("invalid compaction level value".into()),
+            _ => Err(CompactionLevelProtoError {}),
         }
     }
 }
@@ -131,7 +141,7 @@ impl NamespaceId {
 }
 
 impl std::fmt::Display for NamespaceId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0)
     }
 }
@@ -157,7 +167,7 @@ impl TableId {
 }
 
 impl std::fmt::Display for TableId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0)
     }
 }
@@ -268,12 +278,48 @@ impl ParquetFileId {
 }
 
 impl std::fmt::Display for ParquetFileId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         // Use `self.number` to refer to each positional data point.
         write!(f, "{}", self.0)
     }
 }
 
+/// Unique store UUID for a [`ParquetFile`].
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
+#[sqlx(transparent)]
+pub struct ObjectStoreId(Uuid);
+
+#[allow(missing_docs)]
+impl ObjectStoreId {
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self::from_uuid(Uuid::new_v4())
+    }
+
+    pub fn from_uuid(uuid: Uuid) -> Self {
+        Self(uuid)
+    }
+
+    pub fn get_uuid(&self) -> Uuid {
+        self.0
+    }
+}
+
+impl std::fmt::Display for ObjectStoreId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl std::str::FromStr for ObjectStoreId {
+    type Err = uuid::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let uuid = Uuid::parse_str(s)?;
+        Ok(Self::from_uuid(uuid))
+    }
+}
+
 /// Data object for a namespace
 #[derive(Debug, Clone, PartialEq, sqlx::FromRow)]
 pub struct Namespace {
@@ -352,35 +398,23 @@ impl NamespaceSchema {
 
 impl From<&NamespaceSchema> for generated_types::influxdata::iox::schema::v1::NamespaceSchema {
     fn from(schema: &NamespaceSchema) -> Self {
-        use generated_types::influxdata::iox::schema::v1 as proto;
-        Self {
-            id: schema.id.get(),
-            tables: schema
-                .tables
-                .iter()
-                .map(|(name, t)| {
-                    (
-                        name.clone(),
-                        proto::TableSchema {
-                            id: t.id.get(),
-                            columns: t
-                                .columns
-                                .iter()
-                                .map(|(name, c)| {
-                                    (
-                                        name.clone(),
-                                        proto::ColumnSchema {
-                                            id: c.id.get(),
-                                            column_type: c.column_type as i32,
-                                        },
-                                    )
-                                })
-                                .collect(),
-                        },
-                    )
-                })
-                .collect(),
-        }
+        namespace_schema_proto(schema.id, schema.tables.iter())
+    }
+}
+
+/// Generate [`NamespaceSchema`] protobuf from a `NamespaceId` and a list of tables. Useful to
+/// filter the tables returned from an API request to a particular table without needing to clone
+/// the whole `NamespaceSchema` to use the `From` impl.
+pub fn namespace_schema_proto<'a>(
+    id: NamespaceId,
+    tables: impl Iterator<Item = (&'a String, &'a TableSchema)>,
+) -> generated_types::influxdata::iox::schema::v1::NamespaceSchema {
+    use generated_types::influxdata::iox::schema::v1 as proto;
+    proto::NamespaceSchema {
+        id: id.get(),
+        tables: tables
+            .map(|(name, t)| (name.clone(), proto::TableSchema::from(t)))
+            .collect(),
     }
 }
 
@@ -428,7 +462,7 @@ impl TableSchema {
         Self {
             id: table.id,
             partition_template: table.partition_template.clone(),
-            columns: ColumnsByName::new([]),
+            columns: ColumnsByName::default(),
         }
     }
 
@@ -458,7 +492,11 @@ impl TableSchema {
     ///
     /// This method panics if a column of the same name already exists in
     /// `self`.
-    pub fn add_column_schema(&mut self, column_name: String, column_schema: ColumnSchema) {
+    pub fn add_column_schema(
+        &mut self,
+        column_name: impl Into<Arc<str>>,
+        column_schema: ColumnSchema,
+    ) {
         self.columns.add_column(column_name, column_schema);
     }
 
@@ -468,12 +506,12 @@ impl TableSchema {
             + self
                 .columns
                 .iter()
-                .map(|(k, v)| size_of_val(k) + k.capacity() + size_of_val(v))
+                .map(|(k, v)| size_of_val(k) + k.as_ref().len() + size_of_val(v))
                 .sum::<usize>()
     }
 
     /// Create `ID->name` map for columns.
-    pub fn column_id_map(&self) -> HashMap<ColumnId, &str> {
+    pub fn column_id_map(&self) -> HashMap<ColumnId, Arc<str>> {
         self.columns.id_map()
     }
 
@@ -494,6 +532,29 @@ impl TableSchema {
     }
 }
 
+impl From<&TableSchema> for generated_types::influxdata::iox::schema::v1::TableSchema {
+    fn from(table_schema: &TableSchema) -> Self {
+        use generated_types::influxdata::iox::schema::v1 as proto;
+
+        Self {
+            id: table_schema.id.get(),
+            columns: table_schema
+                .columns
+                .iter()
+                .map(|(name, c)| {
+                    (
+                        name.to_string(),
+                        proto::ColumnSchema {
+                            id: c.id.get(),
+                            column_type: c.column_type as i32,
+                        },
+                    )
+                })
+                .collect(),
+        }
+    }
+}
+
 /// Data recorded when compaction skips a partition.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::FromRow)]
 pub struct SkippedCompaction {
@@ -515,8 +576,9 @@ pub struct SkippedCompaction {
     pub limit_num_files_first_in_partition: i64,
 }
 
-use generated_types::influxdata::iox::compactor::v1 as compactor_proto;
-impl From<SkippedCompaction> for compactor_proto::SkippedCompaction {
+impl From<SkippedCompaction>
+    for generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction
+{
     fn from(skipped_compaction: SkippedCompaction) -> Self {
         let SkippedCompaction {
             partition_id,
@@ -537,7 +599,27 @@ impl From<SkippedCompaction> for compactor_proto::SkippedCompaction {
             limit_bytes,
             num_files,
             limit_num_files,
-            limit_num_files_first_in_partition: Some(limit_num_files_first_in_partition),
+            limit_num_files_first_in_partition,
+        }
+    }
+}
+
+impl From<generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction>
+    for SkippedCompaction
+{
+    fn from(
+        skipped_compaction: generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction,
+    ) -> Self {
+        Self {
+            partition_id: PartitionId::new(skipped_compaction.partition_id),
+            reason: skipped_compaction.reason,
+            skipped_at: Timestamp::new(skipped_compaction.skipped_at),
+            estimated_bytes: skipped_compaction.estimated_bytes,
+            limit_bytes: skipped_compaction.limit_bytes,
+            num_files: skipped_compaction.num_files,
+            limit_num_files: skipped_compaction.limit_num_files,
+            limit_num_files_first_in_partition: skipped_compaction
+                .limit_num_files_first_in_partition,
         }
     }
 }
@@ -552,10 +634,11 @@ pub struct ParquetFile {
     /// the table
     pub table_id: TableId,
     /// the partition identifier
-    #[sqlx(flatten)]
-    pub partition_id: TransitionPartitionId,
+    pub partition_id: PartitionId,
+    /// the optional partition hash id
+    pub partition_hash_id: Option<PartitionHashId>,
     /// the uuid used in the object store path for this file
-    pub object_store_id: Uuid,
+    pub object_store_id: ObjectStoreId,
     /// the min timestamp of data in this file
     pub min_time: Timestamp,
     /// the max timestamp of data in this file
@@ -608,9 +691,10 @@ impl ParquetFile {
     pub fn from_params(params: ParquetFileParams, id: ParquetFileId) -> Self {
         Self {
             id,
+            partition_id: params.partition_id,
+            partition_hash_id: params.partition_hash_id,
             namespace_id: params.namespace_id,
             table_id: params.table_id,
-            partition_id: params.partition_id,
             object_store_id: params.object_store_id,
             min_time: params.min_time,
             max_time: params.max_time,
@@ -626,7 +710,13 @@ impl ParquetFile {
 
     /// Estimate the memory consumption of this object and its contents
     pub fn size(&self) -> usize {
-        std::mem::size_of_val(self) + self.partition_id.size() + self.column_set.size()
+        let hash_id = self
+            .partition_hash_id
+            .as_ref()
+            .map(|x| x.size())
+            .unwrap_or_default();
+
+        std::mem::size_of_val(self) + hash_id + self.column_set.size()
             - std::mem::size_of_val(&self.column_set)
     }
 
@@ -661,6 +751,11 @@ impl ParquetFile {
         }
         false
     }
+
+    /// Temporary to aid incremental migration
+    pub fn transition_partition_id(&self) -> TransitionPartitionId {
+        TransitionPartitionId::from_parts(self.partition_id, self.partition_hash_id.clone())
+    }
 }
 
 impl From<ParquetFile> for generated_types::influxdata::iox::catalog::v1::ParquetFile {
@@ -669,7 +764,11 @@ impl From<ParquetFile> for generated_types::influxdata::iox::catalog::v1::Parque
             id: v.id.get(),
             namespace_id: v.namespace_id.get(),
             table_id: v.table_id.get(),
-            partition_identifier: Some(v.partition_id.into()),
+            partition_id: v.partition_id.get(),
+            partition_hash_id: v
+                .partition_hash_id
+                .map(|x| x.as_bytes().to_vec())
+                .unwrap_or_default(),
             object_store_id: v.object_store_id.to_string(),
             min_time: v.min_time.get(),
             max_time: v.max_time.get(),
@@ -700,40 +799,8 @@ pub enum ParquetFileProtoError {
     InvalidObjectStoreId(uuid::Error),
 
     /// The specified compaction level value is invalid.
-    #[error("invalid compaction level: {0}")]
-    InvalidCompactionLevel(Box<dyn std::error::Error + Send + Sync + 'static>),
-}
-
-impl TryFrom<generated_types::influxdata::iox::catalog::v1::ParquetFile> for ParquetFile {
-    type Error = ParquetFileProtoError;
-
-    fn try_from(
-        v: generated_types::influxdata::iox::catalog::v1::ParquetFile,
-    ) -> Result<Self, Self::Error> {
-        Ok(Self {
-            id: ParquetFileId::new(v.id),
-            namespace_id: NamespaceId::new(v.namespace_id),
-            table_id: TableId::new(v.table_id),
-            partition_id: TransitionPartitionId::try_from(
-                v.partition_identifier
-                    .ok_or(ParquetFileProtoError::NoPartitionId)?,
-            )?,
-            object_store_id: v
-                .object_store_id
-                .parse()
-                .map_err(ParquetFileProtoError::InvalidObjectStoreId)?,
-            min_time: Timestamp::new(v.min_time),
-            max_time: Timestamp::new(v.max_time),
-            to_delete: v.to_delete.map(Timestamp::new),
-            file_size_bytes: v.file_size_bytes,
-            row_count: v.row_count,
-            compaction_level: CompactionLevel::try_from(v.compaction_level)
-                .map_err(ParquetFileProtoError::InvalidCompactionLevel)?,
-            created_at: Timestamp::new(v.created_at),
-            column_set: ColumnSet::new(v.column_set.into_iter().map(ColumnId::new)),
-            max_l0_created_at: Timestamp::new(v.max_l0_created_at),
-        })
-    }
+    #[error(transparent)]
+    InvalidCompactionLevel(#[from] CompactionLevelProtoError),
 }
 
 /// Data for a parquet file to be inserted into the catalog.
@@ -744,9 +811,11 @@ pub struct ParquetFileParams {
     /// the table
     pub table_id: TableId,
     /// the partition identifier
-    pub partition_id: TransitionPartitionId,
+    pub partition_id: PartitionId,
+    /// the partition hash ID
+    pub partition_hash_id: Option<PartitionHashId>,
     /// the uuid used in the object store path for this file
-    pub object_store_id: Uuid,
+    pub object_store_id: ObjectStoreId,
     /// the min timestamp of data in this file
     pub min_time: Timestamp,
     /// the max timestamp of data in this file
@@ -765,25 +834,6 @@ pub struct ParquetFileParams {
     pub max_l0_created_at: Timestamp,
 }
 
-impl From<ParquetFile> for ParquetFileParams {
-    fn from(value: ParquetFile) -> Self {
-        Self {
-            namespace_id: value.namespace_id,
-            table_id: value.table_id,
-            partition_id: value.partition_id,
-            object_store_id: value.object_store_id,
-            min_time: value.min_time,
-            max_time: value.max_time,
-            file_size_bytes: value.file_size_bytes,
-            row_count: value.row_count,
-            compaction_level: value.compaction_level,
-            created_at: value.created_at,
-            column_set: value.column_set,
-            max_l0_created_at: value.max_l0_created_at,
-        }
-    }
-}
-
 /// ID of a chunk.
 ///
 /// This ID is unique within a single partition.
@@ -835,9 +885,9 @@ impl std::fmt::Display for ChunkId {
     }
 }
 
-impl From<Uuid> for ChunkId {
-    fn from(uuid: Uuid) -> Self {
-        Self(uuid)
+impl From<ObjectStoreId> for ChunkId {
+    fn from(id: ObjectStoreId) -> Self {
+        Self(id.get_uuid())
     }
 }
 
@@ -1405,9 +1455,12 @@ impl IsNan for f64 {
 pub enum Statistics {
     I64(StatValues<i64>),
     U64(StatValues<u64>),
-    F64(StatValues<f64>),
     Bool(StatValues<bool>),
     String(StatValues<String>),
+
+    /// For the purposes of min/max values of floats, NaN values are ignored (no
+    /// ordering is applied to NaNs).
+    F64(StatValues<f64>),
 }
 
 impl Statistics {
@@ -1706,6 +1759,16 @@ impl TimestampMinMax {
             || range.contains(self.max)
             || (self.min <= range.start && self.max >= range.end)
     }
+
+    /// Returns the union of this range with `other` with the minimum of the `min`s
+    /// and the maximum of the `max`es
+
+    pub fn union(&self, other: &Self) -> Self {
+        Self {
+            min: self.min.min(other.min),
+            max: self.max.max(other.max),
+        }
+    }
 }
 
 /// FileRange describes a range of files by the min/max time and the sum of their capacities.
@@ -1726,7 +1789,6 @@ mod tests {
     use std::borrow::Cow;
 
     use ordered_float::OrderedFloat;
-    use proptest::{prelude::*, proptest};
 
     #[test]
     fn test_chunk_id_new() {
@@ -2661,7 +2723,7 @@ mod tests {
         let schema1 = TableSchema {
             id: TableId::new(1),
             partition_template: Default::default(),
-            columns: ColumnsByName::new([]),
+            columns: ColumnsByName::default(),
         };
         let schema2 = TableSchema {
             id: TableId::new(2),
@@ -2681,8 +2743,8 @@ mod tests {
         let schema1 = NamespaceSchema {
             id: NamespaceId::new(1),
             tables: BTreeMap::from([]),
-            max_tables: MaxTables::new(42),
-            max_columns_per_table: MaxColumnsPerTable::new(4),
+            max_tables: MaxTables::try_from(42).unwrap(),
+            max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(),
             retention_period_ns: None,
             partition_template: Default::default(),
         };
@@ -2692,12 +2754,12 @@ mod tests {
                 String::from("foo"),
                 TableSchema {
                     id: TableId::new(1),
-                    columns: ColumnsByName::new([]),
+                    columns: ColumnsByName::default(),
                     partition_template: Default::default(),
                 },
             )]),
-            max_tables: MaxTables::new(42),
-            max_columns_per_table: MaxColumnsPerTable::new(4),
+            max_tables: MaxTables::try_from(42).unwrap(),
+            max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(),
             retention_period_ns: None,
             partition_template: Default::default(),
         };
@@ -2734,77 +2796,4 @@ mod tests {
         assert_eq!(tr.start(), 1);
         assert_eq!(tr.end(), 1);
     }
-
-    use crate::partition::tests::arbitrary_partition_id;
-
-    prop_compose! {
-        /// Return an arbitrary [`Timestamp`].
-        pub fn arbitrary_timestamp()(value in any::<i64>()) -> Timestamp {
-            Timestamp::new(value)
-        }
-    }
-
-    fn arbitrary_compaction_level() -> impl prop::strategy::Strategy<Value = CompactionLevel> {
-        prop_oneof![
-            Just(CompactionLevel::Initial),
-            Just(CompactionLevel::FileNonOverlapped),
-            Just(CompactionLevel::Final),
-        ]
-    }
-
-    prop_compose! {
-        /// Return an arbitrary [`ParquetFile`] with a randomised values.
-        fn arbitrary_parquet_file()(
-            partition_id in arbitrary_partition_id(),
-            parquet_file_id in any::<i64>(),
-            namespace_id in any::<i64>(),
-            table_id in any::<i64>(),
-            min_time in arbitrary_timestamp(),
-            max_time in arbitrary_timestamp(),
-            to_delete in prop::option::of(arbitrary_timestamp()),
-            file_size_bytes in any::<i64>(),
-            row_count in any::<i64>(),
-            compaction_level in arbitrary_compaction_level(),
-            created_at in arbitrary_timestamp(),
-            column_set in prop::collection::vec(any::<i64>(), 0..10),
-            max_l0_created_at in arbitrary_timestamp(),
-        ) -> ParquetFile {
-            let column_set = ColumnSet::new(column_set.into_iter().map(ColumnId::new));
-
-            ParquetFile {
-                id: ParquetFileId::new(parquet_file_id),
-                namespace_id: NamespaceId::new(namespace_id),
-                table_id: TableId::new(table_id),
-                partition_id,
-                object_store_id: Uuid::new_v4(),
-                min_time,
-                max_time,
-                to_delete,
-                file_size_bytes,
-                row_count,
-                compaction_level,
-                created_at,
-                column_set,
-                max_l0_created_at,
-            }
-        }
-    }
-
-    proptest! {
-        /// Assert a [`ParquetFile`] is round-trippable through proto
-        /// serialisation.
-        #[test]
-        fn prop_parquet_file_proto_round_trip(file in arbitrary_parquet_file()) {
-            use generated_types::influxdata::iox::catalog::v1 as proto;
-
-            // Encoding is infallible
-            let encoded = proto::ParquetFile::from(file.clone());
-
-            // Decoding a valid proto ParquetFile is infallible.
-            let decoded = ParquetFile::try_from(encoded).unwrap();
-
-            // The deserialised value must match the input (round trippable)
-            assert_eq!(decoded, file);
-        }
-    }
 }
diff --git a/data_types/src/namespace_name.rs b/data_types/src/namespace_name.rs
index 04f462a7f90..e9e2e580512 100644
--- a/data_types/src/namespace_name.rs
+++ b/data_types/src/namespace_name.rs
@@ -131,6 +131,14 @@ impl<'a> NamespaceName<'a> {
 
         Ok(Self::new(format!("{}_{}", org, bucket))?)
     }
+
+    /// Efficiently returns the string representation of this [`NamespaceName`].
+    ///
+    /// If this [`NamespaceName`] contains an owned string, it is returned
+    /// without cloning.
+    pub fn into_string(self) -> String {
+        self.0.into_owned()
+    }
 }
 
 impl<'a> std::convert::From<NamespaceName<'a>> for String {
@@ -191,6 +199,23 @@ mod tests {
             .expect("failed on valid DB mapping");
 
         assert_eq!(got.as_str(), "org_bucket");
+        assert_eq!(got.into_string(), "org_bucket");
+    }
+
+    #[test]
+    fn test_into_string() {
+        // Ref type str
+        assert_eq!(
+            NamespaceName::new("bananas").unwrap().into_string(),
+            "bananas"
+        );
+        // Owned type string
+        assert_eq!(
+            NamespaceName::new("bananas".to_string())
+                .unwrap()
+                .into_string(),
+            "bananas"
+        );
     }
 
     #[test]
diff --git a/data_types/src/partition.rs b/data_types/src/partition.rs
index 825fda0d22a..eb095241a28 100644
--- a/data_types/src/partition.rs
+++ b/data_types/src/partition.rs
@@ -1,8 +1,6 @@
 //! Types having to do with partitions.
 
-use crate::SortedColumnSet;
-
-use super::{TableId, Timestamp};
+use super::{ColumnsByName, SortKeyIds, TableId, Timestamp};
 
 use schema::sort::SortKey;
 use sha2::Digest;
@@ -22,6 +20,14 @@ pub enum TransitionPartitionId {
 }
 
 impl TransitionPartitionId {
+    /// Create a [`TransitionPartitionId`] from a [`PartitionId`] and optional [`PartitionHashId`]
+    pub fn from_parts(id: PartitionId, hash_id: Option<PartitionHashId>) -> Self {
+        match hash_id {
+            Some(x) => Self::Deterministic(x),
+            None => Self::Deprecated(id),
+        }
+    }
+
     /// Size in bytes including `self`.
     pub fn size(&self) -> usize {
         match self {
@@ -63,15 +69,12 @@ where
 
 impl From<(PartitionId, Option<&PartitionHashId>)> for TransitionPartitionId {
     fn from((partition_id, partition_hash_id): (PartitionId, Option<&PartitionHashId>)) -> Self {
-        partition_hash_id
-            .cloned()
-            .map(TransitionPartitionId::Deterministic)
-            .unwrap_or_else(|| TransitionPartitionId::Deprecated(partition_id))
+        Self::from_parts(partition_id, partition_hash_id.cloned())
     }
 }
 
 impl std::fmt::Display for TransitionPartitionId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Deprecated(old_partition_id) => write!(f, "{}", old_partition_id.0),
             Self::Deterministic(partition_hash_id) => write!(f, "{}", partition_hash_id),
@@ -169,7 +172,7 @@ impl PartitionId {
 }
 
 impl std::fmt::Display for PartitionId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}", self.0)
     }
 }
@@ -279,7 +282,7 @@ const PARTITION_HASH_ID_SIZE_BYTES: usize = 32;
 pub struct PartitionHashId(Arc<[u8; PARTITION_HASH_ID_SIZE_BYTES]>);
 
 impl std::fmt::Display for PartitionHashId {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         for byte in &*self.0 {
             write!(f, "{:02x}", byte)?;
         }
@@ -340,6 +343,11 @@ impl TryFrom<&[u8]> for PartitionHashId {
 impl PartitionHashId {
     /// Create a new `PartitionHashId`.
     pub fn new(table_id: TableId, partition_key: &PartitionKey) -> Self {
+        Self::from_raw(table_id, partition_key.as_bytes())
+    }
+
+    /// Create a new `PartitionHashId`
+    pub fn from_raw(table_id: TableId, key: &[u8]) -> Self {
         // The hash ID of a partition is the SHA-256 of the `TableId` then the `PartitionKey`. This
         // particular hash format was chosen so that there won't be collisions and this value can
         // be used to uniquely identify a Partition without needing to go to the catalog to get a
@@ -357,7 +365,7 @@ impl PartitionHashId {
         assert_eq!(table_bytes.len(), 8);
         inner.update(table_bytes);
 
-        inner.update(partition_key.as_bytes());
+        inner.update(key);
         Self(Arc::new(inner.finalize().into()))
     }
 
@@ -440,37 +448,32 @@ pub struct Partition {
     /// the string key of the partition
     pub partition_key: PartitionKey,
 
-    // TODO: remove this field once the sort_key_ids is fully imlemented
-    /// vector of column names that describes how *every* parquet file
-    /// in this [`Partition`] is sorted.
-    pub sort_key: Vec<String>,
-
-    /// vector of column ids that describes how *every* parquet file
-    /// in this [`Partition`] is sorted. The sort_key contains all the
+    /// Vector of column IDs that describes how *every* parquet file
+    /// in this [`Partition`] is sorted. The sort key contains all the
     /// primary key (PK) columns that have been persisted, and nothing
     /// else. The PK columns are all `tag` columns and the `time`
     /// column.
     ///
     /// Even though it is possible for both the unpersisted data
     /// and/or multiple parquet files to contain different subsets of
-    /// columns, the partition's sort_key is guaranteed to be
+    /// columns, the partition's sort key is guaranteed to be
     /// "compatible" across all files. Compatible means that the
     /// parquet file is sorted in the same order as the partition
-    /// sort_key after removing any missing columns.
+    /// sort key after removing any missing columns.
     ///
     /// Partitions are initially created before any data is persisted
-    /// with an empty sort_key. The partition sort_key is updated as
+    /// with an empty sort key. The partition sort key is updated as
     /// needed when data is persisted to parquet files: both on the
     /// first persist when the sort key is empty, as on subsequent
     /// persist operations when new tags occur in newly inserted data.
     ///
-    /// Updating inserts new column into the existing order. The order
+    /// Updating inserts new columns into the existing sort key. The order
     /// of the existing columns relative to each other is NOT changed.
     ///
     /// For example, updating `A,B,C` to either `A,D,B,C` or `A,B,C,D`
     /// is legal. However, updating to `A,C,D,B` is not because the
-    /// relative order of B and C have been reversed.
-    pub sort_key_ids: SortedColumnSet,
+    /// relative order of B and C has been reversed.
+    sort_key_ids: SortKeyIds,
 
     /// The time at which the newest file of the partition is created
     pub new_file_at: Option<Timestamp>,
@@ -480,40 +483,13 @@ impl Partition {
     /// Create a new Partition data object from the given attributes. This constructor will take
     /// care of computing the [`PartitionHashId`].
     ///
-    /// This is only appropriate to use in the in-memory catalog or in tests.
-    pub fn new_in_memory_only(
-        id: PartitionId,
-        table_id: TableId,
-        partition_key: PartitionKey,
-        sort_key: Vec<String>,
-        sort_key_ids: SortedColumnSet,
-        new_file_at: Option<Timestamp>,
-    ) -> Self {
-        let hash_id = PartitionHashId::new(table_id, &partition_key);
-        Self {
-            id,
-            hash_id: Some(hash_id),
-            table_id,
-            partition_key,
-            sort_key,
-            sort_key_ids,
-            new_file_at,
-        }
-    }
-
-    /// The sqlite catalog has to define a `PartitionPod` type that's slightly different than
-    /// `Partition` because of what sqlite serialization is supported. This function is for
-    /// conversion between the `PartitionPod` type and `Partition` and should not be used anywhere
-    /// else.
-    ///
-    /// The in-memory catalog also creates the `Partition` directly from w
-    pub fn new_with_hash_id_from_sqlite_catalog_only(
+    /// This is only appropriate to use in the catalog or in tests.
+    pub fn new_catalog_only(
         id: PartitionId,
         hash_id: Option<PartitionHashId>,
         table_id: TableId,
         partition_key: PartitionKey,
-        sort_key: Vec<String>,
-        sort_key_ids: SortedColumnSet,
+        sort_key_ids: SortKeyIds,
         new_file_at: Option<Timestamp>,
     ) -> Self {
         Self {
@@ -521,7 +497,6 @@ impl Partition {
             hash_id,
             table_id,
             partition_key,
-            sort_key,
             sort_key_ids,
             new_file_at,
         }
@@ -538,30 +513,31 @@ impl Partition {
         self.hash_id.as_ref()
     }
 
-    // TODO: remove this function after all PRs that teach compactor, ingester,
-    // and querier to use sort_key_ids are merged.
-    /// The sort key for the partition, if present, structured as a `SortKey`
-    pub fn sort_key(&self) -> Option<SortKey> {
-        if self.sort_key.is_empty() {
-            return None;
-        }
-
-        Some(SortKey::from_columns(self.sort_key.iter().map(|s| &**s)))
-    }
-
-    /// The sort_key_ids if present
-    pub fn sort_key_ids(&self) -> &SortedColumnSet {
-        &self.sort_key_ids
-    }
-
-    /// The sort_key_ids if not empty and None if empty
-    pub fn sort_key_ids_none_if_empty(&self) -> Option<&SortedColumnSet> {
+    /// The sort key IDs, if the sort key has been set
+    pub fn sort_key_ids(&self) -> Option<&SortKeyIds> {
         if self.sort_key_ids.is_empty() {
             None
         } else {
             Some(&self.sort_key_ids)
         }
     }
+
+    /// The sort key containing the column names found in the specified column map.
+    ///
+    /// # Panics
+    ///
+    /// Will panic if an ID isn't found in the column map.
+    pub fn sort_key(&self, columns_by_name: &ColumnsByName) -> Option<SortKey> {
+        self.sort_key_ids()
+            .map(|sort_key_ids| sort_key_ids.to_sort_key(columns_by_name))
+    }
+
+    /// Change the sort key IDs to the given sort key IDs. This should only be used in the
+    /// in-memory catalog or in tests; all other sort key updates should go through the catalog
+    /// functions.
+    pub fn set_sort_key_ids(&mut self, sort_key_ids: &SortKeyIds) {
+        self.sort_key_ids = sort_key_ids.clone();
+    }
 }
 
 #[cfg(test)]
diff --git a/data_types/src/partition_template.rs b/data_types/src/partition_template.rs
index 48d82e48cac..bbd063302bd 100644
--- a/data_types/src/partition_template.rs
+++ b/data_types/src/partition_template.rs
@@ -129,36 +129,43 @@
 //!      [
 //!          TemplatePart::TimeFormat("%Y"),
 //!          TemplatePart::TagValue("a"),
-//!          TemplatePart::TagValue("b")
+//!          TemplatePart::TagValue("b"),
+//!          TemplatePart::Bucket("c", 10)
 //!      ]
 //! ```
 //!
 //! The following partition keys are derived:
 //!
-//!   * `time=2023-01-01, a=bananas, b=plátanos`   -> `2023|bananas|plátanos`
-//!   * `time=2023-01-01, b=plátanos`              -> `2023|!|plátanos`
-//!   * `time=2023-01-01, another=cat, b=plátanos` -> `2023|!|plátanos`
-//!   * `time=2023-01-01`                          -> `2023|!|!`
-//!   * `time=2023-01-01, a=cat|dog, b=!`          -> `2023|cat%7Cdog|%21`
-//!   * `time=2023-01-01, a=%50`                   -> `2023|%2550|!`
-//!   * `time=2023-01-01, a=`                      -> `2023|^|!`
-//!   * `time=2023-01-01, a=<long string>`         -> `2023|<long string>#|!`
+//!   * `time=2023-01-01, a=bananas, b=plátanos, c=ananas`   -> `2023|bananas|plátanos|5`
+//!   * `time=2023-01-01, b=plátanos`                        -> `2023|!|plátanos|!`
+//!   * `time=2023-01-01, another=cat, b=plátanos`           -> `2023|!|plátanos|!`
+//!   * `time=2023-01-01`                                    -> `2023|!|!|!`
+//!   * `time=2023-01-01, a=cat|dog, b=!, c=!`               -> `2023|cat%7Cdog|%21|8`
+//!   * `time=2023-01-01, a=%50, c=%50`                      -> `2023|%2550|!|9`
+//!   * `time=2023-01-01, a=, c=`                            -> `2023|^|!|0`
+//!   * `time=2023-01-01, a=<long string>`                   -> `2023|<long string>#|!|!`
 //!
 //! When using the default partitioning template (YYYY-MM-DD) there is no
 //! encoding necessary, as the derived partition key contains a single part, and
 //! no reserved characters.
 //!
 //! [percent encoded]: https://url.spec.whatwg.org/#percent-encoded-bytes
+use std::{
+    borrow::Cow,
+    fmt::{Display, Formatter},
+    ops::Range,
+    sync::Arc,
+};
 
 use chrono::{
     format::{Numeric, StrftimeItems},
     DateTime, Days, Months, Utc,
 };
 use generated_types::influxdata::iox::partition_template::v1 as proto;
+use murmur3::murmur3_32;
 use once_cell::sync::Lazy;
 use percent_encoding::{percent_decode_str, AsciiSet, CONTROLS};
 use schema::TIME_COLUMN_NAME;
-use std::{borrow::Cow, sync::Arc};
 use thiserror::Error;
 
 /// Reasons a user-specified partition template isn't valid.
@@ -187,12 +194,32 @@ pub enum ValidationError {
     #[error("invalid strftime format in partition template: {0}")]
     InvalidStrftime(String),
 
-    /// The partition template defines a [`TagValue`] part, but the provided
-    /// value is invalid.
+    /// The partition template defines a [`TagValue`] part or [`Bucket`] part,
+    /// but the provided tag name value is invalid.
     ///
     /// [`TagValue`]: [`proto::template_part::Part::TagValue`]
-    #[error("invalid tag value in partition template: {0}")]
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error("invalid tag name value in partition template: {0}")]
     InvalidTagValue(String),
+
+    /// The partition template defines a [`Bucket`] part, but the provided
+    /// number of buckets is invalid.
+    ///
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error(
+        "number of buckets in partition template must be in range \
+        [{ALLOWED_BUCKET_QUANTITIES:?}), number specified: {0}"
+    )]
+    InvalidNumberOfBuckets(u32),
+
+    /// The partition template defines a [`TagValue`] or [`Bucket`] part
+    /// which repeats a tag name used in another [`TagValue`] or [`Bucket`] part.
+    /// This is not allowed
+    ///
+    /// [`TagValue`]: [`proto::template_part::Part::TagValue`]
+    /// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+    #[error("tag name value cannot be repeated in partition template: {0}")]
+    RepeatedTagValue(String),
 }
 
 /// The maximum number of template parts a custom partition template may specify, to limit the
@@ -234,6 +261,14 @@ pub const PARTITION_KEY_PART_TRUNCATED: char = '#';
 /// data point.
 pub const TAG_VALUE_KEY_TIME: &str = "time";
 
+/// The range of bucket quantities allowed for [`Bucket`] template parts.
+///    
+/// [`Bucket`]: [`proto::template_part::Part::Bucket`]
+pub const ALLOWED_BUCKET_QUANTITIES: Range<u32> = Range {
+    start: 1,
+    end: 100_000,
+};
+
 /// The minimal set of characters that must be encoded during partition key
 /// generation when they form part of a partition key part, in order to be
 /// unambiguously reversible.
@@ -249,10 +284,23 @@ pub const ENCODED_PARTITION_KEY_CHARS: AsciiSet = CONTROLS
 /// Allocationless and protobufless access to the parts of a template needed to
 /// actually do partitioning.
 #[derive(Debug, Clone)]
-#[allow(missing_docs)]
 pub enum TemplatePart<'a> {
+    /// A tag-value partition part.
+    ///
+    /// Specifies the name of the tag column.
     TagValue(&'a str),
+
+    /// A strftime formatter.
+    ///
+    /// Specifies the formatter spec applied to the [`TIME_COLUMN_NAME`] column.
     TimeFormat(&'a str),
+
+    /// A bucketing partition part.
+    ///
+    /// Specifies the name of the tag column used to derive which of the `n`
+    /// buckets the data belongs in, through the mechanism implemented by the
+    /// [`bucket_for_tag_value`] function.
+    Bucket(&'a str, u32),
 }
 
 /// The default partitioning scheme is by each day according to the "time" column.
@@ -266,6 +314,37 @@ pub static PARTITION_BY_DAY_PROTO: Lazy<Arc<proto::PartitionTemplate>> = Lazy::n
     })
 });
 
+// This applies murmur3 32 bit hashing to the tag value string, as Iceberg would.
+//
+// * <https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements>
+fn iceberg_hash(tag_value: &str) -> u32 {
+    murmur3_32(&mut tag_value.as_bytes(), 0).expect("read of tag value string must never error")
+}
+
+/// Hash bucket the provided tag value to a bucket ID in the range `[0,num_buckets)`.
+///
+/// This applies murmur3 32 bit hashing to the tag value string, zero-ing the sign bit
+/// then modulo assigning it to a bucket as Iceberg would.
+///
+/// * <https://iceberg.apache.org/spec/#appendix-b-32-bit-hash-requirements>
+/// * <https://iceberg.apache.org/spec/#bucket-transform-details>
+///
+///
+/// # Panics
+///
+/// If `num_buckets` is zero, this will panic. Validation MUST prevent
+/// [`TemplatePart::Bucket`] from being constructed with a zero bucket count. It just
+/// makes no sense and shouldn't need to be checked here.
+#[inline(always)]
+pub fn bucket_for_tag_value(tag_value: &str, num_buckets: u32) -> u32 {
+    // Hash the tag value as iceberg would.
+    let hash = iceberg_hash(tag_value);
+    // Then bucket it as iceberg would, by removing the sign bit from the
+    // 32 bit murmur hash and modulo by the number of buckets to assign
+    // across.
+    (hash & i32::MAX as u32) % num_buckets
+}
+
 /// A partition template specified by a namespace record.
 ///
 /// Internally this type is [`None`] when no namespace-level override is
@@ -344,6 +423,10 @@ impl TablePartitionTemplateOverride {
             .map(|part| match part {
                 proto::template_part::Part::TagValue(value) => TemplatePart::TagValue(value),
                 proto::template_part::Part::TimeFormat(fmt) => TemplatePart::TimeFormat(fmt),
+                proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name,
+                    num_buckets,
+                }) => TemplatePart::Bucket(tag_name, *num_buckets),
             })
     }
 
@@ -370,6 +453,10 @@ impl TablePartitionTemplateOverride {
                                     .map(|part| match part {
                                         proto::template_part::Part::TagValue(s) => s.capacity(),
                                         proto::template_part::Part::TimeFormat(s) => s.capacity(),
+                                        proto::template_part::Part::Bucket(proto::Bucket {
+                                            tag_name,
+                                            num_buckets: _,
+                                        }) => tag_name.capacity() + std::mem::size_of::<u32>(),
                                     })
                                     .unwrap_or_default()
                             })
@@ -384,15 +471,42 @@ impl TablePartitionTemplateOverride {
     }
 }
 
+/// Display the serde_json representation so that the output
+/// can be copy/pasted into CLI tools, etc as the partition
+/// template is specified as JSON
+impl Display for TablePartitionTemplateOverride {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{}",
+            self.as_proto()
+                .map(|proto| serde_json::to_string(proto)
+                    .expect("serialization should be infallible"))
+                .unwrap_or_default()
+        )
+    }
+}
+
+impl TryFrom<Option<proto::PartitionTemplate>> for TablePartitionTemplateOverride {
+    type Error = ValidationError;
+
+    fn try_from(p: Option<proto::PartitionTemplate>) -> Result<Self, Self::Error> {
+        Ok(Self(p.map(serialization::Wrapper::try_from).transpose()?))
+    }
+}
+
 /// This manages the serialization/deserialization of the `proto::PartitionTemplate` type to and
 /// from the database through `sqlx` for the `NamespacePartitionTemplateOverride` and
 /// `TablePartitionTemplateOverride` types. It's an internal implementation detail to minimize code
 /// duplication.
 mod serialization {
-    use super::{ValidationError, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, TAG_VALUE_KEY_TIME};
+    use super::{
+        ValidationError, ALLOWED_BUCKET_QUANTITIES, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS,
+        TAG_VALUE_KEY_TIME,
+    };
     use chrono::{format::StrftimeItems, Utc};
     use generated_types::influxdata::iox::partition_template::v1 as proto;
-    use std::{fmt::Write, sync::Arc};
+    use std::{collections::HashSet, fmt::Write, sync::Arc};
 
     #[derive(Debug, Clone, PartialEq, Hash)]
     pub struct Wrapper(Arc<proto::PartitionTemplate>);
@@ -437,6 +551,8 @@ mod serialization {
                 return Err(ValidationError::TooManyParts { specified });
             }
 
+            let mut seen_tags: HashSet<&str> = HashSet::with_capacity(specified);
+
             // All time formats must be valid and tag values may not specify any
             // restricted values.
             for part in &partition_template.parts {
@@ -479,6 +595,32 @@ mod serialization {
                                 "{TAG_VALUE_KEY_TIME} cannot be used"
                             )));
                         }
+
+                        if !seen_tags.insert(value.as_str()) {
+                            return Err(ValidationError::RepeatedTagValue(value.into()));
+                        }
+                    }
+                    Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name,
+                        num_buckets,
+                    })) => {
+                        if tag_name.is_empty() {
+                            return Err(ValidationError::InvalidTagValue(tag_name.into()));
+                        }
+
+                        if tag_name.contains(TAG_VALUE_KEY_TIME) {
+                            return Err(ValidationError::InvalidTagValue(format!(
+                                "{TAG_VALUE_KEY_TIME} cannot be used"
+                            )));
+                        }
+
+                        if !seen_tags.insert(tag_name.as_str()) {
+                            return Err(ValidationError::RepeatedTagValue(tag_name.into()));
+                        }
+
+                        if !ALLOWED_BUCKET_QUANTITIES.contains(num_buckets) {
+                            return Err(ValidationError::InvalidNumberOfBuckets(*num_buckets));
+                        }
                     }
                     None => {}
                 }
@@ -558,6 +700,10 @@ pub enum ColumnValue<'a> {
         /// Exclusive end of the datatime partition range.
         end: DateTime<Utc>,
     },
+
+    /// The inner value is the ID of the bucket selected through a modulo hash
+    /// of the input column value.
+    Bucket(u32),
 }
 
 impl<'a> ColumnValue<'a> {
@@ -572,7 +718,7 @@ impl<'a> ColumnValue<'a> {
         let this = match self {
             ColumnValue::Identity(v) => v.as_bytes(),
             ColumnValue::Prefix(v) => v.as_bytes(),
-            ColumnValue::Datetime { .. } => {
+            ColumnValue::Datetime { .. } | ColumnValue::Bucket(..) => {
                 return false;
             }
         };
@@ -590,6 +736,7 @@ where
             ColumnValue::Identity(v) => other.as_ref().eq(v.as_ref()),
             ColumnValue::Prefix(_) => false,
             ColumnValue::Datetime { .. } => false,
+            ColumnValue::Bucket(..) => false,
         }
     }
 }
@@ -605,7 +752,9 @@ where
 ///
 /// # Panics
 ///
-/// This method panics if a column value is not valid UTF8 after decoding.
+/// This method panics if a column value is not valid UTF8 after decoding, or
+/// when a bucket ID is not valid (not a u32 or within the expected number of
+/// buckets).
 pub fn build_column_values<'a>(
     template: &'a TablePartitionTemplateOverride,
     partition_key: &'a str,
@@ -629,10 +778,21 @@ pub fn build_column_values<'a>(
     // Produce an iterator of (template_part, template_value)
     template_parts
         .zip(key_parts)
-        .filter_map(|(template, value)| match template {
-            TemplatePart::TagValue(col_name) => Some((col_name, parse_part_tag_value(value)?)),
-            TemplatePart::TimeFormat(format) => {
-                Some((TIME_COLUMN_NAME, parse_part_time_format(value, format)?))
+        .filter_map(|(template, value)| {
+            if value == PARTITION_KEY_VALUE_NULL_STR {
+                None
+            } else {
+                match template {
+                    TemplatePart::TagValue(col_name) => {
+                        Some((col_name, parse_part_tag_value(value)?))
+                    }
+                    TemplatePart::TimeFormat(format) => {
+                        Some((TIME_COLUMN_NAME, parse_part_time_format(value, format)?))
+                    }
+                    TemplatePart::Bucket(col_name, num_buckets) => {
+                        Some((col_name, parse_part_bucket(value, num_buckets)?))
+                    }
+                }
             }
         })
 }
@@ -640,11 +800,6 @@ pub fn build_column_values<'a>(
 fn parse_part_tag_value(value: &str) -> Option<ColumnValue<'_>> {
     // Perform re-mapping of sentinel values.
     let value = match value {
-        PARTITION_KEY_VALUE_NULL_STR => {
-            // Skip null or empty partition key parts, indicated by the
-            // presence of a single "!" character as the part value.
-            return None;
-        }
         PARTITION_KEY_VALUE_EMPTY_STR => {
             // Re-map the empty string sentinel "^"" to an empty string
             // value.
@@ -736,6 +891,18 @@ fn parse_part_time_format(value: &str, format: &str) -> Option<ColumnValue<'stat
     end.map(|end| ColumnValue::Datetime { begin, end })
 }
 
+fn parse_part_bucket(value: &str, num_buckets: u32) -> Option<ColumnValue<'_>> {
+    // Parse the bucket ID from the given value string.
+    let bucket_id = value
+        .parse::<u32>()
+        .expect("invalid partition key bucket encoding");
+    // Invariant: If the bucket ID (0 indexed) is greater than the number of
+    // buckets to spread data across the partition key is invalid.
+    assert!(bucket_id < num_buckets);
+
+    Some(ColumnValue::Bucket(bucket_id))
+}
+
 fn parsed_implicit_defaults(mut parsed: chrono::format::Parsed) -> Option<chrono::format::Parsed> {
     parsed.year?;
 
@@ -800,6 +967,12 @@ pub fn test_table_partition_override(
             let part = match part {
                 TemplatePart::TagValue(value) => proto::template_part::Part::TagValue(value.into()),
                 TemplatePart::TimeFormat(fmt) => proto::template_part::Part::TimeFormat(fmt.into()),
+                TemplatePart::Bucket(value, num_buckets) => {
+                    proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: value.into(),
+                        num_buckets,
+                    })
+                }
             };
 
             proto::TemplatePart { part: Some(part) }
@@ -814,12 +987,32 @@ pub fn test_table_partition_override(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use assert_matches::assert_matches;
     use chrono::TimeZone;
+    use proptest::prelude::*;
     use sqlx::Encode;
     use test_helpers::assert_error;
 
+    use super::*;
+
+    #[test]
+    fn test_partition_template_to_string() {
+        let template_empty: TablePartitionTemplateOverride =
+            TablePartitionTemplateOverride::default();
+
+        let template: Vec<TemplatePart<'_>> =
+            [TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a")]
+                .into_iter()
+                .collect::<Vec<_>>();
+        let template: TablePartitionTemplateOverride = test_table_partition_override(template);
+
+        assert_eq!(template_empty.to_string(), "");
+        assert_eq!(
+            template.to_string(),
+            "{\"parts\":[{\"timeFormat\":\"%Y\"},{\"tagValue\":\"a\"}]}"
+        );
+    }
+
     #[test]
     fn test_max_partition_key_len() {
         let max_len: usize =
@@ -879,6 +1072,60 @@ mod tests {
         assert_error!(err, ValidationError::TooManyParts { specified } if specified == 9);
     }
 
+    #[test]
+    fn repeated_tag_name_value_is_invalid() {
+        // Test [`TagValue`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+
+        // Test [`Bucket`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+
+        // Test a combination of [`TagValue`] and [`Bucket`]
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("bananas".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
+            ],
+        });
+
+        assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas");
+    }
+
     /// Chrono will panic when formatting a timestamp if the "%#z" formatting
     /// directive is used...
     #[test]
@@ -947,10 +1194,74 @@ mod tests {
         assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty());
     }
 
+    /// "time" is a special column already covered by strftime, being a time
+    /// series database and all.
+    #[test]
+    fn bucket_time_tag_name_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "time".into(),
+                    num_buckets: 42,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(_));
+    }
+
+    #[test]
+    fn bucket_empty_tag_name_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "".into(),
+                    num_buckets: 42,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty());
+    }
+
+    #[test]
+    fn bucket_zero_num_buckets_is_invalid() {
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "arán".into(),
+                    num_buckets: 0,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidNumberOfBuckets(0));
+    }
+
+    #[test]
+    fn bucket_too_high_num_buckets_is_invalid() {
+        const TOO_HIGH: u32 = 100_000;
+
+        let err = serialization::Wrapper::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                    tag_name: "arán".into(),
+                    num_buckets: TOO_HIGH,
+                })),
+            }],
+        });
+
+        assert_error!(err, ValidationError::InvalidNumberOfBuckets(TOO_HIGH));
+    }
+
     fn identity(s: &str) -> ColumnValue<'_> {
         ColumnValue::Identity(s.into())
     }
 
+    fn bucket(bucket_id: u32) -> ColumnValue<'static> {
+        ColumnValue::Bucket(bucket_id)
+    }
+
     fn prefix<'a, T>(s: T) -> ColumnValue<'a>
     where
         T: Into<Cow<'a, str>>,
@@ -965,14 +1276,76 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_iceberg_string_hash() {
+        assert_eq!(iceberg_hash("iceberg"), 1210000089);
+    }
+
+    // This is a test fixture designed to catch accidental changes to the
+    // Iceberg-like hash-bucket partitioning behaviour.
+    //
+    // You shouldn't be changing this!
+    #[test]
+    fn test_hash_bucket_fixture() {
+        // These are values lifted from the iceberg spark test suite for
+        // `BucketString`, sadly not provided in the reference/spec:
+        //
+        // https://github.com/apache/iceberg/blob/31e31fd819c846f49d2bd459b8bfadfdc3c2bc3a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkBucketFunction.java#L151-L169
+        //
+        assert_eq!(bucket_for_tag_value("abcdefg", 5), 4);
+        assert_eq!(bucket_for_tag_value("abc", 128), 122);
+        assert_eq!(bucket_for_tag_value("abcde", 64), 54);
+        assert_eq!(bucket_for_tag_value("测试", 12), 8);
+        assert_eq!(bucket_for_tag_value("测试raul试测", 16), 1);
+        assert_eq!(bucket_for_tag_value("", 16), 0);
+
+        // These are pre-existing arbitrary fixture values
+        assert_eq!(bucket_for_tag_value("bananas", 10), 1);
+        assert_eq!(bucket_for_tag_value("plátanos", 100), 98);
+        assert_eq!(bucket_for_tag_value("crobhaing bananaí", 1000), 166);
+        assert_eq!(bucket_for_tag_value("bread", 42), 9);
+        assert_eq!(bucket_for_tag_value("arán", 76), 72);
+        assert_eq!(bucket_for_tag_value("banana arán", 1337), 1284);
+        assert_eq!(
+            bucket_for_tag_value("uasmhéid bananaí", u32::MAX),
+            1109892861
+        );
+    }
+
+    /// Test to approximate and show how the tag value maps to the partition key
+    /// for the example cases in the mod-doc. The behaviour that renders the key
+    /// itself is a combination of this bucket assignment and the render logic.
+    #[test]
+    fn test_bucket_for_mod_doc() {
+        assert_eq!(bucket_for_tag_value("ananas", 10), 5);
+        assert_eq!(bucket_for_tag_value("!", 10), 8);
+        assert_eq!(bucket_for_tag_value("%50", 10), 9);
+        assert_eq!(bucket_for_tag_value("", 10), 0);
+    }
+
+    proptest! {
+        #[test]
+        fn prop_consistent_bucketing_within_limits(tag_values in proptest::collection::vec(any::<String>(), (1, 10)), num_buckets in any::<u32>()) {
+            for value in tag_values {
+                // First pass assign
+                let want_bucket = bucket_for_tag_value(&value, num_buckets);
+                // The assigned bucket must fit within the domain given to the bucketer.
+                assert!(want_bucket < num_buckets);
+                // Feed in the same tag value, expect the same result.
+                let got_bucket = bucket_for_tag_value(&value, num_buckets);
+                assert_eq!(want_bucket, got_bucket);
+            }
+        }
+    }
+
     /// Generate a test that asserts "partition_key" is reversible, yielding
     /// "want" assuming the partition "template" was used.
     macro_rules! test_build_column_values {
         (
             $name:ident,
-            template = $template:expr,              // Array/vec of TemplatePart
-            partition_key = $partition_key:expr,    // String derived partition key
-            want = $want:expr                       // Expected build_column_values() output
+            template = $template:expr,                 // Array/vec of TemplatePart
+            partition_key = $partition_key:expr,       // String derived partition key
+            want = $want:expr                          // Expected build_column_values() output
         ) => {
             paste::paste! {
                 #[test]
@@ -1001,23 +1374,26 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|bananas|plátanos",
+        partition_key = "2023|bananas|plátanos|5",
         want = [
             (TIME_COLUMN_NAME, year(2023)),
             ("a", identity("bananas")),
             ("b", identity("plátanos")),
+            ("c", bucket(5)),
         ]
     );
 
     test_build_column_values!(
-        module_doc_example_2,
+        module_doc_example_2, // Examples 2 and 3 are the same partition key
         template = [
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|!|plátanos",
+        partition_key = "2023|!|plátanos|!",
         want = [(TIME_COLUMN_NAME, year(2023)), ("b", identity("plátanos")),]
     );
 
@@ -1027,8 +1403,9 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|!|!",
+        partition_key = "2023|!|!|!",
         want = [(TIME_COLUMN_NAME, year(2023)),]
     );
 
@@ -1038,12 +1415,14 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|cat%7Cdog|%21",
+        partition_key = "2023|cat%7Cdog|%21|8",
         want = [
             (TIME_COLUMN_NAME, year(2023)),
             ("a", identity("cat|dog")),
             ("b", identity("!")),
+            ("c", bucket(8)),
         ]
     );
 
@@ -1053,9 +1432,14 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|%2550|!",
-        want = [(TIME_COLUMN_NAME, year(2023)), ("a", identity("%50")),]
+        partition_key = "2023|%2550|!|9",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("%50")),
+            ("c", bucket(9)),
+        ]
     );
 
     test_build_column_values!(
@@ -1064,8 +1448,25 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|BANANAS#|!",
+        partition_key = "2023|^|!|0",
+        want = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("")),
+            ("c", bucket(0)),
+        ]
+    );
+
+    test_build_column_values!(
+        module_doc_example_8,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
+        ],
+        partition_key = "2023|BANANAS#|!|!|!",
         want = [(TIME_COLUMN_NAME, year(2023)), ("a", prefix("BANANAS")),]
     );
 
@@ -1075,8 +1476,9 @@ mod tests {
             TemplatePart::TimeFormat("%Y"),
             TemplatePart::TagValue("a"),
             TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10),
         ],
-        partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!",
+        partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!|!",
         want = [
             (TIME_COLUMN_NAME, year(2023)),
             ("a", prefix("(ノಠ益ಠ)ノ彡┻━┻")),
@@ -1115,6 +1517,13 @@ mod tests {
         want = []
     );
 
+    test_build_column_values!(
+        datetime_null,
+        template = [TemplatePart::TimeFormat("%Y"),],
+        partition_key = "!",
+        want = []
+    );
+
     test_build_column_values!(
         datetime_range_y,
         template = [TemplatePart::TimeFormat("%Y"),],
@@ -1206,6 +1615,51 @@ mod tests {
         )]
     );
 
+    test_build_column_values!(
+        bucket_part_fixture,
+        template = [
+            TemplatePart::Bucket("a", 41),
+            TemplatePart::Bucket("b", 91),
+            TemplatePart::Bucket("c", 144)
+        ],
+        partition_key = "1|2|3",
+        want = [("a", bucket(1)), ("b", bucket(2)), ("c", bucket(3)),]
+    );
+
+    #[test]
+    #[should_panic]
+    fn test_build_column_values_bucket_part_out_of_range_panics() {
+        let template = [
+            TemplatePart::Bucket("a", 42),
+            TemplatePart::Bucket("b", 42),
+            TemplatePart::Bucket("c", 42),
+        ]
+        .into_iter()
+        .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        // normalise the values into a (str, ColumnValue) for the comparison
+        let input = String::from("1|1|43");
+        let _ = build_column_values(&template, input.as_str()).collect::<Vec<_>>();
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_build_column_values_bucket_part_not_u32_panics() {
+        let template = [
+            TemplatePart::Bucket("a", 42),
+            TemplatePart::Bucket("b", 42),
+            TemplatePart::Bucket("c", 42),
+        ]
+        .into_iter()
+        .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        // normalise the values into a (str, ColumnValue) for the comparison
+        let input = String::from("1|1|bananas");
+        let _ = build_column_values(&template, input.as_str()).collect::<Vec<_>>();
+    }
+
     test_build_column_values!(
         datetime_not_compact_y_d,
         template = [TemplatePart::TimeFormat("%Y-%d"),],
@@ -1369,11 +1823,18 @@ mod tests {
                 proto::TemplatePart {
                     part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
                 },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: "bananas".into(),
+                        num_buckets: 42,
+                    })),
+                },
             ],
         };
         let expected_json_str = "{\"parts\":[\
             {\"tagValue\":\"region\"},\
-            {\"timeFormat\":\"year-%Y\"}\
+            {\"timeFormat\":\"year-%Y\"},\
+            {\"bucket\":{\"tagName\":\"bananas\",\"numBuckets\":42}}\
         ]}";
 
         let namespace = NamespacePartitionTemplateOverride::try_from(custom_template).unwrap();
@@ -1383,7 +1844,7 @@ mod tests {
         );
 
         fn extract_sqlite_argument_text(
-            argument_value: &sqlx::sqlite::SqliteArgumentValue,
+            argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>,
         ) -> String {
             match argument_value {
                 sqlx::sqlite::SqliteArgumentValue::Text(cow) => cow.to_string(),
@@ -1401,6 +1862,88 @@ mod tests {
         );
         let table_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect();
         assert_eq!(table_json_str, expected_json_str);
-        assert_eq!(table.len(), 2);
+        assert_eq!(table.len(), 3);
+    }
+
+    #[test]
+    fn test_template_size_reporting() {
+        const BASE_SIZE: usize = std::mem::size_of::<TablePartitionTemplateOverride>()
+            + std::mem::size_of::<proto::PartitionTemplate>();
+
+        let first_string = "^";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue(first_string.into())),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+
+        assert_eq!(
+            template.size(),
+            BASE_SIZE + std::mem::size_of::<proto::TemplatePart>() + first_string.len()
+        );
+
+        let second_string = "region";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue(second_string.into())),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+
+        assert_eq!(
+            template.size(),
+            BASE_SIZE + std::mem::size_of::<proto::TemplatePart>() + second_string.len()
+        );
+
+        let time_string = "year-%Y";
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![
+                    proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TagValue(second_string.into())),
+                    },
+                    proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat(time_string.into())),
+                    },
+                ],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template ");
+        assert_eq!(
+            template.size(),
+            BASE_SIZE
+                + std::mem::size_of::<proto::TemplatePart>()
+                + second_string.len()
+                + std::mem::size_of::<proto::TemplatePart>()
+                + time_string.len()
+        );
+
+        let template = TablePartitionTemplateOverride::try_new(
+            Some(proto::PartitionTemplate {
+                parts: vec![proto::TemplatePart {
+                    part: Some(proto::template_part::Part::Bucket(proto::Bucket {
+                        tag_name: second_string.into(),
+                        num_buckets: 42,
+                    })),
+                }],
+            }),
+            &NamespacePartitionTemplateOverride::default(),
+        )
+        .expect("failed to create table partition template");
+        assert_eq!(
+            template.size(),
+            BASE_SIZE
+                + std::mem::size_of::<proto::TemplatePart>()
+                + second_string.len()
+                + std::mem::size_of::<u32>()
+        );
     }
 }
diff --git a/data_types/src/service_limits.rs b/data_types/src/service_limits.rs
index 4e4acc14914..7c00b6a1e90 100644
--- a/data_types/src/service_limits.rs
+++ b/data_types/src/service_limits.rs
@@ -3,74 +3,164 @@
 use generated_types::influxdata::iox::namespace::{
     v1 as namespace_proto, v1::update_namespace_service_protection_limit_request::LimitUpdate,
 };
+use observability_deps::tracing::*;
+use std::num::NonZeroUsize;
 use thiserror::Error;
 
-/// Max tables allowed in a namespace.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
-#[sqlx(transparent)]
-pub struct MaxTables(i32);
+/// Definitions that apply to both MaxColumnsPerTable and MaxTables. Note that the hardcoded
+/// default value specified in the macro invocation must be greater than 0 and fit in an `i32`.
+macro_rules! define_service_limit {
+    ($type_name:ident, $default_value:expr, $documentation:expr) => {
+        /// $documentation
+        #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
+        pub struct $type_name(NonZeroUsize);
 
-#[allow(missing_docs)]
-impl MaxTables {
-    pub const fn new(v: i32) -> Self {
-        Self(v)
-    }
+        impl TryFrom<usize> for $type_name {
+            type Error = ServiceLimitError;
 
-    pub fn get(&self) -> i32 {
-        self.0
-    }
+            fn try_from(value: usize) -> Result<Self, Self::Error> {
+                // Even though the value is stored as a `usize`, service limits are stored as `i32`
+                // in the database and transferred as i32 over protobuf. So try to convert to an
+                // `i32` (and throw away the result) so that we know about invalid values before
+                // trying to use them.
+                if i32::try_from(value).is_err() {
+                    return Err(ServiceLimitError::MustFitInI32);
+                }
 
-    /// Default per-namespace table count service protection limit.
-    pub const fn const_default() -> Self {
-        Self(500)
-    }
-}
+                let nonzero_value =
+                    NonZeroUsize::new(value).ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
 
-impl Default for MaxTables {
-    fn default() -> Self {
-        Self::const_default()
-    }
-}
+                Ok(Self(nonzero_value))
+            }
+        }
 
-impl std::fmt::Display for MaxTables {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
+        impl TryFrom<u64> for $type_name {
+            type Error = ServiceLimitError;
 
-/// Max columns per table allowed in a namespace.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)]
-#[sqlx(transparent)]
-pub struct MaxColumnsPerTable(i32);
+            fn try_from(value: u64) -> Result<Self, Self::Error> {
+                // Even though the value is stored as a `usize`, service limits are stored as `i32`
+                // in the database and transferred as i32 over protobuf. So try to convert to an
+                // `i32` (and throw away the result) so that we know about invalid values before
+                // trying to use them.
+                if i32::try_from(value).is_err() {
+                    return Err(ServiceLimitError::MustFitInI32);
+                }
 
-#[allow(missing_docs)]
-impl MaxColumnsPerTable {
-    pub const fn new(v: i32) -> Self {
-        Self(v)
-    }
+                let nonzero_value = usize::try_from(value)
+                    .ok()
+                    .and_then(NonZeroUsize::new)
+                    .ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
 
-    pub fn get(&self) -> i32 {
-        self.0
-    }
+                Ok(Self(nonzero_value))
+            }
+        }
 
-    /// Default per-table column count service protection limit.
-    pub const fn const_default() -> Self {
-        Self(200)
-    }
-}
+        impl TryFrom<i32> for $type_name {
+            type Error = ServiceLimitError;
 
-impl Default for MaxColumnsPerTable {
-    fn default() -> Self {
-        Self::const_default()
-    }
-}
+            fn try_from(value: i32) -> Result<Self, Self::Error> {
+                let nonzero_value = usize::try_from(value)
+                    .ok()
+                    .and_then(NonZeroUsize::new)
+                    .ok_or(ServiceLimitError::MustBeGreaterThanZero)?;
 
-impl std::fmt::Display for MaxColumnsPerTable {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
+                Ok(Self(nonzero_value))
+            }
+        }
+
+        #[allow(missing_docs)]
+        impl $type_name {
+            pub fn get(&self) -> usize {
+                self.0.get()
+            }
+
+            /// For use by the database and some protobuf representations. It should not be
+            /// possible to construct an instance that contains a `NonZeroUsize` that won't fit in
+            /// an `i32`.
+            pub fn get_i32(&self) -> i32 {
+                self.0.get() as i32
+            }
+
+            /// Constant-time default for use in constructing test constants.
+            pub const fn const_default() -> Self {
+                // This is safe because the hardcoded value is not 0.
+                let value = unsafe { NonZeroUsize::new_unchecked($default_value) };
+
+                Self(value)
+            }
+        }
+
+        impl Default for $type_name {
+            fn default() -> Self {
+                Self::const_default()
+            }
+        }
+
+        impl std::fmt::Display for $type_name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(f, "{}", self.0)
+            }
+        }
+
+        // Tell sqlx this is an i32 in the database.
+        impl<DB> sqlx::Type<DB> for $type_name
+        where
+            i32: sqlx::Type<DB>,
+            DB: sqlx::Database,
+        {
+            fn type_info() -> DB::TypeInfo {
+                <i32 as sqlx::Type<DB>>::type_info()
+            }
+        }
+
+        impl<'q, DB> sqlx::Encode<'q, DB> for $type_name
+        where
+            DB: sqlx::Database,
+            i32: sqlx::Encode<'q, DB>,
+        {
+            fn encode_by_ref(
+                &self,
+                buf: &mut <DB as sqlx::database::HasArguments<'q>>::ArgumentBuffer,
+            ) -> sqlx::encode::IsNull {
+                <i32 as sqlx::Encode<'_, DB>>::encode_by_ref(&self.get_i32(), buf)
+            }
+        }
+
+        // The database stores i32s, so there's a chance of invalid values already being stored in
+        // there. When deserializing those values, rather than panicking or returning an error, log
+        // and use the default instead.
+        impl<'r, DB: ::sqlx::Database> ::sqlx::decode::Decode<'r, DB> for $type_name
+        where
+            i32: sqlx::Decode<'r, DB>,
+        {
+            fn decode(
+                value: <DB as ::sqlx::database::HasValueRef<'r>>::ValueRef,
+            ) -> ::std::result::Result<
+                Self,
+                ::std::boxed::Box<
+                    dyn ::std::error::Error + 'static + ::std::marker::Send + ::std::marker::Sync,
+                >,
+            > {
+                let data = <i32 as ::sqlx::decode::Decode<'r, DB>>::decode(value)?;
+
+                let data = Self::try_from(data).unwrap_or_else(|_| {
+                    error!("database contains invalid $type_name value {data}, using default value");
+                    Self::default()
+                });
+
+                Ok(data)
+            }
+        }
+    };
 }
 
+define_service_limit!(MaxTables, 500, "Max tables allowed in a namespace.");
+define_service_limit!(
+    MaxColumnsPerTable,
+    200,
+    "Max columns per table allowed in a namespace."
+);
+
 /// Overrides for service protection limits.
 #[derive(Debug, Copy, Clone)]
 pub struct NamespaceServiceProtectionLimitsOverride {
@@ -80,16 +170,23 @@ pub struct NamespaceServiceProtectionLimitsOverride {
     pub max_columns_per_table: Option<MaxColumnsPerTable>,
 }
 
-impl From<namespace_proto::ServiceProtectionLimits> for NamespaceServiceProtectionLimitsOverride {
-    fn from(value: namespace_proto::ServiceProtectionLimits) -> Self {
+impl TryFrom<namespace_proto::ServiceProtectionLimits>
+    for NamespaceServiceProtectionLimitsOverride
+{
+    type Error = ServiceLimitError;
+
+    fn try_from(value: namespace_proto::ServiceProtectionLimits) -> Result<Self, Self::Error> {
         let namespace_proto::ServiceProtectionLimits {
             max_tables,
             max_columns_per_table,
         } = value;
-        Self {
-            max_tables: max_tables.map(MaxTables::new),
-            max_columns_per_table: max_columns_per_table.map(MaxColumnsPerTable::new),
-        }
+
+        Ok(Self {
+            max_tables: max_tables.map(MaxTables::try_from).transpose()?,
+            max_columns_per_table: max_columns_per_table
+                .map(MaxColumnsPerTable::try_from)
+                .transpose()?,
+        })
     }
 }
 
@@ -114,6 +211,11 @@ pub enum ServiceLimitError {
     /// No value was provided so we can't update anything
     #[error("a supported service limit value is required")]
     NoValueSpecified,
+
+    /// Limits are stored as `i32` in the database and transferred as i32 over protobuf, so even
+    /// though they are stored as `usize` in Rust, the `usize` value must be less than `i32::MAX`.
+    #[error("service limit values must fit in a 32-bit signed integer (`i32`)")]
+    MustFitInI32,
 }
 
 impl TryFrom<Option<LimitUpdate>> for ServiceLimitUpdate {
@@ -122,20 +224,88 @@ impl TryFrom<Option<LimitUpdate>> for ServiceLimitUpdate {
     fn try_from(limit_update: Option<LimitUpdate>) -> Result<Self, Self::Error> {
         match limit_update {
             Some(LimitUpdate::MaxTables(n)) => {
-                if n == 0 {
-                    return Err(ServiceLimitError::MustBeGreaterThanZero);
-                }
-                Ok(ServiceLimitUpdate::MaxTables(MaxTables::new(n)))
-            }
-            Some(LimitUpdate::MaxColumnsPerTable(n)) => {
-                if n == 0 {
-                    return Err(ServiceLimitError::MustBeGreaterThanZero);
-                }
-                Ok(ServiceLimitUpdate::MaxColumnsPerTable(
-                    MaxColumnsPerTable::new(n),
-                ))
+                Ok(ServiceLimitUpdate::MaxTables(MaxTables::try_from(n)?))
             }
+            Some(LimitUpdate::MaxColumnsPerTable(n)) => Ok(ServiceLimitUpdate::MaxColumnsPerTable(
+                MaxColumnsPerTable::try_from(n)?,
+            )),
             None => Err(ServiceLimitError::NoValueSpecified),
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn extract_sqlite_argument_i32(argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>) -> i32 {
+        match argument_value {
+            sqlx::sqlite::SqliteArgumentValue::Int(i) => *i,
+            other => panic!("Expected Int values, got: {other:?}"),
+        }
+    }
+
+    macro_rules! service_limit_test {
+        ($type_name:ident, $module_name: ident) => {
+            mod $module_name {
+                use super::*;
+
+                fn success<T: TryInto<$type_name>>(value: T, expected: usize)
+                where
+                    <T as TryInto<$type_name>>::Error: std::fmt::Debug,
+                {
+                    assert_eq!(value.try_into().unwrap().get(), expected);
+                }
+
+                #[test]
+                fn successful_conversions() {
+                    success(1usize, 1);
+                    success(1u64, 1);
+                    success(1i32, 1);
+                    success(i32::MAX, i32::MAX as usize);
+                }
+
+                fn failure<T: TryInto<$type_name>>(value: T, expected_error_message: &str)
+                where
+                    <T as TryInto<$type_name>>::Error: std::fmt::Debug + std::fmt::Display,
+                {
+                    assert_eq!(
+                        value.try_into().unwrap_err().to_string(),
+                        expected_error_message
+                    );
+                }
+
+                #[test]
+                fn failed_conversions() {
+                    failure(0usize, "service limit values must be greater than 0");
+                    failure(0u64, "service limit values must be greater than 0");
+                    failure(0i32, "service limit values must be greater than 0");
+                    failure(-1i32, "service limit values must be greater than 0");
+                    failure(
+                        i32::MAX as usize + 1,
+                        "service limit values must fit in a 32-bit signed integer (`i32`)",
+                    );
+                    failure(
+                        i32::MAX as u64 + 1,
+                        "service limit values must fit in a 32-bit signed integer (`i32`)",
+                    );
+                }
+
+                #[test]
+                fn encode() {
+                    let value = $type_name::try_from(10).unwrap();
+                    let mut buf = Default::default();
+                    let _ = <$type_name as sqlx::Encode<'_, sqlx::Sqlite>>::encode_by_ref(
+                        &value, &mut buf,
+                    );
+
+                    let encoded: Vec<_> = buf.iter().map(extract_sqlite_argument_i32).collect();
+                    assert_eq!(encoded, &[value.get_i32()]);
+                }
+            }
+        };
+    }
+
+    service_limit_test!(MaxTables, max_tables);
+    service_limit_test!(MaxColumnsPerTable, max_columns_per_table);
+}
diff --git a/data_types/src/snapshot/hash.rs b/data_types/src/snapshot/hash.rs
new file mode 100644
index 00000000000..adf8c24c96f
--- /dev/null
+++ b/data_types/src/snapshot/hash.rs
@@ -0,0 +1,219 @@
+//! A primitive hash table supporting linear probing
+
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+use siphasher::sip::SipHasher24;
+
+use snafu::{ensure, Snafu};
+
+/// Error for [`HashBuckets`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs, missing_copy_implementations)]
+pub enum Error {
+    #[snafu(display("Bucket length not a power of two"))]
+    BucketsNotPower,
+    #[snafu(display("Unrecognized hash function"))]
+    UnrecognizedHash,
+}
+
+/// Result for [`HashBuckets`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A primitive hash table supporting [linear probing]
+///
+/// [linear probing](https://en.wikipedia.org/wiki/Linear_probing)
+#[derive(Debug, Clone)]
+pub struct HashBuckets {
+    /// The mask to yield index in `buckets` from a u64 hash
+    mask: usize,
+    /// A sequence of u32 encoding the value index + 1, or 0 if empty
+    buckets: Bytes,
+    /// The hash function to use
+    hash: SipHasher24,
+}
+
+impl HashBuckets {
+    /// Performs a lookup of `value`
+    pub fn lookup(&self, value: &[u8]) -> HashProbe<'_> {
+        self.lookup_raw(self.hash.hash(value))
+    }
+
+    fn lookup_raw(&self, hash: u64) -> HashProbe<'_> {
+        let idx = (hash as usize) & self.mask;
+        HashProbe {
+            idx,
+            buckets: self,
+            mask: self.mask as _,
+        }
+    }
+}
+
+impl TryFrom<generated::HashBuckets> for HashBuckets {
+    type Error = Error;
+
+    fn try_from(value: generated::HashBuckets) -> std::result::Result<Self, Self::Error> {
+        let buckets_len = value.buckets.len();
+        ensure!(buckets_len.count_ones() == 1, BucketsNotPowerSnafu);
+        let mask = buckets_len.wrapping_sub(1) ^ 3;
+        match value.hash_function {
+            Some(generated::hash_buckets::HashFunction::SipHash24(s)) => Ok(Self {
+                mask,
+                buckets: value.buckets,
+                hash: SipHasher24::new_with_keys(s.key0, s.key1),
+            }),
+            _ => Err(Error::UnrecognizedHash),
+        }
+    }
+}
+
+impl From<HashBuckets> for generated::HashBuckets {
+    fn from(value: HashBuckets) -> Self {
+        let (key0, key1) = value.hash.keys();
+        Self {
+            buckets: value.buckets,
+            hash_function: Some(generated::hash_buckets::HashFunction::SipHash24(
+                generated::SipHash24 { key0, key1 },
+            )),
+        }
+    }
+}
+
+/// Yields the indices to probe for equality
+#[derive(Debug)]
+pub struct HashProbe<'a> {
+    buckets: &'a HashBuckets,
+    idx: usize,
+    mask: usize,
+}
+
+impl<'a> Iterator for HashProbe<'a> {
+    type Item = usize;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let slice = self.buckets.buckets.get(self.idx..self.idx + 4)?;
+        let entry = u32::from_le_bytes(slice.try_into().unwrap());
+        self.idx = (self.idx + 4) & self.mask;
+
+        // Empty entries are encoded as 0
+        Some(entry.checked_sub(1)? as usize)
+    }
+}
+
+/// An encoder for [`HashBuckets`]
+#[derive(Debug)]
+pub struct HashBucketsEncoder {
+    mask: usize,
+    buckets: Vec<u8>,
+    hash: SipHasher24,
+    len: u32,
+    capacity: u32,
+}
+
+impl HashBucketsEncoder {
+    /// Create a new [`HashBucketsEncoder`]
+    ///
+    /// # Panics
+    ///
+    /// Panics if capacity >= u32::MAX
+    pub fn new(capacity: usize) -> Self {
+        assert!(capacity < u32::MAX as usize);
+
+        let buckets_len = (capacity * 2).next_power_of_two() * 4;
+        let mask = buckets_len.wrapping_sub(1) ^ 3;
+        Self {
+            mask,
+            len: 0,
+            capacity: capacity as u32,
+            buckets: vec![0; buckets_len],
+            // Note: this uses keys (0, 0)
+            hash: SipHasher24::new(),
+        }
+    }
+
+    /// Append a new value
+    ///
+    /// # Panics
+    ///
+    /// Panics if this would exceed the capacity provided to new
+    pub fn push(&mut self, v: &[u8]) {
+        self.push_raw(self.hash.hash(v));
+    }
+
+    /// Append a new value by hash, returning the bucket index
+    fn push_raw(&mut self, hash: u64) -> usize {
+        assert_ne!(self.len, self.capacity);
+        self.len += 1;
+        let entry = self.len;
+        let mut idx = (hash as usize) & self.mask;
+        loop {
+            let s = &mut self.buckets[idx..idx + 4];
+            let s: &mut [u8; 4] = s.try_into().unwrap();
+            if s.iter().all(|x| *x == 0) {
+                *s = entry.to_le_bytes();
+                return idx / 4;
+            }
+            idx = (idx + 4) & self.mask;
+        }
+    }
+
+    /// Construct the output [`HashBuckets`]
+    pub fn finish(self) -> HashBuckets {
+        HashBuckets {
+            mask: self.mask,
+            hash: self.hash,
+            buckets: self.buckets.into(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_collision() {
+        let mut builder = HashBucketsEncoder::new(6);
+
+        assert_eq!(builder.push_raw(14), 3);
+        assert_eq!(builder.push_raw(297), 10);
+        assert_eq!(builder.push_raw(43), 11); // Hashes to occupied bucket 10
+        assert_eq!(builder.push_raw(60), 15);
+        assert_eq!(builder.push_raw(124), 0); // Hashes to occupied bucket 15
+        assert_eq!(builder.push_raw(0), 1); // Hashes to occupied bucket 0
+
+        let buckets = builder.finish();
+
+        let l = buckets.lookup_raw(14).collect::<Vec<_>>();
+        assert_eq!(l, vec![0]);
+
+        let l = buckets.lookup_raw(297).collect::<Vec<_>>();
+        assert_eq!(l, vec![1, 2]);
+
+        let l = buckets.lookup_raw(43).collect::<Vec<_>>();
+        assert_eq!(l, vec![1, 2]);
+
+        let l = buckets.lookup_raw(60).collect::<Vec<_>>();
+        assert_eq!(l, vec![3, 4, 5]);
+
+        let l = buckets.lookup_raw(0).collect::<Vec<_>>();
+        assert_eq!(l, vec![4, 5]);
+    }
+
+    #[test]
+    fn test_basic() {
+        let data = ["a", "", "bongos", "cupcakes", "bananas"];
+        let mut builder = HashBucketsEncoder::new(data.len());
+        for s in &data {
+            builder.push(s.as_bytes());
+        }
+        let buckets = builder.finish();
+
+        let contains = |s: &str| -> bool { buckets.lookup(s.as_bytes()).any(|idx| data[idx] == s) };
+
+        assert!(contains("a"));
+        assert!(contains(""));
+        assert!(contains("bongos"));
+        assert!(contains("bananas"));
+        assert!(!contains("windows"));
+    }
+}
diff --git a/data_types/src/snapshot/list.rs b/data_types/src/snapshot/list.rs
new file mode 100644
index 00000000000..bd86b98dd30
--- /dev/null
+++ b/data_types/src/snapshot/list.rs
@@ -0,0 +1,192 @@
+//! A list of [`Message`] supporting efficient skipping
+
+use bytes::Bytes;
+use prost::Message;
+use snafu::{ensure, Snafu};
+use std::marker::PhantomData;
+use std::ops::Range;
+
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+
+/// Error type for [`MessageList`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(context(false), display("PackedList decode error: {source}"))]
+    DecodeError { source: prost::DecodeError },
+
+    #[snafu(context(false), display("PackedList encode error: {source}"))]
+    EncodeError { source: prost::EncodeError },
+
+    #[snafu(display("Invalid MessageList offsets: {start}..{end}"))]
+    InvalidSlice { start: usize, end: usize },
+
+    #[snafu(display("MessageList slice {start}..{end} out of bounds 0..{bounds}"))]
+    SliceOutOfBounds {
+        start: usize,
+        end: usize,
+        bounds: usize,
+    },
+}
+
+/// Error type for [`MessageList`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A packed list of [`Message`]
+///
+/// Normally protobuf encodes repeated fields by simply encoding the tag multiple times,
+/// see [here](https://protobuf.dev/programming-guides/encoding/#optional).
+///
+/// Unfortunately this means it is not possible to locate a value at a given index without
+/// decoding all prior records. [`MessageList`] therefore provides a list encoding, inspired
+/// by arrow, that provides this and is designed to be combined with [`prost`]'s support
+/// for zero-copy decoding of [`Bytes`]
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct MessageList<T: Message + Default> {
+    len: usize,
+    offsets: Bytes,
+    values: Bytes,
+    phantom: PhantomData<T>,
+}
+
+impl<T: Message + Default> MessageList<T> {
+    /// Encode `values` to a [`MessageList`]
+    pub fn encode(values: &[T]) -> Result<Self> {
+        let cap = (values.len() + 1) * 4;
+        let mut offsets: Vec<u8> = Vec::with_capacity(cap);
+        offsets.extend_from_slice(&0_u32.to_le_bytes());
+
+        let mut cap = 0;
+        for x in values {
+            cap += x.encoded_len();
+            let offset = u32::try_from(cap).unwrap();
+            offsets.extend_from_slice(&offset.to_le_bytes());
+        }
+
+        let mut data = Vec::with_capacity(cap);
+        values.iter().try_for_each(|x| x.encode(&mut data))?;
+
+        Ok(Self {
+            len: values.len(),
+            offsets: offsets.into(),
+            values: data.into(),
+            phantom: Default::default(),
+        })
+    }
+
+    /// Returns true if this list is empty
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    /// Returns the number of elements in this list
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns the element at index `idx`
+    pub fn get(&self, idx: usize) -> Result<T> {
+        let offset_start = idx * 4;
+        let offset_slice = &self.offsets[offset_start..offset_start + 8];
+        let start = u32::from_le_bytes(offset_slice[0..4].try_into().unwrap()) as usize;
+        let end = u32::from_le_bytes(offset_slice[4..8].try_into().unwrap()) as usize;
+
+        let bounds = self.values.len();
+        ensure!(end >= start, InvalidSliceSnafu { start, end });
+        ensure!(end <= bounds, SliceOutOfBoundsSnafu { start, end, bounds });
+
+        // We slice `Bytes` to preserve zero-copy
+        let data = self.values.slice(start..end);
+        Ok(T::decode(data)?)
+    }
+}
+
+impl<T: Message + Default> From<generated::MessageList> for MessageList<T> {
+    fn from(proto: generated::MessageList) -> Self {
+        let len = (proto.offsets.len() / 4).saturating_sub(1);
+        Self {
+            len,
+            offsets: proto.offsets,
+            values: proto.values,
+            phantom: Default::default(),
+        }
+    }
+}
+
+impl<T: Message + Default> From<MessageList<T>> for generated::MessageList {
+    fn from(value: MessageList<T>) -> Self {
+        Self {
+            offsets: value.offsets,
+            values: value.values,
+        }
+    }
+}
+
+impl<T: Message + Default> IntoIterator for MessageList<T> {
+    type Item = Result<T>;
+    type IntoIter = MessageListIter<T>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        MessageListIter {
+            iter: (0..self.len),
+            list: self,
+        }
+    }
+}
+
+/// [`Iterator`] for [`MessageList`]
+#[derive(Debug)]
+pub struct MessageListIter<T: Message + Default> {
+    iter: Range<usize>,
+    list: MessageList<T>,
+}
+
+impl<T: Message + Default> Iterator for MessageListIter<T> {
+    type Item = Result<T>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        Some(self.list.get(self.iter.next()?))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.iter.size_hint()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simple() {
+        let strings = ["", "test", "foo", "abc", "", "skd"];
+        let strings: Vec<_> = strings.into_iter().map(ToString::to_string).collect();
+
+        let encoded = MessageList::encode(&strings).unwrap();
+
+        assert_eq!(encoded.get(5).unwrap().as_str(), "skd");
+        assert_eq!(encoded.get(2).unwrap().as_str(), "foo");
+        assert_eq!(encoded.get(0).unwrap().as_str(), "");
+
+        let decoded: Vec<_> = encoded.clone().into_iter().map(Result::unwrap).collect();
+        assert_eq!(strings, decoded);
+
+        let proto = generated::MessageList::from(encoded.clone());
+        let back = MessageList::<String>::from(proto.clone());
+        assert_eq!(encoded, back);
+
+        // Invalid decode should return error not panic
+        let invalid = MessageList::<i32>::from(proto);
+        invalid.get(2).unwrap_err();
+
+        let strings: Vec<String> = vec![];
+        let encoded = MessageList::encode(&strings).unwrap();
+        assert_eq!(encoded.len(), 0);
+        assert!(encoded.is_empty());
+
+        let proto = generated::MessageList::default();
+        let encoded = MessageList::<String>::from(proto);
+        assert_eq!(encoded.len(), 0);
+        assert!(encoded.is_empty());
+    }
+}
diff --git a/data_types/src/snapshot/mask.rs b/data_types/src/snapshot/mask.rs
new file mode 100644
index 00000000000..ae9dc3bc0ba
--- /dev/null
+++ b/data_types/src/snapshot/mask.rs
@@ -0,0 +1,71 @@
+//! A packed bitmask
+
+use arrow_buffer::bit_iterator::BitIndexIterator;
+use arrow_buffer::bit_util::{ceil, set_bit};
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as generated;
+
+/// A packed bitmask
+#[derive(Debug, Clone)]
+pub struct BitMask {
+    mask: Bytes,
+    len: usize,
+}
+
+impl BitMask {
+    /// Returns an iterator of the set indices in this mask
+    pub fn set_indices(&self) -> BitIndexIterator<'_> {
+        BitIndexIterator::new(&self.mask, 0, self.len)
+    }
+}
+
+impl From<generated::BitMask> for BitMask {
+    fn from(value: generated::BitMask) -> Self {
+        Self {
+            mask: value.mask,
+            len: value.len as _,
+        }
+    }
+}
+
+impl From<BitMask> for generated::BitMask {
+    fn from(value: BitMask) -> Self {
+        Self {
+            mask: value.mask,
+            len: value.len as _,
+        }
+    }
+}
+
+/// A builder for [`BitMask`]
+#[derive(Debug)]
+pub struct BitMaskBuilder {
+    values: Vec<u8>,
+    len: usize,
+}
+
+impl BitMaskBuilder {
+    /// Create a new bitmask able to store `len` boolean values
+    #[inline]
+    pub fn new(len: usize) -> Self {
+        Self {
+            values: vec![0; ceil(len, 8)],
+            len,
+        }
+    }
+
+    /// Set the bit at index `idx`
+    #[inline]
+    pub fn set_bit(&mut self, idx: usize) {
+        set_bit(&mut self.values, idx)
+    }
+
+    /// Return the built [`BitMask`]
+    #[inline]
+    pub fn finish(self) -> BitMask {
+        BitMask {
+            mask: self.values.into(),
+            len: self.len,
+        }
+    }
+}
diff --git a/data_types/src/snapshot/mod.rs b/data_types/src/snapshot/mod.rs
new file mode 100644
index 00000000000..7be5a937954
--- /dev/null
+++ b/data_types/src/snapshot/mod.rs
@@ -0,0 +1,11 @@
+//! Definitions of catalog snapshots
+//!
+//! Snapshots are read-optimised, that is they are designed to be inexpensive to
+//! decode, making extensive use of zero-copy [`Bytes`](bytes::Bytes) in place of
+//! allocating structures such as `String` and `Vec`
+
+pub mod hash;
+pub mod list;
+pub mod mask;
+pub mod partition;
+pub mod table;
diff --git a/data_types/src/snapshot/partition.rs b/data_types/src/snapshot/partition.rs
new file mode 100644
index 00000000000..d1838e57acf
--- /dev/null
+++ b/data_types/src/snapshot/partition.rs
@@ -0,0 +1,246 @@
+//! Snapshot definition for partitions
+
+use crate::snapshot::list::MessageList;
+use crate::snapshot::mask::{BitMask, BitMaskBuilder};
+use crate::{
+    ColumnId, ColumnSet, CompactionLevelProtoError, NamespaceId, ObjectStoreId, ParquetFile,
+    ParquetFileId, Partition, PartitionHashId, PartitionHashIdError, PartitionId,
+    SkippedCompaction, SortKeyIds, TableId, Timestamp,
+};
+use bytes::Bytes;
+use generated_types::influxdata::iox::{
+    catalog_cache::v1 as proto, skipped_compaction::v1 as skipped_compaction_proto,
+};
+use snafu::{OptionExt, ResultExt, Snafu};
+
+/// Error for [`PartitionSnapshot`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Error decoding PartitionFile: {source}"))]
+    FileDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding ParquetFile: {source}"))]
+    FileEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Missing required field {field}"))]
+    RequiredField { field: &'static str },
+
+    #[snafu(context(false))]
+    CompactionLevel { source: CompactionLevelProtoError },
+
+    #[snafu(context(false))]
+    PartitionHashId { source: PartitionHashIdError },
+
+    #[snafu(display("Invalid partition key: {source}"))]
+    PartitionKey { source: std::str::Utf8Error },
+}
+
+/// Result for [`PartitionSnapshot`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A snapshot of a partition
+#[derive(Debug, Clone)]
+pub struct PartitionSnapshot {
+    /// The [`NamespaceId`]
+    namespace_id: NamespaceId,
+    /// The [`TableId`]
+    table_id: TableId,
+    /// The [`PartitionId`]
+    partition_id: PartitionId,
+    /// The [`PartitionHashId`]
+    partition_hash_id: Option<PartitionHashId>,
+    /// The generation of this snapshot
+    generation: u64,
+    /// The partition key
+    key: Bytes,
+    /// The files
+    files: MessageList<proto::PartitionFile>,
+    /// The columns for this partition
+    columns: ColumnSet,
+    /// The sort key ids
+    sort_key: SortKeyIds,
+    /// The time of a new file
+    new_file_at: Option<Timestamp>,
+    /// Skipped compaction.
+    skipped_compaction: Option<skipped_compaction_proto::SkippedCompaction>,
+}
+
+impl PartitionSnapshot {
+    /// Create a new [`PartitionSnapshot`] from the provided state
+    pub fn encode(
+        namespace_id: NamespaceId,
+        partition: Partition,
+        files: Vec<ParquetFile>,
+        skipped_compaction: Option<SkippedCompaction>,
+        generation: u64,
+    ) -> Result<Self> {
+        // Iterate in reverse order as schema additions are normally additive and
+        // so the later files will typically have more columns
+        let columns = files.iter().rev().fold(ColumnSet::empty(), |mut acc, v| {
+            acc.union(&v.column_set);
+            acc
+        });
+
+        let files = files
+            .into_iter()
+            .map(|file| {
+                let mut mask = BitMaskBuilder::new(columns.len());
+                for (idx, _) in columns.intersect(&file.column_set) {
+                    mask.set_bit(idx);
+                }
+
+                proto::PartitionFile {
+                    id: file.id.get(),
+                    object_store_uuid: Some(file.object_store_id.get_uuid().into()),
+                    min_time: file.min_time.0,
+                    max_time: file.max_time.0,
+                    file_size_bytes: file.file_size_bytes,
+                    row_count: file.row_count,
+                    compaction_level: file.compaction_level as _,
+                    created_at: file.created_at.0,
+                    max_l0_created_at: file.max_l0_created_at.0,
+                    column_mask: Some(mask.finish().into()),
+                }
+            })
+            .collect::<Vec<_>>();
+
+        Ok(Self {
+            generation,
+            columns,
+            namespace_id,
+            partition_id: partition.id,
+            partition_hash_id: partition.hash_id().cloned(),
+            key: partition.partition_key.as_bytes().to_vec().into(),
+            files: MessageList::encode(&files).context(FileEncodeSnafu)?,
+            sort_key: partition.sort_key_ids().cloned().unwrap_or_default(),
+            table_id: partition.table_id,
+            new_file_at: partition.new_file_at,
+            skipped_compaction: skipped_compaction.map(|sc| sc.into()),
+        })
+    }
+
+    /// Create a new [`PartitionSnapshot`] from a `proto` and generation
+    pub fn decode(proto: proto::Partition, generation: u64) -> Self {
+        let table_id = TableId::new(proto.table_id);
+        let partition_hash_id = proto
+            .partition_hash_id
+            .then(|| PartitionHashId::from_raw(table_id, proto.key.as_ref()));
+
+        Self {
+            generation,
+            table_id,
+            partition_hash_id,
+            key: proto.key,
+            files: MessageList::from(proto.files.unwrap_or_default()),
+            namespace_id: NamespaceId::new(proto.namespace_id),
+            partition_id: PartitionId::new(proto.partition_id),
+            columns: ColumnSet::new(proto.column_ids.into_iter().map(ColumnId::new)),
+            sort_key: SortKeyIds::new(proto.sort_key_ids.into_iter().map(ColumnId::new)),
+            new_file_at: proto.new_file_at.map(Timestamp::new),
+            skipped_compaction: proto.skipped_compaction,
+        }
+    }
+
+    /// Returns the generation of this snapshot
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+
+    /// Returns the [`PartitionId`]
+    pub fn partition_id(&self) -> PartitionId {
+        self.partition_id
+    }
+
+    /// Returns the [`PartitionHashId`] if any
+    pub fn partition_hash_id(&self) -> Option<&PartitionHashId> {
+        self.partition_hash_id.as_ref()
+    }
+
+    /// Returns the file at index `idx`
+    pub fn file(&self, idx: usize) -> Result<ParquetFile> {
+        let file = self.files.get(idx).context(FileDecodeSnafu)?;
+
+        let uuid = file.object_store_uuid.context(RequiredFieldSnafu {
+            field: "object_store_uuid",
+        })?;
+
+        let column_set = match file.column_mask {
+            Some(mask) => {
+                let mask = BitMask::from(mask);
+                ColumnSet::new(mask.set_indices().map(|idx| self.columns[idx]))
+            }
+            None => self.columns.clone(),
+        };
+
+        Ok(ParquetFile {
+            id: ParquetFileId(file.id),
+            namespace_id: self.namespace_id,
+            table_id: self.table_id,
+            partition_id: self.partition_id,
+            partition_hash_id: self.partition_hash_id.clone(),
+            object_store_id: ObjectStoreId::from_uuid(uuid.into()),
+            min_time: Timestamp(file.min_time),
+            max_time: Timestamp(file.max_time),
+            to_delete: None,
+            file_size_bytes: file.file_size_bytes,
+            row_count: file.row_count,
+            compaction_level: file.compaction_level.try_into()?,
+            created_at: Timestamp(file.created_at),
+            column_set,
+            max_l0_created_at: Timestamp(file.max_l0_created_at),
+        })
+    }
+
+    /// Returns an iterator over the files in this snapshot
+    pub fn files(&self) -> impl Iterator<Item = Result<ParquetFile>> + '_ {
+        (0..self.files.len()).map(|idx| self.file(idx))
+    }
+
+    /// Returns the [`Partition`] for this snapshot
+    pub fn partition(&self) -> Result<Partition> {
+        let key = std::str::from_utf8(&self.key).context(PartitionKeySnafu)?;
+        Ok(Partition::new_catalog_only(
+            self.partition_id,
+            self.partition_hash_id.clone(),
+            self.table_id,
+            key.into(),
+            self.sort_key.clone(),
+            self.new_file_at,
+        ))
+    }
+
+    /// Returns the columns IDs
+    pub fn column_ids(&self) -> &ColumnSet {
+        &self.columns
+    }
+
+    /// Return skipped compaction for this partition, if any.
+    pub fn skipped_compaction(&self) -> Option<SkippedCompaction> {
+        self.skipped_compaction
+            .as_ref()
+            .cloned()
+            .map(|sc| sc.into())
+    }
+}
+
+impl From<PartitionSnapshot> for proto::Partition {
+    fn from(value: PartitionSnapshot) -> Self {
+        Self {
+            key: value.key,
+            files: Some(value.files.into()),
+            namespace_id: value.namespace_id.get(),
+            table_id: value.table_id.get(),
+            partition_id: value.partition_id.get(),
+            partition_hash_id: value.partition_hash_id.is_some(),
+            column_ids: value.columns.iter().map(|x| x.get()).collect(),
+            sort_key_ids: value.sort_key.iter().map(|x| x.get()).collect(),
+            new_file_at: value.new_file_at.map(|x| x.get()),
+            skipped_compaction: value.skipped_compaction,
+        }
+    }
+}
diff --git a/data_types/src/snapshot/table.rs b/data_types/src/snapshot/table.rs
new file mode 100644
index 00000000000..08c235d2dff
--- /dev/null
+++ b/data_types/src/snapshot/table.rs
@@ -0,0 +1,197 @@
+//! Snapshot definition for tables
+use crate::snapshot::list::MessageList;
+use crate::{
+    Column, ColumnId, ColumnTypeProtoError, NamespaceId, Partition, PartitionId, Table, TableId,
+};
+use bytes::Bytes;
+use generated_types::influxdata::iox::catalog_cache::v1 as proto;
+use generated_types::influxdata::iox::column_type::v1::ColumnType;
+use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate;
+use snafu::{ResultExt, Snafu};
+
+/// Error for [`TableSnapshot`]
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum Error {
+    #[snafu(display("Error decoding TablePartition: {source}"))]
+    PartitionDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding TablePartition: {source}"))]
+    PartitionEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error decoding TableColumn: {source}"))]
+    ColumnDecode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Error encoding TableColumn: {source}"))]
+    ColumnEncode {
+        source: crate::snapshot::list::Error,
+    },
+
+    #[snafu(display("Invalid column name: {source}"))]
+    ColumnName { source: std::str::Utf8Error },
+
+    #[snafu(display("Invalid table name: {source}"))]
+    TableName { source: std::str::Utf8Error },
+
+    #[snafu(display("Invalid partition template: {source}"))]
+    PartitionTemplate {
+        source: crate::partition_template::ValidationError,
+    },
+
+    #[snafu(context(false))]
+    ColumnType { source: ColumnTypeProtoError },
+}
+
+/// Result for [`TableSnapshot`]
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+/// A snapshot of a table
+#[derive(Debug, Clone)]
+pub struct TableSnapshot {
+    table_id: TableId,
+    namespace_id: NamespaceId,
+    table_name: Bytes,
+    partitions: MessageList<proto::TablePartition>,
+    columns: MessageList<proto::TableColumn>,
+    partition_template: Option<PartitionTemplate>,
+    generation: u64,
+}
+
+impl TableSnapshot {
+    /// Create a new [`TableSnapshot`] from the provided state
+    pub fn encode(
+        table: Table,
+        partitions: Vec<Partition>,
+        columns: Vec<Column>,
+        generation: u64,
+    ) -> Result<Self> {
+        let columns: Vec<_> = columns
+            .into_iter()
+            .map(|c| proto::TableColumn {
+                id: c.id.get(),
+                name: c.name.into(),
+                column_type: ColumnType::from(c.column_type).into(),
+            })
+            .collect();
+
+        let partitions: Vec<_> = partitions
+            .into_iter()
+            .map(|p| proto::TablePartition {
+                id: p.id.get(),
+                key: p.partition_key.as_bytes().to_vec().into(),
+            })
+            .collect();
+
+        Ok(Self {
+            table_id: table.id,
+            namespace_id: table.namespace_id,
+            table_name: table.name.into(),
+            partitions: MessageList::encode(&partitions).context(PartitionEncodeSnafu)?,
+            columns: MessageList::encode(&columns).context(ColumnEncodeSnafu)?,
+            partition_template: table.partition_template.as_proto().cloned(),
+            generation,
+        })
+    }
+
+    /// Create a new [`TableSnapshot`] from a `proto` and generation
+    pub fn decode(proto: proto::Table, generation: u64) -> Self {
+        Self {
+            generation,
+            table_id: TableId::new(proto.table_id),
+            namespace_id: NamespaceId::new(proto.namespace_id),
+            table_name: proto.table_name,
+            partitions: MessageList::from(proto.partitions.unwrap_or_default()),
+            columns: MessageList::from(proto.columns.unwrap_or_default()),
+            partition_template: proto.partition_template,
+        }
+    }
+
+    /// Returns the [`Table`] for this snapshot
+    pub fn table(&self) -> Result<Table> {
+        let name = std::str::from_utf8(&self.table_name).context(TableNameSnafu)?;
+        let template = self
+            .partition_template
+            .clone()
+            .try_into()
+            .context(PartitionTemplateSnafu)?;
+
+        Ok(Table {
+            id: self.table_id,
+            namespace_id: self.namespace_id,
+            name: name.into(),
+            partition_template: template,
+        })
+    }
+
+    /// Returns the column by index
+    pub fn column(&self, idx: usize) -> Result<Column> {
+        let column = self.columns.get(idx).context(ColumnDecodeSnafu)?;
+        let name = std::str::from_utf8(&column.name).context(ColumnNameSnafu)?;
+
+        Ok(Column {
+            id: ColumnId::new(column.id),
+            table_id: self.table_id,
+            name: name.into(),
+            column_type: (column.column_type as i16).try_into()?,
+        })
+    }
+
+    /// Returns an iterator of the columns in this table
+    pub fn columns(&self) -> impl Iterator<Item = Result<Column>> + '_ {
+        (0..self.columns.len()).map(|idx| self.column(idx))
+    }
+
+    /// Returns an iterator of the [`PartitionId`] in this table
+    pub fn partitions(&self) -> impl Iterator<Item = Result<TableSnapshotPartition>> + '_ {
+        (0..self.partitions.len()).map(|idx| {
+            let p = self.partitions.get(idx).context(PartitionDecodeSnafu)?;
+            Ok(TableSnapshotPartition {
+                id: PartitionId::new(p.id),
+                key: p.key,
+            })
+        })
+    }
+
+    /// Returns the generation of this snapshot
+    pub fn generation(&self) -> u64 {
+        self.generation
+    }
+}
+
+/// Partition information stored within [`TableSnapshot`]
+#[derive(Debug)]
+pub struct TableSnapshotPartition {
+    id: PartitionId,
+    key: Bytes,
+}
+
+impl TableSnapshotPartition {
+    /// Returns the [`PartitionId`] for this partition
+    pub fn id(&self) -> PartitionId {
+        self.id
+    }
+
+    /// Returns the partition key for this partition
+    pub fn key(&self) -> &[u8] {
+        &self.key
+    }
+}
+
+impl From<TableSnapshot> for proto::Table {
+    fn from(value: TableSnapshot) -> Self {
+        Self {
+            partitions: Some(value.partitions.into()),
+            columns: Some(value.columns.into()),
+            partition_template: value.partition_template,
+            namespace_id: value.namespace_id.get(),
+            table_id: value.table_id.get(),
+            table_name: value.table_name,
+        }
+    }
+}
diff --git a/datafusion_util/Cargo.toml b/datafusion_util/Cargo.toml
index 9c75525084d..1f5f5541153 100644
--- a/datafusion_util/Cargo.toml
+++ b/datafusion_util/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 async-trait = "0.1"
 datafusion = { workspace = true }
@@ -13,10 +16,8 @@ futures = "0.3"
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
 pin-project = "1.1"
-tokio = { version = "1.32", features = ["parking_lot", "sync"] }
+schema = { path = "../schema" }
+tokio = { version = "1.35", features = ["parking_lot", "sync"] }
 tokio-stream = "0.1"
-url = "2.4"
+url = "2.5"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-
-[dev-dependencies]
-schema = { path = "../schema" }
diff --git a/datafusion_util/src/config.rs b/datafusion_util/src/config.rs
index 1c1a975a4ff..ed41b197688 100644
--- a/datafusion_util/src/config.rs
+++ b/datafusion_util/src/config.rs
@@ -4,6 +4,7 @@ use datafusion::{
     config::ConfigOptions, execution::runtime_env::RuntimeEnv, prelude::SessionConfig,
 };
 use object_store::ObjectStore;
+use schema::TIME_DATA_TIMEZONE;
 use url::Url;
 
 // The default catalog name - this impacts what SQL queries use if not specified
@@ -20,6 +21,7 @@ pub fn iox_session_config() -> SessionConfig {
     let mut options = ConfigOptions::new();
     options.execution.parquet.pushdown_filters = true;
     options.execution.parquet.reorder_filters = true;
+    options.execution.time_zone = TIME_DATA_TIMEZONE().map(|s| s.to_string());
     options.optimizer.repartition_sorts = true;
 
     SessionConfig::from(options)
@@ -27,6 +29,12 @@ pub fn iox_session_config() -> SessionConfig {
         .with_create_default_catalog_and_schema(true)
         .with_information_schema(true)
         .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA)
+        // Tell the datafusion optimizer to avoid repartitioning sorted inputs
+        .with_prefer_existing_sort(true)
+        // Avoid repartitioning file scans as it destroys existing sort orders
+        // see https://github.com/influxdata/influxdb_iox/issues/9450
+        // see https://github.com/apache/arrow-datafusion/issues/8451
+        .with_repartition_file_scans(false)
 }
 
 /// Register the "IOx" object store provider for URLs of the form "iox://{id}
diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs
index b62cf3b8007..6323f06278a 100644
--- a/datafusion_util/src/lib.rs
+++ b/datafusion_util/src/lib.rs
@@ -20,6 +20,7 @@
 //! for expression manipulation functions.
 
 use datafusion::execution::memory_pool::{MemoryPool, UnboundedMemoryPool};
+use std::collections::HashSet;
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
@@ -33,10 +34,11 @@ use std::task::{Context, Poll};
 use datafusion::arrow::array::BooleanArray;
 use datafusion::arrow::compute::filter_record_batch;
 use datafusion::arrow::datatypes::{DataType, Fields};
+use datafusion::common::stats::Precision;
 use datafusion::common::{DataFusionError, ToDFSchema};
-use datafusion::datasource::MemTable;
 use datafusion::execution::context::TaskContext;
 use datafusion::logical_expr::expr::Sort;
+use datafusion::logical_expr::utils::inspect_expr_pre;
 use datafusion::physical_expr::execution_props::ExecutionProps;
 use datafusion::physical_expr::{create_physical_expr, PhysicalExpr};
 use datafusion::physical_optimizer::pruning::PruningPredicate;
@@ -51,6 +53,7 @@ use datafusion::{
     scalar::ScalarValue,
 };
 use futures::{Stream, StreamExt};
+use schema::TIME_DATA_TIMEZONE;
 use tokio::sync::mpsc::{Receiver, UnboundedReceiver};
 use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream};
 use watch::WatchedTask;
@@ -113,8 +116,8 @@ pub fn lit_dict(value: &str) -> Expr {
 pub fn make_range_expr(start: i64, end: i64, time: impl AsRef<str>) -> Expr {
     // We need to cast the start and end values to timestamps
     // the equivalent of:
-    let ts_start = ScalarValue::TimestampNanosecond(Some(start), None);
-    let ts_end = ScalarValue::TimestampNanosecond(Some(end), None);
+    let ts_start = timestamptz_nano(start);
+    let ts_end = timestamptz_nano(end);
 
     let time_col = time.as_ref().as_expr();
     let ts_low = lit(ts_start).lt_eq(time_col.clone());
@@ -123,6 +126,45 @@ pub fn make_range_expr(start: i64, end: i64, time: impl AsRef<str>) -> Expr {
     ts_low.and(ts_high)
 }
 
+/// Ensures all columns referred to in `filters` are in the `projection`, if
+/// any, adding them if necessary.
+pub fn extend_projection_for_filters(
+    schema: &Schema,
+    filters: &[Expr],
+    projection: Option<&Vec<usize>>,
+) -> Result<Option<Vec<usize>>, DataFusionError> {
+    let Some(mut projection) = projection.cloned() else {
+        return Ok(None);
+    };
+
+    let mut seen_cols: HashSet<usize> = projection.iter().cloned().collect();
+    for filter in filters {
+        inspect_expr_pre(filter, |expr| {
+            if let Expr::Column(c) = expr {
+                let idx = schema.index_of(&c.name)?;
+                // if haven't seen this column before, add it to the list
+                if seen_cols.insert(idx) {
+                    projection.push(idx);
+                }
+            }
+            Ok(()) as Result<(), DataFusionError>
+        })?;
+    }
+    Ok(Some(projection))
+}
+
+// TODO port this upstream to datafusion (maybe as From<Option> for Precision)
+/// Maps `Option::Some(T)` to `Precision::Exact(T)` and `Option::None` to
+/// `Precision::Absent`
+pub fn option_to_precision<T: std::fmt::Debug + Clone + PartialEq + Eq + PartialOrd>(
+    option: Option<T>,
+) -> Precision<T> {
+    match option {
+        Some(value) => Precision::Exact(value),
+        None => Precision::Absent,
+    }
+}
+
 /// A RecordBatchStream created from in-memory RecordBatches.
 #[derive(Debug)]
 pub struct MemoryStream {
@@ -324,7 +366,7 @@ pub fn batch_filter(
 ) -> Result<RecordBatch, DataFusionError> {
     predicate
         .evaluate(batch)
-        .map(|v| v.into_array(batch.num_rows()))
+        .and_then(|v| v.into_array(batch.num_rows()))
         .and_then(|array| {
             array
                 .as_any()
@@ -336,20 +378,12 @@ pub fn batch_filter(
                 })
                 // apply filter array to record batch
                 .and_then(|filter_array| {
-                    filter_record_batch(batch, filter_array).map_err(DataFusionError::ArrowError)
+                    filter_record_batch(batch, filter_array)
+                        .map_err(|err| DataFusionError::ArrowError(err, None))
                 })
         })
 }
 
-/// Return a DataFusion [`SessionContext`] that has the passed RecordBatch available as a table
-pub fn context_with_table(batch: RecordBatch) -> SessionContext {
-    let schema = batch.schema();
-    let provider = MemTable::try_new(schema, vec![vec![batch]]).unwrap();
-    let ctx = SessionContext::new();
-    ctx.register_table("t", Arc::new(provider)).unwrap();
-    ctx
-}
-
 /// Returns a new schema where all the fields are nullable
 pub fn nullable_schema(schema: SchemaRef) -> SchemaRef {
     // they are all already nullable
@@ -398,6 +432,21 @@ pub fn unbounded_memory_pool() -> Arc<dyn MemoryPool> {
     Arc::new(UnboundedMemoryPool::default())
 }
 
+/// Create a timestamp literal for the given UTC nanosecond offset in
+/// the timezone specified by [TIME_DATA_TIMEZONE].
+///
+/// N.B. If [TIME_DATA_TIMEZONE] specifies the None timezone then this
+/// function behaves identially to [datafusion::prelude::lit_timestamp_nano].
+pub fn lit_timestamptz_nano(ns: i64) -> Expr {
+    lit(timestamptz_nano(ns))
+}
+
+/// Create a scalar timestamp value for the given UTC nanosecond offset
+/// in the timezone specified by [TIME_DATA_TIMEZONE].
+pub fn timestamptz_nano(ns: i64) -> ScalarValue {
+    ScalarValue::TimestampNanosecond(Some(ns), TIME_DATA_TIMEZONE())
+}
+
 #[cfg(test)]
 mod tests {
     use datafusion::arrow::datatypes::{DataType, Field};
@@ -410,8 +459,12 @@ mod tests {
         // Test that the generated predicate is correct
 
         let ts_predicate_expr = make_range_expr(101, 202, "time");
+        let expected_timezone = match TIME_DATA_TIMEZONE() {
+            Some(tz) => format!("Some(\"{tz}\")"),
+            None => "None".into(),
+        };
         let expected_string =
-            "TimestampNanosecond(101, None) <= time AND time < TimestampNanosecond(202, None)";
+            format!("TimestampNanosecond(101, {expected_timezone}) <= time AND time < TimestampNanosecond(202, {expected_timezone})");
         let actual_string = format!("{ts_predicate_expr}");
 
         assert_eq!(actual_string, expected_string);
diff --git a/datafusion_util/src/watch.rs b/datafusion_util/src/watch.rs
index 198ea0edf43..c23303ee0db 100644
--- a/datafusion_util/src/watch.rs
+++ b/datafusion_util/src/watch.rs
@@ -99,7 +99,7 @@ async fn watch_task<S>(
 struct AutoAbortJoinHandle<T>(#[pin] JoinHandle<T>);
 
 impl<T> AutoAbortJoinHandle<T> {
-    pub fn new(handle: JoinHandle<T>) -> Self {
+    pub(crate) fn new(handle: JoinHandle<T>) -> Self {
         Self(handle)
     }
 }
diff --git a/dml/Cargo.toml b/dml/Cargo.toml
index bb90237c604..9ae6876f857 100644
--- a/dml/Cargo.toml
+++ b/dml/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 arrow_util = { path = "../arrow_util" }
 data_types = { path = "../data_types" }
diff --git a/executor/Cargo.toml b/executor/Cargo.toml
index 7fe8c1ace1d..a644901c4c4 100644
--- a/executor/Cargo.toml
+++ b/executor/Cargo.toml
@@ -5,17 +5,21 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 futures = "0.3"
 metric = { path = "../metric" }
 observability_deps = { path = "../observability_deps" }
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parking_lot = "0.12"
 pin-project = "1.1"
-snafu = "0.7"
-tokio = { version = "1.32" }
-tokio-util = { version = "0.7.9" }
+snafu = "0.8"
+tokio = { version = "1.35" }
+tokio-util = { version = "0.7.10" }
 tokio_metrics_bridge = { path = "../tokio_metrics_bridge" }
+tokio_watchdog = { path = "../tokio_watchdog" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 # use libc on unix like platforms to set worker priority in DedicatedExecutor
diff --git a/executor/src/lib.rs b/executor/src/lib.rs
index fef04b3a593..8df9054c10b 100644
--- a/executor/src/lib.rs
+++ b/executor/src/lib.rs
@@ -19,6 +19,7 @@ use metric::Registry;
 use snafu::Snafu;
 #[cfg(tokio_unstable)]
 use tokio_metrics_bridge::setup_tokio_metrics;
+use tokio_watchdog::WatchdogConfig;
 // Workaround for "unused crate" lint false positives.
 #[cfg(not(tokio_unstable))]
 use tokio_metrics_bridge as _;
@@ -35,6 +36,7 @@ use std::{
         atomic::{AtomicUsize, Ordering},
         Arc,
     },
+    time::Duration,
 };
 use tokio::sync::oneshot::{error::RecvError, Receiver};
 use tokio_util::sync::CancellationToken;
@@ -252,6 +254,12 @@ impl DedicatedExecutor {
                     .build()
                     .expect("Creating tokio runtime");
 
+                WatchdogConfig::new(runtime.handle(), &metric_registry)
+                    .with_runtime_name(thread_name)
+                    .with_tick_duration(Duration::from_millis(100))
+                    .with_warn_duration(Duration::from_millis(100))
+                    .install();
+
                 #[cfg(tokio_unstable)]
                 setup_tokio_metrics(runtime.metrics(), thread_name, metric_registry);
                 #[cfg(not(tokio_unstable))]
diff --git a/flightsql/Cargo.toml b/flightsql/Cargo.toml
index e426849ca39..f4edb94b490 100644
--- a/flightsql/Cargo.toml
+++ b/flightsql/Cargo.toml
@@ -5,11 +5,12 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
-# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+[lints]
+workspace = true
 
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
-arrow-flight = { workspace = true, features = ["flight-sql-experimental"] }
+arrow = { workspace = true }
+arrow-flight = { workspace = true }
 arrow_util = { path = "../arrow_util" }
 datafusion = { workspace = true }
 observability_deps = { path = "../observability_deps" }
@@ -17,7 +18,7 @@ iox_query = { path = "../iox_query" }
 
 # Crates.io dependencies, in alphabetical order
 bytes = "1.5"
-snafu = "0.7"
+snafu = "0.8"
 once_cell = { version = "1", default-features = false }
-prost = "0.11"
+prost = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/flightsql/src/error.rs b/flightsql/src/error.rs
index bc13a90c668..9c40f6c2cd2 100644
--- a/flightsql/src/error.rs
+++ b/flightsql/src/error.rs
@@ -44,7 +44,7 @@ impl From<Error> for DataFusionError {
     fn from(value: Error) -> Self {
         match value {
             Error::DataFusion { source } => source,
-            Error::Arrow { source } => Self::ArrowError(source),
+            Error::Arrow { source } => Self::ArrowError(source, None),
             value => Self::External(Box::new(value)),
         }
     }
diff --git a/flightsql/src/planner.rs b/flightsql/src/planner.rs
index 0c37a6806d2..78016618510 100644
--- a/flightsql/src/planner.rs
+++ b/flightsql/src/planner.rs
@@ -20,6 +20,7 @@ use arrow_flight::{
 use arrow_util::flight::prepare_schema_for_flight;
 use bytes::Bytes;
 use datafusion::{
+    common::ParamValues,
     logical_expr::{LogicalPlan, TableType},
     physical_plan::ExecutionPlan,
     sql::TableReference,
@@ -33,7 +34,7 @@ use crate::{error::*, sql_info::iox_sql_info_data, xdbc_type_info::xdbc_type_inf
 use crate::{FlightSQLCommand, PreparedStatementHandle};
 
 /// Logic for creating plans for various Flight messages against a query database
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct FlightSQLPlanner {}
 
 impl FlightSQLPlanner {
@@ -92,23 +93,24 @@ impl FlightSQLPlanner {
 
     /// Returns a plan that computes results requested in msg
     pub async fn do_get(
-        namespace_name: impl Into<String> + Send,
+        namespace_name: impl AsRef<str> + Send,
         _database: Arc<dyn QueryNamespace>,
         cmd: FlightSQLCommand,
+        params: impl Into<ParamValues> + Send,
         ctx: &IOxSessionContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        let namespace_name = namespace_name.into();
+        let namespace_name = namespace_name.as_ref();
         debug!(%namespace_name, %cmd, "Handling flightsql do_get");
 
         match cmd {
             FlightSQLCommand::CommandStatementQuery(CommandStatementQuery { query, .. }) => {
                 debug!(%query, "Planning FlightSQL query");
-                Ok(ctx.sql_to_physical_plan(&query).await?)
+                Ok(ctx.sql_to_physical_plan_with_params(&query, params).await?)
             }
             FlightSQLCommand::CommandPreparedStatementQuery(handle) => {
                 let query = handle.query();
                 debug!(%query, "Planning FlightSQL prepared query");
-                Ok(ctx.sql_to_physical_plan(query).await?)
+                Ok(ctx.sql_to_physical_plan_with_params(query, params).await?)
             }
             FlightSQLCommand::CommandGetSqlInfo(cmd) => {
                 debug!(?cmd, "Planning GetSqlInfo query");
diff --git a/flightsql/src/sql_info/meta.rs b/flightsql/src/sql_info/meta.rs
index fc56fcdb657..425d54591f5 100644
--- a/flightsql/src/sql_info/meta.rs
+++ b/flightsql/src/sql_info/meta.rs
@@ -4,6 +4,18 @@
 //!
 //! [queryrouterd]: https://github.com/influxdata/idpe/blob/85aa7a52b40f173cc4d79ac02b3a4a13e82333c4/queryrouter/internal/server/flightsql_info.go#L4
 
+use arrow::{
+    compute::can_cast_types,
+    datatypes::{
+        DataType,
+        IntervalUnit::{DayTime, YearMonth},
+        TimeUnit::Nanosecond,
+    },
+};
+use arrow_flight::sql::SqlSupportsConvert;
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+
 pub(crate) const SQL_INFO_SQL_KEYWORDS: &[&str] = &[
     // SQL-92 Reserved Words
     "absolute",
@@ -301,3 +313,78 @@ pub(crate) const SQL_INFO_DATE_TIME_FUNCTIONS: &[&str] = &[
 ];
 
 pub(crate) const SQL_INFO_SYSTEM_FUNCTIONS: &[&str] = &["array", "arrow_typeof", "struct"];
+
+static SQL_DATA_TYPE_TO_ARROW_DATA_TYPE: Lazy<HashMap<SqlSupportsConvert, DataType>> =
+    Lazy::new(|| {
+        [
+            // Referenced from DataFusion data types
+            // https://arrow.apache.org/datafusion/user-guide/sql/data_types.html
+            // Some SQL types are not supported by DataFusion
+            // https://arrow.apache.org/datafusion/user-guide/sql/data_types.html#unsupported-sql-types
+            (SqlSupportsConvert::SqlConvertBigint, DataType::Int64),
+            // SqlSupportsConvert::SqlConvertBinary is not supported
+            (SqlSupportsConvert::SqlConvertBit, DataType::Boolean),
+            (SqlSupportsConvert::SqlConvertChar, DataType::Utf8),
+            (SqlSupportsConvert::SqlConvertDate, DataType::Date32),
+            (
+                SqlSupportsConvert::SqlConvertDecimal,
+                // Use the max precision 38
+                // https://docs.rs/arrow-schema/47.0.0/arrow_schema/constant.DECIMAL128_MAX_PRECISION.html
+                DataType::Decimal128(38, 2),
+            ),
+            (SqlSupportsConvert::SqlConvertFloat, DataType::Float32),
+            (SqlSupportsConvert::SqlConvertInteger, DataType::Int32),
+            (
+                SqlSupportsConvert::SqlConvertIntervalDayTime,
+                DataType::Interval(DayTime),
+            ),
+            (
+                SqlSupportsConvert::SqlConvertIntervalYearMonth,
+                DataType::Interval(YearMonth),
+            ),
+            // SqlSupportsConvert::SqlConvertLongvarbinary is not supported
+            // LONG VARCHAR is identical to VARCHAR
+            // https://docs.oracle.com/javadb/10.6.2.1/ref/rrefsqlj15147.html
+            (SqlSupportsConvert::SqlConvertLongvarchar, DataType::Utf8),
+            // NUMERIC is a synonym for DECIMAL and behaves the same way
+            // https://docs.oracle.com/javadb/10.6.2.1/ref/rrefsqlj12362.html
+            (
+                SqlSupportsConvert::SqlConvertNumeric,
+                // Use the max precision 38
+                // https://docs.rs/arrow-schema/47.0.0/arrow_schema/constant.DECIMAL128_MAX_PRECISION.html
+                DataType::Decimal128(38, 2),
+            ),
+            (SqlSupportsConvert::SqlConvertReal, DataType::Float32),
+            (SqlSupportsConvert::SqlConvertSmallint, DataType::Int16),
+            (
+                SqlSupportsConvert::SqlConvertTime,
+                DataType::Time64(Nanosecond),
+            ),
+            (
+                SqlSupportsConvert::SqlConvertTimestamp,
+                DataType::Timestamp(Nanosecond, None),
+            ),
+            (SqlSupportsConvert::SqlConvertTinyint, DataType::Int8),
+            // SqlSupportsConvert::SqlConvertVarbinary is not supported
+            (SqlSupportsConvert::SqlConvertVarchar, DataType::Utf8),
+        ]
+        .iter()
+        .cloned()
+        .collect()
+    });
+
+pub(crate) static SQL_INFO_SUPPORTS_CONVERT: Lazy<HashMap<i32, Vec<i32>>> = Lazy::new(|| {
+    let mut convert: HashMap<i32, Vec<i32>> = HashMap::new();
+    for (from_type_sql, from_type_arrow) in SQL_DATA_TYPE_TO_ARROW_DATA_TYPE.clone().into_iter() {
+        let mut can_convert_to: Vec<i32> = vec![];
+        for (to_type_sql, to_type_arrow) in SQL_DATA_TYPE_TO_ARROW_DATA_TYPE.clone().into_iter() {
+            if can_cast_types(&from_type_arrow, &to_type_arrow) {
+                can_convert_to.push(to_type_sql as i32)
+            }
+        }
+        if !can_convert_to.is_empty() {
+            convert.insert(from_type_sql as i32, can_convert_to);
+        }
+    }
+    convert
+});
diff --git a/flightsql/src/sql_info/mod.rs b/flightsql/src/sql_info/mod.rs
index 6ef906d1702..db709e478ef 100644
--- a/flightsql/src/sql_info/mod.rs
+++ b/flightsql/src/sql_info/mod.rs
@@ -29,7 +29,7 @@ use once_cell::sync::Lazy;
 
 use meta::{
     SQL_INFO_DATE_TIME_FUNCTIONS, SQL_INFO_NUMERIC_FUNCTIONS, SQL_INFO_SQL_KEYWORDS,
-    SQL_INFO_STRING_FUNCTIONS, SQL_INFO_SYSTEM_FUNCTIONS,
+    SQL_INFO_STRING_FUNCTIONS, SQL_INFO_SUPPORTS_CONVERT, SQL_INFO_SYSTEM_FUNCTIONS,
 };
 
 #[allow(non_snake_case)]
@@ -94,9 +94,10 @@ static INSTANCE: Lazy<SqlInfoData> = Lazy::new(|| {
     builder.append(SqlInfo::SqlExtraNameCharacters, "");
     builder.append(SqlInfo::SqlSupportsColumnAliasing, true);
     builder.append(SqlInfo::SqlNullPlusNullIsNull, true);
-    // Skip SqlSupportsConvert (which is the map of the conversions that are supported);
-    // .with_sql_info(SqlInfo::SqlSupportsConvert, TBD);
-    // https://github.com/influxdata/influxdb_iox/issues/7253
+    builder.append(
+        SqlInfo::SqlSupportsConvert,
+        &SQL_INFO_SUPPORTS_CONVERT.clone(),
+    );
     builder.append(SqlInfo::SqlSupportsTableCorrelationNames, false);
     builder.append(SqlInfo::SqlSupportsDifferentTableCorrelationNames, false);
     builder.append(SqlInfo::SqlSupportsExpressionsInOrderBy, true);
@@ -178,6 +179,6 @@ static INSTANCE: Lazy<SqlInfoData> = Lazy::new(|| {
 });
 
 /// Return a [`SqlInfoData`] that describes IOx's capablities
-pub fn iox_sql_info_data() -> &'static SqlInfoData {
+pub(crate) fn iox_sql_info_data() -> &'static SqlInfoData {
     &INSTANCE
 }
diff --git a/garbage_collector/Cargo.toml b/garbage_collector/Cargo.toml
index 276697c8dd7..af040dbfe8a 100644
--- a/garbage_collector/Cargo.toml
+++ b/garbage_collector/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 chrono = { version = "0.4", default-features = false }
 clap = { version = "4", features = ["derive", "env"] }
@@ -13,13 +16,12 @@ data_types = { path = "../data_types" }
 futures = "0.3"
 humantime = "2.1.0"
 iox_catalog = { path = "../iox_catalog" }
-backoff = { path = "../backoff" }
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
-snafu = "0.7"
+snafu = "0.8"
 tokio = { version = "1", features = ["macros", "rt", "sync"] }
 tokio-stream = "0.1"
-tokio-util = { version = "0.7.9" }
+tokio-util = { version = "0.7.10" }
 uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
@@ -28,9 +30,8 @@ async-trait = "0.1"
 bytes = "1.5"
 data_types = { path = "../data_types" }
 filetime = "0.2"
+iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parquet_file = { path = "../parquet_file" }
 tempfile = "3"
-sqlx = { version = "0.7.1", features = [ "runtime-tokio-rustls" ] }
-
diff --git a/garbage_collector/src/lib.rs b/garbage_collector/src/lib.rs
index 05f9472b45e..89fba329975 100644
--- a/garbage_collector/src/lib.rs
+++ b/garbage_collector/src/lib.rs
@@ -80,10 +80,10 @@ impl GarbageCollector {
 
         let dry_run = sub_config.dry_run;
         info!(
-            objectstore_cutoff_days = %format_duration(sub_config.objectstore_cutoff).to_string(),
-            parquetfile_cutoff_days = %format_duration(sub_config.parquetfile_cutoff).to_string(),
+            objectstore_cutoff = %format_duration(sub_config.objectstore_cutoff),
+            parquetfile_cutoff = %format_duration(sub_config.parquetfile_cutoff),
+            parquetfile_sleep_interval = %format_duration(sub_config.parquetfile_sleep_interval()),
             objectstore_sleep_interval_minutes = %sub_config.objectstore_sleep_interval_minutes,
-            parquetfile_sleep_interval_minutes = %sub_config.parquetfile_sleep_interval_minutes,
             retention_sleep_interval_minutes = %sub_config.retention_sleep_interval_minutes,
             "GarbageCollector starting"
         );
@@ -149,7 +149,6 @@ impl GarbageCollector {
             shutdown.clone(),
             object_store,
             dry_run,
-            sub_config.objectstore_concurrent_deletes,
             rx2,
         ));
 
@@ -159,7 +158,7 @@ impl GarbageCollector {
             shutdown.clone(),
             Arc::clone(&catalog),
             sub_config.parquetfile_cutoff,
-            sub_config.parquetfile_sleep_interval_minutes,
+            sub_config.parquetfile_sleep_interval(),
         ));
 
         // Initialise the retention code, which is just one thread that calls
@@ -287,6 +286,7 @@ mod tests {
         object_store::{make_object_store, ObjectStoreConfig},
     };
     use filetime::FileTime;
+    use iox_time::SystemProvider;
     use std::{fs, iter, path::PathBuf, time::Duration};
     use tempfile::TempDir;
     use tokio::time::sleep;
@@ -369,8 +369,11 @@ mod tests {
         ]);
 
         let metrics = metric::Registry::default().into();
+        let time_provider = Arc::new(SystemProvider::new());
 
-        cfg.get_catalog("garbage_collector", metrics).await.unwrap()
+        cfg.get_catalog("garbage_collector", metrics, time_provider)
+            .await
+            .unwrap()
     }
 
     struct OldFileSetup {
diff --git a/garbage_collector/src/objectstore/checker.rs b/garbage_collector/src/objectstore/checker.rs
index cc624ddf0df..b84d78ca876 100644
--- a/garbage_collector/src/objectstore/checker.rs
+++ b/garbage_collector/src/objectstore/checker.rs
@@ -1,4 +1,5 @@
 use chrono::{DateTime, Duration, Utc};
+use data_types::ObjectStoreId;
 use iox_catalog::interface::{Catalog, ParquetFileRepo};
 use object_store::ObjectMeta;
 use observability_deps::tracing::*;
@@ -7,7 +8,6 @@ use std::collections::HashSet;
 use std::sync::Arc;
 use tokio::sync::mpsc;
 use tokio::time::timeout;
-use uuid::Uuid;
 
 #[derive(Debug, Snafu)]
 #[allow(missing_docs)]
@@ -51,7 +51,7 @@ pub(crate) async fn perform(
     items: mpsc::Receiver<ObjectMeta>,
     deleter: mpsc::Sender<ObjectMeta>,
 ) -> Result<()> {
-    let mut repositories = catalog.repositories().await;
+    let mut repositories = catalog.repositories();
     let parquet_files = repositories.parquet_files();
 
     perform_inner(parquet_files, cutoff, items, deleter).await
@@ -143,7 +143,7 @@ async fn should_delete(
 
         // extract the file suffix, delete it if it isn't a parquet file
         if let Some(uuid) = file_name.unwrap().as_ref().strip_suffix(".parquet") {
-            if let Ok(object_store_id) = uuid.parse::<Uuid>() {
+            if let Ok(object_store_id) = uuid.parse::<ObjectStoreId>() {
                 // add it to the list to check against the catalog
                 // push a tuple that maps the uuid to the object meta struct so we don't have generate the uuid again
                 to_check_in_catalog.push((object_store_id, candidate))
@@ -171,7 +171,8 @@ async fn should_delete(
     }
 
     // do_not_delete contains the items that are present in the catalog
-    let mut do_not_delete: HashSet<Uuid> = HashSet::with_capacity(to_check_in_catalog.len());
+    let mut do_not_delete: HashSet<ObjectStoreId> =
+        HashSet::with_capacity(to_check_in_catalog.len());
     for batch in to_check_in_catalog.chunks(CATALOG_BATCH_SIZE) {
         let just_uuids: Vec<_> = batch.iter().map(|id| id.0).collect();
         match check_ids_exists_in_catalog(just_uuids.clone(), parquet_files).await {
@@ -214,9 +215,9 @@ async fn should_delete(
 /// helper to check a batch of ids for presence in the catalog.
 /// returns a list of the ids (from the original batch) that exist (or catalog error).
 async fn check_ids_exists_in_catalog(
-    candidates: Vec<Uuid>,
+    candidates: Vec<ObjectStoreId>,
     parquet_files: &mut dyn ParquetFileRepo,
-) -> Result<Vec<Uuid>> {
+) -> Result<Vec<ObjectStoreId>> {
     parquet_files
         .exists_by_object_store_id_batch(candidates)
         .await
@@ -228,19 +229,19 @@ mod tests {
     use super::*;
     use async_trait::async_trait;
     use data_types::{
-        ColumnId, ColumnSet, CompactionLevel, NamespaceId, ParquetFile, ParquetFileId,
-        ParquetFileParams, PartitionId, TableId, Timestamp, TransitionPartitionId,
+        ColumnId, ColumnSet, CompactionLevel, NamespaceId, ObjectStoreId, ParquetFile,
+        ParquetFileId, ParquetFileParams, PartitionId, TableId, Timestamp, TransitionPartitionId,
     };
     use iox_catalog::{
-        interface::Catalog,
+        interface::{Catalog, ParquetFileRepoExt},
         mem::MemCatalog,
         test_helpers::{arbitrary_namespace, arbitrary_table},
     };
+    use iox_time::SystemProvider;
     use object_store::path::Path;
     use once_cell::sync::Lazy;
     use parquet_file::ParquetFilePath;
     use std::{assert_eq, vec};
-    use uuid::Uuid;
 
     static OLDER_TIME: Lazy<DateTime<Utc>> = Lazy::new(|| {
         DateTime::parse_from_str("2022-01-01T00:00:00z", "%+")
@@ -257,12 +258,13 @@ mod tests {
 
     async fn create_catalog_and_file() -> (Arc<dyn Catalog>, ParquetFile) {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog = Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
         create_schema_and_file(catalog).await
     }
 
     async fn create_schema_and_file(catalog: Arc<dyn Catalog>) -> (Arc<dyn Catalog>, ParquetFile) {
-        let mut repos = catalog.repositories().await;
+        let mut repos = catalog.repositories();
         let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await;
         let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
         let partition = repos
@@ -274,8 +276,9 @@ mod tests {
         let parquet_file_params = ParquetFileParams {
             namespace_id: namespace.id,
             table_id: partition.table_id,
-            partition_id: partition.transition_partition_id(),
-            object_store_id: Uuid::new_v4(),
+            partition_id: partition.id,
+            partition_hash_id: partition.hash_id().cloned(),
+            object_store_id: ObjectStoreId::new(),
             min_time: Timestamp::new(1),
             max_time: Timestamp::new(10),
             file_size_bytes: 1337,
@@ -298,13 +301,13 @@ mod tests {
     #[tokio::test]
     async fn dont_delete_new_file_in_catalog() {
         let (catalog, file_in_catalog) = create_catalog_and_file().await;
-        let mut repositories = catalog.repositories().await;
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let location = ParquetFilePath::new(
             file_in_catalog.namespace_id,
             file_in_catalog.table_id,
-            &file_in_catalog.partition_id.clone(),
+            &file_in_catalog.transition_partition_id(),
             file_in_catalog.object_store_id,
         )
         .object_store_path();
@@ -317,6 +320,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         let results = should_delete(vec![item], cutoff, parquet_files).await;
@@ -326,15 +330,17 @@ mod tests {
     #[tokio::test]
     async fn dont_delete_new_file_not_in_catalog() {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
-        let mut repositories = catalog.repositories().await;
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let location = ParquetFilePath::new(
             NamespaceId::new(1),
             TableId::new(2),
             &TransitionPartitionId::Deprecated(PartitionId::new(4)),
-            Uuid::new_v4(),
+            ObjectStoreId::new(),
         )
         .object_store_path();
 
@@ -346,6 +352,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         let results = should_delete(vec![item], cutoff, parquet_files).await;
@@ -355,8 +362,10 @@ mod tests {
     #[tokio::test]
     async fn dont_delete_new_file_with_unparseable_path() {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
-        let mut repositories = catalog.repositories().await;
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let cutoff = *OLDER_TIME;
@@ -367,6 +376,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         let results = should_delete(vec![item], cutoff, parquet_files).await;
@@ -376,13 +386,13 @@ mod tests {
     #[tokio::test]
     async fn dont_delete_old_file_in_catalog() {
         let (catalog, file_in_catalog) = create_catalog_and_file().await;
-        let mut repositories = catalog.repositories().await;
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let location = ParquetFilePath::new(
             file_in_catalog.namespace_id,
             file_in_catalog.table_id,
-            &file_in_catalog.partition_id.clone(),
+            &file_in_catalog.transition_partition_id(),
             file_in_catalog.object_store_id,
         )
         .object_store_path();
@@ -395,6 +405,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         let results = should_delete(vec![item], cutoff, parquet_files).await;
@@ -404,15 +415,17 @@ mod tests {
     #[tokio::test]
     async fn delete_old_file_not_in_catalog() {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
-        let mut repositories = catalog.repositories().await;
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let location = ParquetFilePath::new(
             NamespaceId::new(1),
             TableId::new(2),
             &TransitionPartitionId::Deprecated(PartitionId::new(4)),
-            Uuid::new_v4(),
+            ObjectStoreId::new(),
         )
         .object_store_path();
 
@@ -424,6 +437,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
         let results = should_delete(vec![item.clone()], cutoff, parquet_files).await;
         assert_eq!(results.len(), 1);
@@ -433,8 +447,10 @@ mod tests {
     #[tokio::test]
     async fn delete_old_file_with_unparseable_path() {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
-        let mut repositories = catalog.repositories().await;
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         let cutoff = *NEWER_TIME;
@@ -445,6 +461,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         let results = should_delete(vec![item.clone()], cutoff, parquet_files).await;
@@ -458,10 +475,12 @@ mod tests {
     #[tokio::test]
     async fn do_not_delete_on_catalog_error() {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
+        let time_provider = Arc::new(SystemProvider::new());
+        let catalog: Arc<dyn Catalog> =
+            Arc::new(MemCatalog::new(Arc::clone(&metric_registry), time_provider));
         let (catalog, file_in_catalog) = create_schema_and_file(catalog).await;
 
-        let mut repositories = catalog.repositories().await;
+        let mut repositories = catalog.repositories();
         let parquet_files = repositories.parquet_files();
 
         // A ParquetFileRepo that returns an error in the one method [should_delete] uses.
@@ -475,7 +494,7 @@ mod tests {
         let loc = ParquetFilePath::new(
             file_in_catalog.namespace_id,
             file_in_catalog.table_id,
-            &file_in_catalog.partition_id.clone(),
+            &file_in_catalog.transition_partition_id(),
             file_in_catalog.object_store_id,
         )
         .object_store_path();
@@ -485,6 +504,7 @@ mod tests {
             last_modified,
             size: 0,
             e_tag: None,
+            version: None,
         };
 
         // check precondition, file exists in catalog
@@ -506,79 +526,53 @@ mod tests {
 
     #[async_trait]
     impl ParquetFileRepo for MockParquetFileRepo<'_> {
-        async fn create(
-            &mut self,
-            parquet_file_params: ParquetFileParams,
-        ) -> iox_catalog::interface::Result<ParquetFile> {
-            self.inner.create(parquet_file_params).await
-        }
-
-        async fn list_all(&mut self) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
-            self.inner.list_all().await
-        }
-
         async fn flag_for_delete_by_retention(
             &mut self,
-        ) -> iox_catalog::interface::Result<Vec<ParquetFileId>> {
+        ) -> iox_catalog::interface::Result<Vec<(PartitionId, ObjectStoreId)>> {
             self.inner.flag_for_delete_by_retention().await
         }
 
-        async fn list_by_namespace_not_to_delete(
-            &mut self,
-            namespace_id: NamespaceId,
-        ) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
-            self.inner
-                .list_by_namespace_not_to_delete(namespace_id)
-                .await
-        }
-
-        async fn list_by_table_not_to_delete(
-            &mut self,
-            table_id: TableId,
-        ) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
-            self.inner.list_by_table_not_to_delete(table_id).await
-        }
-
         async fn delete_old_ids_only(
             &mut self,
             older_than: Timestamp,
-        ) -> iox_catalog::interface::Result<Vec<ParquetFileId>> {
+        ) -> iox_catalog::interface::Result<Vec<ObjectStoreId>> {
             self.inner.delete_old_ids_only(older_than).await
         }
 
-        async fn list_by_partition_not_to_delete(
+        async fn list_by_partition_not_to_delete_batch(
             &mut self,
-            partition_id: &TransitionPartitionId,
+            partition_ids: Vec<PartitionId>,
         ) -> iox_catalog::interface::Result<Vec<ParquetFile>> {
             self.inner
-                .list_by_partition_not_to_delete(partition_id)
+                .list_by_partition_not_to_delete_batch(partition_ids)
                 .await
         }
 
         async fn get_by_object_store_id(
             &mut self,
-            object_store_id: Uuid,
+            object_store_id: ObjectStoreId,
         ) -> iox_catalog::interface::Result<Option<ParquetFile>> {
             self.inner.get_by_object_store_id(object_store_id).await
         }
 
         async fn exists_by_object_store_id_batch(
             &mut self,
-            _object_store_ids: Vec<Uuid>,
-        ) -> iox_catalog::interface::Result<Vec<Uuid>> {
-            Err(iox_catalog::interface::Error::SqlxError {
-                source: sqlx::Error::WorkerCrashed,
+            _object_store_ids: Vec<ObjectStoreId>,
+        ) -> iox_catalog::interface::Result<Vec<ObjectStoreId>> {
+            Err(iox_catalog::interface::Error::External {
+                source: String::from("test").into(),
             })
         }
 
         async fn create_upgrade_delete(
             &mut self,
-            delete: &[ParquetFileId],
-            upgrade: &[ParquetFileId],
+            partition_id: PartitionId,
+            delete: &[ObjectStoreId],
+            upgrade: &[ObjectStoreId],
             create: &[ParquetFileParams],
             target_level: CompactionLevel,
         ) -> iox_catalog::interface::Result<Vec<ParquetFileId>> {
-            self.create_upgrade_delete(delete, upgrade, create, target_level)
+            self.create_upgrade_delete(partition_id, delete, upgrade, create, target_level)
                 .await
         }
     }
diff --git a/garbage_collector/src/objectstore/deleter.rs b/garbage_collector/src/objectstore/deleter.rs
index 31b5bc4d669..97af8ba506f 100644
--- a/garbage_collector/src/objectstore/deleter.rs
+++ b/garbage_collector/src/objectstore/deleter.rs
@@ -1,4 +1,4 @@
-use futures::{StreamExt, TryStreamExt};
+use futures::{FutureExt, StreamExt, TryStreamExt};
 use object_store::{DynObjectStore, ObjectMeta};
 use observability_deps::tracing::info;
 use snafu::prelude::*;
@@ -10,29 +10,39 @@ pub(crate) async fn perform(
     shutdown: CancellationToken,
     object_store: Arc<DynObjectStore>,
     dry_run: bool,
-    concurrent_deletes: usize,
     items: mpsc::Receiver<ObjectMeta>,
 ) -> Result<()> {
-    let stream_fu = tokio_stream::wrappers::ReceiverStream::new(items)
-        .map(|item| {
-            let object_store = Arc::clone(&object_store);
+    let locations = tokio_stream::wrappers::ReceiverStream::new(items).map(|item| item.location);
 
-            async move {
-                let path = item.location;
-                if dry_run {
+    let stream_fu = if dry_run {
+        async move {
+            locations
+                .map(|path| {
                     info!(?path, "Not deleting due to dry run");
-                    Ok(())
-                } else {
-                    info!("Deleting {path}");
-                    object_store
-                        .delete(&path)
-                        .await
-                        .context(DeletingSnafu { path })
-                }
-            }
-        })
-        .buffer_unordered(concurrent_deletes)
-        .try_collect();
+                })
+                .collect::<()>()
+                .await;
+            Ok(())
+        }
+        .boxed()
+    } else {
+        async move {
+            object_store
+                .delete_stream(
+                    locations
+                        .map(|path| {
+                            info!(%path, "Deleting");
+                            Ok(path)
+                        })
+                        .boxed(),
+                )
+                .map_ok(|_| ())
+                .map_err(|e: object_store::Error| Error::Deleting { source: e })
+                .try_collect()
+                .await
+        }
+        .boxed()
+    };
 
     tokio::select! {
         _ = shutdown.cancelled() => {
@@ -50,11 +60,8 @@ pub(crate) async fn perform(
 #[derive(Debug, Snafu)]
 #[allow(missing_docs)]
 pub enum Error {
-    #[snafu(display("{path} could not be deleted"))]
-    Deleting {
-        source: object_store::Error,
-        path: object_store::path::Path,
-    },
+    #[snafu(display("could not be delete: {source}"))]
+    Deleting { source: object_store::Error },
 }
 
 pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
@@ -64,11 +71,10 @@ mod tests {
     use super::*;
     use bytes::Bytes;
     use chrono::Utc;
-    use data_types::{NamespaceId, PartitionId, TableId, TransitionPartitionId};
+    use data_types::{NamespaceId, ObjectStoreId, PartitionId, TableId, TransitionPartitionId};
     use object_store::path::Path;
     use parquet_file::ParquetFilePath;
     use std::time::Duration;
-    use uuid::Uuid;
 
     #[tokio::test]
     async fn perform_shutdown_gracefully() {
@@ -80,7 +86,6 @@ mod tests {
         assert_eq!(count_os_element(&object_store).await, nitems);
 
         let dry_run = false;
-        let concurrent_deletes = 2;
         let (tx, rx) = mpsc::channel(1000);
 
         tokio::spawn({
@@ -106,13 +111,7 @@ mod tests {
         // nothing can be said about the number of elements in object store.
         // The processing stream may or may not have chance to process the
         // items for deletion.
-        let perform_fu = perform(
-            shutdown,
-            Arc::clone(&object_store),
-            dry_run,
-            concurrent_deletes,
-            rx,
-        );
+        let perform_fu = perform(shutdown, Arc::clone(&object_store), dry_run, rx);
         // Unusual test because there is no assertion but the call below should
         // not panic which verifies that the deleter task shutdown gracefully.
         tokio::time::timeout(Duration::from_secs(3), perform_fu)
@@ -122,7 +121,7 @@ mod tests {
     }
 
     async fn count_os_element(os: &Arc<DynObjectStore>) -> usize {
-        let objects = os.list(None).await.unwrap();
+        let objects = os.list(None);
         objects.fold(0, |acc, _| async move { acc + 1 }).await
     }
 
@@ -134,6 +133,7 @@ mod tests {
                 last_modified: Utc::now(),
                 size: 0,
                 e_tag: None,
+                version: None,
             };
             os.put(&object_meta.location, Bytes::from(i.to_string()))
                 .await
@@ -148,7 +148,7 @@ mod tests {
             NamespaceId::new(1),
             TableId::new(2),
             &TransitionPartitionId::Deprecated(PartitionId::new(4)),
-            Uuid::new_v4(),
+            ObjectStoreId::new(),
         )
         .object_store_path()
     }
diff --git a/garbage_collector/src/objectstore/lister.rs b/garbage_collector/src/objectstore/lister.rs
index 0e2cf32aed5..092e306784a 100644
--- a/garbage_collector/src/objectstore/lister.rs
+++ b/garbage_collector/src/objectstore/lister.rs
@@ -1,4 +1,3 @@
-use backoff::*;
 use futures::prelude::*;
 use object_store::{DynObjectStore, ObjectMeta};
 use observability_deps::tracing::*;
@@ -23,14 +22,9 @@ pub(crate) async fn perform(
     info!("beginning object store listing");
 
     loop {
-        let mut backoff = Backoff::new(&BackoffConfig::default());
-
         // there are issues with the service immediately hitting the os api (credentials, etc) on
         // startup. Retry as needed.
-        let items = backoff
-            .retry_all_errors("list_os_files", || object_store.list(None))
-            .await
-            .expect("backoff retries forever");
+        let items = object_store.list(None);
 
         let mut chunked_items = items.chunks(MAX_ITEMS_PROCESSED_PER_LOOP);
 
diff --git a/garbage_collector/src/parquetfile/deleter.rs b/garbage_collector/src/parquetfile/deleter.rs
index 227aca2fc05..9313fbd3d7a 100644
--- a/garbage_collector/src/parquetfile/deleter.rs
+++ b/garbage_collector/src/parquetfile/deleter.rs
@@ -2,6 +2,7 @@ use data_types::Timestamp;
 use iox_catalog::interface::Catalog;
 use observability_deps::tracing::*;
 use snafu::prelude::*;
+use std::time::Instant;
 use std::{sync::Arc, time::Duration};
 use tokio::{select, time::sleep};
 use tokio_util::sync::CancellationToken;
@@ -10,25 +11,27 @@ pub(crate) async fn perform(
     shutdown: CancellationToken,
     catalog: Arc<dyn Catalog>,
     cutoff: Duration,
-    sleep_interval_minutes: u64,
+    sleep_interval: Duration,
 ) -> Result<()> {
     loop {
+        let start = Instant::now();
         let older_than = Timestamp::from(catalog.time_provider().now() - cutoff);
         // do the delete, returning the deleted files
         let deleted = catalog
             .repositories()
-            .await
             .parquet_files()
             .delete_old_ids_only(older_than) // read/write
             .await
             .context(DeletingSnafu)?;
-        info!(delete_count = %deleted.len(), "iox_catalog::delete_old()");
+
+        let elapsed = start.elapsed();
+        info!(delete_count = %deleted.len(), ?elapsed, "iox_catalog::delete_old()");
 
         select! {
             _ = shutdown.cancelled() => {
                 break
             },
-            _ = sleep(Duration::from_secs(60 * sleep_interval_minutes)) => (),
+            _ = sleep(sleep_interval) => (),
         }
     }
     Ok(())
diff --git a/garbage_collector/src/retention/flagger.rs b/garbage_collector/src/retention/flagger.rs
index 12516c2ecd8..59905ac7155 100644
--- a/garbage_collector/src/retention/flagger.rs
+++ b/garbage_collector/src/retention/flagger.rs
@@ -15,7 +15,6 @@ pub(crate) async fn perform(
         if !dry_run {
             let flagged = catalog
                 .repositories()
-                .await
                 .parquet_files()
                 .flag_for_delete_by_retention() //read/write
                 .await
diff --git a/generated_types/Cargo.toml b/generated_types/Cargo.toml
index d69f4676569..345f147dd7c 100644
--- a/generated_types/Cargo.toml
+++ b/generated_types/Cargo.toml
@@ -5,13 +5,17 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 observability_deps = { path = "../observability_deps" }
-pbjson = "0.6"
-pbjson-types = "0.5"
-prost = "0.11"
+pbjson = { workspace = true }
+pbjson-types = { workspace = true }
+prost = { workspace = true }
 serde = { version = "1.0", features = ["derive"] }
 tonic = { workspace = true }
+uuid = { version = "1" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
@@ -19,5 +23,5 @@ bytes = "1.5"
 
 [build-dependencies] # In alphabetical order
 tonic-build = { workspace = true }
-prost-build = "0.11"
-pbjson-build = "0.5"
+prost-build = { workspace = true }
+pbjson-build = { workspace = true }
diff --git a/generated_types/build.rs b/generated_types/build.rs
index ffbc45f995b..4718737c81a 100644
--- a/generated_types/build.rs
+++ b/generated_types/build.rs
@@ -19,6 +19,7 @@ fn main() -> Result<()> {
 /// Creates:
 ///
 /// - `influxdata.iox.authz.v1.rs`
+/// - `influxdata.iox.bulk_ingest.v1.rs`
 /// - `influxdata.iox.catalog.v1.rs`
 /// - `influxdata.iox.compactor.v1.rs`
 /// - `influxdata.iox.delete.v1.rs`
@@ -34,7 +35,11 @@ fn main() -> Result<()> {
 /// - `influxdata.platform.storage.rs`
 fn generate_grpc_types(root: &Path) -> Result<()> {
     let authz_path = root.join("influxdata/iox/authz/v1");
-    let catalog_path = root.join("influxdata/iox/catalog/v1");
+    let bulk_ingest_path = root.join("influxdata/iox/bulk_ingest/v1");
+    let catalog_cache_path = root.join("influxdata/iox/catalog_cache/v1");
+    let catalog_v1_path = root.join("influxdata/iox/catalog/v1");
+    let catalog_v2_path = root.join("influxdata/iox/catalog/v2");
+    let column_type = root.join("influxdata/iox/column_type/v1");
     let compactor_path = root.join("influxdata/iox/compactor/v1");
     let delete_path = root.join("influxdata/iox/delete/v1");
     let gossip_path = root.join("influxdata/iox/gossip/v1");
@@ -45,6 +50,7 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
     let predicate_path = root.join("influxdata/iox/predicate/v1");
     let querier_path = root.join("influxdata/iox/querier/v1");
     let schema_path = root.join("influxdata/iox/schema/v1");
+    let skipped_compaction_path = root.join("influxdata/iox/skipped_compaction/v1");
     let storage_errors_path = root.join("influxdata/platform/errors");
     let storage_path = root.join("influxdata/platform/storage");
     let table_path = root.join("influxdata/iox/table/v1");
@@ -52,15 +58,20 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
 
     let proto_files = vec![
         authz_path.join("authz.proto"),
-        catalog_path.join("parquet_file.proto"),
-        catalog_path.join("partition_identifier.proto"),
-        catalog_path.join("service.proto"),
+        bulk_ingest_path.join("service.proto"),
+        catalog_cache_path.join("value.proto"),
+        catalog_v1_path.join("parquet_file.proto"),
+        catalog_v1_path.join("partition_identifier.proto"),
+        catalog_v1_path.join("service.proto"),
+        catalog_v2_path.join("service.proto"),
+        column_type.join("type.proto"),
         compactor_path.join("service.proto"),
         delete_path.join("service.proto"),
         gossip_path.join("compaction.proto"),
         gossip_path.join("parquet_file.proto"),
         gossip_path.join("schema.proto"),
         gossip_path.join("schema_sync.proto"),
+        gossip_path.join("sort_keys.proto"),
         ingester_path.join("parquet_metadata.proto"),
         ingester_path.join("persist.proto"),
         ingester_path.join("write.proto"),
@@ -69,12 +80,14 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
         partition_template_path.join("template.proto"),
         predicate_path.join("predicate.proto"),
         querier_path.join("flight.proto"),
+        querier_path.join("query_log.proto"),
         root.join("google/longrunning/operations.proto"),
         root.join("google/rpc/error_details.proto"),
         root.join("google/rpc/status.proto"),
         root.join("grpc/health/v1/service.proto"),
         root.join("influxdata/pbdata/v1/influxdb_pb_data_protocol.proto"),
         schema_path.join("service.proto"),
+        skipped_compaction_path.join("skipped_compaction.proto"),
         storage_errors_path.join("errors.proto"),
         storage_path.join("predicate.proto"),
         storage_path.join("service.proto"),
@@ -98,8 +111,10 @@ fn generate_grpc_types(root: &Path) -> Result<()> {
         .extern_path(".google.protobuf", "::pbjson_types")
         .btree_map([
             ".influxdata.iox.ingester.v1.IngesterQueryResponseMetadata.unpersisted_partitions",
+            ".influxdata.iox.schema.v1.UpsertSchemaRequest.columns",
         ])
-        .type_attribute(".influxdata.iox.partition_template", "#[derive(Hash)]");
+        .type_attribute(".influxdata.iox.partition_template", "#[derive(Hash)]")
+        .bytes([".influxdata.iox.catalog_cache.v1"]);
 
     let descriptor_path = PathBuf::from(env::var("OUT_DIR").unwrap()).join("proto_descriptor.bin");
     tonic_build::configure()
diff --git a/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto b/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto
new file mode 100644
index 00000000000..ad62d5728a4
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+package influxdata.iox.bulk_ingest.v1;
+option go_package = "github.com/influxdata/iox/bulk_ingest/v1";
+
+import "google/protobuf/timestamp.proto";
+
+service BulkIngestService {
+  // Generate the Parquet metadata that a bulk ingest process should use for the specified data
+  rpc NewParquetMetadata(NewParquetMetadataRequest) returns (NewParquetMetadataResponse);
+
+  // Given a partition and the suggested sort order expressed as columns in order by cardinality
+  // (low-to-high, as observed in all data to be imported for this partition), update the sort key
+  // in the catalog by merging these new columns with the existing catalog sort key, then return
+  // the full sort key.
+  rpc UpsertSortKey(UpsertSortKeyRequest) returns (UpsertSortKeyResponse);
+}
+
+message NewParquetMetadataRequest {
+  // Name of the namespace the data will be imported into
+  string namespace_name = 1;
+
+  // Name of the table the data will be imported into
+  string table_name = 2;
+
+  // Partition key of this data
+  string partition_key = 3;
+
+  // The set of column names that will appear in this Parquet file, ordered by cardinality of the
+  // full data set (not just the cardinality of the data appearing in this file).
+  //
+  // This is not necessarily the column order that will be uploaded later.
+  //
+  // Will be merged with the set of columns in the catalog for this partition's sort key and then
+  // returned in the metadata blob.
+  //
+  // This request will return an error if any columns are unknown or have no schema information.
+  repeated string columns = 4;
+
+  // Timestamp when the dataset to be imported was generated - used to order older/newer data
+  google.protobuf.Timestamp data_created_at = 5;
+}
+
+message NewParquetMetadataResponse {
+   // The Parquet metadata blob.
+   //
+   // Opaque payload consisting of a `key=value` map to be inserted into the Parquet file metadata.
+   //
+   // Metadata is IOxMetadata + ECDSA signature to ensure it is not tampered with.
+   map<string, bytes> metadata = 1;
+
+   // A pre-authorised, signed URL which should be used to PUT the file to object storage
+   string upload_url = 2;
+}
+
+message UpsertSortKeyRequest {
+  // Name of the namespace in which to upsert the sort key
+  string namespace_name = 1;
+
+  // Name of the table for which to upsert the sort key
+  string table_name = 2;
+
+  // The partition key for which to upsert the sort key
+  string partition_key = 3;
+
+  // The sort key columns to potentially add to the catalog if needed
+  repeated string columns = 4;
+}
+
+message UpsertSortKeyResponse {
+   // The full catalog sort key that files to be imported must be sorted by, to be filtered by
+   // which columns actually occur in the file
+   repeated string sort_key = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto b/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
index bc6c135cfe5..c1382cb2ca2 100644
--- a/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v1/parquet_file.proto
@@ -13,8 +13,8 @@ message ParquetFile {
     reserved "shard_id";
     reserved 8;
     reserved "max_sequence_number";
-    reserved 5;
-    reserved "partition_id";
+    reserved 19;
+    reserved "partition_identifier";
 
     // the id of the file in the catalog
     int64 id = 1;
@@ -22,9 +22,10 @@ message ParquetFile {
     int64 namespace_id = 3;
     // the table id
     int64 table_id = 4;
-
-    PartitionIdentifier partition_identifier = 19;
-
+    // the partition id
+    int64 partition_id = 5;
+    // optional partition hash id
+    bytes partition_hash_id = 20;
     // the object store uuid
     string object_store_id = 6;
     // the min timestamp of data in this file
diff --git a/generated_types/protos/influxdata/iox/catalog/v1/service.proto b/generated_types/protos/influxdata/iox/catalog/v1/service.proto
index 94a6c844460..0834e8b9dfb 100644
--- a/generated_types/protos/influxdata/iox/catalog/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/catalog/v1/service.proto
@@ -21,10 +21,10 @@ service CatalogService {
 
 message GetParquetFilesByPartitionIdRequest {
     // Was the catalog-assigned partition ID.
-    reserved 1;
-    reserved "partition_id";
+    int64 partition_id = 1;
 
-    PartitionIdentifier partition_identifier = 2;
+    reserved 2;
+    reserved "partition_identifier";
 }
 
 message GetParquetFilesByPartitionIdResponse {
@@ -41,13 +41,15 @@ message Partition {
     reserved "shard_id";
     reserved 1;
     reserved "id";
+    reserved 6;
+    reserved "array_sort_key";
+    reserved 10;
+    reserved "optional_sort_key";
 
     // the table id the partition is in
     int64 table_id = 3;
     // the partition key
     string key = 4;
-    // the sort key for data in parquet files in the partition
-    repeated string array_sort_key = 6;
 
     PartitionIdentifier identifier = 8;
 
diff --git a/generated_types/protos/influxdata/iox/catalog/v2/service.proto b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
new file mode 100644
index 00000000000..d06362bdfe8
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog/v2/service.proto
@@ -0,0 +1,489 @@
+// The API for the catalog service
+
+syntax = "proto3";
+package influxdata.iox.catalog.v2;
+option go_package = "github.com/influxdata/iox/catalog/v2";
+
+import "influxdata/iox/partition_template/v1/template.proto";
+import "influxdata/iox/column_type/v1/type.proto";
+import "influxdata/iox/catalog_cache/v1/value.proto";
+
+service CatalogService {
+  rpc NamespaceCreate(NamespaceCreateRequest) returns (NamespaceCreateResponse);
+  rpc NamespaceUpdateRetentionPeriod(NamespaceUpdateRetentionPeriodRequest) returns (NamespaceUpdateRetentionPeriodResponse);
+  rpc NamespaceList(NamespaceListRequest) returns (stream NamespaceListResponse);
+  rpc NamespaceGetById(NamespaceGetByIdRequest) returns (NamespaceGetByIdResponse);
+  rpc NamespaceGetByName(NamespaceGetByNameRequest) returns (NamespaceGetByNameResponse);
+  rpc NamespaceSoftDelete(NamespaceSoftDeleteRequest) returns (NamespaceSoftDeleteResponse);
+  rpc NamespaceUpdateTableLimit(NamespaceUpdateTableLimitRequest) returns (NamespaceUpdateTableLimitResponse);
+  rpc NamespaceUpdateColumnLimit(NamespaceUpdateColumnLimitRequest) returns (NamespaceUpdateColumnLimitResponse);
+
+  rpc TableCreate(TableCreateRequest) returns (TableCreateResponse);
+  rpc TableGetById(TableGetByIdRequest) returns (TableGetByIdResponse);
+  rpc TableGetByNamespaceAndName(TableGetByNamespaceAndNameRequest) returns (TableGetByNamespaceAndNameResponse);
+  rpc TableListByNamespaceId(TableListByNamespaceIdRequest) returns (stream TableListByNamespaceIdResponse);
+  rpc TableList(TableListRequest) returns (stream TableListResponse);
+  rpc TableSnapshot(TableSnapshotRequest) returns (TableSnapshotResponse);
+
+  rpc ColumnCreateOrGet(ColumnCreateOrGetRequest) returns (ColumnCreateOrGetResponse);
+  rpc ColumnCreateOrGetManyUnchecked(ColumnCreateOrGetManyUncheckedRequest) returns (stream ColumnCreateOrGetManyUncheckedResponse);
+  rpc ColumnListByNamespaceId(ColumnListByNamespaceIdRequest) returns (stream ColumnListByNamespaceIdResponse);
+  rpc ColumnListByTableId(ColumnListByTableIdRequest) returns (stream ColumnListByTableIdResponse);
+  rpc ColumnList(ColumnListRequest) returns (stream ColumnListResponse);
+
+  rpc PartitionCreateOrGet(PartitionCreateOrGetRequest) returns (PartitionCreateOrGetResponse);
+  rpc PartitionGetByIdBatch(PartitionGetByIdBatchRequest) returns (stream PartitionGetByIdBatchResponse);
+  rpc PartitionListByTableId(PartitionListByTableIdRequest) returns (stream PartitionListByTableIdResponse);
+  rpc PartitionListIds(PartitionListIdsRequest) returns (stream PartitionListIdsResponse);
+  rpc PartitionCasSortKey(PartitionCasSortKeyRequest) returns (PartitionCasSortKeyResponse);
+  rpc PartitionRecordSkippedCompaction(PartitionRecordSkippedCompactionRequest) returns (PartitionRecordSkippedCompactionResponse);
+  rpc PartitionGetInSkippedCompactions(PartitionGetInSkippedCompactionsRequest) returns (stream PartitionGetInSkippedCompactionsResponse);
+  rpc PartitionListSkippedCompactions(PartitionListSkippedCompactionsRequest) returns (stream PartitionListSkippedCompactionsResponse);
+  rpc PartitionDeleteSkippedCompactions(PartitionDeleteSkippedCompactionsRequest) returns (PartitionDeleteSkippedCompactionsResponse);
+  rpc PartitionMostRecentN(PartitionMostRecentNRequest) returns (stream PartitionMostRecentNResponse);
+  rpc PartitionNewFileBetween(PartitionNewFileBetweenRequest) returns (stream PartitionNewFileBetweenResponse);
+  rpc PartitionListOldStyle(PartitionListOldStyleRequest) returns (stream PartitionListOldStyleResponse);
+  rpc PartitionSnapshot(PartitionSnapshotRequest) returns (PartitionSnapshotResponse);
+
+  rpc ParquetFileFlagForDeleteByRetention(ParquetFileFlagForDeleteByRetentionRequest) returns (stream ParquetFileFlagForDeleteByRetentionResponse);
+  rpc ParquetFileDeleteOldIdsOnly(ParquetFileDeleteOldIdsOnlyRequest) returns (stream ParquetFileDeleteOldIdsOnlyResponse);
+  rpc ParquetFileListByPartitionNotToDeleteBatch(ParquetFileListByPartitionNotToDeleteBatchRequest) returns (stream ParquetFileListByPartitionNotToDeleteBatchResponse);
+  rpc ParquetFileGetByObjectStoreId(ParquetFileGetByObjectStoreIdRequest) returns (ParquetFileGetByObjectStoreIdResponse);
+  rpc ParquetFileExistsByObjectStoreIdBatch(stream ParquetFileExistsByObjectStoreIdBatchRequest) returns (stream ParquetFileExistsByObjectStoreIdBatchResponse);
+  rpc ParquetFileCreateUpgradeDelete(ParquetFileCreateUpgradeDeleteRequest) returns (ParquetFileCreateUpgradeDeleteResponse);
+}
+
+message NamespaceCreateRequest {
+  string name = 1;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 2;
+  optional int64 retention_period_ns = 3;
+  ServiceProtectionLimits service_protection_limits = 4;
+}
+
+message NamespaceCreateResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceUpdateRetentionPeriodRequest {
+  string name = 1;
+  optional int64 retention_period_ns = 2;
+}
+
+message NamespaceUpdateRetentionPeriodResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceListRequest {
+  SoftDeletedRows deleted = 1;
+}
+
+message NamespaceListResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceGetByIdRequest {
+  int64 id = 1;
+  SoftDeletedRows deleted = 2;
+}
+
+message NamespaceGetByIdResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceGetByNameRequest {
+  string name = 1;
+  SoftDeletedRows deleted = 2;
+}
+
+message NamespaceGetByNameResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceSoftDeleteRequest {
+  string name = 1;
+}
+
+message NamespaceSoftDeleteResponse {}
+
+message NamespaceUpdateTableLimitRequest {
+  string name = 1;
+  int32 new_max = 2;
+}
+
+message NamespaceUpdateTableLimitResponse {
+  Namespace namespace = 1;
+}
+
+message NamespaceUpdateColumnLimitRequest {
+  string name = 1;
+  int32 new_max = 2;
+}
+
+message NamespaceUpdateColumnLimitResponse {
+  Namespace namespace = 1;
+}
+
+message TableCreateRequest {
+  string name = 1;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 2;
+  int64 namespace_id = 3;
+}
+
+message TableCreateResponse {
+  Table table = 1;
+}
+
+message TableGetByIdRequest {
+  int64 id = 1;
+}
+
+message TableGetByIdResponse {
+  Table table = 1;
+}
+
+message TableGetByNamespaceAndNameRequest {
+  int64 namespace_id = 1;
+  string name = 2;
+}
+
+message TableGetByNamespaceAndNameResponse {
+  Table table = 1;
+}
+
+message TableListByNamespaceIdRequest {
+  int64 namespace_id = 1;
+}
+
+message TableListByNamespaceIdResponse {
+  Table table = 1;
+}
+
+message TableListRequest {}
+
+message TableListResponse {
+  Table table = 1;
+}
+
+message TableSnapshotRequest {
+  int64 table_id = 1;
+}
+
+message TableSnapshotResponse {
+  catalog_cache.v1.Table table = 1;
+  uint64 generation = 2;
+}
+
+message ColumnCreateOrGetRequest {
+  string name = 1;
+  int64 table_id = 2;
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
+}
+
+message ColumnCreateOrGetResponse {
+  Column column = 1;
+}
+
+message ColumnCreateOrGetManyUncheckedRequest {
+  int64 table_id = 1;
+  map<string, influxdata.iox.column_type.v1.ColumnType> columns = 2;
+}
+
+message ColumnCreateOrGetManyUncheckedResponse {
+  Column column = 1;
+}
+
+message ColumnListByNamespaceIdRequest {
+  int64 namespace_id = 1;
+}
+
+message ColumnListByNamespaceIdResponse {
+  Column column = 1;
+}
+
+message ColumnListByTableIdRequest {
+  int64 table_id = 1;
+}
+
+message ColumnListByTableIdResponse {
+  Column column = 1;
+}
+
+message ColumnListRequest {}
+
+message ColumnListResponse {
+  Column column = 1;
+}
+
+message PartitionCreateOrGetRequest {
+  string key = 1;
+  int64 table_id = 2;
+}
+
+message PartitionCreateOrGetResponse {
+  Partition partition = 1;
+}
+
+message PartitionGetByIdBatchRequest {
+  repeated int64 partition_ids = 1;
+}
+
+message PartitionGetByIdBatchResponse {
+  Partition partition = 1;
+}
+
+message PartitionGetByHashIdBatchRequest {
+  repeated bytes partition_hash_ids = 1;
+}
+
+message PartitionGetByHashIdBatchResponse {
+  Partition partition = 1;
+}
+
+message PartitionListByTableIdRequest {
+  int64 table_id = 1;
+}
+
+message PartitionListByTableIdResponse {
+  Partition partition = 1;
+}
+
+message PartitionListIdsRequest {}
+
+message PartitionListIdsResponse {
+  int64 partition_id = 1;
+}
+
+message PartitionCasSortKeyRequest {
+  int64 partition_id = 1;
+  SortKeyIds old_sort_key_ids = 2;
+  SortKeyIds new_sort_key_ids = 3;
+}
+
+message PartitionCasSortKeyResponse {
+  oneof res {
+    Partition partition = 1;
+    SortKeyIds current_sort_key = 2;
+  }
+}
+
+message PartitionRecordSkippedCompactionRequest {
+  int64 partition_id = 1;
+  string reason = 2;
+  uint64 num_files = 3;
+  uint64 limit_num_files = 4;
+  uint64 limit_num_files_first_in_partition = 5;
+  uint64 estimated_bytes = 6;
+  uint64 limit_bytes = 7;
+}
+
+message PartitionRecordSkippedCompactionResponse {}
+
+message PartitionGetInSkippedCompactionsRequest {
+  repeated int64 partition_ids = 1;
+}
+
+message PartitionGetInSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionListSkippedCompactionsRequest {}
+
+message PartitionListSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionDeleteSkippedCompactionsRequest {
+  int64 partition_id = 1;
+}
+
+message PartitionDeleteSkippedCompactionsResponse {
+  SkippedCompaction skipped_compaction = 1;
+}
+
+message PartitionMostRecentNRequest {
+  uint64 n = 1;
+}
+
+message PartitionMostRecentNResponse {
+  Partition partition = 1;
+}
+
+message PartitionNewFileBetweenRequest {
+  int64 minimum_time = 1;
+  optional int64 maximum_time = 2;
+}
+
+message PartitionNewFileBetweenResponse {
+  int64 partition_id = 1;
+}
+
+message PartitionListOldStyleRequest {}
+
+message PartitionListOldStyleResponse {
+  Partition partition = 1;
+}
+
+message PartitionSnapshotRequest {
+  int64 partition_id = 1;
+}
+
+message PartitionSnapshotResponse {
+  catalog_cache.v1.Partition partition = 1;
+  uint64 generation = 2;
+}
+
+message ParquetFileFlagForDeleteByRetentionRequest {}
+
+message ParquetFileFlagForDeleteByRetentionResponse {
+  ObjectStoreId object_store_id = 1;
+  int64 partition_id = 2;
+}
+
+message ParquetFileDeleteOldIdsOnlyRequest {
+  int64 older_than = 1;
+}
+
+message ParquetFileDeleteOldIdsOnlyResponse {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileListByPartitionNotToDeleteBatchRequest  {
+  repeated int64 partition_ids = 1;
+}
+
+message ParquetFileListByPartitionNotToDeleteBatchResponse {
+  ParquetFile parquet_file = 1;
+}
+
+message ParquetFileGetByObjectStoreIdRequest {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileGetByObjectStoreIdResponse {
+  ParquetFile parquet_file = 1;
+}
+
+message ParquetFileExistsByObjectStoreIdBatchRequest {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileExistsByObjectStoreIdBatchResponse {
+  ObjectStoreId object_store_id = 1;
+}
+
+message ParquetFileCreateUpgradeDeleteRequest {
+  repeated ObjectStoreId delete = 1;
+  repeated ObjectStoreId upgrade = 2;
+  repeated ParquetFileParams create = 3;
+  int32 target_level = 4;
+  int64 partition_id = 5;
+}
+
+message ParquetFileCreateUpgradeDeleteResponse {
+  repeated int64 created_parquet_file_ids = 1;
+}
+
+message ServiceProtectionLimits {
+  optional int32 max_tables = 1;
+  optional int32 max_columns_per_table = 2;
+}
+
+message Namespace {
+  int64 id = 1;
+  string name = 2;
+  optional int64 retention_period_ns = 3;
+  int32 max_tables = 4;
+  int32 max_columns_per_table = 5;
+  optional int64 deleted_at = 6;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 7;
+}
+
+enum SoftDeletedRows {
+  SOFT_DELETED_ROWS_UNSPECIFIED = 0;
+  SOFT_DELETED_ROWS_ALL_ROWS = 1;
+  SOFT_DELETED_ROWS_EXCLUDE_DELETED = 2;
+  SOFT_DELETED_ROWS_ONLY_DELETED = 3;
+}
+
+message Table {
+  int64 id = 1;
+  int64 namespace_id = 2;
+  string name = 3;
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 4;
+}
+
+message Column {
+  int64 id = 1;
+  int64 table_id = 2;
+  string name = 3;
+  influxdata.iox.column_type.v1.ColumnType column_type = 4;
+}
+
+message SortKeyIds {
+  repeated int64 column_ids = 2;
+}
+
+message Partition {
+  int64 id = 1;
+  bytes hash_id = 2;
+  int64 table_id = 3;
+  string partition_key = 4;
+  SortKeyIds sort_key_ids = 5;
+  optional int64 new_file_at = 6;
+}
+
+message SkippedCompaction {
+  int64 partition_id = 1;
+  string reason = 2;
+  int64 skipped_at = 3;
+  int64 estimated_bytes = 4;
+  int64 limit_bytes = 5;
+  int64 num_files = 6;
+  int64 limit_num_files = 7;
+  int64 limit_num_files_first_in_partition = 8;
+}
+
+message ObjectStoreId {
+  fixed64 high64 = 1;
+  fixed64 low64 = 2;
+}
+
+message ColumnSet {
+  repeated int64 column_ids = 1;
+}
+
+message ParquetFileParams {
+  int64 namespace_id = 1;
+  int64 table_id = 2;
+  int64 partition_id = 3;
+  optional bytes partition_hash_id = 4;
+  ObjectStoreId object_store_id = 5;
+  int64 min_time = 6;
+  int64 max_time = 7;
+  int64 file_size_bytes = 8;
+  int64 row_count = 9;
+  int32 compaction_level = 10;
+  int64 created_at = 11;
+  ColumnSet column_set = 12;
+  int64 max_l0_created_at = 13;
+}
+
+message ParquetFile {
+  reserved 4; // TransitionPartitionId
+
+  int64 id = 1;
+  int64 namespace_id = 2;
+  int64 table_id = 3;
+  int64 partition_id = 15;
+  bytes partition_hash_id = 16;
+  ObjectStoreId object_store_id = 5;
+  int64 min_time = 6;
+  int64 max_time = 7;
+  optional int64 to_delete = 8;
+  int64 file_size_bytes = 9;
+  int64 row_count = 10;
+  int32 compaction_level = 11;
+  int64 created_at = 12;
+  ColumnSet column_set = 13;
+  int64 max_l0_created_at = 14;
+}
diff --git a/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
new file mode 100644
index 00000000000..4338e4fb759
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto
@@ -0,0 +1,158 @@
+syntax = "proto3";
+package influxdata.iox.catalog_cache.v1;
+option go_package = "github.com/influxdata/iox/catalog_cache/v1";
+
+import "influxdata/iox/column_type/v1/type.proto";
+import "influxdata/iox/partition_template/v1/template.proto";
+import "influxdata/iox/skipped_compaction/v1/skipped_compaction.proto";
+
+// A list of Message supporting efficient random access
+// See data_types::snapshot::list::MessageList
+message MessageList {
+  // Int32 offsets denoting slices into values
+  // See https://arrow.apache.org/docs/format/Columnar.html#variable-size-list-layout
+  bytes offsets = 1;
+  // Raw value data
+  bytes values = 2;
+}
+
+message SipHash24 {
+  fixed64 key0 = 1;
+  fixed64 key1 = 2;
+}
+
+// A list of hash buckets supporting linear probing
+// See data_types::snapshot::hash::HashBuckets
+message HashBuckets {
+  bytes buckets = 1;
+  oneof hash_function {
+    SipHash24 sip_hash_24 = 2;
+  }
+}
+
+// A packed bitmask
+// See data_types::snapshot::mask::BitMask
+message BitMask {
+  bytes mask = 1;
+  uint64 len = 2;
+}
+
+/// A UUID encoded as two unsigned 64-bit integers
+message UUID {
+  fixed64 low = 1;
+  fixed64 high = 2;
+}
+
+message Partition {
+  // The namespace id
+  int64 namespace_id = 1;
+
+  // The table id
+  int64 table_id = 2;
+
+  // The partition id
+  int64 partition_id = 3;
+
+  // True if this partition has a partition_hash_id
+  bool partition_hash_id = 4;
+
+  // The partition key of this partition
+  bytes key = 6;
+
+  // A MessageList of PartitionFile
+  MessageList files = 7;
+
+  // The column ids
+  repeated int64 column_ids = 8;
+
+  // The sort keys ids
+  repeated int64 sort_key_ids = 9;
+
+  // The time of a new file
+  optional int64 new_file_at = 10;
+
+  // Skipped compaction registered for this partition.
+  influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compaction = 11;
+}
+
+message PartitionFile {
+  // The identifier for a file
+  UUID object_store_uuid = 1;
+
+  // A min timestamp in nanoseconds from epoch
+  int64 min_time = 2;
+
+  // A max timestamp in nanoseconds from epoch
+  int64 max_time = 3;
+
+  // The size of this file in bytes
+  int64 file_size_bytes = 4;
+
+  // The number of rows in this file
+  int64 row_count = 5;
+
+  // The compaction level
+  int32 compaction_level = 6;
+
+  // The creation time of this file in nanoseconds
+  int64 created_at = 7;
+
+  // The maximum created_at of the l0 files used to produce this file
+  int64 max_l0_created_at = 8;
+
+  // A mask of Partition.column_ids
+  BitMask column_mask = 9;
+
+  // Legacy sequential id
+  int64 id = 10;
+}
+
+message Table {
+  // A MessageList of TablePartition
+  MessageList partitions = 1;
+
+  // A MessageList of TableColumn
+  MessageList columns = 2;
+
+  // The partition template of this table
+  influxdata.iox.partition_template.v1.PartitionTemplate partition_template = 3;
+
+  // The namespace this table belongs to
+  int64 namespace_id = 4;
+
+  // The id of this table
+  int64 table_id = 5;
+
+  // The name of the table
+  bytes table_name = 6;
+}
+
+message TablePartition {
+  // The id of this partition
+  int64 id = 1;
+  // The partition key
+  bytes key = 2;
+}
+
+message TableColumn {
+  // The id of this column
+  int64 id = 1;
+  // The name of this column
+  bytes name = 2;
+  // The type of this column
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
+}
+
+message Namespace {
+  // A MessageList of NamespaceTable
+  MessageList tables = 1;
+  // A HashBuckets of NamespaceTable.table_name
+  HashBuckets table_names = 2;
+}
+
+message NamespaceTable {
+  // The id of this table
+  int64 id = 1;
+  // The name of this table
+  bytes name = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/column_type/v1/type.proto b/generated_types/protos/influxdata/iox/column_type/v1/type.proto
new file mode 100644
index 00000000000..fc65faf4a2e
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/column_type/v1/type.proto
@@ -0,0 +1,14 @@
+syntax = "proto3";
+package influxdata.iox.column_type.v1;
+option go_package = "github.com/influxdata/iox/column_type/v1";
+
+enum ColumnType {
+  COLUMN_TYPE_UNSPECIFIED = 0;
+  COLUMN_TYPE_I64 = 1;
+  COLUMN_TYPE_U64 = 2;
+  COLUMN_TYPE_F64 = 3;
+  COLUMN_TYPE_BOOL = 4;
+  COLUMN_TYPE_STRING = 5;
+  COLUMN_TYPE_TIME = 6;
+  COLUMN_TYPE_TAG = 7;
+}
diff --git a/generated_types/protos/influxdata/iox/compactor/v1/service.proto b/generated_types/protos/influxdata/iox/compactor/v1/service.proto
index dba7c94b6a9..2a25e4b1acd 100644
--- a/generated_types/protos/influxdata/iox/compactor/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/compactor/v1/service.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 package influxdata.iox.compactor.v1;
 option go_package = "github.com/influxdata/iox/compactor/v1";
 
+import "influxdata/iox/skipped_compaction/v1/skipped_compaction.proto";
+
 service CompactionService {
   // List all skipped compactions in the catalog
   rpc ListSkippedCompactions(ListSkippedCompactionsRequest) returns (ListSkippedCompactionsResponse);
@@ -14,38 +16,7 @@ message ListSkippedCompactionsRequest {}
 
 message ListSkippedCompactionsResponse {
   // A list of skipped compactions
-  repeated SkippedCompaction skipped_compactions = 1;
-}
-
-message SkippedCompaction {
-  // The ID of the partition for which compaction was skipped; this can be used to uniquely
-  // identify the skipped compaction record and remove it.
-  int64 partition_id = 1;
-
-  // Free text describing why compaction was skipped for this partition.
-  string reason = 2;
-
-  // Timestamp in nanoseconds since the epoch of when compaction was skipped.
-  int64 skipped_at = 3;
-
-  // The number of Parquet files selected to be compacted for this partition.
-  int64 num_files = 4;
-
-  // The compactor's limit on the number of files in a compaction operation at the time this
-  // compaction was skipped.
-  int64 limit_num_files = 5;
-
-  // The compactor's limit on the number of files at the beginning of a partition at the time
-  // this copmaction was skipped
-  optional int64 limit_num_files_first_in_partition = 8;
-
-  // The number of bytes of memory estimated to be needed to compact this partition at the time
-  // this compaction was skipped.
-  int64 estimated_bytes = 6;
-
-  // The compactor's limit on the number of bytes of memory that can be used for a compaction
-  // operation at the time this compaction was skipped.
-  int64 limit_bytes = 7;
+  repeated influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compactions = 1;
 }
 
 message DeleteSkippedCompactionsRequest {
@@ -54,5 +25,5 @@ message DeleteSkippedCompactionsRequest {
 
 message DeleteSkippedCompactionsResponse {
   // The deleted skipped compaction
-  optional SkippedCompaction skipped_compaction = 1;
+  optional influxdata.iox.skipped_compaction.v1.SkippedCompaction skipped_compaction = 1;
 }
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
index 7ecf8e3411f..7036cadaa73 100644
--- a/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
+++ b/generated_types/protos/influxdata/iox/gossip/v1/schema.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 package influxdata.iox.gossip.v1;
 option go_package = "github.com/influxdata/iox/gossip/v1";
 
+import "google/protobuf/timestamp.proto";
 import "influxdata/iox/partition_template/v1/template.proto";
 
 // A message exchanged via the IOx gossip mechanism describing schema changes.
@@ -16,6 +17,13 @@ message SchemaMessage {
     // One or more new columns were added to an existing table.
     TableUpdated table_updated = 3;
   }
+
+  // Wall-clock time when this consistency probe was enqueued for broadcast
+  // by the sending node.
+  //
+  // Clock-skew yada yada applies to any calculations done by nodes in receipt
+  // of the message.
+  google.protobuf.Timestamp sent_at = 15;
 }
 
 // Initialisation of a new namespace occured.
diff --git a/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto b/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto
new file mode 100644
index 00000000000..b72e38a11ae
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto
@@ -0,0 +1,20 @@
+syntax = "proto3";
+package influxdata.iox.gossip.v1;
+option go_package = "github.com/influxdata/iox/gossip/v1";
+
+import "influxdata/iox/catalog/v1/partition_identifier.proto";
+import "influxdata/iox/catalog/v1/service.proto";
+
+// Notification of the sort key for a partition being updated.
+//
+// This message defines the complete sort key yielded by the sender as a result
+// of the update. Sort key updates are additive so this event MUST NOT result in
+// the removal of columns from the receivers sort key.
+message PartitionSortKeyUpdateEvent {
+  // The unique identifier for the partition this event 
+  influxdata.iox.catalog.v1.PartitionIdentifier identifier = 1;
+
+  // The complete, sorted set of column IDs which make up the new sort key for 
+  // the partition at the time of update.
+  influxdata.iox.catalog.v1.SortKeyIds sort_key_ids = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/partition_template/v1/template.proto b/generated_types/protos/influxdata/iox/partition_template/v1/template.proto
index 976950ddba1..e78646ce85f 100644
--- a/generated_types/protos/influxdata/iox/partition_template/v1/template.proto
+++ b/generated_types/protos/influxdata/iox/partition_template/v1/template.proto
@@ -39,5 +39,17 @@ message TemplatePart {
     // A time format matcher accepts a "strftime"-like format string and
     // evaluates it against the "time" column.
     string time_format = 2;
+
+    // A bucketing matcher that sorts data through a hash on the value of
+    // the specified tag.
+    Bucket bucket = 3;
   }
 }
+
+// A hash-bucketing sub-part of a PartitionTemplate. 
+message Bucket {
+  // The tag name used for derivation of the bucket the data belongs in.
+  string tag_name = 1;
+  // The number of number of buckets tag values are distributed across.
+  uint32 num_buckets = 2;
+}
diff --git a/generated_types/protos/influxdata/iox/querier/v1/flight.proto b/generated_types/protos/influxdata/iox/querier/v1/flight.proto
index adf544b16b2..46ffe90411e 100644
--- a/generated_types/protos/influxdata/iox/querier/v1/flight.proto
+++ b/generated_types/protos/influxdata/iox/querier/v1/flight.proto
@@ -47,6 +47,27 @@ message ReadInfo {
     QUERY_TYPE_FLIGHT_SQL_MESSAGE = 3;
   }
 
+  // A sequence of query parameters to insert into the query in place
+  // of `$placeholder` variables
+  repeated QueryParam params = 6;
+
+  message QueryParam {
+    string name = 1;
+    // Tagged union of possible param values
+    oneof value {
+      NullValue null = 2;
+      bool boolean = 3;
+      uint64 u_int64 = 4;
+      int64 int64 = 5;
+      double float64 = 6;
+      string string = 7;
+    }
+    // a singleton enum to represent a null value
+    enum NullValue {
+      NULL_VALUE_UNSPECIFIED = 0;
+    }
+  }
+
   // Do we present debug information to the user?
   //
   // Debug information are mostly features that are helpful to IOx developers but a unhelpful or even confusing to end
diff --git a/generated_types/protos/influxdata/iox/querier/v1/query_log.proto b/generated_types/protos/influxdata/iox/querier/v1/query_log.proto
new file mode 100644
index 00000000000..a75f26fc6fc
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/querier/v1/query_log.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+package influxdata.iox.querier.v1;
+option go_package = "github.com/influxdata/iox/querier/v1";
+
+import "google/protobuf/duration.proto";
+import "google/protobuf/timestamp.proto";
+
+service QueryLogService {
+    // Get the process-local query log.
+    rpc GetLog(GetLogRequest) returns (GetLogResponse);
+}
+
+message GetLogRequest {}
+
+// Describe a single query.
+message LogEntry {
+  // Unique entry ID.
+  string id = 1;
+
+  // Namespace ID.
+  int64 namespace_id = 2;
+
+  // Namespace name.
+  string namespace_name = 3;
+
+  // Query type, e.g. `sql`.
+  string query_type = 4;
+
+  // Query text.
+  string query_text = 5;
+
+  // Trace ID.
+  string trace_id = 6;
+
+  // Start timestamp.
+  google.protobuf.Timestamp issue_time = 7;
+
+  reserved 8;
+  reserved 'query_completed_duration';
+
+  // Duration it took to acquire a semaphore permit, relative to `issue_time`.
+  google.protobuf.Duration permit_duration = 10;
+
+  // Duration it took to plan the query, relative to `issue_time` + `permit_duration`.
+  google.protobuf.Duration plan_duration = 11;
+
+  // Duration it took to execute the query, relative to `issue_time` +
+  // `permit_duration` + `plan_duration`.
+  google.protobuf.Duration execute_duration = 12;
+
+  // Duration from `issue_time` til the query ended somehow.
+  google.protobuf.Duration end2end_duration = 13;
+
+  // CPU duration spend for computation.
+  google.protobuf.Duration compute_duration = 15;
+
+  // If the query completed successfully.
+  bool success = 9;
+
+  // If the query is currently running (in any state).
+  bool running = 14;
+}
+
+message GetLogResponse {
+  // Current entries.
+  repeated LogEntry entries = 1;
+
+  // Maximum size of the query log.
+  uint64 max_size = 2;
+
+  // Number of evicted entries due to the "max size" constraint.
+  uint64 evicted = 3;
+}
diff --git a/generated_types/protos/influxdata/iox/schema/v1/service.proto b/generated_types/protos/influxdata/iox/schema/v1/service.proto
index 127a8aee1d3..ef026ded99d 100644
--- a/generated_types/protos/influxdata/iox/schema/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/schema/v1/service.proto
@@ -2,9 +2,15 @@ syntax = "proto3";
 package influxdata.iox.schema.v1;
 option go_package = "github.com/influxdata/iox/schema/v1";
 
+import "influxdata/iox/column_type/v1/type.proto";
+
 service SchemaService {
   // Get the schema for a namespace and, optionally, a table within that namespace
   rpc GetSchema(GetSchemaRequest) returns (GetSchemaResponse);
+
+  // Upsert the schema for the specified namespace and table. Returns a namespace schema
+  // containing only the upserted table's schema.
+  rpc UpsertSchema(UpsertSchemaRequest) returns (UpsertSchemaResponse);
 }
 
 message GetSchemaRequest {
@@ -20,6 +26,22 @@ message GetSchemaResponse {
   NamespaceSchema schema = 1;
 }
 
+message UpsertSchemaRequest {
+  // The namespace in which to upsert the schema
+  string namespace = 1;
+
+  // The table for which to upsert the schema
+  string table = 2;
+
+  // Map of Column Name -> Column Type to upsert into this table's schema
+  map<string, influxdata.iox.column_type.v1.ColumnType> columns = 3;
+}
+
+message UpsertSchemaResponse {
+  // Namespace schema containing only the upserted table's schema.
+  NamespaceSchema schema = 1;
+}
+
 message NamespaceSchema {
   // Renamed to topic_id
   reserved 2;
@@ -40,7 +62,7 @@ message NamespaceSchema {
 message TableSchema {
   // Table ID
   int64 id = 1;
-  // Map of Column Name -> Table Schema
+  // Map of Column Name -> Column Schema
   map<string, ColumnSchema> columns = 2;
 }
 
@@ -50,19 +72,5 @@ message ColumnSchema {
   // Column ID
   int64 id = 1;
   // Column type
-  ColumnType column_type = 3;
-
-  // Column data type.
-  enum ColumnType {
-        // An unknown column data type.
-        COLUMN_TYPE_UNSPECIFIED = 0;
-
-        COLUMN_TYPE_I64 = 1;
-        COLUMN_TYPE_U64 = 2;
-        COLUMN_TYPE_F64 = 3;
-        COLUMN_TYPE_BOOL = 4;
-        COLUMN_TYPE_STRING = 5;
-        COLUMN_TYPE_TIME = 6;
-        COLUMN_TYPE_TAG = 7;
-    }
+  influxdata.iox.column_type.v1.ColumnType column_type = 3;
 }
diff --git a/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto b/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto
new file mode 100644
index 00000000000..cd9863cc25d
--- /dev/null
+++ b/generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto
@@ -0,0 +1,29 @@
+syntax = "proto3";
+package influxdata.iox.skipped_compaction.v1;
+option go_package = "github.com/influxdata/iox/skipped_compaction/v1";
+
+message SkippedCompaction {
+  // the partition
+  int64 partition_id = 1;
+
+  // the reason compaction was skipped
+  string reason = 2;
+
+  // when compaction was skipped
+  int64 skipped_at = 3;
+
+  // estimated memory budget
+  int64 estimated_bytes = 4;
+
+  // limit on memory budget
+  int64 limit_bytes = 5;
+
+  // num files selected to compact
+  int64 num_files = 6;
+
+  // limit on num files
+  int64 limit_num_files = 7;
+
+  // limit on num files for the first file in a partition
+  int64 limit_num_files_first_in_partition = 8;
+}
diff --git a/generated_types/protos/influxdata/iox/table/v1/service.proto b/generated_types/protos/influxdata/iox/table/v1/service.proto
index 9eaf2a279ea..34ced589b1a 100644
--- a/generated_types/protos/influxdata/iox/table/v1/service.proto
+++ b/generated_types/protos/influxdata/iox/table/v1/service.proto
@@ -8,6 +8,9 @@ service TableService {
   // Get tables within a namespace
   rpc GetTables(GetTablesRequest) returns (GetTablesResponse);
 
+  // Get a table within a namespace
+  rpc GetTable(GetTableRequest) returns (GetTableResponse); 
+
   // Create a table in a namespace
   rpc CreateTable(CreateTableRequest) returns (CreateTableResponse);
 }
@@ -50,7 +53,20 @@ message GetTablesRequest {
   string namespace_name = 1;
 }
 
+message GetTableRequest {
+  // Name of the namespace to  table for.
+  string namespace_name = 1;
+
+  // Name of the table to get from namespace
+  string table_name = 2;
+}
+
 message GetTablesResponse {
   // Tables contained within the namespace.
   repeated Table tables = 1;
 }
+
+message GetTableResponse {
+  // Table contained within a namespace
+  Table table = 1;
+}
diff --git a/generated_types/protos/influxdata/iox/wal/v1/wal.proto b/generated_types/protos/influxdata/iox/wal/v1/wal.proto
index 244ff759a88..0345073f54d 100644
--- a/generated_types/protos/influxdata/iox/wal/v1/wal.proto
+++ b/generated_types/protos/influxdata/iox/wal/v1/wal.proto
@@ -44,53 +44,4 @@ message SequencedWalOp {
 message WalOpBatch {
   // the ops
   repeated SequencedWalOp ops = 1;
-}
-
-// WAL operation with a sequence number, used to inform read buffers when to evict data
-message NewWalOp {
-    uint32 namespace_id = 2;
-
-    oneof op {
-        WriteBatch write = 3;
-        influxdata.iox.delete.v1.DeletePayload delete = 4;
-        PersistOp persist = 5;
-    }
-}
-
-// A single write request that can add data to multiple tables and multiple partitions
-// within each table.
-message WriteBatch {
-   repeated TableBatch table_batches = 1;
-}
-
-// A batch of writes for a table to one or more partitions.
-message TableBatch {
-    uint32 table_id = 1;
-    repeated PartitionBatch partition_batches = 2;
-}
-
-// A batch of rows to a given partition.
-message PartitionBatch {
-    uint32 partition_id = 1;
-    repeated Row rows = 2;
-}
-
-// A collection of values for a row.
-message Row {
-    repeated Value values = 1;
-    uint64 sequence_number = 2;
-}
-
-message Value {
-    uint32 column_id = 1;
-    oneof value {
-        int64 i64_value = 2;
-        double f64_value = 3;
-        uint64 u64_value = 4;
-        string string_value = 5;
-        string tag_value = 6;
-        bool bool_value = 7;
-        bytes bytes_value = 8;
-        int64 time_value = 9;
-    }
 }
\ No newline at end of file
diff --git a/generated_types/src/lib.rs b/generated_types/src/lib.rs
index 0bd88f222b2..0b468d1fb66 100644
--- a/generated_types/src/lib.rs
+++ b/generated_types/src/lib.rs
@@ -4,8 +4,11 @@
 #![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls)]
 #![allow(
     clippy::derive_partial_eq_without_eq,
+    clippy::future_not_send,
     clippy::needless_borrow,
-    clippy::needless_borrows_for_generic_args
+    clippy::needless_borrows_for_generic_args,
+    missing_copy_implementations,
+    unreachable_pub
 )]
 #![warn(unused_crate_dependencies)]
 
@@ -66,6 +69,19 @@ pub mod influxdata {
             }
         }
 
+        pub mod bulk_ingest {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.bulk_ingest.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.bulk_ingest.v1.serde.rs"
+                ));
+            }
+        }
+
         pub mod catalog {
             pub mod v1 {
                 include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v1.rs"));
@@ -74,6 +90,52 @@ pub mod influxdata {
                     "/influxdata.iox.catalog.v1.serde.rs"
                 ));
             }
+            pub mod v2 {
+                include!(concat!(env!("OUT_DIR"), "/influxdata.iox.catalog.v2.rs"));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog.v2.serde.rs"
+                ));
+            }
+        }
+
+        pub mod catalog_cache {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog_cache.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.catalog_cache.v1.serde.rs"
+                ));
+            }
+
+            impl From<uuid::Uuid> for v1::Uuid {
+                fn from(value: uuid::Uuid) -> Self {
+                    let (high, low) = value.as_u64_pair();
+                    Self { high, low }
+                }
+            }
+
+            impl From<v1::Uuid> for uuid::Uuid {
+                fn from(value: v1::Uuid) -> Self {
+                    uuid::Uuid::from_u64_pair(value.high, value.low)
+                }
+            }
+        }
+
+        pub mod column_type {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.column_type.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.column_type.v1.serde.rs"
+                ));
+            }
         }
 
         pub mod compactor {
@@ -120,6 +182,9 @@ pub mod influxdata {
                 /// Schema cache consistency check / sync / convergence
                 /// messages.
                 SchemaCacheConsistency = 4,
+
+                /// Partition sort key update notifications.
+                PartitionSortKeyUpdates = 5,
             }
 
             impl TryFrom<u64> for Topic {
@@ -133,6 +198,9 @@ pub mod influxdata {
                         v if v == Self::SchemaCacheConsistency as u64 => {
                             Self::SchemaCacheConsistency
                         }
+                        v if v == Self::PartitionSortKeyUpdates as u64 => {
+                            Self::PartitionSortKeyUpdates
+                        }
                         _ => return Err(format!("unknown topic id {}", v).into()),
                     })
                 }
@@ -221,6 +289,19 @@ pub mod influxdata {
             }
         }
 
+        pub mod skipped_compaction {
+            pub mod v1 {
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.skipped_compaction.v1.rs"
+                ));
+                include!(concat!(
+                    env!("OUT_DIR"),
+                    "/influxdata.iox.skipped_compaction.v1.serde.rs"
+                ));
+            }
+        }
+
         pub mod table {
             pub mod v1 {
                 include!(concat!(env!("OUT_DIR"), "/influxdata.iox.table.v1.rs"));
@@ -334,6 +415,7 @@ mod tests {
             Topic::NewParquetFiles,
             Topic::CompactionEvents,
             Topic::SchemaCacheConsistency,
+            Topic::PartitionSortKeyUpdates,
         ];
 
         for topic in topics {
@@ -350,6 +432,7 @@ mod tests {
             Topic::NewParquetFiles => {}
             Topic::CompactionEvents => {}
             Topic::SchemaCacheConsistency => {}
+            Topic::PartitionSortKeyUpdates => {}
         }
     }
 }
diff --git a/grpc-binary-logger-proto/Cargo.toml b/grpc-binary-logger-proto/Cargo.toml
index c660de6f2c2..7b1780c957f 100644
--- a/grpc-binary-logger-proto/Cargo.toml
+++ b/grpc-binary-logger-proto/Cargo.toml
@@ -5,12 +5,15 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-prost = "0.11"
-prost-types = { version = "0.11.9", features = ["std"] }
+prost = { workspace = true }
+prost-types = { workspace = true, features = ["std"] }
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [build-dependencies]
-prost-build = "0.11"
+prost-build = { workspace = true }
 tonic-build = { workspace = true }
diff --git a/grpc-binary-logger-test-proto/Cargo.toml b/grpc-binary-logger-test-proto/Cargo.toml
index fb94cbe6ae5..5fc86dd3217 100644
--- a/grpc-binary-logger-test-proto/Cargo.toml
+++ b/grpc-binary-logger-test-proto/Cargo.toml
@@ -5,11 +5,14 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-prost = "0.11"
+prost = { workspace = true }
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [build-dependencies]
-prost-build = "0.11"
+prost-build = { workspace = true }
 tonic-build = { workspace = true }
diff --git a/grpc-binary-logger-test-proto/src/lib.rs b/grpc-binary-logger-test-proto/src/lib.rs
index 08193ea9442..12397ebd182 100644
--- a/grpc-binary-logger-test-proto/src/lib.rs
+++ b/grpc-binary-logger-test-proto/src/lib.rs
@@ -1,5 +1,10 @@
 #![warn(unused_crate_dependencies)]
-#![allow(clippy::derive_partial_eq_without_eq)]
+#![allow(
+    clippy::derive_partial_eq_without_eq,
+    clippy::future_not_send,
+    missing_copy_implementations,
+    unreachable_pub
+)]
 
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
diff --git a/grpc-binary-logger/Cargo.toml b/grpc-binary-logger/Cargo.toml
index 9754fb09730..09085709ac8 100644
--- a/grpc-binary-logger/Cargo.toml
+++ b/grpc-binary-logger/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 base64 = "0.21"
 byteorder = { version = "1", features = ["std"] }
@@ -14,7 +17,7 @@ http = "0.2"
 http-body = "0.4"
 hyper = "0.14"
 pin-project = "1.1"
-prost = "0.11"
+prost = { workspace = true }
 tokio = {version = "1", features = [ "rt" ]}
 tonic = { workspace = true }
 tower = "0.4"
@@ -27,5 +30,5 @@ tokio-stream = { version = "0.1", features = ["net"] }
 assert_matches = "1"
 
 [build-dependencies]
-prost-build = "0.11"
+prost-build = { workspace = true }
 tonic-build = { workspace = true }
diff --git a/grpc-binary-logger/src/lib.rs b/grpc-binary-logger/src/lib.rs
index 14be5df43be..f302087ac7e 100644
--- a/grpc-binary-logger/src/lib.rs
+++ b/grpc-binary-logger/src/lib.rs
@@ -14,7 +14,7 @@
     clippy::dbg_macro,
     unused_crate_dependencies
 )]
-
+#[allow(unreachable_pub)]
 // Workaround for "unused crate" lint false positives.
 #[cfg(test)]
 use assert_matches as _;
diff --git a/grpc-binary-logger/src/predicate.rs b/grpc-binary-logger/src/predicate.rs
index 0d82b147c36..3cd437f21e8 100644
--- a/grpc-binary-logger/src/predicate.rs
+++ b/grpc-binary-logger/src/predicate.rs
@@ -10,7 +10,7 @@ pub trait Predicate: Clone {
 }
 
 #[derive(Default, Clone, Debug)]
-pub struct LogAll;
+pub(crate) struct LogAll;
 
 impl Predicate for LogAll {
     fn should_log<B>(&self, _req: &hyper::Request<B>) -> bool
diff --git a/grpc-binary-logger/tests/end_to_end_cases/server.rs b/grpc-binary-logger/tests/end_to_end_cases/server.rs
index 018f916f8a9..c148d74a46b 100644
--- a/grpc-binary-logger/tests/end_to_end_cases/server.rs
+++ b/grpc-binary-logger/tests/end_to_end_cases/server.rs
@@ -5,7 +5,7 @@ use tonic::{metadata::MetadataValue, Request, Response, Status};
 use grpc_binary_logger_test_proto::{test_server, TestRequest, TestResponse};
 
 #[derive(Debug, Clone, Copy)]
-pub struct TestService;
+pub(crate) struct TestService;
 
 type PinnedStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send>>;
 
diff --git a/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs b/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs
index 01ce40d4f41..10cb26b8c9e 100644
--- a/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs
+++ b/grpc-binary-logger/tests/end_to_end_cases/test_utils.rs
@@ -13,16 +13,15 @@ use tokio_stream::wrappers::TcpListenerStream;
 use tonic::transport::{Channel, Server};
 
 #[derive(Debug)]
-pub struct Fixture {
-    pub local_addr: String,
-    pub client: TestClient<Channel>,
+pub(crate) struct Fixture {
+    pub(crate) client: TestClient<Channel>,
     shutdown_tx: tokio::sync::oneshot::Sender<()>,
 }
 
 impl Fixture {
     /// Start up a grpc server listening on `port`, returning
     /// a fixture with the server and client.
-    pub async fn new<T, K>(svc: T, sink: K) -> Result<Self, Box<dyn std::error::Error>>
+    pub(crate) async fn new<T, K>(svc: T, sink: K) -> Result<Self, Box<dyn std::error::Error>>
     where
         T: test_server::Test,
         K: Sink + 'static,
@@ -54,7 +53,6 @@ impl Fixture {
             .expect("connect");
 
         Ok(Self {
-            local_addr,
             client,
             shutdown_tx,
         })
@@ -72,19 +70,19 @@ impl Drop for Fixture {
 }
 
 #[derive(Clone, Debug)]
-pub struct RecordingSink {
+pub(crate) struct RecordingSink {
     log: Arc<Mutex<Vec<GrpcLogEntry>>>,
 }
 
 impl RecordingSink {
-    pub fn new() -> Self {
+    pub(crate) fn new() -> Self {
         Self {
             log: Default::default(),
         }
     }
 
     /// Return a copy of the recorded log entries.
-    pub fn entries(&self) -> Vec<GrpcLogEntry> {
+    pub(crate) fn entries(&self) -> Vec<GrpcLogEntry> {
         self.log.lock().unwrap().clone()
     }
 }
diff --git a/import_export/Cargo.toml b/import_export/Cargo.toml
index 27b3cc62583..e516947a79a 100644
--- a/import_export/Cargo.toml
+++ b/import_export/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 bytes = "1.5"
 data_types = { path = "../data_types" }
@@ -16,8 +19,8 @@ parquet_file = { path = "../parquet_file"  }
 object_store = { workspace=true }
 observability_deps = { path = "../observability_deps" }
 schema = { path = "../schema" }
-serde_json = "1.0.107"
-thiserror = "1.0.48"
-tokio = { version = "1.32" }
-tokio-util = { version = "0.7.9", features = ["compat"] }
+serde_json = "1.0.111"
+thiserror = "1.0.56"
+tokio = { version = "1.35" }
+tokio-util = { version = "0.7.10" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/import_export/src/file/export.rs b/import_export/src/file/export.rs
index bde598c47a7..86e91f4bd7d 100644
--- a/import_export/src/file/export.rs
+++ b/import_export/src/file/export.rs
@@ -6,7 +6,7 @@ use influxdb_iox_client::{
         generated_types::{partition_identifier, ParquetFile, PartitionIdentifier},
     },
     connection::Connection,
-    store,
+    store, table,
 };
 use std::path::{Path, PathBuf};
 use thiserror::Error;
@@ -38,13 +38,15 @@ type Result<T, E = ExportError> = std::result::Result<T, E>;
 pub struct RemoteExporter {
     catalog_client: catalog::Client,
     store_client: store::Client,
+    table_client: table::Client,
 }
 
 impl RemoteExporter {
     pub fn new(connection: Connection) -> Self {
         Self {
             catalog_client: catalog::Client::new(connection.clone()),
-            store_client: store::Client::new(connection),
+            store_client: store::Client::new(connection.clone()),
+            table_client: table::Client::new(connection),
         }
     }
 
@@ -65,7 +67,7 @@ impl RemoteExporter {
 
         let parquet_files = self
             .catalog_client
-            .get_parquet_files_by_namespace_table(namespace_name, table_name)
+            .get_parquet_files_by_namespace_table(&namespace_name, &table_name)
             .await?;
 
         // Export the metadata for the table. Since all
@@ -75,7 +77,7 @@ impl RemoteExporter {
             .first()
             .map(|parquet_file| parquet_file.table_id);
         if let Some(table_id) = table_id {
-            self.export_table_metadata(&output_directory, table_id)
+            self.export_table_metadata(&output_directory, table_id, &table_name, &namespace_name)
                 .await?;
         }
 
@@ -105,15 +107,18 @@ impl RemoteExporter {
         &mut self,
         output_directory: &Path,
         table_id: i64,
+        table_name: &str,
+        namespace_name: &str,
     ) -> Result<()> {
         // write table metadata
-        //
-        // (Note that since there is way to get table metadata via
-        // catalog API yet, make an empty object)
-        let table_json = "{}";
+        let table = self
+            .table_client
+            .get_table(namespace_name, table_name)
+            .await?;
+        let table_json = serde_json::to_string_pretty(&table)?;
         let filename = format!("table.{table_id}.json");
         let file_path = output_directory.join(&filename);
-        write_string_to_file(table_json, &file_path).await?;
+        write_string_to_file(&table_json, &file_path).await?;
 
         // write partition metadata for the table
         let partitions = self
@@ -147,18 +152,16 @@ impl RemoteExporter {
         let uuid = &parquet_file.object_store_id;
         let file_size_bytes = parquet_file.file_size_bytes as u64;
 
-        let partition_id = to_partition_id(parquet_file.partition_identifier.as_ref());
-
         // copy out the metadata as pbjson encoded data always (to
         // ensure we have the most up to date version)
         {
-            let filename = format!("{uuid}.{partition_id}.parquet.json");
+            let filename = format!("{uuid}.parquet.json");
             let file_path = output_directory.join(&filename);
             let json = serde_json::to_string_pretty(&parquet_file)?;
             write_string_to_file(&json, &file_path).await?;
         }
 
-        let filename = format!("{uuid}.{partition_id}.parquet");
+        let filename = format!("{uuid}.parquet");
         let file_path = output_directory.join(&filename);
 
         if fs::metadata(&file_path)
diff --git a/import_export/src/file/import.rs b/import_export/src/file/import.rs
index e1877d1ff10..d8401c116f8 100644
--- a/import_export/src/file/import.rs
+++ b/import_export/src/file/import.rs
@@ -6,13 +6,17 @@ use data_types::{
     partition_template::{
         NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, PARTITION_BY_DAY_PROTO,
     },
-    ColumnSet, ColumnType, ColumnsByName, CompactionLevel, Namespace, NamespaceName,
-    NamespaceNameError, ParquetFileParams, Partition, PartitionKey, SortedColumnSet, Statistics,
-    Table, TableId, Timestamp,
+    ColumnSet, ColumnType, CompactionLevel, CompactionLevelProtoError, Namespace, NamespaceId,
+    NamespaceName, NamespaceNameError, ParquetFileParams, Partition, PartitionKey, SortKeyIds,
+    Statistics, Table, TableId, Timestamp,
 };
 use generated_types::influxdata::iox::catalog::v1 as proto;
+use generated_types::influxdata::iox::table::v1 as table;
 //    ParquetFile as ProtoParquetFile, Partition as ProtoPartition,
-use iox_catalog::interface::{CasFailure, Catalog, RepoCollection, SoftDeletedRows};
+use iox_catalog::{
+    interface::{CasFailure, Catalog, ParquetFileRepoExt, RepoCollection, SoftDeletedRows},
+    util::get_table_columns_by_id,
+};
 use object_store::ObjectStore;
 use observability_deps::tracing::{debug, info, warn};
 use parquet_file::{
@@ -59,7 +63,7 @@ pub enum Error {
     #[error("Mismatched sort key. Exported sort key is {exported}, existing is {existing}")]
     MismatchedSortKey { exported: String, existing: String },
 
-    #[error("Unexpected parquet filename. Expected a name like <id>.<partition_id>.parquet, got {path:?}")]
+    #[error("Unexpected parquet filename. Expected a name like <id>.parquet, got {path:?}")]
     UnexpectedFileName { path: PathBuf },
 
     #[error("Invalid Namespace: {0}")]
@@ -71,7 +75,7 @@ pub enum Error {
     NoSortKey,
 
     #[error("Unknown compaction level in encoded metadata: {0}")]
-    UnknownCompactionLevel(Box<dyn std::error::Error + std::marker::Send + Sync>),
+    UnknownCompactionLevel(#[from] CompactionLevelProtoError),
 
     #[error("Catalog error: {0}")]
     Catalog(#[from] iox_catalog::interface::Error),
@@ -110,6 +114,9 @@ pub struct ExportedContents {
     /// Decoded partition metadata,  found in the export
     partition_metadata: Vec<proto::Partition>,
 
+    /// Decoded tables, found in the export
+    tables: Vec<table::Table>,
+
     /// Decoded parquet metata found in the export
     /// Key is object_store_id, value is decoded metadata
     parquet_metadata: Vec<proto::ParquetFile>,
@@ -206,6 +213,21 @@ impl ExportedContents {
             self.parquet_metadata.push(parquet_file);
         }
 
+        for path in &self.table_json_files {
+            debug!(?path, "Reading table metadata json file");
+            let json = std::fs::read_to_string(path).map_err(|e| Error::Reading {
+                path: path.clone(),
+                e,
+            })?;
+
+            let table: table::Table = serde_json::from_str(&json).map_err(|e| Error::Json {
+                path: path.clone(),
+                e,
+            })?;
+
+            self.tables.push(table);
+        }
+
         Ok(())
     }
 
@@ -251,6 +273,15 @@ impl ExportedContents {
             .find(|p| p.object_store_id == object_store_id)
             .cloned()
     }
+
+    /// Returns table information retrieved exported
+    /// from the table client, if any, with the given table namespace id and table id
+    pub fn table(&self, namespace_id: i64, table_name: &str) -> Option<table::Table> {
+        self.tables
+            .iter()
+            .find(|t| t.namespace_id == namespace_id && t.name == table_name)
+            .cloned()
+    }
 }
 
 /// Returns the name of the file
@@ -332,7 +363,7 @@ impl RemoteImporter {
 
         // step 2: Add the appropriate entry to the catalog
         let namespace_name = iox_metadata.namespace_name.as_ref();
-        let mut repos = self.catalog.repositories().await;
+        let mut repos = self.catalog.repositories();
 
         let namespace = repos
             .namespaces()
@@ -411,7 +442,7 @@ impl RemoteImporter {
             Ok(parquet_file) => {
                 debug!(parquet_file_id=?parquet_file.id, "  Created parquet file entry {}", parquet_file.id);
             }
-            Err(iox_catalog::interface::Error::FileExists { .. }) => {
+            Err(iox_catalog::interface::Error::AlreadyExists { .. }) => {
                 warn!(%object_store_id, "parquet file already exists, skipping");
             }
             Err(e) => {
@@ -449,9 +480,10 @@ impl RemoteImporter {
         Ok(())
     }
 
-    /// Return the relevant Catlog [`Table`] for the specified parquet
+    /// Return the relevant Catalog [`Table`] for the specified parquet
     /// file.
     ///
+    /// If the table has been exported, add it to the repo and return it.
     /// If the table does not yet exist, it is created, using any
     /// available catalog metadata and falling back to what is in the
     /// iox metadata if needed
@@ -473,13 +505,23 @@ impl RemoteImporter {
             return Ok(table);
         }
 
+        // use exported table
+        if let Some(table) = self.exported_contents.table(namespace.id.get(), table_name) {
+            return Ok(tables
+                .create(
+                    &table.name,
+                    table.partition_template.try_into()?,
+                    NamespaceId::new(table.namespace_id),
+                )
+                .await?);
+        }
+
         // need to make a new table, create the default partitioning scheme...
         let partition_template = PARTITION_BY_DAY_PROTO.as_ref().clone();
         let namespace_template = NamespacePartitionTemplateOverride::try_from(partition_template)?;
         let custom_table_template = None;
         let partition_template =
             TablePartitionTemplateOverride::try_new(custom_table_template, &namespace_template)?;
-
         let table = tables
             .create(table_name, partition_template, namespace.id)
             .await?;
@@ -508,7 +550,7 @@ impl RemoteImporter {
 
     /// Update sort keys of the partition
     ///
-    /// file shoudl be inserted.
+    /// file should be inserted.
     ///
     /// First attempts to use any available metadata from the
     /// catalog export, and falls back to what is in the iox
@@ -530,24 +572,15 @@ impl RemoteImporter {
             .exported_contents
             .partition_metadata(iox_metadata.table_id.get(), partition_key.inner());
 
-        let (new_sort_key, new_sort_key_ids) = if let Some(proto_partition) =
-            proto_partition.as_ref()
-        {
+        let new_sort_key_ids = if let Some(proto_partition) = proto_partition.as_ref() {
             // Use the sort key from the source catalog
-            debug!(array_sort_key=?proto_partition.array_sort_key, "Using sort key from catalog export");
-            let new_sort_key = proto_partition
-                .array_sort_key
-                .iter()
-                .map(|s| s.as_str())
-                .collect::<Vec<&str>>();
-
+            debug!(sort_key_ids=?proto_partition.sort_key_ids, "Using sort key from catalog export");
             let new_sort_key_ids = match &proto_partition.sort_key_ids {
                 Some(sort_key_ids) => sort_key_ids.array_sort_key_ids.clone(),
                 None => vec![],
             };
-            let new_sort_key_ids = SortedColumnSet::from(new_sort_key_ids);
 
-            (new_sort_key, new_sort_key_ids)
+            SortKeyIds::from(new_sort_key_ids)
         } else {
             warn!("Could not find sort key in catalog metadata export, falling back to embedded metadata");
             let sort_key = iox_metadata
@@ -557,29 +590,15 @@ impl RemoteImporter {
 
             let new_sort_key = sort_key.to_columns().collect::<Vec<_>>();
 
-            // fecth table columns
-            let columns = ColumnsByName::new(repos.columns().list_by_table_id(table.id).await?);
-            let new_sort_key_ids = columns.ids_for_names(&new_sort_key);
-
-            (new_sort_key, new_sort_key_ids)
+            // fetch table columns
+            let columns = get_table_columns_by_id(table.id, repos).await?;
+            columns.ids_for_names(&new_sort_key)
         };
 
-        if !partition.sort_key.is_empty() && partition.sort_key != new_sort_key {
-            let exported = new_sort_key.join(",");
-            let existing = partition.sort_key.join(",");
-            return Err(Error::MismatchedSortKey { exported, existing });
-        }
-
         loop {
             let res = repos
                 .partitions()
-                .cas_sort_key(
-                    &partition.transition_partition_id(),
-                    Some(partition.sort_key.clone()),
-                    Some(partition.sort_key_ids.clone()),
-                    &new_sort_key,
-                    &new_sort_key_ids,
-                )
+                .cas_sort_key(partition.id, partition.sort_key_ids(), &new_sort_key_ids)
                 .await;
 
             match res {
@@ -618,15 +637,13 @@ impl RemoteImporter {
         let column_set = insert_columns(table.id, decoded_iox_parquet_metadata, repos).await?;
 
         let params = if let Some(proto_parquet_file) = &parquet_metadata {
-            let compaction_level = proto_parquet_file
-                .compaction_level
-                .try_into()
-                .map_err(Error::UnknownCompactionLevel)?;
+            let compaction_level = proto_parquet_file.compaction_level.try_into()?;
 
             ParquetFileParams {
                 namespace_id: namespace.id,
                 table_id: table.id,
-                partition_id: partition.transition_partition_id(),
+                partition_id: partition.id,
+                partition_hash_id: partition.hash_id().cloned(),
                 object_store_id,
                 min_time: Timestamp::new(proto_parquet_file.min_time),
                 max_time: Timestamp::new(proto_parquet_file.max_time),
@@ -645,7 +662,8 @@ impl RemoteImporter {
             ParquetFileParams {
                 namespace_id: namespace.id,
                 table_id: table.id,
-                partition_id: partition.transition_partition_id(),
+                partition_id: partition.id,
+                partition_hash_id: partition.hash_id().cloned(),
                 object_store_id,
                 min_time,
                 max_time,
@@ -722,7 +740,7 @@ fn get_min_max_times(
 
 /// Given a filename of the store parquet metadata, returns the object_store_id
 ///
-/// For example, `e65790df-3e42-0094-048f-0b69a7ee402c.13180488.parquet`,
+/// For example, `e65790df-3e42-0094-048f-0b69a7ee402c.parquet`,
 /// returns `e65790df-3e42-0094-048f-0b69a7ee402c`
 ///
 /// For some reason the object store id embedded in the parquet file's
@@ -735,8 +753,5 @@ fn object_store_id_from_parquet_filename(path: &Path) -> Option<String> {
         .file_stem()?
         .to_string_lossy();
 
-    // <uuid>.partition_id --> (<uuid>, partition_id)
-    let (object_store_id, _partition_id) = stem.split_once('.')?;
-
-    Some(object_store_id.to_string())
+    Some(stem.to_string())
 }
diff --git a/influxdb2_client/Cargo.toml b/influxdb2_client/Cargo.toml
index ac0a1a947d4..3b05518a705 100644
--- a/influxdb2_client/Cargo.toml
+++ b/influxdb2_client/Cargo.toml
@@ -5,19 +5,22 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 bytes = "1.5"
 futures = { version = "0.3", default-features = false }
-reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["stream", "json", "rustls-tls-native-roots"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.107"
-snafu = "0.7"
-url = "2.4.1"
+serde_json = "1.0.111"
+snafu = "0.8"
+url = "2.5.0"
 uuid = { version = "1", features = ["v4"] }
 
 [dev-dependencies] # In alphabetical order
 mockito = { version ="1.2", default-features = false }
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parking_lot = "0.12"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 test_helpers = { path = "../test_helpers" }
diff --git a/influxdb2_client/tests/common/server_fixture.rs b/influxdb2_client/tests/common/server_fixture.rs
index d51b8a80a5c..e8707f20978 100644
--- a/influxdb2_client/tests/common/server_fixture.rs
+++ b/influxdb2_client/tests/common/server_fixture.rs
@@ -53,6 +53,7 @@ macro_rules! maybe_skip_integration {
 
 /// Represents a server that has been started and is available for
 /// testing.
+#[derive(Debug)]
 pub struct ServerFixture {
     server: Arc<TestServer>,
 }
@@ -156,6 +157,7 @@ const ADMIN_TEST_ORG: &str = "admin-test-org";
 const ADMIN_TEST_BUCKET: &str = "admin-test-bucket";
 const ADMIN_TEST_PASSWORD: &str = "admin-test-password";
 
+#[derive(Debug)]
 struct TestServer {
     /// Is the server ready to accept connections?
     ready: Mutex<ServerState>,
diff --git a/influxdb3_server/Cargo.toml b/influxdb3_server/Cargo.toml
index 1fd87f79009..f00a735b47a 100644
--- a/influxdb3_server/Cargo.toml
+++ b/influxdb3_server/Cargo.toml
@@ -46,6 +46,8 @@ serde_urlencoded = "0.7.0"
 tower = "0.4.13"
 flate2 = "1.0.27"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
+arrow-json = "49.0.0"
+arrow-schema = "49.0.0"
 
 [dev-dependencies]
 parquet_file = { path = "../parquet_file" }
diff --git a/influxdb3_server/src/http.rs b/influxdb3_server/src/http.rs
index 42706febd3f..895ebf4e6b3 100644
--- a/influxdb3_server/src/http.rs
+++ b/influxdb3_server/src/http.rs
@@ -23,6 +23,8 @@ use std::sync::Arc;
 use thiserror::Error;
 use tokio_util::sync::CancellationToken;
 use tower::Layer;
+use trace_http::metrics::MetricFamily;
+use trace_http::metrics::RequestMetrics;
 use trace_http::tower::TraceLayer;
 
 #[derive(Debug, Error)]
@@ -297,11 +299,14 @@ pub(crate) async fn serve<W: WriteBuffer, Q: QueryExecutor>(
     println!("binding listener");
     info!(bind_addr=%listener.local_addr(), "bound HTTP listener");
 
+    let req_metrics = RequestMetrics::new(
+        Arc::clone(&http_server.common_state.metrics),
+        MetricFamily::HttpServer,
+    );
     let trace_layer = TraceLayer::new(
         http_server.common_state.trace_header_parser.clone(),
-        Arc::<metric::Registry>::clone(&http_server.common_state.metrics),
+        Arc::new(req_metrics),
         http_server.common_state.trace_collector().clone(),
-        false,
         TRACE_SERVER_NAME,
     );
 
diff --git a/influxdb3_server/src/query_executor.rs b/influxdb3_server/src/query_executor.rs
index 1b3301924a6..aeebb24cd05 100644
--- a/influxdb3_server/src/query_executor.rs
+++ b/influxdb3_server/src/query_executor.rs
@@ -2,9 +2,11 @@
 use crate::QueryExecutor;
 use arrow::datatypes::SchemaRef;
 use async_trait::async_trait;
+use data_types::NamespaceId;
 use data_types::{ChunkId, ChunkOrder, TransitionPartitionId};
 use datafusion::catalog::schema::SchemaProvider;
 use datafusion::catalog::CatalogProvider;
+use datafusion::common::ParamValues;
 use datafusion::common::Statistics;
 use datafusion::datasource::{TableProvider, TableType};
 use datafusion::error::DataFusionError;
@@ -18,14 +20,18 @@ use influxdb3_write::{
     WriteBuffer,
 };
 use iox_query::exec::{Executor, ExecutorType, IOxSessionContext};
+use iox_query::frontend::sql::SqlQueryPlanner;
 use iox_query::provider::ProviderBuilder;
-use iox_query::{QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryText};
+use iox_query::query_log::QueryCompletedToken;
+use iox_query::query_log::QueryLog;
+use iox_query::query_log::QueryText;
+use iox_query::query_log::StateReceived;
+use iox_query::QueryNamespaceProvider;
+use iox_query::{QueryChunk, QueryChunkData, QueryNamespace};
 use metric::Registry;
 use observability_deps::tracing::info;
 use schema::sort::SortKey;
 use schema::Schema;
-use service_common::planner::Planner;
-use service_common::QueryNamespaceProvider;
 use std::any::Any;
 use std::collections::HashMap;
 use std::fmt::Debug;
@@ -89,42 +95,57 @@ impl<W: WriteBuffer> QueryExecutor for QueryExecutorImpl<W> {
             })?;
 
         let ctx = db.new_query_context(span_ctx);
-        let _token = db.record_query(
+
+        let token = db.record_query(
             external_span_ctx.as_ref().map(RequestLogContext::ctx),
             "sql",
             Box::new(q.to_string()),
         );
+
         info!("plan");
-        let plan = Planner::new(&ctx).sql(q).await?;
+        let planner = SqlQueryPlanner::new();
+        // TODO: Figure out if we want to support parameter values in SQL
+        // queries
+        let params = ParamValues::List(Vec::new());
+        let plan = planner.query(q, params, &ctx).await?;
+        let token = token.planned(Arc::clone(&plan));
 
-        info!("execute_stream");
-        let query_results = ctx.execute_stream(Arc::clone(&plan)).await?;
+        // TODO: Enforce concurrency limit here
+        let token = token.permit();
 
-        Ok(query_results)
+        info!("execute_stream");
+        match ctx.execute_stream(Arc::clone(&plan)).await {
+            Ok(query_results) => {
+                token.success();
+                Ok(query_results)
+            }
+            Err(err) => {
+                token.fail();
+                Err(err.into())
+            }
+        }
     }
 }
 
 // This implementation is for the Flight service
 #[async_trait]
 impl<W: WriteBuffer> QueryNamespaceProvider for QueryExecutorImpl<W> {
-    type Db = QueryDatabase<W>;
-
     async fn db(
         &self,
         name: &str,
         span: Option<Span>,
         _include_debug_info_tables: bool,
-    ) -> Option<Arc<Self::Db>> {
+    ) -> Option<Arc<dyn QueryNamespace>> {
         let _span_recorder = SpanRecorder::new(span);
 
         let db_schema = self.catalog.db_schema(name)?;
 
-        Some(Arc::new(QueryDatabase {
+        Some(Arc::new(QueryDatabase::new(
             db_schema,
-            write_buffer: Arc::clone(&self.write_buffer) as _,
-            exec: Arc::clone(&self.exec),
-            datafusion_config: Arc::clone(&self.datafusion_config),
-        }))
+            Arc::clone(&self.write_buffer) as _,
+            Arc::clone(&self.exec),
+            Arc::clone(&self.datafusion_config),
+        )))
     }
 
     async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit {
@@ -141,6 +162,7 @@ pub struct QueryDatabase<B> {
     write_buffer: Arc<B>,
     exec: Arc<Executor>,
     datafusion_config: Arc<HashMap<String, String>>,
+    query_log: Arc<QueryLog>,
 }
 
 impl<B: WriteBuffer> QueryDatabase<B> {
@@ -150,11 +172,19 @@ impl<B: WriteBuffer> QueryDatabase<B> {
         exec: Arc<Executor>,
         datafusion_config: Arc<HashMap<String, String>>,
     ) -> Self {
+        // TODO Fine tune this number
+        const QUERY_LOG_LIMIT: usize = 10;
+
+        let query_log = Arc::new(QueryLog::new(
+            QUERY_LOG_LIMIT,
+            Arc::new(iox_time::SystemProvider::new()),
+        ));
         Self {
             db_schema,
             write_buffer,
             exec,
             datafusion_config,
+            query_log,
         }
     }
 }
@@ -181,11 +211,16 @@ impl<B: WriteBuffer> QueryNamespace for QueryDatabase<B> {
         span_ctx: Option<&SpanContext>,
         query_type: &'static str,
         query_text: QueryText,
-    ) -> QueryCompletedToken {
+    ) -> QueryCompletedToken<StateReceived> {
         let trace_id = span_ctx.map(|ctx| ctx.trace_id);
-        QueryCompletedToken::new(move |success| {
-            info!(?trace_id, %query_type, %query_text, %success, "query completed");
-        })
+        let namespace_name: Arc<str> = Arc::from("influxdb3 edge");
+        self.query_log.push(
+            NamespaceId::new(0),
+            namespace_name,
+            query_type,
+            query_text,
+            trace_id,
+        )
     }
 
     fn new_query_context(&self, span_ctx: Option<SpanContext>) -> IOxSessionContext {
@@ -312,7 +347,7 @@ impl<B: WriteBuffer> TableProvider for QueryTable<B> {
         projection: Option<&Vec<usize>>,
         filters: &[Expr],
         limit: Option<usize>,
-    ) -> service_common::planner::Result<Arc<dyn ExecutionPlan>> {
+    ) -> datafusion::common::Result<Arc<dyn ExecutionPlan>> {
         let filters = filters.to_vec();
         info!(
             "TableProvider scan {:?} {:?} {:?}",
diff --git a/influxdb3_write/src/catalog.rs b/influxdb3_write/src/catalog.rs
index 9459668d2ad..0065758b8f0 100644
--- a/influxdb3_write/src/catalog.rs
+++ b/influxdb3_write/src/catalog.rs
@@ -141,7 +141,7 @@ pub struct TableDefinition {
     pub name: String,
     #[serde(skip_serializing, skip_deserializing)]
     pub schema: Option<Schema>,
-    columns: BTreeMap<String, ColumnType>,
+    columns: BTreeMap<String, i16>,
 }
 
 struct TableDefinitionVisitor;
@@ -171,7 +171,7 @@ impl<'de> Visitor<'de> for TableDefinitionVisitor {
                     if columns.is_some() {
                         return Err(serde::de::Error::duplicate_field("columns"));
                     }
-                    columns = Some(map.next_value::<BTreeMap<String, ColumnType>>()?);
+                    columns = Some(map.next_value::<BTreeMap<String, i16>>()?);
                 }
                 _ => {
                     let _ = map.next_value::<serde::de::IgnoredAny>()?;
@@ -195,10 +195,13 @@ impl<'de> Deserialize<'de> for TableDefinition {
 }
 
 impl TableDefinition {
-    pub(crate) fn new(name: impl Into<String>, columns: BTreeMap<String, ColumnType>) -> Self {
+    pub(crate) fn new(name: impl Into<String>, columns: BTreeMap<String, i16>) -> Self {
         let mut schema_builder = SchemaBuilder::with_capacity(columns.len());
         for (name, column_type) in &columns {
-            schema_builder.influx_column(name, column_type_to_influx_column_type(column_type));
+            schema_builder.influx_column(
+                name,
+                column_type_to_influx_column_type(&ColumnType::try_from(*column_type).unwrap()),
+            );
         }
         let schema = schema_builder.build().unwrap();
 
@@ -213,11 +216,14 @@ impl TableDefinition {
         self.columns.contains_key(column)
     }
 
-    pub(crate) fn add_columns(&mut self, mut columns: Vec<(String, ColumnType)>) {
+    pub(crate) fn add_columns(&mut self, mut columns: Vec<(String, i16)>) {
         let mut schema_builder = SchemaBuilder::with_capacity(columns.len());
         columns.sort_by(|(a, _), (b, _)| a.cmp(b));
         for (name, column_type) in &columns {
-            schema_builder.influx_column(name, column_type_to_influx_column_type(column_type));
+            schema_builder.influx_column(
+                name,
+                column_type_to_influx_column_type(&ColumnType::try_from(*column_type).unwrap()),
+            );
         }
         let schema = schema_builder.build().unwrap();
 
@@ -227,7 +233,7 @@ impl TableDefinition {
         self.schema = Some(schema);
     }
 
-    pub(crate) fn columns(&self) -> &BTreeMap<String, ColumnType> {
+    pub(crate) fn columns(&self) -> &BTreeMap<String, i16> {
         &self.columns
     }
 }
@@ -259,7 +265,7 @@ mod tests {
             "test".into(),
             TableDefinition::new(
                 "test",
-                BTreeMap::from([("test".to_string(), ColumnType::String)]),
+                BTreeMap::from([("test".to_string(), ColumnType::String as i16)]),
             ),
         );
         let database = Arc::new(database);
diff --git a/influxdb3_write/src/persister.rs b/influxdb3_write/src/persister.rs
index 7a374cadff5..a70e8123f03 100644
--- a/influxdb3_write/src/persister.rs
+++ b/influxdb3_write/src/persister.rs
@@ -83,10 +83,7 @@ impl PersisterImpl {
 #[async_trait]
 impl Persister for PersisterImpl {
     async fn load_catalog(&self) -> Result<Option<PersistedCatalog>> {
-        let mut list = self
-            .object_store
-            .list(Some(&CatalogFilePath::dir()))
-            .await?;
+        let mut list = self.object_store.list(Some(&CatalogFilePath::dir()));
         let mut catalog_path: Option<ObjPath> = None;
         while let Some(item) = list.next().await {
             let item = item?;
@@ -152,18 +149,11 @@ impl Persister for PersisterImpl {
                 count
             };
 
-            let segment_list = if let Some(offset) = offset {
+            let mut segment_list = if let Some(offset) = offset {
                 self.object_store
                     .list_with_offset(Some(&SegmentInfoFilePath::dir()), &offset)
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await
             } else {
-                self.object_store
-                    .list(Some(&SegmentInfoFilePath::dir()))
-                    .await?
-                    .collect::<Vec<_>>()
-                    .await
+                self.object_store.list(Some(&SegmentInfoFilePath::dir()))
             };
 
             // Why not collect into a Result<Vec<ObjectMeta>, object_store::Error>>
@@ -175,8 +165,8 @@ impl Persister for PersisterImpl {
             // on the n most recent segments that we want and is returned in order
             // of the moste recent to least.
             let mut list = Vec::new();
-            for segment in segment_list {
-                list.push(segment?);
+            while let Some(item) = segment_list.next().await {
+                list.push(item?);
             }
 
             list.sort_unstable_by(|a, b| a.location.cmp(&b.location));
diff --git a/influxdb3_write/src/write_buffer.rs b/influxdb3_write/src/write_buffer.rs
index 238589a8997..56089b46d3d 100644
--- a/influxdb3_write/src/write_buffer.rs
+++ b/influxdb3_write/src/write_buffer.rs
@@ -23,7 +23,7 @@ use datafusion::common::{DataFusionError, Statistics};
 use datafusion::execution::context::SessionState;
 use datafusion::logical_expr::Expr;
 use influxdb_line_protocol::{parse_lines, FieldValue, ParsedLine};
-use iox_catalog::TIME_COLUMN;
+use iox_catalog::constants::TIME_COLUMN;
 use iox_query::chunk_statistics::{create_chunk_statistics, ColumnRange};
 use iox_query::{QueryChunk, QueryChunkData};
 use observability_deps::tracing::{debug, info};
@@ -146,13 +146,13 @@ impl<W: Wal> WriteBufferImpl<W> {
             let batch = partition_buffer.rows_to_record_batch(&schema, table.columns());
             let column_ranges = Arc::new(partition_buffer.column_ranges);
             let batch_stats = create_chunk_statistics(
-                partition_buffer.rows.len() as u64,
+                Some(partition_buffer.rows.len()),
                 &schema,
                 Some(TimestampMinMax {
                     min: partition_buffer.timestamp_min,
                     max: partition_buffer.timestamp_max,
                 }),
-                &column_ranges,
+                Some(&column_ranges),
             );
 
             let chunk = BufferChunk {
@@ -243,12 +243,12 @@ impl PartitionBuffer {
     fn rows_to_record_batch(
         &self,
         schema: &Schema,
-        column_types: &BTreeMap<String, ColumnType>,
+        column_types: &BTreeMap<String, i16>,
     ) -> RecordBatch {
         let row_count = self.rows.len();
         let mut columns = BTreeMap::new();
         for (name, column_type) in column_types {
-            match column_type {
+            match ColumnType::try_from(*column_type).unwrap() {
                 ColumnType::Bool => columns.insert(
                     name,
                     Builder::Bool(BooleanBuilder::with_capacity(row_count)),
@@ -493,13 +493,13 @@ fn validate_and_convert_parsed_line(
             if let Some(tagset) = &line.series.tag_set {
                 for (tag_key, _) in tagset {
                     if !t.column_exists(tag_key.as_str()) {
-                        new_cols.push((tag_key.to_string(), ColumnType::Tag));
+                        new_cols.push((tag_key.to_string(), ColumnType::Tag as i16));
                     }
                 }
             }
             for (field_name, value) in &line.field_set {
                 if !t.column_exists(field_name.as_str()) {
-                    new_cols.push((field_name.to_string(), column_type_from_field(value)));
+                    new_cols.push((field_name.to_string(), column_type_from_field(value) as i16));
                 }
             }
 
@@ -512,14 +512,14 @@ fn validate_and_convert_parsed_line(
             let mut columns = BTreeMap::new();
             if let Some(tag_set) = &line.series.tag_set {
                 for (tag_key, _) in tag_set {
-                    columns.insert(tag_key.to_string(), ColumnType::Tag);
+                    columns.insert(tag_key.to_string(), ColumnType::Tag as i16);
                 }
             }
             for (field_name, value) in &line.field_set {
-                columns.insert(field_name.to_string(), column_type_from_field(value));
+                columns.insert(field_name.to_string(), column_type_from_field(value) as i16);
             }
 
-            columns.insert(TIME_COLUMN.to_string(), ColumnType::Time);
+            columns.insert(TIME_COLUMN.to_string(), ColumnType::Time as i16);
 
             let table = TableDefinition::new(table_name, columns);
 
diff --git a/influxdb_influxql_parser/Cargo.toml b/influxdb_influxql_parser/Cargo.toml
index d63a93293a6..6751e459b91 100644
--- a/influxdb_influxql_parser/Cargo.toml
+++ b/influxdb_influxql_parser/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 nom = { version = "7", default-features = false, features = ["std"] }
 once_cell = "1"
@@ -17,5 +20,5 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 [dev-dependencies] # In alphabetical order
 test_helpers = { path = "../test_helpers" }
 assert_matches = "1"
-insta = { version = "1.32.0", features = ["yaml"] }
-paste = "1.0.14"
\ No newline at end of file
+insta = { version = "1.34.0", features = ["yaml"] }
+paste = "1.0.14"
diff --git a/influxdb_influxql_parser/src/explain.rs b/influxdb_influxql_parser/src/explain.rs
index fc278e5f7ce..d96fbd7dd59 100644
--- a/influxdb_influxql_parser/src/explain.rs
+++ b/influxdb_influxql_parser/src/explain.rs
@@ -7,7 +7,7 @@
 use crate::common::ws1;
 use crate::internal::{expect, ParseResult};
 use crate::keywords::keyword;
-use crate::select::{select_statement, SelectStatement};
+use crate::statement::{statement, Statement};
 use nom::branch::alt;
 use nom::combinator::{map, opt, value};
 use nom::sequence::{preceded, tuple};
@@ -46,7 +46,7 @@ pub struct ExplainStatement {
     pub options: Option<ExplainOption>,
 
     /// Represents the `SELECT` statement to be explained and / or analyzed.
-    pub select: Box<SelectStatement>,
+    pub statement: Box<Statement>,
 }
 
 impl Display for ExplainStatement {
@@ -55,7 +55,7 @@ impl Display for ExplainStatement {
         if let Some(options) = &self.options {
             write!(f, "{options} ")?;
         }
-        Display::fmt(&self.select, f)
+        Display::fmt(&self.statement, f)
     }
 }
 
@@ -80,13 +80,13 @@ pub(crate) fn explain_statement(i: &str) -> ParseResult<&str, ExplainStatement>
             )),
             ws1,
             expect(
-                "invalid EXPLAIN statement, expected SELECT statement",
-                select_statement,
+                "invalid EXPLAIN statement, expected InfluxQL statement",
+                statement,
             ),
         )),
-        |(_, options, _, select)| ExplainStatement {
+        |(_, options, _, statement)| ExplainStatement {
             options,
-            select: Box::new(select),
+            statement: Box::new(statement),
         },
     )(i)
 }
@@ -99,6 +99,8 @@ mod test {
 
     #[test]
     fn test_explain_statement() {
+        // EXPLAIN SELECT cases
+
         let (remain, got) = explain_statement("EXPLAIN SELECT val from temp").unwrap();
         assert_eq!(remain, ""); // assert that all input was consumed
         assert_matches!(got.options, None);
@@ -123,16 +125,161 @@ mod test {
             "EXPLAIN ANALYZE VERBOSE SELECT val FROM temp"
         );
 
-        // Fallible cases
+        // EXPLAIN SHOW MEASUREMENTS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW MEASUREMENTS");
 
-        assert_expect_error!(
-            explain_statement("EXPLAIN ANALYZE SHOW DATABASES"),
-            "invalid EXPLAIN statement, expected SELECT statement"
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW MEASUREMENTS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW MEASUREMENTS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW MEASUREMENTS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW MEASUREMENTS");
+
+        // EXPLAIN SHOW TAG KEYS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW TAG KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW TAG KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW TAG KEYS");
+
+        // EXPLAIN SHOW TAG VALUES cases
+        let (remain, got) =
+            explain_statement("EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
         );
 
-        assert_expect_error!(
-            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp"),
-            "invalid EXPLAIN statement, expected SELECT statement"
+        let (remain, got) =
+            explain_statement("EXPLAIN VERBOSE SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN VERBOSE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE SHOW TAG VALUES WITH KEY = \"Key\"").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SHOW TAG VALUES WITH KEY = \"Key\"")
+                .unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE VERBOSE SHOW TAG VALUES WITH KEY = \"Key\""
+        );
+
+        // EXPLAIN SHOW FIELD KEYS cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW FIELD KEYS");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW FIELD KEYS").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW FIELD KEYS");
+
+        // EXPLAIN SHOW RETENTION POLICIES cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW RETENTION POLICIES");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW RETENTION POLICIES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW RETENTION POLICIES");
+
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE VERBOSE SHOW RETENTION POLICIES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE VERBOSE SHOW RETENTION POLICIES"
+        );
+
+        // EXPLAIN SHOW DATABASES cases
+        let (remain, got) = explain_statement("EXPLAIN SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(got.options, None);
+        assert_eq!(got.to_string(), "EXPLAIN SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN VERBOSE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Verbose);
+        assert_eq!(got.to_string(), "EXPLAIN VERBOSE SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE SHOW DATABASES");
+
+        let (remain, got) = explain_statement("EXPLAIN ANALYZE VERBOSE SHOW DATABASES").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::AnalyzeVerbose);
+        assert_eq!(got.to_string(), "EXPLAIN ANALYZE VERBOSE SHOW DATABASES");
+
+        // NOTE: Nested EXPLAIN is valid; DataFusion will throw a "No Nested EXPLAIN" error later
+        let (remain, got) =
+            explain_statement("EXPLAIN ANALYZE EXPLAIN SELECT val from temp").unwrap();
+        assert_eq!(remain, ""); // assert that all input was consumed
+        assert_matches!(&got.options, Some(o) if *o == ExplainOption::Analyze);
+        assert_eq!(
+            got.to_string(),
+            "EXPLAIN ANALYZE EXPLAIN SELECT val FROM temp"
         );
 
         // surfaces statement-specific errors
diff --git a/influxdb_influxql_parser/src/internal.rs b/influxdb_influxql_parser/src/internal.rs
index 65d608c2778..90b2f591754 100644
--- a/influxdb_influxql_parser/src/internal.rs
+++ b/influxdb_influxql_parser/src/internal.rs
@@ -7,7 +7,7 @@ use std::fmt::{Display, Formatter};
 
 /// This trait must be implemented in order to use the [`map_fail`] and
 /// [`expect`] functions for generating user-friendly error messages.
-pub trait ParseError<'a>: NomParseError<&'a str> + Sized {
+pub(crate) trait ParseError<'a>: NomParseError<&'a str> + Sized {
     fn from_message(input: &'a str, message: &'static str) -> Self;
 }
 
@@ -38,7 +38,7 @@ impl<'a> ParseError<'a> for Error<&'a str> {
 /// Applies a function returning a [`ParseResult`] over the result of the `parser`.
 /// If the parser returns an error, the result will be mapped to an unrecoverable
 /// [`nom::Err::Failure`] with the specified `message` for additional context.
-pub fn map_fail<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
+pub(crate) fn map_fail<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
     message: &'static str,
     mut parser: F,
     mut f: G,
@@ -59,7 +59,7 @@ where
 /// Applies a function returning a [`ParseResult`] over the result of the `parser`.
 /// If the parser returns an error, the result will be mapped to a recoverable
 /// [`nom::Err::Error`] with the specified `message` for additional context.
-pub fn map_error<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
+pub(crate) fn map_error<'a, O1, O2, E: ParseError<'a>, E2, F, G>(
     message: &'static str,
     mut parser: F,
     mut f: G,
@@ -79,7 +79,7 @@ where
 
 /// Transforms a [`nom::Err::Error`] to a [`nom::Err::Failure`] using `message` for additional
 /// context.
-pub fn expect<'a, E: ParseError<'a>, F, O>(
+pub(crate) fn expect<'a, E: ParseError<'a>, F, O>(
     message: &'static str,
     mut f: F,
 ) -> impl FnMut(&'a str) -> ParseResult<&'a str, O, E>
@@ -96,7 +96,7 @@ where
 
 /// Returns the result of `f` if it satisfies `is_valid`; otherwise,
 /// returns an error using the specified `message`.
-pub fn verify<'a, O1, O2, E: ParseError<'a>, F, G>(
+pub(crate) fn verify<'a, O1, O2, E: ParseError<'a>, F, G>(
     message: &'static str,
     mut f: F,
     is_valid: G,
diff --git a/influxdb_influxql_parser/src/keywords.rs b/influxdb_influxql_parser/src/keywords.rs
index 6820dd923db..d66524511a4 100644
--- a/influxdb_influxql_parser/src/keywords.rs
+++ b/influxdb_influxql_parser/src/keywords.rs
@@ -146,7 +146,7 @@ pub(crate) fn sql_keyword(i: &str) -> ParseResult<&str, &str> {
 
 /// Recognizes a case-insensitive `keyword`, ensuring it is followed by
 /// a valid separator.
-pub fn keyword<'a>(keyword: &'static str) -> impl FnMut(&'a str) -> ParseResult<&str, &str> {
+pub(crate) fn keyword<'a>(keyword: &'static str) -> impl FnMut(&'a str) -> ParseResult<&str, &str> {
     terminated(tag_no_case(keyword), keyword_follow_char)
 }
 
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap
new file mode 100644
index 00000000000..442e5725857
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap
new file mode 100644
index 00000000000..145932a90b8
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap
new file mode 100644
index 00000000000..6380d322424
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG VALUES WITH KEY = \\\"Key\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap
new file mode 100644
index 00000000000..51288d7c828
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap
new file mode 100644
index 00000000000..9b92a681210
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap
new file mode 100644
index 00000000000..c6dca620c3e
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap
new file mode 100644
index 00000000000..ed8616812a7
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit.rs
+expression: "visit_statement!(\"EXPLAIN EXPLAIN SELECT * from cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap
index 42d91a13523..20b1193a764 100644
--- a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement.snap
@@ -4,6 +4,7 @@ expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
 ---
 - pre_visit_statement
 - pre_visit_explain_statement
+- pre_visit_statement
 - pre_visit_select_statement
 - pre_visit_select_field_list
 - pre_visit_select_field
@@ -20,6 +21,7 @@ expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
 - post_visit_select_measurement_selection
 - post_visit_select_from_clause
 - post_visit_select_statement
+- post_visit_statement
 - post_visit_explain_statement
 - post_visit_statement
 
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap
new file mode 100644
index 00000000000..f47cd019624
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW MEASUREMENTS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_measurements_statement
+- post_visit_show_measurements_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap
new file mode 100644
index 00000000000..52d7f78db77
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_keys_statement
+- post_visit_show_tag_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap
new file mode 100644
index 00000000000..4c788453e2d
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap
@@ -0,0 +1,15 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW TAG VALUES WITH KEY = \\\"Key\\\"\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_tag_values_statement
+- pre_visit_with_key_clause
+- post_visit_with_key_clause
+- post_visit_show_tag_values_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap
new file mode 100644
index 00000000000..0a34c156940
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW FIELD KEYS\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_field_keys_statement
+- post_visit_show_field_keys_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap
new file mode 100644
index 00000000000..e67d8422b68
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW RETENTION POLICIES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_retention_policies_statement
+- post_visit_show_retention_policies_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap
new file mode 100644
index 00000000000..d3c64c27a9d
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap
@@ -0,0 +1,13 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN SHOW DATABASES\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_show_databases_statement
+- post_visit_show_databases_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap
new file mode 100644
index 00000000000..382aa954fa0
--- /dev/null
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap
@@ -0,0 +1,31 @@
+---
+source: influxdb_influxql_parser/src/visit_mut.rs
+expression: "visit_statement!(\"EXPLAIN EXPLAIN SELECT * from cpu\")"
+---
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_explain_statement
+- pre_visit_statement
+- pre_visit_select_statement
+- pre_visit_select_field_list
+- pre_visit_select_field
+- pre_visit_expr
+- post_visit_expr
+- post_visit_select_field
+- post_visit_select_field_list
+- pre_visit_select_from_clause
+- pre_visit_select_measurement_selection
+- pre_visit_qualified_measurement_name
+- pre_visit_measurement_name
+- post_visit_measurement_name
+- post_visit_qualified_measurement_name
+- post_visit_select_measurement_selection
+- post_visit_select_from_clause
+- post_visit_select_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+- post_visit_explain_statement
+- post_visit_statement
+
diff --git a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap
index 868db594e61..54e1008c052 100644
--- a/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap
+++ b/influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement.snap
@@ -4,6 +4,7 @@ expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
 ---
 - pre_visit_statement
 - pre_visit_explain_statement
+- pre_visit_statement
 - pre_visit_select_statement
 - pre_visit_select_field_list
 - pre_visit_select_field
@@ -20,6 +21,7 @@ expression: "visit_statement!(\"EXPLAIN SELECT * FROM cpu\")"
 - post_visit_select_measurement_selection
 - post_visit_select_from_clause
 - post_visit_select_statement
+- post_visit_statement
 - post_visit_explain_statement
 - post_visit_statement
 
diff --git a/influxdb_influxql_parser/src/time_range.rs b/influxdb_influxql_parser/src/time_range.rs
index 4d07580d347..069df2a6b68 100644
--- a/influxdb_influxql_parser/src/time_range.rs
+++ b/influxdb_influxql_parser/src/time_range.rs
@@ -299,6 +299,11 @@ pub fn split_cond(
                 }) => {
                     stack.push(Some(node.clone()));
                 }
+                node @ CE::Expr(expr)
+                    if matches!(expr.as_ref(), Expr::Literal(Literal::Boolean(_))) =>
+                {
+                    stack.push(Some(node.clone()));
+                }
                 CE::Binary(ConditionalBinary {
                     op: op @ (And | Or),
                     ..
@@ -895,8 +900,13 @@ mod test {
         assert!(cond.is_none());
         assert_eq!(tr, range!(lower ex = 1672531200000000000, upper = 1000));
 
-        // fallible
+        // boolean constant
+        // see https://github.com/influxdata/influxdb_iox/issues/9175
+        let (cond, tr) = split_exprs("true OR time > 0").unwrap();
+        assert_eq!(cond.unwrap().to_string(), "true");
+        assert_eq!(tr, range!(lower = 1));
 
+        // fallible
         assert_error!(split_exprs("time > '2004-04-09T'"), ExprError::Expression(ref s) if s == "invalid expression \"'2004-04-09T'\": '2004-04-09T' is not a valid timestamp");
     }
 
diff --git a/influxdb_influxql_parser/src/visit.rs b/influxdb_influxql_parser/src/visit.rs
index 8b668dd701f..cedcb9b8315 100644
--- a/influxdb_influxql_parser/src/visit.rs
+++ b/influxdb_influxql_parser/src/visit.rs
@@ -53,6 +53,7 @@ use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
 use crate::statement::Statement;
 
 /// Controls how the visitor recursion should proceed.
+#[derive(Debug)]
 pub enum Recursion<V: Visitor> {
     /// Attempt to visit all the children, recursively, of this expression.
     Continue(V),
@@ -719,7 +720,7 @@ impl Visitable for ExplainStatement {
             Stop(visitor) => return Ok(visitor),
         };
 
-        let visitor = self.select.accept(visitor)?;
+        let visitor = self.statement.accept(visitor)?;
 
         visitor.post_visit_explain_statement(self)
     }
@@ -1475,6 +1476,15 @@ mod test {
     #[test]
     fn test_explain_statement() {
         insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SELECT * FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW DATABASES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN EXPLAIN SELECT * from cpu"));
     }
 
     #[test]
diff --git a/influxdb_influxql_parser/src/visit_mut.rs b/influxdb_influxql_parser/src/visit_mut.rs
index bc2b6babfad..3205f5f3313 100644
--- a/influxdb_influxql_parser/src/visit_mut.rs
+++ b/influxdb_influxql_parser/src/visit_mut.rs
@@ -53,7 +53,7 @@ use crate::simple_from_clause::{DeleteFromClause, ShowFromClause};
 use crate::statement::Statement;
 
 /// Controls how the visitor recursion should proceed.
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 pub enum Recursion {
     /// Attempt to visit all the children, recursively, of this expression.
     Continue,
@@ -742,7 +742,7 @@ impl VisitableMut for ExplainStatement {
             return Ok(());
         };
 
-        self.select.accept(visitor)?;
+        self.statement.accept(visitor)?;
 
         visitor.post_visit_explain_statement(self)
     }
@@ -1413,6 +1413,15 @@ mod test {
     #[test]
     fn test_explain_statement() {
         insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SELECT * FROM cpu"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW MEASUREMENTS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW TAG KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!(
+            "EXPLAIN SHOW TAG VALUES WITH KEY = \"Key\""
+        ));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW FIELD KEYS"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW RETENTION POLICIES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN SHOW DATABASES"));
+        insta::assert_yaml_snapshot!(visit_statement!("EXPLAIN EXPLAIN SELECT * from cpu"));
     }
 
     #[test]
diff --git a/influxdb_iox_client/Cargo.toml b/influxdb_iox_client/Cargo.toml
index 29bfd54f34f..84a20933231 100644
--- a/influxdb_iox_client/Cargo.toml
+++ b/influxdb_iox_client/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [features]
 default = ["flight", "format"]
 flight = ["arrow", "arrow-flight", "arrow_util"]
@@ -12,22 +15,23 @@ format = ["arrow", "arrow_util"]
 
 [dependencies]
 arrow = { workspace = true, optional = true }
-arrow-flight = { workspace = true, optional = true, features=["flight-sql-experimental"] }
+arrow-flight = { workspace = true, optional = true }
 arrow_util = { path = "../arrow_util", optional = true }
 bytes = "1.5"
 client_util = { path = "../client_util" }
-comfy-table = { version = "7.0", default-features = false}
+comfy-table = { version = "7.1", default-features = false}
 futures-util = { version = "0.3" }
 influxdb-line-protocol = { path = "../influxdb_line_protocol"}
+iox_query_params = { path = "../iox_query_params" }
 generated_types = { path = "../generated_types" }
-prost = "0.11"
+prost = { workspace = true }
 rand = "0.8.3"
-reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls-native-roots"] }
 schema = { path = "../schema" }
-serde_json = "1.0.107"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread"] }
+serde_json = "1.0.111"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread"] }
 tokio-stream = "0.1.13"
-thiserror = "1.0.48"
+thiserror = "1.0.56"
 tonic = { workspace = true }
 
 [dev-dependencies]
diff --git a/influxdb_iox_client/src/client.rs b/influxdb_iox_client/src/client.rs
index bcfe3313661..161e5ad2120 100644
--- a/influxdb_iox_client/src/client.rs
+++ b/influxdb_iox_client/src/client.rs
@@ -27,6 +27,9 @@ pub mod ingester;
 /// Client for namespace API
 pub mod namespace;
 
+/// Client for query log API.
+pub mod query_log;
+
 /// Client for schema API
 pub mod schema;
 
diff --git a/influxdb_iox_client/src/client/catalog.rs b/influxdb_iox_client/src/client/catalog.rs
index 249b2dec506..f65ea4f154e 100644
--- a/influxdb_iox_client/src/client/catalog.rs
+++ b/influxdb_iox_client/src/client/catalog.rs
@@ -29,15 +29,9 @@ impl Client {
         &mut self,
         partition_id: i64,
     ) -> Result<Vec<ParquetFile>, Error> {
-        let partition_identifier = PartitionIdentifier {
-            id: Some(partition_identifier::Id::CatalogId(partition_id)),
-        };
-
         let response = self
             .inner
-            .get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest {
-                partition_identifier: Some(partition_identifier),
-            })
+            .get_parquet_files_by_partition_id(GetParquetFilesByPartitionIdRequest { partition_id })
             .await?;
 
         Ok(response.into_inner().parquet_files)
diff --git a/influxdb_iox_client/src/client/compactor.rs b/influxdb_iox_client/src/client/compactor.rs
index 6ccf577c07d..43fc01979cd 100644
--- a/influxdb_iox_client/src/client/compactor.rs
+++ b/influxdb_iox_client/src/client/compactor.rs
@@ -5,6 +5,7 @@ use client_util::connection::GrpcConnection;
 /// Re-export generated_types
 pub mod generated_types {
     pub use generated_types::influxdata::iox::compactor::v1::*;
+    pub use generated_types::influxdata::iox::skipped_compaction::v1::*;
 }
 
 /// A basic client for interacting with the compaction service.
@@ -22,7 +23,9 @@ impl Client {
     }
 
     /// List all skipped compactions
-    pub async fn skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>, Error> {
+    pub async fn skipped_compactions(
+        &mut self,
+    ) -> Result<Vec<generated_types::SkippedCompaction>, Error> {
         let response = self
             .inner
             .list_skipped_compactions(ListSkippedCompactionsRequest {})
@@ -35,7 +38,7 @@ impl Client {
     pub async fn delete_skipped_compactions(
         &mut self,
         partition_id: i64,
-    ) -> Result<Option<SkippedCompaction>, Error> {
+    ) -> Result<Option<generated_types::SkippedCompaction>, Error> {
         let response = self
             .inner
             .delete_skipped_compactions(DeleteSkippedCompactionsRequest { partition_id })
diff --git a/influxdb_iox_client/src/client/error.rs b/influxdb_iox_client/src/client/error.rs
index ffb4577ba9c..fff676e20e5 100644
--- a/influxdb_iox_client/src/client/error.rs
+++ b/influxdb_iox_client/src/client/error.rs
@@ -68,16 +68,16 @@ pub enum Error {
     Unknown(ServerError<()>),
 
     #[error("Client specified an invalid argument: {0}")]
-    InvalidArgument(ServerError<FieldViolation>),
+    InvalidArgument(Box<ServerError<FieldViolation>>),
 
     #[error("Deadline expired before operation could complete: {0}")]
     DeadlineExceeded(ServerError<()>),
 
     #[error("{0}")]
-    NotFound(ServerError<NotFound>),
+    NotFound(Box<ServerError<NotFound>>),
 
     #[error("Some entity that we attempted to create already exists: {0}")]
-    AlreadyExists(ServerError<AlreadyExists>),
+    AlreadyExists(Box<ServerError<AlreadyExists>>),
 
     #[error("The caller does not have permission to execute the specified operation: {0}")]
     PermissionDenied(ServerError<()>),
@@ -86,7 +86,7 @@ pub enum Error {
     ResourceExhausted(ServerError<()>),
 
     #[error("The system is not in a state required for the operation's execution: {0}")]
-    FailedPrecondition(ServerError<PreconditionViolation>),
+    FailedPrecondition(Box<ServerError<PreconditionViolation>>),
 
     #[error("The operation was aborted: {0}")]
     Aborted(ServerError<()>),
@@ -122,13 +122,13 @@ impl From<tonic::Status> for Error {
             Code::Ok => Self::Client("status is not an error".into()),
             Code::Cancelled => Self::Cancelled(parse_status(s)),
             Code::Unknown => Self::Unknown(parse_status(s)),
-            Code::InvalidArgument => Self::InvalidArgument(parse_status(s)),
+            Code::InvalidArgument => Self::InvalidArgument(Box::new(parse_status(s))),
             Code::DeadlineExceeded => Self::DeadlineExceeded(parse_status(s)),
-            Code::NotFound => Self::NotFound(parse_status(s)),
-            Code::AlreadyExists => Self::AlreadyExists(parse_status(s)),
+            Code::NotFound => Self::NotFound(Box::new(parse_status(s))),
+            Code::AlreadyExists => Self::AlreadyExists(Box::new(parse_status(s))),
             Code::PermissionDenied => Self::PermissionDenied(parse_status(s)),
             Code::ResourceExhausted => Self::ResourceExhausted(parse_status(s)),
-            Code::FailedPrecondition => Self::FailedPrecondition(parse_status(s)),
+            Code::FailedPrecondition => Self::FailedPrecondition(Box::new(parse_status(s))),
             Code::Aborted => Self::Aborted(parse_status(s)),
             Code::OutOfRange => Self::OutOfRange(parse_status(s)),
             Code::Unimplemented => Self::Unimplemented(parse_status(s)),
@@ -170,13 +170,13 @@ impl Error {
         let field_name = field_name.into();
         let description = description.into();
 
-        Self::InvalidArgument(ServerError {
+        Self::InvalidArgument(Box::new(ServerError {
             message: format!("Invalid argument for '{field_name}': {description}"),
             details: Some(FieldViolation {
                 field: field_name,
                 description,
             }),
-        })
+        }))
     }
 }
 
diff --git a/influxdb_iox_client/src/client/flight/mod.rs b/influxdb_iox_client/src/client/flight/mod.rs
index 7180543783b..73301998042 100644
--- a/influxdb_iox_client/src/client/flight/mod.rs
+++ b/influxdb_iox_client/src/client/flight/mod.rs
@@ -19,6 +19,10 @@ use arrow_flight::{decode::FlightRecordBatchStream, error::FlightError, FlightCl
 
 use crate::connection::Connection;
 
+use self::query::{NoQuery, QueryBuilder};
+
+pub mod query;
+
 /// Re-export generated_types
 pub mod generated_types {
     pub use generated_types::influxdata::iox::querier::v1::*;
@@ -134,6 +138,55 @@ impl From<tonic::Status> for Error {
 ///     .expect("valid bathes");
 /// # }
 /// ```
+///
+/// # Parameterized Queries
+///
+/// Use the [`Client::query`] method to create a [`QueryBuilder`] which can be used
+/// to supply parameter values to a query containing `$placeholder` variables
+///
+/// For example:
+///
+/// ```rust, no_run
+/// # let mut client: influxdb_iox_client::flight::Client = todo!();
+/// # async {
+///
+/// // Use QueryBuilder to create a parameterized query
+/// let query_results = client
+///     .query("my_namespace")
+///     .sql("select * from cpu_load where host = $host and value = $value")
+///     .with_param("$host", "my.hostname")
+///     .with_param("$value", 0.523)
+///     .run()
+///     .await?;
+///
+/// # Ok::<(), influxdb_iox_client::flight::Error>(())
+/// # };
+///```
+///
+/// ## Helper macro for working with query parameters
+///
+/// To make building queries easier in some scenarios, you can use the [`iox_query_params::params`] macro to
+/// build a map of name-value pairs.
+///
+/// Use the `params!` macro with [`query::QueryBuilder::with_params`] to
+/// supply parameters to a [`query::QueryBuilder`]
+///
+/// ```rust, no_run
+///     # use influxdb_iox_client::flight::query::{QueryBuilder, Query};
+///     # use iox_query_params::params;
+///     # let mut query: QueryBuilder<Query> = todo!();
+///     query.with_params(
+///         params! {
+///            "param1" => "a string",
+///            "param2" => 1234,
+///            "param3" => 1.234,
+///            "param4" => true,
+///            "param5" => Some(false),
+///            "param6" => None::<Option<()>>
+///         }
+///     )
+///     # ;
+/// ```
 #[derive(Debug)]
 pub struct Client {
     inner: FlightClient,
@@ -185,6 +238,12 @@ impl Client {
         Ok(self.inner.add_header(key, value)?)
     }
 
+    /// Create a new [`QueryBuilder`] to construct a query, optionally with parameters, on the
+    /// given namespace
+    pub fn query(&mut self, database: impl Into<String> + Send) -> QueryBuilder<'_, NoQuery> {
+        QueryBuilder::new(self, database)
+    }
+
     /// Query the given database with the given SQL query, returning
     /// a struct that can stream Arrow [`RecordBatch`] results.
     pub async fn sql(
@@ -197,6 +256,7 @@ impl Client {
             sql_query: sql_query.into(),
             query_type: QueryType::Sql.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         };
 
@@ -215,6 +275,7 @@ impl Client {
             sql_query: influxql_query.into(),
             query_type: QueryType::InfluxQl.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         };
 
diff --git a/influxdb_iox_client/src/client/flight/query.rs b/influxdb_iox_client/src/client/flight/query.rs
new file mode 100644
index 00000000000..d298626d547
--- /dev/null
+++ b/influxdb_iox_client/src/client/flight/query.rs
@@ -0,0 +1,154 @@
+//! Query builder for the native InfluxDB Flight API with support for parameters.
+
+use std::{collections::HashMap, marker::PhantomData};
+
+use generated_types::influxdata::iox::querier::v1::{
+    read_info::{QueryParam, QueryType},
+    ReadInfo,
+};
+use iox_query_params::StatementParam;
+
+use super::{Client, IOxRecordBatchStream};
+
+/// Initial type state for [`QueryBuilder`] when no query has been given via
+/// the [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct NoQuery;
+
+/// Type state for [`QueryBuilder`] when a query has been given via the
+/// [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
+pub struct Query;
+
+/// Query builder for InfluxDB queries executed over the Flight protocol. Supports SQL and InfluxQL languages
+/// optionally with named parameters.
+///
+/// See [`super::Client::query`] for information on how to create a QueryBuilder.
+///
+/// Use [`QueryBuilder::sql`] or [`QueryBuilder::influxql`] methods to supply the text of the query
+/// as a string. Optionally, named parameters can be given with [`QueryBuilder::with_param`].
+///
+/// Finally, call [`QueryBuilder::run`] to execute the query.
+///
+/// The generic parameter `State` is a typestate indicating whether the query builder
+/// has been initialized yet. This ensures that the builder cannot call [`QueryBuilder::run`] on a
+/// query until all mandatory parameters are given.
+#[derive(Debug)]
+pub struct QueryBuilder<'client, State> {
+    client: &'client mut Client,
+    database: String,
+    query_text: Option<String>,
+    query_type: Option<QueryType>,
+    params: HashMap<String, StatementParam>,
+    _phantom: PhantomData<State>,
+}
+
+impl<'c, State> QueryBuilder<'c, State> {
+    /// Supply a new named parameter to use with this query. Parameters referenced using
+    /// `$placeholder` syntax in the query will be substituted with the value provided.
+    ///
+    /// Any type that can be converted to [`StatementParam`] can be used as a value. Here are some
+    /// examples:
+    ///
+    /// ```rust, no_run
+    ///     # use influxdb_iox_client::flight::query::{QueryBuilder, Query};
+    ///     # let mut query: QueryBuilder<Query> = todo!();
+    ///     query
+    ///         .with_param("my_param1", true) // boolean
+    ///         .with_param("my_param2", "Hello, World!") // string
+    ///         .with_param("my_param3", 1234) // integer
+    ///         .with_param("my_param4", 1.23) // floating point
+    ///         .with_param("my_param5", Some("string")) // Option types can be converted
+    ///         .with_param("my_param6", None::<Option<()>>) // Option types convert None to NULL
+    ///     # ;
+    /// ```
+    ///
+    pub fn with_param(mut self, name: impl Into<String>, value: impl Into<StatementParam>) -> Self {
+        self.params.insert(name.into(), value.into());
+        self
+    }
+
+    /// IMPORTANT NOTE: Named parameters currently do not work with this client until
+    /// an upgrade to DataFusion 34.0 is performed. See <https://github.com/apache/arrow-datafusion/issues/8245>
+    ///
+    /// Supply an iterator of (name, value) pairs to use as named parameters for this query. Parameters referenced using
+    /// `$placeholder` syntax in the query will be substituted with the values provided.
+    ///
+    /// This is equivalent to calling [`Self::with_param`) on each (name, value) pair of the
+    /// provided iterator. See docs of that method for more details.
+    pub fn with_params<N, V>(mut self, params: impl IntoIterator<Item = (N, V)>) -> Self
+    where
+        N: Into<String>,
+        V: Into<StatementParam>,
+    {
+        self.params
+            .extend(params.into_iter().map(|(k, v)| (k.into(), v.into())));
+        self
+    }
+}
+
+impl<'c> QueryBuilder<'c, NoQuery> {
+    /// internal constructor. use [`super::Client::query`] as the public constructor
+    pub(crate) fn new(
+        client: &'c mut Client,
+        database: impl Into<String> + Send,
+    ) -> QueryBuilder<'c, NoQuery> {
+        QueryBuilder {
+            client,
+            database: database.into(),
+            query_text: None,
+            query_type: None,
+            params: HashMap::new(),
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Builds an SQL query from the given string.
+    pub fn sql(self, query_text: impl Into<String>) -> QueryBuilder<'c, Query> {
+        // can't use record update syntax because the output type changes
+        QueryBuilder {
+            client: self.client,
+            database: self.database,
+            query_text: Some(query_text.into()),
+            query_type: Some(QueryType::Sql),
+            params: self.params,
+            _phantom: PhantomData,
+        }
+    }
+
+    /// Builds an InfluxQL query from the given string.
+    pub fn influxql(self, query_text: impl Into<String>) -> QueryBuilder<'c, Query> {
+        // can't use record update syntax because the output type changes
+        QueryBuilder {
+            client: self.client,
+            database: self.database,
+            query_text: Some(query_text.into()),
+            query_type: Some(QueryType::InfluxQl),
+            params: self.params,
+            _phantom: PhantomData,
+        }
+    }
+}
+
+impl<'c> QueryBuilder<'c, Query> {
+    /// Query the given database with the SQL query constructed by this builder,
+    /// returning a struct that can stream Arrow [`arrow::record_batch::RecordBatch`] results.
+    pub async fn run(self) -> Result<IOxRecordBatchStream, super::Error> {
+        let request = ReadInfo {
+            database: self.database,
+            sql_query: self.query_text.unwrap(),
+            query_type: self.query_type.unwrap().into(),
+            flightsql_command: vec![],
+            params: self
+                .params
+                .into_iter()
+                .map(|(name, v)| QueryParam {
+                    name,
+                    value: Some(v.into()),
+                })
+                .collect(),
+            is_debug: false,
+        };
+        self.client.do_get_with_read_info(request).await
+    }
+}
diff --git a/influxdb_iox_client/src/client/query_log.rs b/influxdb_iox_client/src/client/query_log.rs
new file mode 100644
index 00000000000..f5210b5e9ab
--- /dev/null
+++ b/influxdb_iox_client/src/client/query_log.rs
@@ -0,0 +1,30 @@
+use client_util::connection::GrpcConnection;
+
+use self::generated_types::{query_log_service_client::QueryLogServiceClient, *};
+use crate::connection::Connection;
+use crate::error::Error;
+
+/// Re-export generated_types
+pub mod generated_types {
+    pub use generated_types::influxdata::iox::querier::v1::*;
+}
+
+/// A basic client for working with the query log.
+#[derive(Debug, Clone)]
+pub struct Client {
+    inner: QueryLogServiceClient<GrpcConnection>,
+}
+
+impl Client {
+    /// Creates a new client with the provided connection
+    pub fn new(connection: Connection) -> Self {
+        Self {
+            inner: QueryLogServiceClient::new(connection.into_grpc_connection()),
+        }
+    }
+
+    /// Get log.
+    pub async fn get_log(&mut self) -> Result<GetLogResponse, Error> {
+        Ok(self.inner.get_log(GetLogRequest {}).await?.into_inner())
+    }
+}
diff --git a/influxdb_iox_client/src/client/table.rs b/influxdb_iox_client/src/client/table.rs
index a603fcda7c1..e168bda6eb0 100644
--- a/influxdb_iox_client/src/client/table.rs
+++ b/influxdb_iox_client/src/client/table.rs
@@ -39,6 +39,24 @@ impl Client {
             .tables)
     }
 
+    /// Get a  table in the given namespace
+    pub async fn get_table(
+        &mut self,
+        namespace_name: &str,
+        table_name: &str,
+    ) -> Result<Table, Error> {
+        Ok(self
+            .inner
+            .get_table(GetTableRequest {
+                namespace_name: namespace_name.to_string(),
+                table_name: table_name.to_string(),
+            })
+            .await?
+            .into_inner()
+            .table
+            .unwrap_field("table")?)
+    }
+
     /// Create a table
     pub async fn create_table(
         &mut self,
diff --git a/influxdb_iox_client/src/client/write.rs b/influxdb_iox_client/src/client/write.rs
index 36a67d9c0cf..b9e471c9a37 100644
--- a/influxdb_iox_client/src/client/write.rs
+++ b/influxdb_iox_client/src/client/write.rs
@@ -12,6 +12,62 @@ use reqwest::{Body, Method};
 /// The default value for the maximum size of each request, in bytes
 pub const DEFAULT_MAX_REQUEST_PAYLOAD_SIZE_BYTES: Option<usize> = Some(1024 * 1024);
 
+/// Name of a database.
+#[derive(Debug, Clone)]
+pub struct DatabaseName {
+    /// The database name.
+    database: String,
+    /// Optionally, an Org ID. This is used only by multi-tenant instances of InfluxDB (such as InfluxDB serverless)
+    /// and must be None if writing to a single-tenant instance (such as InfluxDB OSS or Influxdb Clustered)
+    org: Option<String>,
+}
+
+impl DatabaseName {
+    /// Create a single tenant database name from a string.
+    ///
+    /// You can also just pass a &str or a String (or anything that has a `AsRef<str>` impl) directly
+    /// all places that expect a DatabaseName, so generally you don't need to call this function.
+    pub fn from_db(database: &str) -> Self {
+        database.into()
+    }
+
+    /// Create a MultiTenant DatabaseName.
+    pub fn from_org_db(org: impl AsRef<str>, database: impl AsRef<str>) -> Self {
+        Self {
+            org: Some(org.as_ref().to_owned()),
+            database: database.as_ref().to_owned(),
+        }
+    }
+
+    /// Create a DatabaseName by splitting a single string formatted as `org_bucket`.
+    /// This format is useful when porting legacy code that used IOx internal org_db -> namespace encoding.
+    pub fn split_org_db(namespace: impl AsRef<str>) -> Result<Self, Error> {
+        let (org, database) = split_namespace(namespace.as_ref()).map_err(|e| {
+            Error::invalid_argument(
+                "namespace",
+                format!("Could not find valid org_id and bucket_id: {e}"),
+            )
+        })?;
+        Ok(Self::from_org_db(org, database))
+    }
+
+    /// Internally, we speak the v2 protocol which has an "org" parameter. Single tenant instances of InfluxDB
+    /// will tolerate the presence of an "org" parameter provided it's an empty string.
+    fn get_org_bucket(&self) -> (String, String) {
+        let name = self.clone();
+        (name.org.unwrap_or_default(), name.database)
+    }
+}
+
+impl<T: AsRef<str>> From<T> for DatabaseName {
+    fn from(value: T) -> Self {
+        Self {
+            database: value.as_ref().to_owned(),
+            org: None,
+        }
+    }
+}
+
 /// An IOx Write API client.
 ///
 /// ```no_run
@@ -99,24 +155,24 @@ impl Client {
     }
 
     /// Write the [LineProtocol] formatted string in `lp_data` to
-    /// namespace `namespace`.
+    /// namespace `database`.
     ///
     /// Returns the number of bytes which were written to the namespace.
     ///
     /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
     pub async fn write_lp(
         &mut self,
-        namespace: impl AsRef<str> + Send,
+        database: impl Into<DatabaseName> + Send,
         lp_data: impl Into<String> + Send,
     ) -> Result<usize, Error> {
         let sources = futures_util::stream::iter([lp_data.into()]);
 
-        self.write_lp_stream(namespace, sources).await
+        self.write_lp_stream(database, sources).await
     }
 
     /// Write the stream of [LineProtocol] formatted strings in
-    /// `sources` to namespace `namespace`. It is assumed that
-    /// individual lines (points) do not cross these strings
+    /// `sources` to database `database`. It is assumed that
+    /// individual lines (points) do not cross these strings.
     ///
     /// Returns the number of bytes, in total, which were written to
     /// the namespace.
@@ -124,19 +180,15 @@ impl Client {
     /// [LineProtocol]: https://docs.influxdata.com/influxdb/v2.0/reference/syntax/line-protocol/#data-types-and-format
     pub async fn write_lp_stream(
         &mut self,
-        namespace: impl AsRef<str> + Send,
+        database: impl Into<DatabaseName> + Send,
         sources: impl Stream<Item = String> + Send,
     ) -> Result<usize, Error> {
-        let (org_id, bucket_id) = split_namespace(namespace.as_ref()).map_err(|e| {
-            Error::invalid_argument(
-                "namespace",
-                format!("Could not find valid org_id and bucket_id: {e}"),
-            )
-        })?;
-
         let max_concurrent_uploads: usize = self.max_concurrent_uploads.into();
         let max_request_payload_size_bytes = self.max_request_payload_size_bytes;
 
+        let database = database.into();
+        let (org_id, bucket_id) = database.get_org_bucket();
+
         // make a stream and process in parallel
         let results = sources
             // split each input source in parallel, if possible
@@ -306,7 +358,7 @@ mod tests {
     async fn test() {
         let mock = Arc::new(MockRequestMaker::new());
 
-        let namespace = "orgname_bucketname";
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
         let data = "m,t=foo f=4";
 
         let expected = vec![MockRequest {
@@ -323,11 +375,53 @@ mod tests {
         assert_eq!(num_bytes, 11);
     }
 
+    #[tokio::test]
+    async fn test_underscore() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = DatabaseName::from_org_db("orgname", "bucket_name");
+        let data = "m,t=foo f=4";
+
+        let expected = vec![MockRequest {
+            org_id: "orgname".into(),
+            bucket_id: "bucket_name".into(),
+            body: data.into(),
+        }];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 11);
+    }
+
+    #[tokio::test]
+    async fn test_single_tenant() {
+        let mock = Arc::new(MockRequestMaker::new());
+
+        let namespace = "bucket_name";
+        let data = "m,t=foo f=4";
+
+        let expected = vec![MockRequest {
+            org_id: "".into(),
+            bucket_id: "bucket_name".into(),
+            body: data.into(),
+        }];
+
+        let num_bytes = Client::new_with_maker(Arc::clone(&mock) as _)
+            .write_lp(namespace, data)
+            .await
+            .unwrap();
+        assert_eq!(expected, mock.requests());
+        assert_eq!(num_bytes, 11);
+    }
+
     #[tokio::test]
     async fn test_max_request_payload_size() {
         let mock = Arc::new(MockRequestMaker::new());
 
-        let namespace = "orgname_bucketname";
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
         let data = "m,t=foo f=4\n\
                     m,t=bar f=3\n\
                     m,t=fooddddddd f=4";
@@ -360,7 +454,7 @@ mod tests {
     async fn test_write_lp_stream() {
         let mock = Arc::new(MockRequestMaker::new());
 
-        let namespace = "orgname_bucketname";
+        let namespace = DatabaseName::from_org_db("orgname", "bucketname");
         let data = futures_util::stream::iter(
             vec!["m,t=foo f=4", "m,t=bar f=3"]
                 .into_iter()
diff --git a/influxdb_iox_client/src/format.rs b/influxdb_iox_client/src/format.rs
index 10aacf919d1..7244ffe5c46 100644
--- a/influxdb_iox_client/src/format.rs
+++ b/influxdb_iox_client/src/format.rs
@@ -136,7 +136,7 @@ fn batches_to_csv(batches: &[RecordBatch]) -> Result<String> {
     let mut bytes = vec![];
 
     {
-        let mut writer = WriterBuilder::new().has_headers(true).build(&mut bytes);
+        let mut writer = WriterBuilder::new().with_header(true).build(&mut bytes);
 
         for batch in batches {
             writer.write(batch).map_err(Error::CsvArrow)?;
diff --git a/influxdb_iox_client/src/format/influxql.rs b/influxdb_iox_client/src/format/influxql.rs
index 3a1b15012a9..5a5d87345cd 100644
--- a/influxdb_iox_client/src/format/influxql.rs
+++ b/influxdb_iox_client/src/format/influxql.rs
@@ -43,7 +43,7 @@ pub enum TableBorders {
 }
 
 /// Options for the [`write_columnar`] function.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct Options {
     /// Specify how borders should be rendered.
     pub borders: TableBorders,
@@ -200,17 +200,19 @@ pub fn write_columnar(mut w: impl Write, batches: &[RecordBatch], options: Optio
 mod test {
     use crate::format::influxql::{write_columnar, Options};
     use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray, TimestampNanosecondArray};
-    use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+    use arrow::datatypes::{DataType, Field, Schema};
     use arrow::record_batch::RecordBatch;
     use generated_types::influxdata::iox::querier::v1::influx_ql_metadata::TagKeyColumn;
     use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
+    use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
     use std::collections::HashMap;
     use std::sync::Arc;
 
     fn times(vals: &[i64]) -> ArrayRef {
-        Arc::new(TimestampNanosecondArray::from_iter_values(
-            vals.iter().cloned(),
-        ))
+        Arc::new(
+            TimestampNanosecondArray::from_iter_values(vals.iter().cloned())
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        )
     }
 
     fn strs<T: AsRef<str>>(vals: &[Option<T>]) -> ArrayRef {
@@ -229,11 +231,7 @@ mod test {
         let schema = Arc::new(Schema::new_with_metadata(
             vec![
                 Field::new("iox::measurement", DataType::Utf8, false),
-                Field::new(
-                    "time",
-                    DataType::Timestamp(TimeUnit::Nanosecond, None),
-                    false,
-                ),
+                Field::new("time", TIME_DATA_TYPE(), false),
                 Field::new("cpu", DataType::Utf8, true),
                 Field::new("device", DataType::Utf8, true),
                 Field::new("usage_idle", DataType::Float64, true),
diff --git a/influxdb_line_protocol/Cargo.toml b/influxdb_line_protocol/Cargo.toml
index f3cf3600a8e..51eb3627ea0 100644
--- a/influxdb_line_protocol/Cargo.toml
+++ b/influxdb_line_protocol/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "influxdb-line-protocol"
-version = "2.0.0"
+version = "1.0.0"
 authors = ["InfluxDB IOx Project Developers"]
 edition = "2021"
 license = "MIT OR Apache-2.0"
@@ -12,7 +12,7 @@ repository = "https://github.com/influxdata/influxdb_iox/tree/main/influxdb_line
 # Note this crate is published as its own crate on crates.io but kept in
 # this repository for maintenance convenience.
 #
-# Thus it is important not to have workspace dependencies.
+# Thus it is important not to have workspace dependencies or workspace configuration.
 #
 # https://github.com/influxdata/influxdb_iox/issues/7051
 
@@ -20,8 +20,8 @@ repository = "https://github.com/influxdata/influxdb_iox/tree/main/influxdb_line
 bytes = "1.5"
 log = "0.4.20"
 nom = { version = "7", default-features = false, features = ["std"] }
-smallvec = { version = "1.11.1", features = ["union"] }
-snafu = "0.7"
+smallvec = { version = "1.12.0", features = ["union"] }
+snafu = "0.8"
 
 [dev-dependencies] # In alphabetical order
 test_helpers = { path = "../test_helpers" }
diff --git a/influxdb_line_protocol/README.md b/influxdb_line_protocol/README.md
index d2e62b11f5e..aac6e394556 100644
--- a/influxdb_line_protocol/README.md
+++ b/influxdb_line_protocol/README.md
@@ -4,14 +4,14 @@
 
 This crate contains pure Rust implementations of
 
-1. A [parser](https://docs.rs/influxdb-line-protocol/latest/influxdb_line_protocol/fn.parse_lines.html) for [InfluxDB Line Protocol] developed as part of the
+1. A [parser](https://docs.rs/influxdb_line_protocol/latest/influxdb_line_protocol/fn.parse_lines.html) for [InfluxDB Line Protocol] developed as part of the
 [InfluxDB IOx] project.  This implementation is intended to be
 compatible with the [Go implementation], however, this
 implementation uses a [nom] combinator-based parser rather than
 attempting to port the imperative Go logic so there are likely
-some small differences.
+some small diferences.
 
-2. A [builder](https://docs.rs/influxdb-line-protocol/latest/influxdb_line_protocol/builder/struct.LineProtocolBuilder.html) to construct valid [InfluxDB Line Protocol]
+2. A [builder](https://docs.rs/influxdb_line_protocol/latest/influxdb_line_protocol/builder/struct.LineProtocolBuilder.html) to contruct valid [InfluxDB Line Protocol]
 
 ## Example
 
diff --git a/influxdb_line_protocol/RELEASE.md b/influxdb_line_protocol/RELEASE.md
index af9e9a9e91d..2cbb664b204 100644
--- a/influxdb_line_protocol/RELEASE.md
+++ b/influxdb_line_protocol/RELEASE.md
@@ -2,7 +2,7 @@
 
 ## Step 1: Update `README.md` file
 
-Update the `README.md` file (copies the rustdoc comments) using the [`cargo rdme`](https://crates.io/crates/cargo-rdme) tool (install via `cargo install cargo-rdme`):
+Update the `README.md` file (copies the rustdoc comments) using the [`cargo rmde`](https://crates.io/crates/cargo-rdme) tool (install via `cargo install cargo-rdme`):
 
 ```shell
 cargo rdme
diff --git a/influxdb_line_protocol/src/lib.rs b/influxdb_line_protocol/src/lib.rs
index abb4e1ac4ad..89a487b12fd 100644
--- a/influxdb_line_protocol/src/lib.rs
+++ b/influxdb_line_protocol/src/lib.rs
@@ -88,15 +88,20 @@ use nom::{
 };
 use smallvec::SmallVec;
 use snafu::{ResultExt, Snafu};
-use std::cmp::Ordering;
 use std::{
     borrow::Cow,
     char,
+    cmp::Ordering,
     collections::{btree_map::Entry, BTreeMap},
     fmt,
+    hash::{Hash, Hasher},
     ops::Deref,
 };
 
+/// String fields are limited to 64K per
+/// <https://docs.influxdata.com/influxdb/v2/reference/syntax/line-protocol/#string>
+const STRING_LENGTH_LIMIT_IN_BYTES: usize = 65536;
+
 /// Parsing errors that describe how a particular line is invalid line protocol.
 #[derive(Debug, Snafu)]
 #[non_exhaustive]
@@ -155,6 +160,9 @@ pub enum Error {
         kind: nom::error::ErrorKind,
         trace: Vec<Error>,
     },
+
+    #[snafu(display(r#"String is greater than 64KB"#))]
+    FieldStringValueTooLarge,
 }
 
 /// A specialized [`Result`] type with a default error type of [`Error`].
@@ -421,7 +429,7 @@ impl<'a> Display for FieldValue<'a> {
 /// For example, the 8-character string `Foo\\Bar` (note the double
 /// `\\`) is parsed into the logical 7-character string `Foo\Bar`
 /// (note the single `\`)
-#[derive(Debug, Clone, Eq, Hash)]
+#[derive(Debug, Clone, Eq)]
 pub enum EscapedStr<'a> {
     SingleSlice(&'a str),
     CopiedValue(String),
@@ -472,6 +480,12 @@ impl<'a> Deref for EscapedStr<'a> {
     }
 }
 
+impl<'a> Hash for EscapedStr<'a> {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        self.as_str().hash(state);
+    }
+}
+
 impl<'a> PartialEq for EscapedStr<'a> {
     fn eq(&self, other: &Self) -> bool {
         self.as_str() == other.as_str()
@@ -817,7 +831,14 @@ fn field_string_value(i: &str) -> IResult<&str, EscapedStr<'_>> {
         empty_str,
     ));
 
-    map(quoted_str, |vec| EscapedStr::from_slices(&vec))(i)
+    map_fail(quoted_str, |value| {
+        let size = value.iter().map(|s| s.len()).sum::<usize>();
+        if STRING_LENGTH_LIMIT_IN_BYTES >= size {
+            Ok(EscapedStr::from_slices(&value))
+        } else {
+            Err(Error::FieldStringValueTooLarge)
+        }
+    })(i)
 }
 
 fn field_bool_value(i: &str) -> IResult<&str, bool> {
@@ -899,6 +920,10 @@ fn escape_or_fallback<'a>(
 
         if s.ends_with('\\') {
             EndsWithBackslashSnafu.fail().map_err(nom::Err::Failure)
+        } else if s.len() > STRING_LENGTH_LIMIT_IN_BYTES {
+            FieldStringValueTooLargeSnafu
+                .fail()
+                .map_err(nom::Err::Failure)
         } else {
             Ok((remaining, s))
         }
@@ -1229,6 +1254,29 @@ mod test {
         assert!(es != "");
     }
 
+    #[test]
+    fn optionally_escaped_strs_are_equal_and_hash_the_same() {
+        let (_remaining, field_name_without_escaping) = field_key("foo,bar").unwrap();
+        assert!(field_name_without_escaping == "foo,bar");
+        assert!(!field_name_without_escaping.is_escaped());
+
+        let (_remaining, field_name_with_escaping) = field_key("foo\\,bar").unwrap();
+        assert!(field_name_with_escaping == "foo,bar");
+        assert!(field_name_with_escaping.is_escaped());
+
+        assert_eq!(field_name_without_escaping, field_name_with_escaping);
+        assert_eq!(
+            calculate_hash(&field_name_without_escaping),
+            calculate_hash(&field_name_with_escaping)
+        );
+    }
+
+    fn calculate_hash<T: std::hash::Hash>(t: &T) -> u64 {
+        let mut s = std::collections::hash_map::DefaultHasher::new();
+        t.hash(&mut s);
+        s.finish()
+    }
+
     #[test]
     fn test_trim_leading() {
         assert_eq!(trim_leading(""), "");
@@ -2403,6 +2451,178 @@ her"#,
         assert!(!FieldValue::Boolean(true).is_same_type(&FieldValue::U64(42)));
     }
 
+    #[test]
+    fn test_large_tag_value() {
+        let input = format!(
+            r#"foo,tag1=normal,tag={} value=1i 123"#,
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+        let parsed = parse(&input);
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_tag_value_with_exact_maximum() {
+        let tag_value = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!(r#"foo,tag1=normal,tag={} value=1i 123"#, tag_value);
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(*vals[0].tag_value("tag1").unwrap(), "normal");
+        assert_eq!(*vals[0].tag_value("tag").unwrap(), tag_value);
+        assert_eq!(vals[0].field_set[0].1.unwrap_i64(), 1);
+    }
+
+    #[test]
+    fn test_large_tag_value_in_one_line_of_multiple_line_protocol() {
+        let input = format!(
+            "foo,tag1=very_long_value_is_okay,tag2=bar value=1i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=3i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=4i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=5i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=6i 123\n\
+                     foo,tag1={}, tag2=bar value=7i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=8i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=9i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=10i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=11i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=12i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=13i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=14i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=15i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=16i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=17i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=18i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=19i 123\n\
+                     foo,tag1=very_long_value_is_okay,tag2=bar value=20i",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 2)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_field_value() {
+        let input = format!(
+            "foo,tag1=bar value=\"{}\" 123",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_field_value_with_exact_maximum() {
+        let value = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,tag1=bar value=\"{}\" 123", value);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(vals[0].timestamp, Some(123));
+        assert_eq!(*vals[0].tag_value("tag1").unwrap(), "bar");
+        assert_eq!(vals[0].field_set[0].1.unwrap_string(), value);
+    }
+
+    #[test]
+    fn test_large_field_value_in_one_line_of_multiple_line_protocol() {
+        let input = format!(
+            "foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+        foo,tag1=very_long_value_is_okay,tag2=bar value=2i 123\n\
+        foo,tag1=bar value=\"{}\" 123",
+            "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1)
+        );
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_measurement_name() {
+        let measurement = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("{},tag1=bar value=1i 123", measurement);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_measurement_name_exact_maximum_length() {
+        let measurement = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("{},tag1=bar value=1i 123", measurement);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, measurement);
+    }
+
+    #[test]
+    fn test_large_tag_name() {
+        let tag_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("foo,{}=bar value=1i 123", tag_name);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_tag_name_exact_maximum_length() {
+        let tag_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,{}=bar value=1i 123", tag_name);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].series.measurement, "foo");
+        assert_eq!(*vals[0].tag_value(&tag_name).unwrap(), "bar");
+    }
+
+    #[test]
+    fn test_large_value_name() {
+        let value_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES + 1);
+        let input = format!("foo,tag1=bar {}=1i 123", value_name);
+
+        let parsed = parse(&input);
+
+        assert!(parsed.is_err());
+        assert!(matches!(parsed, Err(Error::FieldStringValueTooLarge)));
+    }
+
+    #[test]
+    fn test_large_value_name_exact_maximum_length() {
+        let value_name = "a".repeat(STRING_LENGTH_LIMIT_IN_BYTES);
+        let input = format!("foo,tag1=bar {}=1i 123", value_name);
+
+        let parsed = parse(&input);
+        assert!(parsed.is_ok());
+        let vals = parsed.unwrap();
+
+        assert_eq!(vals[0].field_value(&value_name).unwrap().unwrap_i64(), 1);
+    }
+
     /// Assert that the field named `field_name` has a float value
     /// within 0.0001% of `expected_value`, panic'ing if not
     fn assert_float_field(parsed_line: &ParsedLine<'_>, field_name: &str, expected_value: f64) {
diff --git a/influxdb_storage_client/Cargo.toml b/influxdb_storage_client/Cargo.toml
index fdd3a8c9e73..bf458436e8b 100644
--- a/influxdb_storage_client/Cargo.toml
+++ b/influxdb_storage_client/Cargo.toml
@@ -5,10 +5,13 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 client_util = { path = "../client_util" }
 generated_types = { path = "../generated_types" }
-prost = "0.11"
+prost = { workspace = true }
 tonic = { workspace = true }
 futures-util = { version = "0.3" }
 observability_deps = { path = "../observability_deps"}
diff --git a/influxrpc_parser/Cargo.toml b/influxrpc_parser/Cargo.toml
index fe26186aed5..520bed5bcfd 100644
--- a/influxrpc_parser/Cargo.toml
+++ b/influxrpc_parser/Cargo.toml
@@ -5,9 +5,11 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
-[dependencies]
-sqlparser = "0.37.0"
-snafu = "0.7.5"
+[lints]
+workspace = true
 
+[dependencies]
+sqlparser = {workspace = true}
+snafu = "0.8.0"
 generated_types = { path = "../generated_types" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/influxrpc_parser/src/predicate.rs b/influxrpc_parser/src/predicate.rs
index 362f4dcb19c..97651ede4eb 100644
--- a/influxrpc_parser/src/predicate.rs
+++ b/influxrpc_parser/src/predicate.rs
@@ -96,7 +96,11 @@ fn build_node(expr: &Expr, strings_are_regex: bool) -> Result<RPCNode> {
             vec![build_node(expr, strings_are_regex)?],
             None,
         ),
-        Expr::Cast { expr, data_type } => match data_type {
+        Expr::Cast {
+            expr,
+            data_type,
+            format: None,
+        } => match data_type {
             sqlparser::ast::DataType::Custom(ident, _modifiers) => {
                 if let Some(Ident { value, .. }) = ident.0.first() {
                     // See https://docs.influxdata.com/influxdb/v1.8/query_language/explore-data/#syntax
diff --git a/ingester_query_grpc/Cargo.toml b/ingester_query_grpc/Cargo.toml
index 7603bc4fb57..eaa246f25d9 100644
--- a/ingester_query_grpc/Cargo.toml
+++ b/ingester_query_grpc/Cargo.toml
@@ -5,24 +5,27 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-arrow = { workspace = true, features = ["prettyprint", "dyn_cmp_dict"] }
+arrow = { workspace = true }
 bytes = "1.5"
 base64 = "0.21"
 data_types = { path = "../data_types" }
 datafusion = { workspace = true }
 datafusion-proto = { workspace = true }
 flatbuffers = "23.5.26"
-pbjson = "0.6"
+pbjson = { workspace = true }
 predicate = { path = "../predicate" }
-prost = "0.11"
+prost = { workspace = true }
 query_functions = { path = "../query_functions" }
 serde = { version = "1.0", features = ["derive"] }
-snafu = "0.7"
+snafu = "0.8"
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [build-dependencies] # In alphabetical order
 tonic-build = { workspace = true }
-prost-build = "0.11"
-pbjson-build = "0.5"
+prost-build = { workspace = true }
+pbjson-build = { workspace = true }
diff --git a/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto b/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto
index 32a8f29424c..4303ccb5f5a 100644
--- a/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto
+++ b/ingester_query_grpc/protos/influxdata/iox/ingester/v2/query.proto
@@ -66,6 +66,14 @@ message QueryRequest {
 
   // Predicate for filtering.
   Filters filters = 4;
+
+  // Minimum timestamp (inclusive) that should be queried. If the query
+  // is unbounded then this will have the value of i64::MIN.
+  int64 t_min = 5;
+
+  // Maximum timestamp (inclusive) that should be queried. If the query
+  // is unbounded then this will have the value of i64::MAX.
+  int64 t_max = 6;
 }
 
 message IngesterQueryResponseMetadata {
@@ -84,13 +92,16 @@ message IngesterQueryResponseMetadata {
     // The projection is represented as a SORTED set of column indices. The indices are 0-based and point to the table schema
     // transmitted in this metadata message. They MUST NOT contain any duplicates.
     repeated uint64 projection = 4;
+
+    // Number of persisted parquet files for this ingester partition.
+    int64 persist_counter = 5;
   }
 
   // Ingester UUID
   string ingester_uuid = 1;
 
-  // Number of persisted parquet files for this ingester.
-  int64 persist_counter = 2;
+  reserved "persist_counter";
+  reserved 2;
 
   // Serialized table schema.
   bytes table_schema = 3;
diff --git a/ingester_query_grpc/src/lib.rs b/ingester_query_grpc/src/lib.rs
index d4dc78f4c0a..71d9c1fe105 100644
--- a/ingester_query_grpc/src/lib.rs
+++ b/ingester_query_grpc/src/lib.rs
@@ -26,7 +26,8 @@ use crate::influxdata::iox::ingester::v1 as proto;
 use crate::influxdata::iox::ingester::v2 as proto2;
 use base64::{prelude::BASE64_STANDARD, Engine};
 use data_types::{
-    NamespaceId, PartitionHashId, PartitionId, TableId, TimestampRange, TransitionPartitionId,
+    NamespaceId, PartitionHashId, PartitionId, TableId, TimestampMinMax, TimestampRange,
+    TransitionPartitionId,
 };
 use datafusion::{common::DataFusionError, prelude::Expr};
 use datafusion_proto::bytes::Serializeable;
@@ -37,7 +38,7 @@ use snafu::{ResultExt, Snafu};
 /// This module imports the generated protobuf code into a Rust module
 /// hierarchy that matches the namespace hierarchy of the protobuf
 /// definitions
-#[allow(clippy::use_self)]
+#[allow(clippy::use_self, missing_copy_implementations, unreachable_pub)]
 pub mod influxdata {
     pub mod iox {
         pub mod ingester {
@@ -209,6 +210,10 @@ pub struct IngesterQueryRequest2 {
 
     /// Predicate for filtering
     pub filters: Vec<Expr>,
+
+    /// Time interval specified by the filters. This will be used by the
+    /// ingestor for cheap early filtering.
+    pub t_min_max: TimestampMinMax,
 }
 
 impl IngesterQueryRequest2 {
@@ -218,12 +223,14 @@ impl IngesterQueryRequest2 {
         table_id: TableId,
         columns: Vec<String>,
         filters: Vec<Expr>,
+        t_min_max: TimestampMinMax,
     ) -> Self {
         Self {
             namespace_id,
             table_id,
             columns,
             filters,
+            t_min_max,
         }
     }
 }
@@ -237,6 +244,8 @@ impl TryFrom<proto2::QueryRequest> for IngesterQueryRequest2 {
             table_id,
             columns,
             filters,
+            t_min,
+            t_max,
         } = proto;
 
         let namespace_id = NamespaceId::new(namespace_id);
@@ -246,7 +255,13 @@ impl TryFrom<proto2::QueryRequest> for IngesterQueryRequest2 {
             .transpose()?
             .unwrap_or_default();
 
-        Ok(Self::new(namespace_id, table_id, columns, filters))
+        Ok(Self::new(
+            namespace_id,
+            table_id,
+            columns,
+            filters,
+            TimestampMinMax::new(t_min, t_max),
+        ))
     }
 }
 
@@ -259,6 +274,7 @@ impl TryFrom<IngesterQueryRequest2> for proto2::QueryRequest {
             table_id,
             columns,
             filters,
+            t_min_max,
         } = query;
 
         Ok(Self {
@@ -266,6 +282,8 @@ impl TryFrom<IngesterQueryRequest2> for proto2::QueryRequest {
             table_id: table_id.get(),
             columns,
             filters: Some(filters.try_into()?),
+            t_min: t_min_max.min,
+            t_max: t_min_max.max,
         })
     }
 }
@@ -408,7 +426,7 @@ impl TryFrom<proto2::Filters> for Vec<Expr> {
     }
 }
 
-#[derive(Debug, Snafu)]
+#[derive(Debug, Snafu, Copy, Clone)]
 pub enum EncodeProtoPredicateFromBase64Error {
     #[snafu(display("Cannot encode protobuf: {source}"))]
     ProtobufEncode { source: prost::EncodeError },
@@ -536,6 +554,7 @@ mod tests {
             TableId::new(1337),
             vec!["usage".into(), "time".into()],
             vec![col("foo").eq(lit(1i64))],
+            TimestampMinMax::new(1000, 2000),
         );
 
         let proto_query: proto2::QueryRequest = rust_query.clone().try_into().unwrap();
@@ -550,7 +569,7 @@ mod tests {
         let predicate = Predicate {
             field_columns: Some(BTreeSet::from([String::from("foo"), String::from("bar")])),
             range: Some(TimestampRange::new(13, 42)),
-            exprs: vec![Expr::Wildcard],
+            exprs: vec![Expr::Wildcard { qualifier: None }],
             value_expr: vec![col("_value").eq(lit("bar")).try_into().unwrap()],
         };
         let predicate: proto::Predicate = predicate.try_into().unwrap();
diff --git a/iox_catalog/Cargo.toml b/iox_catalog/Cargo.toml
index e2560207806..40e9bb99bad 100644
--- a/iox_catalog/Cargo.toml
+++ b/iox_catalog/Cargo.toml
@@ -5,31 +5,38 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-async-trait = "0.1.73"
+async-trait = "0.1.77"
+backoff = { version = "0.1.0", path = "../backoff" }
+catalog_cache = { path = "../catalog_cache" }
 data_types = { path = "../data_types" }
+generated_types = { path = "../generated_types" }
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 log = "0.4"
 metric = { version = "0.1.0", path = "../metric" }
 mutable_batch = { path = "../mutable_batch" }
 observability_deps = { path = "../observability_deps" }
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parking_lot = { version = "0.12" }
 serde = { version = "1.0", features = ["derive"] }
 siphasher = "1.0"
-snafu = "0.7"
-sqlx = { version = "0.7.1", features = [ "runtime-tokio-rustls" , "postgres", "uuid", "sqlite" ] }
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "uuid", "sqlite"] }
 sqlx-hotswap-pool = { path = "../sqlx-hotswap-pool" }
-thiserror = "1.0.48"
-tokio = { version = "1.32", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
-uuid = { version = "1", features = ["v4"] }
+thiserror = "1.0.56"
+tokio = { version = "1.35", features = ["io-util", "macros", "parking_lot", "rt-multi-thread", "time"] }
+tonic = { workspace = true }
+trace_http = { path = "../trace_http" }
+uuid = "1"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
 assert_matches = "1.5.0"
 dotenvy = "0.15.7"
-generated_types = { path = "../generated_types" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 paste = "1.0.14"
 pretty_assertions = "1.4.0"
diff --git a/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql b/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql
new file mode 100644
index 00000000000..95fba05395f
--- /dev/null
+++ b/iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ALTER COLUMN sort_key DROP NOT NULL;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231003120000_drop_sort_key.sql b/iox_catalog/migrations/20231003120000_drop_sort_key.sql
new file mode 100644
index 00000000000..868a0e592b6
--- /dev/null
+++ b/iox_catalog/migrations/20231003120000_drop_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition DROP COLUMN sort_key;
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql b/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql
new file mode 100644
index 00000000000..6e398b317f4
--- /dev/null
+++ b/iox_catalog/migrations/20231004120000_add_empty_sort_key.sql
@@ -0,0 +1,4 @@
+-- We no longer use sort_key but to avoid phase deployments for Clustered customers,
+-- we do not need to drop it. However, since it was already dropped in the previous migration,
+-- let us add it back as a NULL column 
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
\ No newline at end of file
diff --git a/iox_catalog/migrations/20231121120000_add_partition_generation.sql b/iox_catalog/migrations/20231121120000_add_partition_generation.sql
new file mode 100644
index 00000000000..9481143eef8
--- /dev/null
+++ b/iox_catalog/migrations/20231121120000_add_partition_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN generation BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/migrations/20231121150000_partition_id_trigger.sql b/iox_catalog/migrations/20231121150000_partition_id_trigger.sql
new file mode 100644
index 00000000000..8dd8ff00fb6
--- /dev/null
+++ b/iox_catalog/migrations/20231121150000_partition_id_trigger.sql
@@ -0,0 +1,20 @@
+-- Forward compatibility for old writers that fail to populate partition.id
+
+-- FUNCTION that updates the partition field in the parquet_file table when the set_partition_id trigger is fired
+CREATE OR REPLACE FUNCTION update_partition_id()
+RETURNS TRIGGER
+LANGUAGE PLPGSQL
+AS $$
+BEGIN
+SELECT partition.id INTO NEW.partition_id
+    FROM partition WHERE partition.hash_id = NEW.partition_hash_id;
+RETURN NEW;
+END;
+$$;
+
+-- TRIGGER that fires the update_partition_id function when a new file is added to the parquet_file table
+CREATE TRIGGER set_partition_id
+    BEFORE INSERT ON parquet_file
+    FOR EACH ROW
+    WHEN (NEW.partition_id IS NULL)
+    EXECUTE PROCEDURE update_partition_id();
diff --git a/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql b/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql
new file mode 100644
index 00000000000..d32bdea292c
--- /dev/null
+++ b/iox_catalog/migrations/20231123120000_partition_id_from_partition.sql
@@ -0,0 +1,47 @@
+-- Actual batched migration that converts parquet file partition ids
+
+-- By default we often only have 5min to finish our statements.
+-- IOX_NO_TRANSACTION
+SET statement_timeout TO '60min';
+
+-- IOX_STEP_BOUNDARY
+
+-- IOX_NO_TRANSACTION
+DO
+$$
+    DECLARE
+        pos       integer;
+        max       integer;
+        processed integer;
+    BEGIN
+        SELECT coalesce(min(id), 0), coalesce(max(id), 0)
+        INTO pos, max
+        FROM parquet_file;
+
+        -- loop that migrates parquet_file in batches
+        LOOP
+            -- update batch:
+            RAISE NOTICE 'Processing rows from %', pos;
+            UPDATE parquet_file
+            SET partition_id=partition.id
+            FROM partition
+            WHERE parquet_file.partition_hash_id = partition.hash_id
+              AND parquet_file.partition_id is NULL
+              AND parquet_file.id >= pos
+              AND parquet_file.id < pos + 100000;
+
+            pos = pos + 100000;
+
+            GET DIAGNOSTICS processed = ROW_COUNT;
+
+            -- commit update
+            COMMIT;
+
+            -- check remaining work
+            RAISE NOTICE 'Updated: % rows', processed;
+            IF pos > max THEN
+                EXIT;
+            END IF;
+        END LOOP;
+    END
+$$ LANGUAGE plpgsql;
diff --git a/iox_catalog/migrations/20240111150000_add_table_generation.sql b/iox_catalog/migrations/20240111150000_add_table_generation.sql
new file mode 100644
index 00000000000..14f124e2068
--- /dev/null
+++ b/iox_catalog/migrations/20240111150000_add_table_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE table_name ADD COLUMN generation BIGINT NOT NULL DEFAULT 0;
diff --git a/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql b/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql
new file mode 100644
index 00000000000..0fef22f0642
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql
@@ -0,0 +1,39 @@
+-- Remove unused indices
+DROP INDEX IF EXISTS parquet_file_partition_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_created_idx;
+DROP INDEX IF EXISTS parquet_file_shard_compaction_delete_idx;
+DROP INDEX IF EXISTS parquet_file_partition_idx;
+DROP INDEX IF EXISTS parquet_file_table_idx;
+-- Remove the columns referring to Kafka based information
+ALTER TABLE namespace DROP COLUMN query_pool_id;
+ALTER TABLE namespace DROP COLUMN topic_id;
+ALTER TABLE parquet_file DROP COLUMN shard_id;
+ALTER TABLE partition DROP COLUMN shard_id;
+-- The tombstone table had a redundant unique constraint on shard_id which
+-- prevents it from being dropped in SQLite, we have to copy the data into a new
+-- table, delete the old one and then recreate the table.
+CREATE TABLE tombstone_temp AS SELECT * FROM tombstone;
+DROP TABLE tombstone;
+CREATE TABLE tombstone (
+    id                   INTEGER
+        constraint tombstone_pkey
+            primary key autoincrement,
+    table_id             numeric not null
+        references table_name
+            on delete cascade,
+    shard_id             numeric not null,
+    sequence_number      numeric not null,
+    min_time             numeric not null,
+    max_time             numeric not null,
+    serialized_predicate text    not null,
+    constraint tombstone_unique
+        unique (table_id, sequence_number)
+);
+INSERT INTO tombstone SELECT * FROM tombstone_temp;
+ALTER TABLE tombstone DROP COLUMN shard_id;
+DROP TABLE tombstone_temp;
+-- Remove the now unreferenced, unused tables 
+DROP TABLE IF EXISTS topic;
+DROP TABLE IF EXISTS query_pool;
+DROP TABLE IF EXISTS shard;
+DROP TABLE IF EXISTS sharding_rule_override;
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql b/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql
new file mode 100644
index 00000000000..6ede650c2e8
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql
@@ -0,0 +1,2 @@
+ALTER TABLE partition DROP COLUMN sort_key;
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
diff --git a/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql b/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql
new file mode 100644
index 00000000000..868a0e592b6
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition DROP COLUMN sort_key;
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql b/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql
new file mode 100644
index 00000000000..103b0e96b1f
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN sort_key TEXT[];
\ No newline at end of file
diff --git a/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql b/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql
new file mode 100644
index 00000000000..6fe1cec0f63
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE partition ADD COLUMN generation INTEGER NOT NULL DEFAULT 0;
diff --git a/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql b/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql
new file mode 100644
index 00000000000..ca736eb441a
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql
@@ -0,0 +1,8 @@
+-- Populate partition ID where omitted (#9338)
+
+UPDATE parquet_file SET partition_id = partition.id
+FROM partition
+WHERE parquet_file.partition_id is NULL
+  AND parquet_file.partition_hash_id = partition.hash_id;
+
+-- Ideally would SET NOT NULL but not supported by SQLite
diff --git a/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql b/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql
new file mode 100644
index 00000000000..438a8990523
--- /dev/null
+++ b/iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql
@@ -0,0 +1 @@
+ALTER TABLE table_name ADD COLUMN generation INTEGER NOT NULL DEFAULT 0;
diff --git a/iox_catalog/src/cache.rs b/iox_catalog/src/cache.rs
new file mode 100644
index 00000000000..09aa45da39e
--- /dev/null
+++ b/iox_catalog/src/cache.rs
@@ -0,0 +1,831 @@
+//! Cache layer.
+
+use std::{
+    collections::{HashMap, HashSet},
+    ops::ControlFlow,
+    sync::Arc,
+};
+
+use async_trait::async_trait;
+use backoff::{Backoff, BackoffConfig};
+use catalog_cache::{
+    api::quorum::{Error as QuorumError, QuorumCatalogCache},
+    CacheKey, CacheValue,
+};
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::partition::PartitionSnapshot,
+    snapshot::table::TableSnapshot,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use futures::{StreamExt, TryStreamExt};
+use generated_types::influxdata::iox::catalog_cache::v1 as proto;
+use generated_types::prost::Message;
+use iox_time::TimeProvider;
+use observability_deps::tracing::{debug, warn};
+
+use crate::{
+    interface::{
+        CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo,
+        RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+
+/// Caching catalog.
+#[derive(Debug)]
+pub struct CachingCatalog {
+    backing: Arc<dyn Catalog>,
+    cache: Arc<QuorumCatalogCache>,
+    metrics: Arc<metric::Registry>,
+    time_provider: Arc<dyn TimeProvider>,
+    quorum_fanout: usize,
+    backoff_config: Arc<BackoffConfig>,
+}
+
+impl CachingCatalog {
+    /// Create new caching catalog.
+    ///
+    /// Sets:
+    /// - `cache`: quorum-based cache
+    /// - `backing`: underlying backing catalog
+    /// - `metrics`: metrics registry
+    /// - `time_provider`: time provider, used for metrics
+    /// - `quorum_fanout`: number of concurrent quorum operations that a single request can trigger
+    pub fn new(
+        cache: Arc<QuorumCatalogCache>,
+        backing: Arc<dyn Catalog>,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+        quorum_fanout: usize,
+    ) -> Self {
+        let backoff_config = Arc::new(BackoffConfig::default());
+
+        Self {
+            backing,
+            cache,
+            metrics,
+            time_provider,
+            quorum_fanout,
+            backoff_config,
+        }
+    }
+}
+
+impl std::fmt::Display for CachingCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "caching")
+    }
+}
+
+#[async_trait]
+impl Catalog for CachingCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            Repos {
+                backing: Arc::clone(&self.backing),
+                cache: Arc::clone(&self.cache),
+                quorum_fanout: self.quorum_fanout,
+                backoff_config: Arc::clone(&self.backoff_config),
+            },
+            Arc::clone(&self.metrics),
+            self.time_provider(),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+#[derive(Debug)]
+struct Repos {
+    backing: Arc<dyn Catalog>,
+    cache: Arc<QuorumCatalogCache>,
+    quorum_fanout: usize,
+    backoff_config: Arc<BackoffConfig>,
+}
+
+impl Repos {
+    /// Get data from quorum cache.
+    ///
+    /// This method implements retries.
+    async fn get_quorum(&self, key: CacheKey) -> Result<Option<CacheValue>, Error> {
+        Backoff::new(&self.backoff_config)
+            .retry_with_backoff(&format!("quorum GET: {key:?}"), || async move {
+                match self.cache.get(key).await {
+                    Ok(val) => ControlFlow::Break(Ok(val)),
+                    Err(e @ QuorumError::Quorum { .. }) => ControlFlow::Continue(e),
+                    Err(e) => ControlFlow::Break(Err(Error::from(e))),
+                }
+            })
+            .await
+            .map_err(|e| Error::External {
+                source: Box::new(e),
+            })?
+    }
+
+    /// Refresh cached value of given partition.
+    ///
+    /// This requests a new snapshot and performs a quorum-write.
+    ///
+    /// Note that this also performs a snapshot+write if the partition was NOT cached yet.
+    async fn refresh_partition(&self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let snapshot = self
+            .backing
+            .repositories()
+            .partitions()
+            .snapshot(partition_id)
+            .await?;
+        assert_eq!(snapshot.partition_id(), partition_id);
+
+        let generation = snapshot.generation();
+
+        let proto: proto::Partition = snapshot.clone().into();
+        let data = proto.encode_to_vec().into();
+
+        debug!(
+            partition_id = partition_id.get(),
+            generation, "refresh partition",
+        );
+        self.cache
+            .put(
+                CacheKey::Partition(partition_id.get()),
+                CacheValue::new(data, generation),
+            )
+            .await
+            .map_err(|e| {
+                warn!(
+                    partition_id=partition_id.get(),
+                    generation,
+                    %e,
+                    "partition quorum write failed",
+                );
+
+                e
+            })?;
+
+        Ok(snapshot)
+    }
+
+    /// Get snapshot for a partition.
+    ///
+    /// This first tries to quorum-read the partition. If the partition does not exist yet, this will perform a
+    /// [refresh](Self::refresh_partition).
+    async fn get_partition(&self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        if let Some(val) = self
+            .get_quorum(CacheKey::Partition(partition_id.get()))
+            .await
+            .map_err(|e| {
+                warn!(
+                    partition_id=partition_id.get(),
+                    %e,
+                    "partition quorum read failed",
+                );
+
+                e
+            })?
+        {
+            debug!(
+                partition_id = partition_id.get(),
+                status = "HIT",
+                generation = val.generation(),
+                "get partition",
+            );
+
+            let proto = proto::Partition::decode(val.data().clone())?;
+            return Ok(PartitionSnapshot::decode(proto, val.generation()));
+        }
+
+        debug!(
+            partition_id = partition_id.get(),
+            status = "MISS",
+            "get partition",
+        );
+        self.refresh_partition(partition_id).await
+    }
+}
+
+impl RepoCollection for Repos {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for Repos {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .create(
+                name,
+                partition_template,
+                retention_period_ns,
+                service_protection_limits,
+            )
+            .await
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_retention_period(name, retention_period_ns)
+            .await
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        self.backing.repositories().namespaces().list(deleted).await
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .get_by_id(id, deleted)
+            .await
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .get_by_name(name, deleted)
+            .await
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .soft_delete(name)
+            .await
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_table_limit(name, new_max)
+            .await
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        self.backing
+            .repositories()
+            .namespaces()
+            .update_column_limit(name, new_max)
+            .await
+    }
+}
+
+#[async_trait]
+impl TableRepo for Repos {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        self.backing
+            .repositories()
+            .tables()
+            .create(name, partition_template, namespace_id)
+            .await
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .get_by_id(table_id)
+            .await
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .get_by_namespace_and_name(namespace_id, name)
+            .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        self.backing
+            .repositories()
+            .tables()
+            .list_by_namespace_id(namespace_id)
+            .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        self.backing.repositories().tables().list().await
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        self.backing
+            .repositories()
+            .tables()
+            .snapshot(table_id)
+            .await
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for Repos {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        self.backing
+            .repositories()
+            .columns()
+            .create_or_get(name, table_id, column_type)
+            .await
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .create_or_get_many_unchecked(table_id, columns)
+            .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .list_by_namespace_id(namespace_id)
+            .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        self.backing
+            .repositories()
+            .columns()
+            .list_by_table_id(table_id)
+            .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        self.backing.repositories().columns().list().await
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for Repos {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        // read-through: need to wire up table snapshots to look this up efficiently
+        self.backing
+            .repositories()
+            .partitions()
+            .create_or_get(key, table_id)
+            .await
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        futures::stream::iter(prepare_set(partition_ids.iter().cloned()))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    match snapshot.partition() {
+                        Ok(p) => Ok(futures::stream::once(async move { Ok(p) }).boxed()),
+                        Err(e) => Err(Error::from(e)),
+                    }
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        // read-through: need to wire up table snapshots to look this up efficiently
+        self.backing
+            .repositories()
+            .partitions()
+            .list_by_table_id(table_id)
+            .await
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        // read-through: only used for testing, we should eventually remove this interface
+        self.backing.repositories().partitions().list_ids().await
+    }
+
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let res = self
+            .backing
+            .repositories()
+            .partitions()
+            .cas_sort_key(partition_id, old_sort_key_ids, new_sort_key_ids)
+            .await?;
+
+        self.refresh_partition(partition_id)
+            .await
+            .map_err(CasFailure::QueryError)?;
+
+        Ok(res)
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        self.backing
+            .repositories()
+            .partitions()
+            .record_skipped_compaction(
+                partition_id,
+                reason,
+                num_files,
+                limit_num_files,
+                limit_num_files_first_in_partition,
+                estimated_bytes,
+                limit_bytes,
+            )
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_id: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        futures::stream::iter(prepare_set(partition_id.iter().cloned()))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    match snapshot.skipped_compaction() {
+                        Some(sc) => Ok(futures::stream::once(async move { Ok(sc) }).boxed()),
+                        None => Ok(futures::stream::empty().boxed()),
+                    }
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        // read-through: used for debugging, this should be replaced w/ proper hierarchy-traversal
+        self.backing
+            .repositories()
+            .partitions()
+            .list_skipped_compactions()
+            .await
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        let res = self
+            .backing
+            .repositories()
+            .partitions()
+            .delete_skipped_compactions(partition_id)
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(res)
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        // read-through: used for ingester warm-up at the moment
+        self.backing
+            .repositories()
+            .partitions()
+            .most_recent_n(n)
+            .await
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        // read-through: used by the compactor for scheduling, we should eventually find a better interface
+        self.backing
+            .repositories()
+            .partitions()
+            .partitions_new_file_between(minimum_time, maximum_time)
+            .await
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        // read-through: used by the ingester due to hash-id stuff
+        self.backing
+            .repositories()
+            .partitions()
+            .list_old_style()
+            .await
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        self.get_partition(partition_id).await
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for Repos {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let res = self
+            .backing
+            .repositories()
+            .parquet_files()
+            .flag_for_delete_by_retention()
+            .await?;
+
+        let affected_partitions = res
+            .iter()
+            .map(|(p_id, _os_id)| *p_id)
+            .collect::<HashSet<_>>();
+
+        // ensure deterministic order
+        let mut affected_partitions = affected_partitions.into_iter().collect::<Vec<_>>();
+        affected_partitions.sort_unstable();
+
+        // refresh ALL partitons that are affected, NOT just only the ones that were cached. This should avoid the
+        // following "lost update" race condition:
+        //
+        // This scenario assumes that the partition in question is NOT cached yet.
+        //
+        // | T | Thread 1                              | Thread 2                                           |
+        // | - | ------------------------------------- | -------------------------------------------------- |
+        // | 1 | receive `create_update_delete`        |                                                    |
+        // | 2 | execute change within backing catalog |                                                    |
+        // | 3 | takes snapshot from backing catalog   |                                                    |
+        // | 4 |                                       | receive `flag_for_delete_by_retention`             |
+        // | 5 |                                       | execute change within backing catalog              |
+        // | 6 |                                       | affected partition not cached => no snapshot taken |
+        // | 7 |                                       | return                                             |
+        // | 8 | quorum-write snapshot                 |                                                    |
+        // | 9 | return                                |                                                    |
+        //
+        // The partition is now cached by does NOT contain the `flag_for_delete_by_retention` change and will not
+        // automatically converge.
+        futures::stream::iter(affected_partitions)
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    this.refresh_partition(p_id).await?;
+                    Ok::<(), Error>(())
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_collect::<()>()
+            .await?;
+
+        Ok(res)
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        // deleted files are NOT part of the snapshot, so this bypasses the cache
+        self.backing
+            .repositories()
+            .parquet_files()
+            .delete_old_ids_only(older_than)
+            .await
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        futures::stream::iter(prepare_set(partition_ids))
+            .map(|p_id| {
+                let this = &self;
+                async move {
+                    let snapshot = match this.get_partition(p_id).await {
+                        Ok(s) => s,
+                        Err(Error::NotFound { .. }) => {
+                            return Ok(futures::stream::empty().boxed());
+                        }
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+
+                    // Decode files so we can drop the snapshot early.
+                    //
+                    // Need to collect the file results into a vec though because we cannot return borrowed data and
+                    // "owned iterators" aren't a thing.
+                    let files = snapshot
+                        .files()
+                        .map(|res| res.map_err(Error::from))
+                        .collect::<Vec<_>>();
+                    Ok::<_, Error>(futures::stream::iter(files).boxed())
+                }
+            })
+            .buffer_unordered(self.quorum_fanout)
+            .try_flatten()
+            .try_collect::<Vec<_>>()
+            .await
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        // read-through: see https://github.com/influxdata/influxdb_iox/issues/9719
+        self.backing
+            .repositories()
+            .parquet_files()
+            .get_by_object_store_id(object_store_id)
+            .await
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        // read-through: this is used by the GC, so this is not overall latency-critical
+        self.backing
+            .repositories()
+            .parquet_files()
+            .exists_by_object_store_id_batch(object_store_ids)
+            .await
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let res = self
+            .backing
+            .repositories()
+            .parquet_files()
+            .create_upgrade_delete(partition_id, delete, upgrade, create, target_level)
+            .await?;
+
+        self.refresh_partition(partition_id).await?;
+
+        Ok(res)
+    }
+}
+
+/// Prepare set of elements in deterministic order.
+fn prepare_set<S, T>(set: S) -> Vec<T>
+where
+    S: IntoIterator<Item = T>,
+    T: Eq + Ord,
+{
+    // ensure deterministic order (also required for de-dup)
+    let mut set = set.into_iter().collect::<Vec<_>>();
+    set.sort_unstable();
+
+    // de-dup
+    set.dedup();
+
+    set
+}
+
+#[cfg(test)]
+mod tests {
+    use catalog_cache::api::server::test_util::TestCacheServer;
+    use catalog_cache::local::CatalogCache;
+    use iox_time::SystemProvider;
+
+    use crate::{interface_tests::TestCatalog, mem::MemCatalog};
+
+    use super::*;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn test_catalog() {
+        crate::interface_tests::test_catalog(|| async {
+            let metrics = Arc::new(metric::Registry::default());
+            let time_provider = Arc::new(SystemProvider::new()) as _;
+            let backing = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+
+            let peer0 = TestCacheServer::bind_ephemeral();
+            let peer1 = TestCacheServer::bind_ephemeral();
+            let cache = Arc::new(QuorumCatalogCache::new(
+                Arc::new(CatalogCache::default()),
+                Arc::new([peer0.client(), peer1.client()]),
+            ));
+
+            // use new metrics registry so the two layers don't double-count
+            let metrics = Arc::new(metric::Registry::default());
+            let caching_catalog = Arc::new(CachingCatalog::new(
+                cache,
+                backing,
+                metrics,
+                time_provider,
+                10,
+            ));
+
+            let test_catalog = TestCatalog::new(caching_catalog);
+            test_catalog.hold_onto(peer0);
+            test_catalog.hold_onto(peer1);
+
+            Arc::new(test_catalog) as _
+        })
+        .await;
+    }
+}
diff --git a/iox_catalog/src/constants.rs b/iox_catalog/src/constants.rs
new file mode 100644
index 00000000000..b6b88fbd21d
--- /dev/null
+++ b/iox_catalog/src/constants.rs
@@ -0,0 +1,19 @@
+//! Constants that are hold for all catalog implementations.
+
+/// Time column.
+pub const TIME_COLUMN: &str = "time";
+
+/// Default retention period for data in the catalog.
+pub const DEFAULT_RETENTION_PERIOD: Option<i64> = None;
+
+/// Maximum number of files touched by [`ParquetFileRepo::flag_for_delete_by_retention`] at a time.
+///
+///
+/// [`ParquetFileRepo::flag_for_delete_by_retention`]: crate::interface::ParquetFileRepo::flag_for_delete_by_retention
+pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION: i64 = 1_000;
+
+/// Maximum number of files touched by [`ParquetFileRepo::delete_old_ids_only`] at a time.
+///
+///
+/// [`ParquetFileRepo::delete_old_ids_only`]: crate::interface::ParquetFileRepo::delete_old_ids_only
+pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE: i64 = 10_000;
diff --git a/iox_catalog/src/grpc/client.rs b/iox_catalog/src/grpc/client.rs
new file mode 100644
index 00000000000..8edc05dd640
--- /dev/null
+++ b/iox_catalog/src/grpc/client.rs
@@ -0,0 +1,997 @@
+//! gRPC client implementation.
+use std::future::Future;
+use std::ops::ControlFlow;
+use std::{collections::HashMap, sync::Arc};
+
+use async_trait::async_trait;
+use futures::TryStreamExt;
+use log::{debug, info, warn};
+use tonic::transport::{Channel, Uri};
+
+use crate::{
+    interface::{
+        CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo,
+        RepoCollection, Result, SoftDeletedRows, TableRepo,
+    },
+    metrics::MetricDecorator,
+};
+use backoff::{Backoff, BackoffError};
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::table::TableSnapshot,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
+};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use iox_time::TimeProvider;
+use trace_http::metrics::{MetricFamily, RequestMetrics};
+use trace_http::tower::TraceService;
+
+use super::serialization::{
+    convert_status, deserialize_column, deserialize_namespace, deserialize_object_store_id,
+    deserialize_parquet_file, deserialize_partition, deserialize_skipped_compaction,
+    deserialize_sort_key_ids, deserialize_table, serialize_column_type, serialize_object_store_id,
+    serialize_parquet_file_params, serialize_soft_deleted_rows, serialize_sort_key_ids, ContextExt,
+    RequiredExt,
+};
+
+type InstrumentedChannel = TraceService<Channel>;
+
+/// Catalog that goes through a gRPC interface.
+#[derive(Debug)]
+pub struct GrpcCatalogClient {
+    channel: InstrumentedChannel,
+    metrics: Arc<metric::Registry>,
+    time_provider: Arc<dyn TimeProvider>,
+}
+
+impl GrpcCatalogClient {
+    /// Create new client.
+    pub fn new(
+        uri: Uri,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
+        let channel = TraceService::new_client(
+            Channel::builder(uri).connect_lazy(),
+            Arc::new(RequestMetrics::new(
+                Arc::clone(&metrics),
+                MetricFamily::GrpcClient,
+            )),
+            None,
+            "catalog",
+        );
+        Self {
+            channel,
+            metrics,
+            time_provider,
+        }
+    }
+}
+
+impl std::fmt::Display for GrpcCatalogClient {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "grpc")
+    }
+}
+
+#[async_trait]
+impl Catalog for GrpcCatalogClient {
+    async fn setup(&self) -> Result<(), Error> {
+        Ok(())
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        Box::new(MetricDecorator::new(
+            GrpcCatalogClientRepos {
+                channel: self.channel.clone(),
+            },
+            Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
+        ))
+    }
+
+    #[cfg(test)]
+    fn metrics(&self) -> Arc<metric::Registry> {
+        Arc::clone(&self.metrics)
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        Arc::clone(&self.time_provider)
+    }
+}
+
+#[derive(Debug)]
+struct GrpcCatalogClientRepos {
+    channel: InstrumentedChannel,
+}
+
+type ServiceClient = proto::catalog_service_client::CatalogServiceClient<InstrumentedChannel>;
+
+fn is_upstream_error(e: &tonic::Status) -> bool {
+    matches!(
+        e.code(),
+        tonic::Code::Cancelled
+            | tonic::Code::DeadlineExceeded
+            | tonic::Code::FailedPrecondition
+            | tonic::Code::Aborted
+            | tonic::Code::Unavailable
+    )
+}
+
+impl GrpcCatalogClientRepos {
+    fn client(&self) -> ServiceClient {
+        proto::catalog_service_client::CatalogServiceClient::new(self.channel.clone())
+    }
+
+    async fn retry<U, FunIo, Fut, D>(
+        &self,
+        operation: &str,
+        upload: U,
+        fun_io: FunIo,
+    ) -> Result<D, Error>
+    where
+        U: Clone + std::fmt::Debug + Send + Sync,
+        FunIo: Fn(U, ServiceClient) -> Fut + Send + Sync,
+        Fut: Future<Output = Result<tonic::Response<D>, tonic::Status>> + Send,
+        D: std::fmt::Debug,
+    {
+        Backoff::new(&Default::default())
+            .retry_with_backoff(operation, || async {
+                let res = fun_io(upload.clone(), self.client()).await;
+                match res {
+                    Ok(r) => {
+                        let r = r.into_inner();
+                        debug!("{} successfully received: {:?}", operation, &r);
+                        ControlFlow::Break(Ok(r))
+                    }
+                    Err(e) if is_upstream_error(&e) => {
+                        info!("{} retriable error encountered: {:?}", operation, &e);
+                        ControlFlow::Continue(e)
+                    }
+                    Err(e) => {
+                        warn!(
+                            "{operation} attempted {:?} and received error: {:?}",
+                            upload, e
+                        );
+                        ControlFlow::Break(Err(convert_status(e)))
+                    }
+                }
+            })
+            .await
+            .map_err(|be| {
+                let status = match be {
+                    BackoffError::DeadlineExceeded { source, .. } => source,
+                };
+                convert_status(status)
+            })?
+    }
+}
+
+impl RepoCollection for GrpcCatalogClientRepos {
+    fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
+        self
+    }
+
+    fn tables(&mut self) -> &mut dyn TableRepo {
+        self
+    }
+
+    fn columns(&mut self) -> &mut dyn ColumnRepo {
+        self
+    }
+
+    fn partitions(&mut self) -> &mut dyn PartitionRepo {
+        self
+    }
+
+    fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo {
+        self
+    }
+}
+
+#[async_trait]
+impl NamespaceRepo for GrpcCatalogClientRepos {
+    async fn create(
+        &mut self,
+        name: &NamespaceName<'_>,
+        partition_template: Option<NamespacePartitionTemplateOverride>,
+        retention_period_ns: Option<i64>,
+        service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceCreateRequest {
+            name: name.to_string(),
+            partition_template: partition_template.and_then(|t| t.as_proto().cloned()),
+            retention_period_ns,
+            service_protection_limits: service_protection_limits.map(|l| {
+                proto::ServiceProtectionLimits {
+                    max_tables: l.max_tables.map(|x| x.get_i32()),
+                    max_columns_per_table: l.max_columns_per_table.map(|x| x.get_i32()),
+                }
+            }),
+        };
+
+        let resp = self
+            .retry("namespace_create", n, |data, mut client| async move {
+                client.namespace_create(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn update_retention_period(
+        &mut self,
+        name: &str,
+        retention_period_ns: Option<i64>,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateRetentionPeriodRequest {
+            name: name.to_owned(),
+            retention_period_ns,
+        };
+
+        let resp = self.retry(
+            "namespace_update_retention_period",
+            n,
+            |data, mut client| async move { client.namespace_update_retention_period(data).await },
+        )
+        .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
+        let n = proto::NamespaceListRequest {
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        self.retry("namespace_list", n, |data, mut client| async move {
+            client.namespace_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            deserialize_namespace(res.namespace.required().ctx("namespace")?).map_err(Error::from)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn get_by_id(
+        &mut self,
+        id: NamespaceId,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let n = proto::NamespaceGetByIdRequest {
+            id: id.get(),
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        let resp = self
+            .retry("namespace_get_by_id", n, |data, mut client| async move {
+                client.namespace_get_by_id(data).await
+            })
+            .await?;
+        Ok(resp.namespace.map(deserialize_namespace).transpose()?)
+    }
+
+    async fn get_by_name(
+        &mut self,
+        name: &str,
+        deleted: SoftDeletedRows,
+    ) -> Result<Option<Namespace>> {
+        let n = proto::NamespaceGetByNameRequest {
+            name: name.to_owned(),
+            deleted: serialize_soft_deleted_rows(deleted),
+        };
+
+        let resp = self
+            .retry("namespace_get_by_name", n, |data, mut client| async move {
+                client.namespace_get_by_name(data).await
+            })
+            .await?;
+        Ok(resp.namespace.map(deserialize_namespace).transpose()?)
+    }
+
+    async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let n = proto::NamespaceSoftDeleteRequest {
+            name: name.to_owned(),
+        };
+
+        self.retry("namespace_soft_delete", n, |data, mut client| async move {
+            client.namespace_soft_delete(data).await
+        })
+        .await?;
+        Ok(())
+    }
+
+    async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateTableLimitRequest {
+            name: name.to_owned(),
+            new_max: new_max.get_i32(),
+        };
+
+        let resp = self
+            .retry("namespace_soft_delete", n, |data, mut client| async move {
+                client.namespace_update_table_limit(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+
+    async fn update_column_limit(
+        &mut self,
+        name: &str,
+        new_max: MaxColumnsPerTable,
+    ) -> Result<Namespace> {
+        let n = proto::NamespaceUpdateColumnLimitRequest {
+            name: name.to_owned(),
+            new_max: new_max.get_i32(),
+        };
+
+        let resp = self
+            .retry("namespace_soft_delete", n, |data, mut client| async move {
+                client.namespace_update_column_limit(data).await
+            })
+            .await?;
+
+        Ok(deserialize_namespace(
+            resp.namespace.required().ctx("namespace")?,
+        )?)
+    }
+}
+
+#[async_trait]
+impl TableRepo for GrpcCatalogClientRepos {
+    async fn create(
+        &mut self,
+        name: &str,
+        partition_template: TablePartitionTemplateOverride,
+        namespace_id: NamespaceId,
+    ) -> Result<Table> {
+        let t = proto::TableCreateRequest {
+            name: name.to_owned(),
+            partition_template: partition_template.as_proto().cloned(),
+            namespace_id: namespace_id.get(),
+        };
+
+        let resp = self
+            .retry("table_create", t, |data, mut client| async move {
+                client.table_create(data).await
+            })
+            .await?;
+        Ok(deserialize_table(resp.table.required().ctx("table")?)?)
+    }
+
+    async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
+        let t = proto::TableGetByIdRequest { id: table_id.get() };
+
+        let resp = self
+            .retry("table_get_by_id", t, |data, mut client| async move {
+                client.table_get_by_id(data).await
+            })
+            .await?;
+        Ok(resp.table.map(deserialize_table).transpose()?)
+    }
+
+    async fn get_by_namespace_and_name(
+        &mut self,
+        namespace_id: NamespaceId,
+        name: &str,
+    ) -> Result<Option<Table>> {
+        let t = proto::TableGetByNamespaceAndNameRequest {
+            namespace_id: namespace_id.get(),
+            name: name.to_owned(),
+        };
+
+        let resp = self.retry(
+            "table_get_by_namespace_and_name",
+            t,
+            |data, mut client| async move { client.table_get_by_namespace_and_name(data).await },
+        )
+        .await?;
+        Ok(resp.table.map(deserialize_table).transpose()?)
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
+        let t = proto::TableListByNamespaceIdRequest {
+            namespace_id: namespace_id.get(),
+        };
+
+        self.retry(
+            "table_list_by_namespace_id",
+            t,
+            |data, mut client| async move { client.table_list_by_namespace_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) })
+        .try_collect()
+        .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Table>> {
+        let t = proto::TableListRequest {};
+
+        self.retry("table_list", t, |data, mut client| async move {
+            client.table_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) })
+        .try_collect()
+        .await
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let t = proto::TableSnapshotRequest {
+            table_id: table_id.get(),
+        };
+
+        let resp = self
+            .retry("table_snapshot", t, |data, mut client| async move {
+                client.table_snapshot(data).await
+            })
+            .await?;
+
+        let table = resp.table.required().ctx("table")?;
+        Ok(TableSnapshot::decode(table, resp.generation))
+    }
+}
+
+#[async_trait]
+impl ColumnRepo for GrpcCatalogClientRepos {
+    async fn create_or_get(
+        &mut self,
+        name: &str,
+        table_id: TableId,
+        column_type: ColumnType,
+    ) -> Result<Column> {
+        let c = proto::ColumnCreateOrGetRequest {
+            name: name.to_owned(),
+            table_id: table_id.get(),
+            column_type: serialize_column_type(column_type),
+        };
+
+        let resp = self
+            .retry("column_create_or_get", c, |data, mut client| async move {
+                client.column_create_or_get(data).await
+            })
+            .await?;
+        Ok(deserialize_column(resp.column.required().ctx("column")?)?)
+    }
+
+    async fn create_or_get_many_unchecked(
+        &mut self,
+        table_id: TableId,
+        columns: HashMap<&str, ColumnType>,
+    ) -> Result<Vec<Column>> {
+        let c = proto::ColumnCreateOrGetManyUncheckedRequest {
+            table_id: table_id.get(),
+            columns: columns
+                .into_iter()
+                .map(|(name, t)| (name.to_owned(), serialize_column_type(t)))
+                .collect(),
+        };
+
+        self.retry(
+            "column_create_or_get_many_unchecked",
+            c,
+            |data, mut client| async move { client.column_create_or_get_many_unchecked(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_column(res.column.required().ctx("column")?)?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
+        let c = proto::ColumnListByNamespaceIdRequest {
+            namespace_id: namespace_id.get(),
+        };
+
+        self.retry(
+            "column_list_by_namespace_id",
+            c,
+            |data, mut client| async move { client.column_list_by_namespace_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
+        let c = proto::ColumnListByTableIdRequest {
+            table_id: table_id.get(),
+        };
+
+        self.retry(
+            "column_list_by_table_id",
+            c,
+            |data, mut client| async move { client.column_list_by_table_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+
+    async fn list(&mut self) -> Result<Vec<Column>> {
+        let c = proto::ColumnListRequest {};
+
+        self.retry("column_list", c, |data, mut client| async move {
+            client.column_list(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .and_then(
+            |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) },
+        )
+        .try_collect()
+        .await
+    }
+}
+
+#[async_trait]
+impl PartitionRepo for GrpcCatalogClientRepos {
+    async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
+        let p = proto::PartitionCreateOrGetRequest {
+            key: key.inner().to_owned(),
+            table_id: table_id.get(),
+        };
+
+        let resp = self
+            .retry(
+                "partition_create_or_get",
+                p,
+                |data, mut client| async move { client.partition_create_or_get(data).await },
+            )
+            .await?;
+
+        Ok(deserialize_partition(
+            resp.partition.required().ctx("partition")?,
+        )?)
+    }
+
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        let p = proto::PartitionGetByIdBatchRequest {
+            partition_ids: partition_ids.iter().map(|id| id.get()).collect(),
+        };
+
+        self.retry(
+            "partition_get_by_id_batch",
+            p,
+            |data, mut client| async move { client.partition_get_by_id_batch(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
+        let p = proto::PartitionListByTableIdRequest {
+            table_id: table_id.get(),
+        };
+
+        self.retry(
+            "partition_list_by_table_id",
+            p,
+            |data, mut client| async move { client.partition_list_by_table_id(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
+        let p = proto::PartitionListIdsRequest {};
+
+        self.retry("partition_list_ids", p, |data, mut client| async move {
+            client.partition_list_ids(data).await
+        })
+        .await?
+        .map_err(convert_status)
+        .map_ok(|res| PartitionId::new(res.partition_id))
+        .try_collect()
+        .await
+    }
+
+    async fn cas_sort_key(
+        &mut self,
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        // This method does not use request/request_streaming_response
+        // because the error handling (converting to CasFailure) differs
+        // from how all the other methods handle errors.
+
+        let p = proto::PartitionCasSortKeyRequest {
+            partition_id: partition_id.get(),
+            old_sort_key_ids: old_sort_key_ids.map(serialize_sort_key_ids),
+            new_sort_key_ids: Some(serialize_sort_key_ids(new_sort_key_ids)),
+        };
+
+        let res = self
+            .retry("partition_cas_sort_key", p, |data, mut client| async move {
+                client.partition_cas_sort_key(data).await
+            })
+            .await
+            .map_err(CasFailure::QueryError)?;
+
+        let res = res
+            .res
+            .required()
+            .ctx("res")
+            .map_err(|e| CasFailure::QueryError(e.into()))?;
+
+        match res {
+            proto::partition_cas_sort_key_response::Res::Partition(p) => {
+                let p = deserialize_partition(p).map_err(|e| CasFailure::QueryError(e.into()))?;
+                Ok(p)
+            }
+            proto::partition_cas_sort_key_response::Res::CurrentSortKey(k) => {
+                Err(CasFailure::ValueMismatch(deserialize_sort_key_ids(k)))
+            }
+        }
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn record_skipped_compaction(
+        &mut self,
+        partition_id: PartitionId,
+        reason: &str,
+        num_files: usize,
+        limit_num_files: usize,
+        limit_num_files_first_in_partition: usize,
+        estimated_bytes: u64,
+        limit_bytes: u64,
+    ) -> Result<()> {
+        let p = proto::PartitionRecordSkippedCompactionRequest {
+            partition_id: partition_id.get(),
+            reason: reason.to_owned(),
+            num_files: num_files as u64,
+            limit_num_files: limit_num_files as u64,
+            limit_num_files_first_in_partition: limit_num_files_first_in_partition as u64,
+            estimated_bytes,
+            limit_bytes,
+        };
+
+        self.retry(
+            "partition_record_skipped_compaction",
+            p,
+            |data, mut client| async move { client.partition_record_skipped_compaction(data).await },
+        )
+        .await?;
+        Ok(())
+    }
+
+    async fn get_in_skipped_compactions(
+        &mut self,
+        partition_id: &[PartitionId],
+    ) -> Result<Vec<SkippedCompaction>> {
+        let p = proto::PartitionGetInSkippedCompactionsRequest {
+            partition_ids: partition_id.iter().map(|id| id.get()).collect(),
+        };
+
+        self.retry(
+            "partition_get_in_skipped_compactions",
+            p,
+            |data, mut client| async move { client.partition_get_in_skipped_compactions(data).await },
+        )
+        .await?
+            .map_err(convert_status)
+            .and_then(|res| async move {
+                Ok(deserialize_skipped_compaction(res.skipped_compaction.required().ctx("skipped_compaction")?))
+            })
+            .try_collect()
+            .await
+    }
+
+    async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
+        let p = proto::PartitionListSkippedCompactionsRequest {};
+
+        self.retry(
+            "partition_list_skipped_compactions",
+            p,
+            |data, mut client| async move { client.partition_list_skipped_compactions(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_skipped_compaction(
+                res.skipped_compaction
+                    .required()
+                    .ctx("skipped_compaction")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn delete_skipped_compactions(
+        &mut self,
+        partition_id: PartitionId,
+    ) -> Result<Option<SkippedCompaction>> {
+        let p = proto::PartitionDeleteSkippedCompactionsRequest {
+            partition_id: partition_id.get(),
+        };
+
+        let resp = self
+            .retry(
+                "partition_delete_skipped_compactions",
+                p,
+                |data, mut client| async move {
+                    client.partition_delete_skipped_compactions(data).await
+                },
+            )
+            .await?;
+
+        Ok(resp.skipped_compaction.map(deserialize_skipped_compaction))
+    }
+
+    async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
+        let p = proto::PartitionMostRecentNRequest { n: n as u64 };
+
+        self.retry(
+            "partition_most_recent_n",
+            p,
+            |data, mut client| async move { client.partition_most_recent_n(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn partitions_new_file_between(
+        &mut self,
+        minimum_time: Timestamp,
+        maximum_time: Option<Timestamp>,
+    ) -> Result<Vec<PartitionId>> {
+        let p = proto::PartitionNewFileBetweenRequest {
+            minimum_time: minimum_time.get(),
+            maximum_time: maximum_time.map(|ts| ts.get()),
+        };
+
+        self.retry(
+            "partition_new_file_between",
+            p,
+            |data, mut client| async move { client.partition_new_file_between(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .map_ok(|res| PartitionId::new(res.partition_id))
+        .try_collect()
+        .await
+    }
+
+    async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
+        let p = proto::PartitionListOldStyleRequest {};
+
+        self.retry(
+            "partition_list_old_style",
+            p,
+            |data, mut client| async move { client.partition_list_old_style(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_partition(
+                res.partition.required().ctx("partition")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let p = proto::PartitionSnapshotRequest {
+            partition_id: partition_id.get(),
+        };
+
+        let resp = self
+            .retry("partition_snapshot", p, |data, mut client| async move {
+                client.partition_snapshot(data).await
+            })
+            .await?;
+        let partition = resp.partition.required().ctx("partition")?;
+        Ok(PartitionSnapshot::decode(partition, resp.generation))
+    }
+}
+
+#[async_trait]
+impl ParquetFileRepo for GrpcCatalogClientRepos {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let p = proto::ParquetFileFlagForDeleteByRetentionRequest {};
+
+        self.retry(
+            "parquet_file_flag_for_delete_by_retention",
+            p,
+            |data, mut client| async move {
+                client.parquet_file_flag_for_delete_by_retention(data).await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok((
+                PartitionId::new(res.partition_id),
+                deserialize_object_store_id(res.object_store_id.required().ctx("object_store_id")?),
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        let p = proto::ParquetFileDeleteOldIdsOnlyRequest {
+            older_than: older_than.get(),
+        };
+
+        self.retry(
+            "parquet_file_delete_old_ids_only",
+            p,
+            |data, mut client| async move { client.parquet_file_delete_old_ids_only(data).await },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_object_store_id(
+                res.object_store_id.required().ctx("object_store_id")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn list_by_partition_not_to_delete_batch(
+        &mut self,
+        partition_ids: Vec<PartitionId>,
+    ) -> Result<Vec<ParquetFile>> {
+        let p = proto::ParquetFileListByPartitionNotToDeleteBatchRequest {
+            partition_ids: partition_ids.into_iter().map(|p| p.get()).collect(),
+        };
+
+        self.retry(
+            "parquet_file_list_by_partition_not_to_delete_batch",
+            p,
+            |data, mut client| async move {
+                client
+                    .parquet_file_list_by_partition_not_to_delete_batch(data)
+                    .await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_parquet_file(
+                res.parquet_file.required().ctx("parquet_file")?,
+            )?)
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn get_by_object_store_id(
+        &mut self,
+        object_store_id: ObjectStoreId,
+    ) -> Result<Option<ParquetFile>> {
+        let p = proto::ParquetFileGetByObjectStoreIdRequest {
+            object_store_id: Some(serialize_object_store_id(object_store_id)),
+        };
+
+        let maybe_file = self.retry(
+            "parquet_file_get_by_object_store_id",
+            p,
+            |data, mut client| async move { client.parquet_file_get_by_object_store_id(data).await })
+            .await?
+            .parquet_file.map(deserialize_parquet_file).transpose()?;
+        Ok(maybe_file)
+    }
+
+    async fn exists_by_object_store_id_batch(
+        &mut self,
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        let p = futures::stream::iter(object_store_ids.into_iter().map(|id| {
+            proto::ParquetFileExistsByObjectStoreIdBatchRequest {
+                object_store_id: Some(serialize_object_store_id(id)),
+            }
+        }));
+
+        self.retry(
+            "parquet_file_exists_by_object_store_id_batch",
+            p,
+            |data, mut client: ServiceClient| async move {
+                client
+                    .parquet_file_exists_by_object_store_id_batch(data)
+                    .await
+            },
+        )
+        .await?
+        .map_err(convert_status)
+        .and_then(|res| async move {
+            Ok(deserialize_object_store_id(
+                res.object_store_id.required().ctx("object_store_id")?,
+            ))
+        })
+        .try_collect()
+        .await
+    }
+
+    async fn create_upgrade_delete(
+        &mut self,
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
+        create: &[ParquetFileParams],
+        target_level: CompactionLevel,
+    ) -> Result<Vec<ParquetFileId>> {
+        let p = proto::ParquetFileCreateUpgradeDeleteRequest {
+            partition_id: partition_id.get(),
+            delete: delete
+                .iter()
+                .copied()
+                .map(serialize_object_store_id)
+                .collect(),
+            upgrade: upgrade
+                .iter()
+                .copied()
+                .map(serialize_object_store_id)
+                .collect(),
+            create: create.iter().map(serialize_parquet_file_params).collect(),
+            target_level: target_level as i32,
+        };
+
+        let resp = self.retry(
+            "parquet_file_create_upgrade_delete",
+            p,
+            |data, mut client| async move { client.parquet_file_create_upgrade_delete(data).await },
+        )
+        .await?;
+
+        Ok(resp
+            .created_parquet_file_ids
+            .into_iter()
+            .map(ParquetFileId::new)
+            .collect())
+    }
+}
diff --git a/iox_catalog/src/grpc/mod.rs b/iox_catalog/src/grpc/mod.rs
new file mode 100644
index 00000000000..0374f575b85
--- /dev/null
+++ b/iox_catalog/src/grpc/mod.rs
@@ -0,0 +1,143 @@
+//! gRPC catalog tunnel.
+//!
+//! This tunnels catalog requests over gRPC.
+
+pub mod client;
+mod serialization;
+pub mod server;
+
+#[cfg(test)]
+mod tests {
+    use std::{net::SocketAddr, sync::Arc};
+
+    use data_types::NamespaceName;
+    use iox_time::SystemProvider;
+    use metric::{Attributes, Metric, U64Counter};
+    use test_helpers::maybe_start_logging;
+    use tokio::{net::TcpListener, task::JoinSet};
+    use tonic::transport::{server::TcpIncoming, Server, Uri};
+
+    use crate::{interface::Catalog, interface_tests::TestCatalog, mem::MemCatalog};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_catalog() {
+        maybe_start_logging();
+
+        crate::interface_tests::test_catalog(|| async {
+            let metrics = Arc::new(metric::Registry::default());
+            let time_provider = Arc::new(SystemProvider::new()) as _;
+            let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+            let test_server = TestServer::new(backing_catalog).await;
+            let uri = test_server.uri();
+
+            // create new metrics for client so that they don't overlap w/ server
+            let metrics = Arc::new(metric::Registry::default());
+            let client = Arc::new(client::GrpcCatalogClient::new(
+                uri,
+                metrics,
+                Arc::clone(&time_provider),
+            ));
+
+            let test_catalog = TestCatalog::new(client);
+            test_catalog.hold_onto(test_server);
+
+            Arc::new(test_catalog) as _
+        })
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_catalog_metrics() {
+        maybe_start_logging();
+
+        let time_provider = Arc::new(SystemProvider::new()) as _;
+        let metrics = Arc::new(metric::Registry::default());
+        let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider)));
+        let test_server = TestServer::new(backing_catalog).await;
+        let uri = test_server.uri();
+
+        // create new metrics for client so that they don't overlap w/ server
+        let metrics = Arc::new(metric::Registry::default());
+        let client = Arc::new(client::GrpcCatalogClient::new(
+            uri,
+            Arc::clone(&metrics),
+            Arc::clone(&time_provider),
+        ));
+
+        let ns = client
+            .repositories()
+            .namespaces()
+            .create(&NamespaceName::new("testns").unwrap(), None, None, None)
+            .await
+            .expect("namespace failed to create");
+
+        let _ = client
+            .repositories()
+            .tables()
+            .list_by_namespace_id(ns.id)
+            .await
+            .expect("failed to list namespaces");
+
+        let metric = metrics
+            .get_instrument::<Metric<U64Counter>>("grpc_client_requests")
+            .expect("failed to get metric");
+
+        let count = metric
+            .get_observer(&Attributes::from(&[
+                (
+                    "path",
+                    "/influxdata.iox.catalog.v2.CatalogService/NamespaceCreate",
+                ),
+                ("status", "ok"),
+            ]))
+            .unwrap()
+            .fetch();
+
+        assert_eq!(count, 1);
+
+        let count = metric
+            .get_observer(&Attributes::from(&[
+                (
+                    "path",
+                    "/influxdata.iox.catalog.v2.CatalogService/TableListByNamespaceId",
+                ),
+                ("status", "ok"),
+            ]))
+            .unwrap()
+            .fetch();
+
+        assert_eq!(count, 1);
+    }
+
+    struct TestServer {
+        addr: SocketAddr,
+        #[allow(dead_code)]
+        task: JoinSet<()>,
+    }
+
+    impl TestServer {
+        async fn new(catalog: Arc<dyn Catalog>) -> Self {
+            let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
+            let addr = listener.local_addr().unwrap();
+            let incoming = TcpIncoming::from_listener(listener, true, None).unwrap();
+            let mut task = JoinSet::new();
+            task.spawn(async move {
+                Server::builder()
+                    .add_service(server::GrpcCatalogServer::new(catalog).service())
+                    .serve_with_incoming(incoming)
+                    .await
+                    .unwrap();
+            });
+
+            Self { addr, task }
+        }
+
+        fn uri(&self) -> Uri {
+            format!("http://{}:{}", self.addr.ip(), self.addr.port())
+                .parse()
+                .unwrap()
+        }
+    }
+}
diff --git a/iox_catalog/src/grpc/serialization.rs b/iox_catalog/src/grpc/serialization.rs
new file mode 100644
index 00000000000..2698dc424f9
--- /dev/null
+++ b/iox_catalog/src/grpc/serialization.rs
@@ -0,0 +1,712 @@
+use data_types::{
+    partition_template::NamespacePartitionTemplateOverride, Column, ColumnId, ColumnSet,
+    ColumnType, Namespace, NamespaceId, ObjectStoreId, ParquetFile, ParquetFileId,
+    ParquetFileParams, Partition, PartitionId, SkippedCompaction, SortKeyIds, Table, TableId,
+    Timestamp,
+};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use uuid::Uuid;
+
+use crate::interface::SoftDeletedRows;
+
+#[derive(Debug)]
+pub struct Error {
+    msg: String,
+    path: Vec<&'static str>,
+}
+
+impl Error {
+    fn new<E>(e: E) -> Self
+    where
+        E: std::fmt::Display,
+    {
+        Self {
+            msg: e.to_string(),
+            path: vec![],
+        }
+    }
+
+    fn ctx(self, arg: &'static str) -> Self {
+        let Self { msg, mut path } = self;
+        path.insert(0, arg);
+        Self { msg, path }
+    }
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if !self.path.is_empty() {
+            write!(f, "{}", self.path[0])?;
+            for p in self.path.iter().skip(1) {
+                write!(f, ".{}", p)?;
+            }
+            write!(f, ": ")?;
+        }
+
+        write!(f, "{}", self.msg)?;
+
+        Ok(())
+    }
+}
+
+impl std::error::Error for Error {}
+
+impl From<Error> for crate::interface::Error {
+    fn from(e: Error) -> Self {
+        Self::External { source: e.into() }
+    }
+}
+
+impl From<Error> for tonic::Status {
+    fn from(e: Error) -> Self {
+        Self::invalid_argument(e.to_string())
+    }
+}
+
+pub(crate) trait ConvertExt<O> {
+    fn convert(self) -> Result<O, Error>;
+}
+
+impl<T, O> ConvertExt<O> for T
+where
+    T: TryInto<O>,
+    T::Error: std::fmt::Display,
+{
+    fn convert(self) -> Result<O, Error> {
+        self.try_into().map_err(Error::new)
+    }
+}
+
+pub(crate) trait ConvertOptExt<O> {
+    fn convert_opt(self) -> Result<O, Error>;
+}
+
+impl<T, O> ConvertOptExt<Option<O>> for Option<T>
+where
+    T: TryInto<O>,
+    T::Error: std::fmt::Display,
+{
+    fn convert_opt(self) -> Result<Option<O>, Error> {
+        self.map(|x| x.convert()).transpose()
+    }
+}
+
+pub(crate) trait RequiredExt<T> {
+    fn required(self) -> Result<T, Error>;
+}
+
+impl<T> RequiredExt<T> for Option<T> {
+    fn required(self) -> Result<T, Error> {
+        self.ok_or_else(|| Error::new("required"))
+    }
+}
+
+pub(crate) trait ContextExt<T> {
+    fn ctx(self, path: &'static str) -> Result<T, Error>;
+}
+
+impl<T> ContextExt<T> for Result<T, Error> {
+    fn ctx(self, path: &'static str) -> Self {
+        self.map_err(|e| e.ctx(path))
+    }
+}
+
+pub(crate) fn catalog_error_to_status(e: crate::interface::Error) -> tonic::Status {
+    use crate::interface::Error;
+
+    match e {
+        Error::External { source } => tonic::Status::internal(source.to_string()),
+        Error::AlreadyExists { descr } => tonic::Status::already_exists(descr),
+        Error::LimitExceeded { descr } => tonic::Status::resource_exhausted(descr),
+        Error::NotFound { descr } => tonic::Status::not_found(descr),
+    }
+}
+
+pub(crate) fn convert_status(status: tonic::Status) -> crate::interface::Error {
+    use crate::interface::Error;
+
+    match status.code() {
+        tonic::Code::Internal => Error::External {
+            source: status.message().to_owned().into(),
+        },
+        tonic::Code::AlreadyExists => Error::AlreadyExists {
+            descr: status.message().to_owned(),
+        },
+        tonic::Code::ResourceExhausted => Error::LimitExceeded {
+            descr: status.message().to_owned(),
+        },
+        tonic::Code::NotFound => Error::NotFound {
+            descr: status.message().to_owned(),
+        },
+        _ => Error::External {
+            source: Box::new(status),
+        },
+    }
+}
+
+pub(crate) fn serialize_soft_deleted_rows(sdr: SoftDeletedRows) -> i32 {
+    let sdr = match sdr {
+        SoftDeletedRows::AllRows => proto::SoftDeletedRows::AllRows,
+        SoftDeletedRows::ExcludeDeleted => proto::SoftDeletedRows::ExcludeDeleted,
+        SoftDeletedRows::OnlyDeleted => proto::SoftDeletedRows::OnlyDeleted,
+    };
+
+    sdr.into()
+}
+
+pub(crate) fn deserialize_soft_deleted_rows(sdr: i32) -> Result<SoftDeletedRows, Error> {
+    let sdr: proto::SoftDeletedRows = sdr.convert().ctx("soft deleted rows")?;
+    let sdr = match sdr {
+        proto::SoftDeletedRows::Unspecified => {
+            return Err(Error::new("unspecified soft deleted rows"));
+        }
+        proto::SoftDeletedRows::AllRows => SoftDeletedRows::AllRows,
+        proto::SoftDeletedRows::ExcludeDeleted => SoftDeletedRows::ExcludeDeleted,
+        proto::SoftDeletedRows::OnlyDeleted => SoftDeletedRows::OnlyDeleted,
+    };
+    Ok(sdr)
+}
+
+pub(crate) fn serialize_namespace(ns: Namespace) -> proto::Namespace {
+    proto::Namespace {
+        id: ns.id.get(),
+        name: ns.name,
+        retention_period_ns: ns.retention_period_ns,
+        max_tables: ns.max_tables.get_i32(),
+        max_columns_per_table: ns.max_columns_per_table.get_i32(),
+        deleted_at: ns.deleted_at.map(|ts| ts.get()),
+        partition_template: ns.partition_template.as_proto().cloned(),
+    }
+}
+
+pub(crate) fn deserialize_namespace(ns: proto::Namespace) -> Result<Namespace, Error> {
+    Ok(Namespace {
+        id: NamespaceId::new(ns.id),
+        name: ns.name,
+        retention_period_ns: ns.retention_period_ns,
+        max_tables: ns.max_tables.convert().ctx("max_tables")?,
+        max_columns_per_table: ns
+            .max_columns_per_table
+            .convert()
+            .ctx("max_columns_per_table")?,
+        deleted_at: ns.deleted_at.map(Timestamp::new),
+        partition_template: ns
+            .partition_template
+            .convert_opt()
+            .ctx("partition_template")?
+            .unwrap_or_else(NamespacePartitionTemplateOverride::const_default),
+    })
+}
+
+pub(crate) fn serialize_table(t: Table) -> proto::Table {
+    proto::Table {
+        id: t.id.get(),
+        namespace_id: t.namespace_id.get(),
+        name: t.name,
+        partition_template: t.partition_template.as_proto().cloned(),
+    }
+}
+
+pub(crate) fn deserialize_table(t: proto::Table) -> Result<Table, Error> {
+    Ok(Table {
+        id: TableId::new(t.id),
+        namespace_id: NamespaceId::new(t.namespace_id),
+        name: t.name,
+        partition_template: t.partition_template.convert().ctx("partition_template")?,
+    })
+}
+
+pub(crate) fn serialize_column_type(t: ColumnType) -> i32 {
+    use generated_types::influxdata::iox::column_type::v1 as proto;
+    proto::ColumnType::from(t).into()
+}
+
+pub(crate) fn deserialize_column_type(t: i32) -> Result<ColumnType, Error> {
+    use generated_types::influxdata::iox::column_type::v1 as proto;
+    let t: proto::ColumnType = t.convert()?;
+    t.convert()
+}
+
+pub(crate) fn serialize_column(column: Column) -> proto::Column {
+    proto::Column {
+        id: column.id.get(),
+        table_id: column.table_id.get(),
+        name: column.name,
+        column_type: serialize_column_type(column.column_type),
+    }
+}
+
+pub(crate) fn deserialize_column(column: proto::Column) -> Result<Column, Error> {
+    Ok(Column {
+        id: ColumnId::new(column.id),
+        table_id: TableId::new(column.table_id),
+        name: column.name,
+        column_type: deserialize_column_type(column.column_type)?,
+    })
+}
+
+pub(crate) fn serialize_sort_key_ids(sort_key_ids: &SortKeyIds) -> proto::SortKeyIds {
+    proto::SortKeyIds {
+        column_ids: sort_key_ids.iter().map(|c_id| c_id.get()).collect(),
+    }
+}
+
+pub(crate) fn deserialize_sort_key_ids(sort_key_ids: proto::SortKeyIds) -> SortKeyIds {
+    SortKeyIds::new(sort_key_ids.column_ids.into_iter().map(ColumnId::new))
+}
+
+pub(crate) fn serialize_partition(partition: Partition) -> proto::Partition {
+    let empty_sk = SortKeyIds::new(std::iter::empty());
+
+    proto::Partition {
+        id: partition.id.get(),
+        hash_id: partition
+            .hash_id()
+            .map(|id| id.as_bytes().to_vec())
+            .unwrap_or_default(),
+        partition_key: partition.partition_key.inner().to_owned(),
+        table_id: partition.table_id.get(),
+        sort_key_ids: Some(serialize_sort_key_ids(
+            partition.sort_key_ids().unwrap_or(&empty_sk),
+        )),
+        new_file_at: partition.new_file_at.map(|ts| ts.get()),
+    }
+}
+
+pub(crate) fn deserialize_partition(partition: proto::Partition) -> Result<Partition, Error> {
+    Ok(Partition::new_catalog_only(
+        PartitionId::new(partition.id),
+        (!partition.hash_id.is_empty())
+            .then_some(partition.hash_id.as_slice())
+            .convert_opt()
+            .ctx("hash_id")?,
+        TableId::new(partition.table_id),
+        partition.partition_key.into(),
+        deserialize_sort_key_ids(partition.sort_key_ids.required().ctx("sort_key_ids")?),
+        partition.new_file_at.map(Timestamp::new),
+    ))
+}
+
+pub(crate) fn serialize_skipped_compaction(sc: SkippedCompaction) -> proto::SkippedCompaction {
+    proto::SkippedCompaction {
+        partition_id: sc.partition_id.get(),
+        reason: sc.reason,
+        skipped_at: sc.skipped_at.get(),
+        estimated_bytes: sc.estimated_bytes,
+        limit_bytes: sc.limit_bytes,
+        num_files: sc.num_files,
+        limit_num_files: sc.limit_num_files,
+        limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition,
+    }
+}
+
+pub(crate) fn deserialize_skipped_compaction(sc: proto::SkippedCompaction) -> SkippedCompaction {
+    SkippedCompaction {
+        partition_id: PartitionId::new(sc.partition_id),
+        reason: sc.reason,
+        skipped_at: Timestamp::new(sc.skipped_at),
+        estimated_bytes: sc.estimated_bytes,
+        limit_bytes: sc.limit_bytes,
+        num_files: sc.num_files,
+        limit_num_files: sc.limit_num_files,
+        limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition,
+    }
+}
+
+pub(crate) fn serialize_object_store_id(id: ObjectStoreId) -> proto::ObjectStoreId {
+    let (high64, low64) = id.get_uuid().as_u64_pair();
+    proto::ObjectStoreId { high64, low64 }
+}
+
+pub(crate) fn deserialize_object_store_id(id: proto::ObjectStoreId) -> ObjectStoreId {
+    ObjectStoreId::from_uuid(Uuid::from_u64_pair(id.high64, id.low64))
+}
+
+pub(crate) fn serialize_column_set(set: &ColumnSet) -> proto::ColumnSet {
+    proto::ColumnSet {
+        column_ids: set.iter().map(|id| id.get()).collect(),
+    }
+}
+
+pub(crate) fn deserialize_column_set(set: proto::ColumnSet) -> ColumnSet {
+    ColumnSet::new(set.column_ids.into_iter().map(ColumnId::new))
+}
+
+pub(crate) fn serialize_parquet_file_params(
+    params: &ParquetFileParams,
+) -> proto::ParquetFileParams {
+    proto::ParquetFileParams {
+        namespace_id: params.namespace_id.get(),
+        table_id: params.table_id.get(),
+        partition_id: params.partition_id.get(),
+        partition_hash_id: params
+            .partition_hash_id
+            .as_ref()
+            .map(|id| id.as_bytes().to_vec()),
+        object_store_id: Some(serialize_object_store_id(params.object_store_id)),
+        min_time: params.min_time.get(),
+        max_time: params.max_time.get(),
+        file_size_bytes: params.file_size_bytes,
+        row_count: params.row_count,
+        compaction_level: params.compaction_level as i32,
+        created_at: params.created_at.get(),
+        column_set: Some(serialize_column_set(&params.column_set)),
+        max_l0_created_at: params.max_l0_created_at.get(),
+    }
+}
+
+pub(crate) fn deserialize_parquet_file_params(
+    params: proto::ParquetFileParams,
+) -> Result<ParquetFileParams, Error> {
+    Ok(ParquetFileParams {
+        namespace_id: NamespaceId::new(params.namespace_id),
+        table_id: TableId::new(params.table_id),
+        partition_id: PartitionId::new(params.partition_id),
+        partition_hash_id: params
+            .partition_hash_id
+            .as_deref()
+            .convert_opt()
+            .ctx("partition_hash_id")?,
+        object_store_id: deserialize_object_store_id(
+            params.object_store_id.required().ctx("object_store_id")?,
+        ),
+        min_time: Timestamp::new(params.min_time),
+        max_time: Timestamp::new(params.max_time),
+        file_size_bytes: params.file_size_bytes,
+        row_count: params.row_count,
+        compaction_level: params.compaction_level.convert().ctx("compaction_level")?,
+        created_at: Timestamp::new(params.created_at),
+        column_set: deserialize_column_set(params.column_set.required().ctx("column_set")?),
+        max_l0_created_at: Timestamp::new(params.max_l0_created_at),
+    })
+}
+
+pub(crate) fn serialize_parquet_file(file: ParquetFile) -> proto::ParquetFile {
+    let partition_hash_id = file
+        .partition_hash_id
+        .map(|x| x.as_bytes().to_vec())
+        .unwrap_or_default();
+
+    proto::ParquetFile {
+        id: file.id.get(),
+        namespace_id: file.namespace_id.get(),
+        table_id: file.table_id.get(),
+        partition_id: file.partition_id.get(),
+        partition_hash_id,
+        object_store_id: Some(serialize_object_store_id(file.object_store_id)),
+        min_time: file.min_time.get(),
+        max_time: file.max_time.get(),
+        to_delete: file.to_delete.map(|ts| ts.get()),
+        file_size_bytes: file.file_size_bytes,
+        row_count: file.row_count,
+        compaction_level: file.compaction_level as i32,
+        created_at: file.created_at.get(),
+        column_set: Some(serialize_column_set(&file.column_set)),
+        max_l0_created_at: file.max_l0_created_at.get(),
+    }
+}
+
+pub(crate) fn deserialize_parquet_file(file: proto::ParquetFile) -> Result<ParquetFile, Error> {
+    let partition_hash_id = match file.partition_hash_id.as_slice() {
+        b"" => None,
+        s => Some(s.convert().ctx("partition_hash_id")?),
+    };
+
+    Ok(ParquetFile {
+        id: ParquetFileId::new(file.id),
+        namespace_id: NamespaceId::new(file.namespace_id),
+        table_id: TableId::new(file.table_id),
+        partition_id: PartitionId::new(file.partition_id),
+        partition_hash_id,
+        object_store_id: deserialize_object_store_id(
+            file.object_store_id.required().ctx("object_store_id")?,
+        ),
+        min_time: Timestamp::new(file.min_time),
+        max_time: Timestamp::new(file.max_time),
+        to_delete: file.to_delete.map(Timestamp::new),
+        file_size_bytes: file.file_size_bytes,
+        row_count: file.row_count,
+        compaction_level: file.compaction_level.convert().ctx("compaction_level")?,
+        created_at: Timestamp::new(file.created_at),
+        column_set: deserialize_column_set(file.column_set.required().ctx("column_set")?),
+        max_l0_created_at: Timestamp::new(file.max_l0_created_at),
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use data_types::{
+        partition_template::TablePartitionTemplateOverride, CompactionLevel, PartitionHashId,
+        PartitionKey,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_column_type_roundtrip() {
+        assert_column_type_roundtrip(ColumnType::Bool);
+        assert_column_type_roundtrip(ColumnType::I64);
+        assert_column_type_roundtrip(ColumnType::U64);
+        assert_column_type_roundtrip(ColumnType::F64);
+        assert_column_type_roundtrip(ColumnType::String);
+        assert_column_type_roundtrip(ColumnType::Tag);
+        assert_column_type_roundtrip(ColumnType::Time);
+    }
+
+    #[track_caller]
+    fn assert_column_type_roundtrip(t: ColumnType) {
+        let protobuf = serialize_column_type(t);
+        let t2 = deserialize_column_type(protobuf).unwrap();
+        assert_eq!(t, t2);
+    }
+
+    #[test]
+    fn test_error_roundtrip() {
+        use crate::interface::Error;
+
+        assert_error_roundtrip(Error::AlreadyExists {
+            descr: "foo".to_owned(),
+        });
+        assert_error_roundtrip(Error::External {
+            source: "foo".to_owned().into(),
+        });
+        assert_error_roundtrip(Error::LimitExceeded {
+            descr: "foo".to_owned(),
+        });
+        assert_error_roundtrip(Error::NotFound {
+            descr: "foo".to_owned(),
+        });
+    }
+
+    #[track_caller]
+    fn assert_error_roundtrip(e: crate::interface::Error) {
+        let msg_orig = e.to_string();
+
+        let status = catalog_error_to_status(e);
+        let e = convert_status(status);
+        let msg = e.to_string();
+        assert_eq!(msg, msg_orig);
+    }
+
+    #[test]
+    fn test_soft_deleted_rows_roundtrip() {
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::AllRows);
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::ExcludeDeleted);
+        assert_soft_deleted_rows_roundtrip(SoftDeletedRows::OnlyDeleted);
+    }
+
+    #[track_caller]
+    fn assert_soft_deleted_rows_roundtrip(sdr: SoftDeletedRows) {
+        let protobuf = serialize_soft_deleted_rows(sdr);
+        let sdr2 = deserialize_soft_deleted_rows(protobuf).unwrap();
+        assert_eq!(sdr, sdr2);
+    }
+
+    #[test]
+    fn test_namespace_roundtrip() {
+        use generated_types::influxdata::iox::partition_template::v1 as proto;
+
+        let ns = Namespace {
+            id: NamespaceId::new(1),
+            name: "ns".to_owned(),
+            retention_period_ns: Some(2),
+            max_tables: 3.try_into().unwrap(),
+            max_columns_per_table: 4.try_into().unwrap(),
+            deleted_at: Some(Timestamp::new(5)),
+            partition_template: NamespacePartitionTemplateOverride::try_from(
+                proto::PartitionTemplate {
+                    parts: vec![proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                    }],
+                },
+            )
+            .unwrap(),
+        };
+        let protobuf = serialize_namespace(ns.clone());
+        let ns2 = deserialize_namespace(protobuf).unwrap();
+        assert_eq!(ns, ns2);
+    }
+
+    #[test]
+    fn test_table_roundtrip() {
+        use generated_types::influxdata::iox::partition_template::v1 as proto;
+
+        let table = Table {
+            id: TableId::new(1),
+            namespace_id: NamespaceId::new(2),
+            name: "table".to_owned(),
+            partition_template: TablePartitionTemplateOverride::try_new(
+                Some(proto::PartitionTemplate {
+                    parts: vec![proto::TemplatePart {
+                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                    }],
+                }),
+                &NamespacePartitionTemplateOverride::const_default(),
+            )
+            .unwrap(),
+        };
+        let protobuf = serialize_table(table.clone());
+        let table2 = deserialize_table(protobuf).unwrap();
+        assert_eq!(table, table2);
+    }
+
+    #[test]
+    fn test_column_roundtrip() {
+        let column = Column {
+            id: ColumnId::new(1),
+            table_id: TableId::new(2),
+            name: "col".to_owned(),
+            column_type: ColumnType::F64,
+        };
+        let protobuf = serialize_column(column.clone());
+        let column2 = deserialize_column(protobuf).unwrap();
+        assert_eq!(column, column2);
+    }
+
+    #[test]
+    fn test_sort_key_ids_roundtrip() {
+        assert_sort_key_ids_roundtrip(SortKeyIds::new(std::iter::empty()));
+        assert_sort_key_ids_roundtrip(SortKeyIds::new([ColumnId::new(1)]));
+        assert_sort_key_ids_roundtrip(SortKeyIds::new([
+            ColumnId::new(1),
+            ColumnId::new(5),
+            ColumnId::new(20),
+        ]));
+    }
+
+    #[track_caller]
+    fn assert_sort_key_ids_roundtrip(sort_key_ids: SortKeyIds) {
+        let protobuf = serialize_sort_key_ids(&sort_key_ids);
+        let sort_key_ids2 = deserialize_sort_key_ids(protobuf);
+        assert_eq!(sort_key_ids, sort_key_ids2);
+    }
+
+    #[test]
+    fn test_partition_roundtrip() {
+        let table_id = TableId::new(1);
+        let partition_key = PartitionKey::from("key");
+        let hash_id = PartitionHashId::new(table_id, &partition_key);
+
+        assert_partition_roundtrip(Partition::new_catalog_only(
+            PartitionId::new(2),
+            Some(hash_id.clone()),
+            table_id,
+            partition_key.clone(),
+            SortKeyIds::new([ColumnId::new(3), ColumnId::new(4)]),
+            Some(Timestamp::new(5)),
+        ));
+        assert_partition_roundtrip(Partition::new_catalog_only(
+            PartitionId::new(2),
+            Some(hash_id),
+            table_id,
+            partition_key,
+            SortKeyIds::new(std::iter::empty()),
+            Some(Timestamp::new(5)),
+        ));
+    }
+
+    #[track_caller]
+    fn assert_partition_roundtrip(partition: Partition) {
+        let protobuf = serialize_partition(partition.clone());
+        let partition2 = deserialize_partition(protobuf).unwrap();
+        assert_eq!(partition, partition2);
+    }
+
+    #[test]
+    fn test_skipped_compaction_roundtrip() {
+        let sc = SkippedCompaction {
+            partition_id: PartitionId::new(1),
+            reason: "foo".to_owned(),
+            skipped_at: Timestamp::new(2),
+            estimated_bytes: 3,
+            limit_bytes: 4,
+            num_files: 5,
+            limit_num_files: 6,
+            limit_num_files_first_in_partition: 7,
+        };
+        let protobuf = serialize_skipped_compaction(sc.clone());
+        let sc2 = deserialize_skipped_compaction(protobuf);
+        assert_eq!(sc, sc2);
+    }
+
+    #[test]
+    fn test_object_store_id_roundtrip() {
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::nil()));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(0)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(1)));
+        assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX - 1)));
+    }
+
+    #[track_caller]
+    fn assert_object_store_id_roundtrip(id: ObjectStoreId) {
+        let protobuf = serialize_object_store_id(id);
+        let id2 = deserialize_object_store_id(protobuf);
+        assert_eq!(id, id2);
+    }
+
+    #[test]
+    fn test_column_set_roundtrip() {
+        assert_column_set_roundtrip(ColumnSet::new([]));
+        assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1)]));
+        assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1), ColumnId::new(10)]));
+        assert_column_set_roundtrip(ColumnSet::new([
+            ColumnId::new(3),
+            ColumnId::new(4),
+            ColumnId::new(10),
+        ]));
+    }
+
+    #[track_caller]
+    fn assert_column_set_roundtrip(set: ColumnSet) {
+        let protobuf = serialize_column_set(&set);
+        let set2 = deserialize_column_set(protobuf);
+        assert_eq!(set, set2);
+    }
+
+    #[test]
+    fn test_parquet_file_params_roundtrip() {
+        let params = ParquetFileParams {
+            namespace_id: NamespaceId::new(1),
+            table_id: TableId::new(2),
+            partition_id: PartitionId::new(3),
+            partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()),
+            object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)),
+            min_time: Timestamp::new(4),
+            max_time: Timestamp::new(5),
+            file_size_bytes: 6,
+            row_count: 7,
+            compaction_level: CompactionLevel::Final,
+            created_at: Timestamp::new(8),
+            column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]),
+            max_l0_created_at: Timestamp::new(11),
+        };
+        let protobuf = serialize_parquet_file_params(&params);
+        let params2 = deserialize_parquet_file_params(protobuf).unwrap();
+        assert_eq!(params, params2);
+    }
+
+    #[test]
+    fn test_parquet_file_roundtrip() {
+        let file = ParquetFile {
+            id: ParquetFileId::new(12),
+            namespace_id: NamespaceId::new(1),
+            table_id: TableId::new(2),
+            partition_id: PartitionId::new(3),
+            partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()),
+            object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)),
+            min_time: Timestamp::new(4),
+            max_time: Timestamp::new(5),
+            to_delete: Some(Timestamp::new(13)),
+            file_size_bytes: 6,
+            row_count: 7,
+            compaction_level: CompactionLevel::Final,
+            created_at: Timestamp::new(8),
+            column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]),
+            max_l0_created_at: Timestamp::new(11),
+        };
+        let protobuf = serialize_parquet_file(file.clone());
+        let file2 = deserialize_parquet_file(protobuf).unwrap();
+        assert_eq!(file, file2);
+    }
+}
diff --git a/iox_catalog/src/grpc/server.rs b/iox_catalog/src/grpc/server.rs
new file mode 100644
index 00000000000..2105457f470
--- /dev/null
+++ b/iox_catalog/src/grpc/server.rs
@@ -0,0 +1,1032 @@
+//! gRPC server implementation.
+
+use std::{pin::Pin, sync::Arc};
+
+use crate::{
+    grpc::serialization::{
+        catalog_error_to_status, deserialize_column_type, deserialize_object_store_id,
+        deserialize_parquet_file_params, deserialize_soft_deleted_rows, deserialize_sort_key_ids,
+        serialize_column, serialize_namespace, serialize_object_store_id, serialize_parquet_file,
+        serialize_partition, serialize_skipped_compaction, serialize_sort_key_ids, serialize_table,
+        ContextExt, ConvertExt, ConvertOptExt, RequiredExt,
+    },
+    interface::{CasFailure, Catalog},
+};
+use async_trait::async_trait;
+use data_types::{
+    NamespaceId, NamespaceServiceProtectionLimitsOverride, PartitionId, PartitionKey, TableId,
+    Timestamp,
+};
+use futures::{Stream, StreamExt, TryStreamExt};
+use generated_types::influxdata::iox::catalog::v2 as proto;
+use generated_types::influxdata::iox::catalog::v2::{TableSnapshotRequest, TableSnapshotResponse};
+use tonic::{Request, Response, Status};
+
+type TonicStream<T> = Pin<Box<dyn Stream<Item = Result<T, tonic::Status>> + Send + 'static>>;
+
+/// gRPC server.
+#[derive(Debug)]
+pub struct GrpcCatalogServer {
+    catalog: Arc<dyn Catalog>,
+}
+
+impl GrpcCatalogServer {
+    /// Create a new [`GrpcCatalogServer`].
+    pub fn new(catalog: Arc<dyn Catalog>) -> Self {
+        Self { catalog }
+    }
+
+    /// Get service for integration w/ tonic.
+    pub fn service(&self) -> proto::catalog_service_server::CatalogServiceServer<Self> {
+        let this = Self {
+            catalog: Arc::clone(&self.catalog),
+        };
+        proto::catalog_service_server::CatalogServiceServer::new(this)
+    }
+}
+
+#[async_trait]
+impl proto::catalog_service_server::CatalogService for GrpcCatalogServer {
+    type NamespaceListStream = TonicStream<proto::NamespaceListResponse>;
+
+    type TableListByNamespaceIdStream = TonicStream<proto::TableListByNamespaceIdResponse>;
+    type TableListStream = TonicStream<proto::TableListResponse>;
+
+    type ColumnCreateOrGetManyUncheckedStream =
+        TonicStream<proto::ColumnCreateOrGetManyUncheckedResponse>;
+    type ColumnListByNamespaceIdStream = TonicStream<proto::ColumnListByNamespaceIdResponse>;
+    type ColumnListByTableIdStream = TonicStream<proto::ColumnListByTableIdResponse>;
+    type ColumnListStream = TonicStream<proto::ColumnListResponse>;
+
+    type PartitionGetByIdBatchStream = TonicStream<proto::PartitionGetByIdBatchResponse>;
+    type PartitionListByTableIdStream = TonicStream<proto::PartitionListByTableIdResponse>;
+    type PartitionListIdsStream = TonicStream<proto::PartitionListIdsResponse>;
+    type PartitionGetInSkippedCompactionsStream =
+        TonicStream<proto::PartitionGetInSkippedCompactionsResponse>;
+    type PartitionListSkippedCompactionsStream =
+        TonicStream<proto::PartitionListSkippedCompactionsResponse>;
+    type PartitionMostRecentNStream = TonicStream<proto::PartitionMostRecentNResponse>;
+    type PartitionNewFileBetweenStream = TonicStream<proto::PartitionNewFileBetweenResponse>;
+    type PartitionListOldStyleStream = TonicStream<proto::PartitionListOldStyleResponse>;
+
+    type ParquetFileFlagForDeleteByRetentionStream =
+        TonicStream<proto::ParquetFileFlagForDeleteByRetentionResponse>;
+    type ParquetFileDeleteOldIdsOnlyStream =
+        TonicStream<proto::ParquetFileDeleteOldIdsOnlyResponse>;
+    type ParquetFileListByPartitionNotToDeleteBatchStream =
+        TonicStream<proto::ParquetFileListByPartitionNotToDeleteBatchResponse>;
+    type ParquetFileExistsByObjectStoreIdBatchStream =
+        TonicStream<proto::ParquetFileExistsByObjectStoreIdBatchResponse>;
+
+    async fn namespace_create(
+        &self,
+        request: Request<proto::NamespaceCreateRequest>,
+    ) -> Result<Response<proto::NamespaceCreateResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .create(
+                &req.name.convert().ctx("name")?,
+                req.partition_template
+                    .convert_opt()
+                    .ctx("partition_template")?,
+                req.retention_period_ns,
+                req.service_protection_limits
+                    .map(|l| {
+                        let l = NamespaceServiceProtectionLimitsOverride {
+                            max_tables: l.max_tables.convert_opt().ctx("max_tables")?,
+                            max_columns_per_table: l
+                                .max_columns_per_table
+                                .convert_opt()
+                                .ctx("max_columns_per_table")?,
+                        };
+                        Ok(l) as Result<_, tonic::Status>
+                    })
+                    .transpose()?,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceCreateResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn namespace_update_retention_period(
+        &self,
+        request: Request<proto::NamespaceUpdateRetentionPeriodRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateRetentionPeriodResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_retention_period(&req.name, req.retention_period_ns)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(
+            proto::NamespaceUpdateRetentionPeriodResponse {
+                namespace: Some(ns),
+            },
+        ))
+    }
+
+    async fn namespace_list(
+        &self,
+        request: Request<proto::NamespaceListRequest>,
+    ) -> Result<Response<Self::NamespaceListStream>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let ns_list = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .list(deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(ns_list.into_iter().map(|ns| {
+                let ns = serialize_namespace(ns);
+
+                Ok(proto::NamespaceListResponse {
+                    namespace: Some(ns),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn namespace_get_by_id(
+        &self,
+        request: Request<proto::NamespaceGetByIdRequest>,
+    ) -> Result<Response<proto::NamespaceGetByIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let maybe_ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .get_by_id(NamespaceId::new(req.id), deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_ns = maybe_ns.map(serialize_namespace);
+
+        Ok(Response::new(proto::NamespaceGetByIdResponse {
+            namespace: maybe_ns,
+        }))
+    }
+
+    async fn namespace_get_by_name(
+        &self,
+        request: Request<proto::NamespaceGetByNameRequest>,
+    ) -> Result<Response<proto::NamespaceGetByNameResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let deleted = deserialize_soft_deleted_rows(req.deleted)?;
+
+        let maybe_ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .get_by_name(&req.name, deleted)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_ns = maybe_ns.map(serialize_namespace);
+
+        Ok(Response::new(proto::NamespaceGetByNameResponse {
+            namespace: maybe_ns,
+        }))
+    }
+
+    async fn namespace_soft_delete(
+        &self,
+        request: Request<proto::NamespaceSoftDeleteRequest>,
+    ) -> Result<Response<proto::NamespaceSoftDeleteResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        self.catalog
+            .repositories()
+            .namespaces()
+            .soft_delete(&req.name)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::NamespaceSoftDeleteResponse {}))
+    }
+
+    async fn namespace_update_table_limit(
+        &self,
+        request: Request<proto::NamespaceUpdateTableLimitRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateTableLimitResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_table_limit(&req.name, req.new_max.convert().ctx("new_max")?)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceUpdateTableLimitResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn namespace_update_column_limit(
+        &self,
+        request: Request<proto::NamespaceUpdateColumnLimitRequest>,
+    ) -> Result<Response<proto::NamespaceUpdateColumnLimitResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let ns = self
+            .catalog
+            .repositories()
+            .namespaces()
+            .update_column_limit(&req.name, req.new_max.convert().ctx("new_max")?)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let ns = serialize_namespace(ns);
+
+        Ok(Response::new(proto::NamespaceUpdateColumnLimitResponse {
+            namespace: Some(ns),
+        }))
+    }
+
+    async fn table_create(
+        &self,
+        request: Request<proto::TableCreateRequest>,
+    ) -> Result<Response<proto::TableCreateResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let table = self
+            .catalog
+            .repositories()
+            .tables()
+            .create(
+                &req.name,
+                req.partition_template.convert().ctx("partition_template")?,
+                NamespaceId::new(req.namespace_id),
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let table = serialize_table(table);
+
+        Ok(Response::new(proto::TableCreateResponse {
+            table: Some(table),
+        }))
+    }
+
+    async fn table_get_by_id(
+        &self,
+        request: Request<proto::TableGetByIdRequest>,
+    ) -> Result<Response<proto::TableGetByIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_table = self
+            .catalog
+            .repositories()
+            .tables()
+            .get_by_id(TableId::new(req.id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::TableGetByIdResponse {
+            table: maybe_table.map(serialize_table),
+        }))
+    }
+
+    async fn table_get_by_namespace_and_name(
+        &self,
+        request: Request<proto::TableGetByNamespaceAndNameRequest>,
+    ) -> Result<Response<proto::TableGetByNamespaceAndNameResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_table = self
+            .catalog
+            .repositories()
+            .tables()
+            .get_by_namespace_and_name(NamespaceId::new(req.namespace_id), &req.name)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::TableGetByNamespaceAndNameResponse {
+            table: maybe_table.map(serialize_table),
+        }))
+    }
+
+    async fn table_list_by_namespace_id(
+        &self,
+        request: Request<proto::TableListByNamespaceIdRequest>,
+    ) -> Result<Response<Self::TableListByNamespaceIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let table_list = self
+            .catalog
+            .repositories()
+            .tables()
+            .list_by_namespace_id(NamespaceId::new(req.namespace_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(table_list.into_iter().map(|table| {
+                let table = serialize_table(table);
+                Ok(proto::TableListByNamespaceIdResponse { table: Some(table) })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn table_list(
+        &self,
+        _request: Request<proto::TableListRequest>,
+    ) -> Result<Response<Self::TableListStream>, tonic::Status> {
+        let table_list = self
+            .catalog
+            .repositories()
+            .tables()
+            .list()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(table_list.into_iter().map(|table| {
+                let table = serialize_table(table);
+                Ok(proto::TableListResponse { table: Some(table) })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn table_snapshot(
+        &self,
+        request: Request<TableSnapshotRequest>,
+    ) -> Result<Response<TableSnapshotResponse>, Status> {
+        let req = request.into_inner();
+        let snapshot = self
+            .catalog
+            .repositories()
+            .tables()
+            .snapshot(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(TableSnapshotResponse {
+            generation: snapshot.generation(),
+            table: Some(snapshot.into()),
+        }))
+    }
+
+    async fn column_create_or_get(
+        &self,
+        request: Request<proto::ColumnCreateOrGetRequest>,
+    ) -> Result<Response<proto::ColumnCreateOrGetResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let column_type = deserialize_column_type(req.column_type)?;
+
+        let column = self
+            .catalog
+            .repositories()
+            .columns()
+            .create_or_get(&req.name, TableId::new(req.table_id), column_type)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let column = serialize_column(column);
+
+        Ok(Response::new(proto::ColumnCreateOrGetResponse {
+            column: Some(column),
+        }))
+    }
+
+    async fn column_create_or_get_many_unchecked(
+        &self,
+        request: Request<proto::ColumnCreateOrGetManyUncheckedRequest>,
+    ) -> Result<Response<Self::ColumnCreateOrGetManyUncheckedStream>, tonic::Status> {
+        let req = request.into_inner();
+        let columns = req
+            .columns
+            .iter()
+            .map(|(name, t)| {
+                let t = deserialize_column_type(*t)?;
+                Ok((name.as_str(), t))
+            })
+            .collect::<Result<_, tonic::Status>>()?;
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .create_or_get_many_unchecked(TableId::new(req.table_id), columns)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnCreateOrGetManyUncheckedResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list_by_namespace_id(
+        &self,
+        request: Request<proto::ColumnListByNamespaceIdRequest>,
+    ) -> Result<Response<Self::ColumnListByNamespaceIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list_by_namespace_id(NamespaceId::new(req.namespace_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListByNamespaceIdResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list_by_table_id(
+        &self,
+        request: Request<proto::ColumnListByTableIdRequest>,
+    ) -> Result<Response<Self::ColumnListByTableIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list_by_table_id(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListByTableIdResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn column_list(
+        &self,
+        _request: Request<proto::ColumnListRequest>,
+    ) -> Result<Response<Self::ColumnListStream>, tonic::Status> {
+        let column_list = self
+            .catalog
+            .repositories()
+            .columns()
+            .list()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(column_list.into_iter().map(|column| {
+                let column = serialize_column(column);
+                Ok(proto::ColumnListResponse {
+                    column: Some(column),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_create_or_get(
+        &self,
+        request: Request<proto::PartitionCreateOrGetRequest>,
+    ) -> Result<Response<proto::PartitionCreateOrGetResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition = self
+            .catalog
+            .repositories()
+            .partitions()
+            .create_or_get(PartitionKey::from(req.key), TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let partition = serialize_partition(partition);
+
+        Ok(Response::new(proto::PartitionCreateOrGetResponse {
+            partition: Some(partition),
+        }))
+    }
+
+    async fn partition_get_by_id_batch(
+        &self,
+        request: Request<proto::PartitionGetByIdBatchRequest>,
+    ) -> Result<Response<Self::PartitionGetByIdBatchStream>, tonic::Status> {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .get_by_id_batch(&partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionGetByIdBatchResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_by_table_id(
+        &self,
+        request: Request<proto::PartitionListByTableIdRequest>,
+    ) -> Result<Response<Self::PartitionListByTableIdStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_by_table_id(TableId::new(req.table_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionListByTableIdResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_ids(
+        &self,
+        _request: Request<proto::PartitionListIdsRequest>,
+    ) -> Result<Response<Self::PartitionListIdsStream>, tonic::Status> {
+        let id_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_ids()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                Ok(proto::PartitionListIdsResponse {
+                    partition_id: id.get(),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_cas_sort_key(
+        &self,
+        request: Request<proto::PartitionCasSortKeyRequest>,
+    ) -> Result<Response<proto::PartitionCasSortKeyResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let res = self
+            .catalog
+            .repositories()
+            .partitions()
+            .cas_sort_key(
+                PartitionId::new(req.partition_id),
+                req.old_sort_key_ids.map(deserialize_sort_key_ids).as_ref(),
+                &deserialize_sort_key_ids(req.new_sort_key_ids.required().ctx("new_sort_key_ids")?),
+            )
+            .await;
+
+        match res {
+            Ok(partition) => Ok(Response::new(proto::PartitionCasSortKeyResponse {
+                res: Some(proto::partition_cas_sort_key_response::Res::Partition(
+                    serialize_partition(partition),
+                )),
+            })),
+            Err(CasFailure::ValueMismatch(sort_key_ids)) => {
+                Ok(Response::new(proto::PartitionCasSortKeyResponse {
+                    res: Some(proto::partition_cas_sort_key_response::Res::CurrentSortKey(
+                        serialize_sort_key_ids(&sort_key_ids),
+                    )),
+                }))
+            }
+            Err(CasFailure::QueryError(e)) => Err(catalog_error_to_status(e)),
+        }
+    }
+
+    async fn partition_record_skipped_compaction(
+        &self,
+        request: Request<proto::PartitionRecordSkippedCompactionRequest>,
+    ) -> Result<Response<proto::PartitionRecordSkippedCompactionResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        self.catalog
+            .repositories()
+            .partitions()
+            .record_skipped_compaction(
+                PartitionId::new(req.partition_id),
+                &req.reason,
+                req.num_files as usize,
+                req.limit_num_files as usize,
+                req.limit_num_files_first_in_partition as usize,
+                req.estimated_bytes,
+                req.limit_bytes,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::PartitionRecordSkippedCompactionResponse {},
+        ))
+    }
+
+    async fn partition_get_in_skipped_compactions(
+        &self,
+        request: Request<proto::PartitionGetInSkippedCompactionsRequest>,
+    ) -> Result<Response<Self::PartitionGetInSkippedCompactionsStream>, tonic::Status> {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let skipped_compaction_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .get_in_skipped_compactions(&partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| {
+                let sc = serialize_skipped_compaction(sc);
+                Ok(proto::PartitionGetInSkippedCompactionsResponse {
+                    skipped_compaction: Some(sc),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_skipped_compactions(
+        &self,
+        _request: Request<proto::PartitionListSkippedCompactionsRequest>,
+    ) -> Result<Response<Self::PartitionListSkippedCompactionsStream>, tonic::Status> {
+        let skipped_compaction_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_skipped_compactions()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| {
+                let sc = serialize_skipped_compaction(sc);
+                Ok(proto::PartitionListSkippedCompactionsResponse {
+                    skipped_compaction: Some(sc),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_delete_skipped_compactions(
+        &self,
+        request: Request<proto::PartitionDeleteSkippedCompactionsRequest>,
+    ) -> Result<Response<proto::PartitionDeleteSkippedCompactionsResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_skipped_compaction = self
+            .catalog
+            .repositories()
+            .partitions()
+            .delete_skipped_compactions(PartitionId::new(req.partition_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        let maybe_skipped_compaction = maybe_skipped_compaction.map(serialize_skipped_compaction);
+
+        Ok(Response::new(
+            proto::PartitionDeleteSkippedCompactionsResponse {
+                skipped_compaction: maybe_skipped_compaction,
+            },
+        ))
+    }
+
+    async fn partition_most_recent_n(
+        &self,
+        request: Request<proto::PartitionMostRecentNRequest>,
+    ) -> Result<Response<Self::PartitionMostRecentNStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .most_recent_n(req.n as usize)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionMostRecentNResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_new_file_between(
+        &self,
+        request: Request<proto::PartitionNewFileBetweenRequest>,
+    ) -> Result<Response<Self::PartitionNewFileBetweenStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .partitions_new_file_between(
+                Timestamp::new(req.minimum_time),
+                req.maximum_time.map(Timestamp::new),
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                Ok(proto::PartitionNewFileBetweenResponse {
+                    partition_id: id.get(),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_list_old_style(
+        &self,
+        _request: Request<proto::PartitionListOldStyleRequest>,
+    ) -> Result<Response<Self::PartitionListOldStyleStream>, tonic::Status> {
+        let partition_list = self
+            .catalog
+            .repositories()
+            .partitions()
+            .list_old_style()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(partition_list.into_iter().map(|partition| {
+                let partition = serialize_partition(partition);
+                Ok(proto::PartitionListOldStyleResponse {
+                    partition: Some(partition),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn partition_snapshot(
+        &self,
+        request: Request<proto::PartitionSnapshotRequest>,
+    ) -> Result<Response<proto::PartitionSnapshotResponse>, Status> {
+        let req = request.into_inner();
+        let snapshot = self
+            .catalog
+            .repositories()
+            .partitions()
+            .snapshot(PartitionId::new(req.partition_id))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(proto::PartitionSnapshotResponse {
+            generation: snapshot.generation(),
+            partition: Some(snapshot.into()),
+        }))
+    }
+
+    async fn parquet_file_flag_for_delete_by_retention(
+        &self,
+        _request: Request<proto::ParquetFileFlagForDeleteByRetentionRequest>,
+    ) -> Result<Response<Self::ParquetFileFlagForDeleteByRetentionStream>, tonic::Status> {
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .flag_for_delete_by_retention()
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|(p_id, os_id)| {
+                let object_store_id = serialize_object_store_id(os_id);
+                Ok(proto::ParquetFileFlagForDeleteByRetentionResponse {
+                    partition_id: p_id.get(),
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_delete_old_ids_only(
+        &self,
+        request: Request<proto::ParquetFileDeleteOldIdsOnlyRequest>,
+    ) -> Result<Response<Self::ParquetFileDeleteOldIdsOnlyStream>, tonic::Status> {
+        let req = request.into_inner();
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .delete_old_ids_only(Timestamp::new(req.older_than))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                let object_store_id = serialize_object_store_id(id);
+                Ok(proto::ParquetFileDeleteOldIdsOnlyResponse {
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_list_by_partition_not_to_delete_batch(
+        &self,
+        request: Request<proto::ParquetFileListByPartitionNotToDeleteBatchRequest>,
+    ) -> Result<Response<Self::ParquetFileListByPartitionNotToDeleteBatchStream>, tonic::Status>
+    {
+        let req = request.into_inner();
+        let partition_ids = req
+            .partition_ids
+            .into_iter()
+            .map(PartitionId::new)
+            .collect::<Vec<_>>();
+
+        let file_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .list_by_partition_not_to_delete_batch(partition_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(file_list.into_iter().map(|file| {
+                let file = serialize_parquet_file(file);
+                Ok(proto::ParquetFileListByPartitionNotToDeleteBatchResponse {
+                    parquet_file: Some(file),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_get_by_object_store_id(
+        &self,
+        request: Request<proto::ParquetFileGetByObjectStoreIdRequest>,
+    ) -> Result<Response<proto::ParquetFileGetByObjectStoreIdResponse>, tonic::Status> {
+        let req = request.into_inner();
+
+        let maybe_file = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .get_by_object_store_id(deserialize_object_store_id(
+                req.object_store_id.required().ctx("object_store_id")?,
+            ))
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::ParquetFileGetByObjectStoreIdResponse {
+                parquet_file: maybe_file.map(serialize_parquet_file),
+            },
+        ))
+    }
+
+    async fn parquet_file_exists_by_object_store_id_batch(
+        &self,
+        request: Request<tonic::Streaming<proto::ParquetFileExistsByObjectStoreIdBatchRequest>>,
+    ) -> Result<Response<Self::ParquetFileExistsByObjectStoreIdBatchStream>, tonic::Status> {
+        let object_store_ids = request
+            .into_inner()
+            .map_err(|e| tonic::Status::invalid_argument(e.to_string()))
+            .and_then(|req| async move {
+                Ok(deserialize_object_store_id(
+                    req.object_store_id.required().ctx("object_store_id")?,
+                ))
+            })
+            .try_collect::<Vec<_>>()
+            .await?;
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .exists_by_object_store_id_batch(object_store_ids)
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            futures::stream::iter(id_list.into_iter().map(|id| {
+                let object_store_id = serialize_object_store_id(id);
+                Ok(proto::ParquetFileExistsByObjectStoreIdBatchResponse {
+                    object_store_id: Some(object_store_id),
+                })
+            }))
+            .boxed(),
+        ))
+    }
+
+    async fn parquet_file_create_upgrade_delete(
+        &self,
+        request: Request<proto::ParquetFileCreateUpgradeDeleteRequest>,
+    ) -> Result<Response<proto::ParquetFileCreateUpgradeDeleteResponse>, tonic::Status> {
+        let req = request.into_inner();
+        let delete = req
+            .delete
+            .into_iter()
+            .map(deserialize_object_store_id)
+            .collect::<Vec<_>>();
+        let upgrade = req
+            .upgrade
+            .into_iter()
+            .map(deserialize_object_store_id)
+            .collect::<Vec<_>>();
+        let create = req
+            .create
+            .into_iter()
+            .map(deserialize_parquet_file_params)
+            .collect::<Result<Vec<_>, _>>()?;
+
+        let id_list = self
+            .catalog
+            .repositories()
+            .parquet_files()
+            .create_upgrade_delete(
+                PartitionId::new(req.partition_id),
+                &delete,
+                &upgrade,
+                &create,
+                req.target_level.convert().ctx("target_level")?,
+            )
+            .await
+            .map_err(catalog_error_to_status)?;
+
+        Ok(Response::new(
+            proto::ParquetFileCreateUpgradeDeleteResponse {
+                created_parquet_file_ids: id_list.into_iter().map(|id| id.get()).collect(),
+            },
+        ))
+    }
+}
diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs
index d06ef68967b..dae33a26a59 100644
--- a/iox_catalog/src/interface.rs
+++ b/iox_catalog/src/interface.rs
@@ -1,27 +1,22 @@
 //! Traits and data types for the IOx Catalog API.
 
 use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
 use data_types::{
     partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
-    Column, ColumnType, ColumnsByName, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace,
-    NamespaceId, NamespaceName, NamespaceSchema, NamespaceServiceProtectionLimitsOverride,
-    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
-    PartitionKey, SkippedCompaction, SortedColumnSet, Table, TableId, TableSchema, Timestamp,
-    TransitionPartitionId,
+    Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
 };
 use iox_time::TimeProvider;
-use snafu::{OptionExt, Snafu};
+use snafu::Snafu;
 use std::{
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::HashMap,
     fmt::{Debug, Display},
     sync::Arc,
 };
-use uuid::Uuid;
-
-/// Maximum number of files touched by [`ParquetFileRepo::flag_for_delete_by_retention`] at a time.
-pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION: i64 = 1_000;
-/// Maximum number of files touched by [`ParquetFileRepo::delete_old_ids_only`] at a time.
-pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE: i64 = 10_000;
 
 /// An error wrapper detailing the reason for a compare-and-swap failure.
 #[derive(Debug)]
@@ -36,122 +31,68 @@ pub enum CasFailure<T> {
 }
 
 #[derive(Debug, Snafu)]
-#[allow(missing_copy_implementations, missing_docs)]
+#[allow(missing_docs)]
 #[snafu(visibility(pub(crate)))]
 pub enum Error {
-    #[snafu(display("invalid name: {}", name))]
-    InvalidName { name: String },
-
-    #[snafu(display("name {} already exists", name))]
-    NameExists { name: String },
-
-    #[snafu(display("A table named {name} already exists in namespace {namespace_id}"))]
-    TableNameExists {
-        name: String,
-        namespace_id: NamespaceId,
-    },
-
-    #[snafu(display("unhandled sqlx error: {}", source))]
-    SqlxError { source: sqlx::Error },
-
-    #[snafu(display("foreign key violation: {}", source))]
-    ForeignKeyViolation { source: sqlx::Error },
-
-    #[snafu(display("column {} is type {} but write has type {}", name, existing, new))]
-    ColumnTypeMismatch {
-        name: String,
-        existing: ColumnType,
-        new: ColumnType,
-    },
-
-    #[snafu(display(
-        "column type {} is in the db for column {}, which is unknown",
-        data_type,
-        name
-    ))]
-    UnknownColumnType { data_type: i16, name: String },
-
-    #[snafu(display("namespace {} not found", name))]
-    NamespaceNotFoundByName { name: String },
-
-    #[snafu(display("namespace {} not found", id))]
-    NamespaceNotFoundById { id: NamespaceId },
-
-    #[snafu(display("table {} not found", id))]
-    TableNotFound { id: TableId },
-
-    #[snafu(display("table {} not found", name))]
-    TableNotFoundByName { name: String },
-
-    #[snafu(display("partition {} not found", id))]
-    PartitionNotFound { id: TransitionPartitionId },
-
-    #[snafu(display(
-        "couldn't create column {} in table {}; limit reached on namespace",
-        column_name,
-        table_id,
-    ))]
-    ColumnCreateLimitError {
-        column_name: String,
-        table_id: TableId,
-    },
-
-    #[snafu(display(
-        "couldn't create table {}; limit reached on namespace {}",
-        table_name,
-        namespace_id
-    ))]
-    TableCreateLimitError {
-        table_name: String,
-        namespace_id: NamespaceId,
-    },
-
-    #[snafu(display("parquet file with object_store_id {} already exists", object_store_id))]
-    FileExists { object_store_id: Uuid },
-
-    #[snafu(display("parquet file with id {} does not exist. Foreign key violation", id))]
-    FileNotFound { id: i64 },
-
-    #[snafu(display("parquet_file record {} not found", id))]
-    ParquetRecordNotFound { id: ParquetFileId },
-
-    #[snafu(display("cannot derive valid column schema from column {}: {}", name, source))]
-    InvalidColumn {
+    #[snafu(display("unhandled external error: {source}"))]
+    External {
         source: Box<dyn std::error::Error + Send + Sync>,
-        name: String,
     },
 
-    #[snafu(display("cannot start a transaction: {}", source))]
-    StartTransaction { source: sqlx::Error },
+    #[snafu(display("already exists: {descr}"))]
+    AlreadyExists { descr: String },
 
-    #[snafu(display("no transaction provided"))]
-    NoTransaction,
+    #[snafu(display("limit exceeded: {descr}"))]
+    LimitExceeded { descr: String },
 
-    #[snafu(display("transaction failed to commit: {}", source))]
-    FailedToCommit { source: sqlx::Error },
+    #[snafu(display("not found: {descr}"))]
+    NotFound { descr: String },
+}
 
-    #[snafu(display("error while converting usize {} to i64", value))]
-    InvalidValue { value: usize },
+impl From<sqlx::Error> for Error {
+    fn from(e: sqlx::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
 
-    #[snafu(display("database setup error: {}", source))]
-    Setup { source: sqlx::Error },
+impl From<sqlx::migrate::MigrateError> for Error {
+    fn from(e: sqlx::migrate::MigrateError) -> Self {
+        Self::from(sqlx::Error::from(e))
+    }
+}
 
-    #[snafu(display(
-        "could not record a skipped compaction for partition {partition_id}: {source}"
-    ))]
-    CouldNotRecordSkippedCompaction {
-        source: sqlx::Error,
-        partition_id: PartitionId,
-    },
+impl From<data_types::snapshot::partition::Error> for Error {
+    fn from(e: data_types::snapshot::partition::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
 
-    #[snafu(display("could not list skipped compactions: {source}"))]
-    CouldNotListSkippedCompactions { source: sqlx::Error },
+impl From<data_types::snapshot::table::Error> for Error {
+    fn from(e: data_types::snapshot::table::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
 
-    #[snafu(display("could not delete skipped compactions: {source}"))]
-    CouldNotDeleteSkippedCompactions { source: sqlx::Error },
+impl From<catalog_cache::api::quorum::Error> for Error {
+    fn from(e: catalog_cache::api::quorum::Error) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
+}
 
-    #[snafu(display("could not delete namespace: {source}"))]
-    CouldNotDeleteNamespace { source: sqlx::Error },
+impl From<generated_types::prost::DecodeError> for Error {
+    fn from(e: generated_types::prost::DecodeError) -> Self {
+        Self::External {
+            source: Box::new(e),
+        }
+    }
 }
 
 /// A specialized `Error` for Catalog errors
@@ -182,7 +123,7 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 ///                               AllRows
 ///
 /// ```
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum SoftDeletedRows {
     /// Return all rows.
     AllRows,
@@ -211,7 +152,7 @@ pub trait Catalog: Send + Sync + Debug + Display {
     async fn setup(&self) -> Result<(), Error>;
 
     /// Accesses the repositories without a transaction scope.
-    async fn repositories(&self) -> Box<dyn RepoCollection>;
+    fn repositories(&self) -> Box<dyn RepoCollection>;
 
     /// Gets metric registry associated with this catalog for testing purposes.
     #[cfg(test)]
@@ -233,7 +174,6 @@ pub trait Catalog: Send + Sync + Debug + Display {
 /// A repository might internally map to a wide range of different storage abstractions, ranging
 /// from one or more SQL tables over key-value key spaces to simple in-memory vectors. The user
 /// should and must not care how these are implemented.
-#[async_trait]
 pub trait RepoCollection: Send + Sync + Debug {
     /// Repository for [namespaces](data_types::Namespace).
     fn namespaces(&mut self) -> &mut dyn NamespaceRepo;
@@ -330,6 +270,9 @@ pub trait TableRepo: Send + Sync {
 
     /// List all tables.
     async fn list(&mut self) -> Result<Vec<Table>>;
+
+    /// Obtain a table snapshot
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot>;
 }
 
 /// Functions for working with columns in the catalog
@@ -370,6 +313,21 @@ pub trait ColumnRepo: Send + Sync {
     async fn list(&mut self) -> Result<Vec<Column>>;
 }
 
+/// Extension trait for [`ParquetFileRepo`]
+#[async_trait]
+pub trait PartitionRepoExt {
+    /// create the parquet file
+    async fn get_by_id(self, partition_id: PartitionId) -> Result<Option<Partition>>;
+}
+
+#[async_trait]
+impl PartitionRepoExt for &mut dyn PartitionRepo {
+    async fn get_by_id(self, partition_id: PartitionId) -> Result<Option<Partition>> {
+        let iter = self.get_by_id_batch(&[partition_id]).await?;
+        Ok(iter.into_iter().next())
+    }
+}
+
 /// Functions for working with IOx partitions in the catalog. These are how IOx splits up
 /// data within a namespace.
 #[async_trait]
@@ -377,27 +335,10 @@ pub trait PartitionRepo: Send + Sync {
     /// create or get a partition record for the given partition key and table
     async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition>;
 
-    /// get partition by ID
-    async fn get_by_id(&mut self, partition_id: PartitionId) -> Result<Option<Partition>>;
-
     /// get multiple partitions by ID.
     ///
     /// the output order is undefined, non-existing partitions are not part of the output.
-    async fn get_by_id_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<Partition>>;
-
-    /// get partition by deterministic hash ID
-    async fn get_by_hash_id(
-        &mut self,
-        partition_hash_id: &PartitionHashId,
-    ) -> Result<Option<Partition>>;
-
-    /// get partition by deterministic hash ID
-    ///
-    /// the output order is undefined, non-existing partitions are not part of the output.
-    async fn get_by_hash_id_batch(
-        &mut self,
-        partition_hash_ids: &[&PartitionHashId],
-    ) -> Result<Vec<Partition>>;
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>>;
 
     /// return the partitions by table id
     async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>>;
@@ -405,8 +346,8 @@ pub trait PartitionRepo: Send + Sync {
     /// return all partitions IDs
     async fn list_ids(&mut self) -> Result<Vec<PartitionId>>;
 
-    /// Update the sort key for the partition, setting it to `new_sort_key` iff
-    /// the current value matches `old_sort_key`.
+    /// Update the sort key for the partition, setting it to `new_sort_key_ids` iff
+    /// the current value matches `old_sort_key_ids`.
     ///
     /// NOTE: it is expected that ONLY the ingesters update sort keys for
     /// existing partitions.
@@ -416,18 +357,12 @@ pub trait PartitionRepo: Send + Sync {
     /// Implementations are allowed to spuriously return
     /// [`CasFailure::ValueMismatch`] for performance reasons in the presence of
     /// concurrent writers.
-    ///
-    // TODO: After the sort_key_ids field is converetd into NOT NULL, the implementation of this function
-    // must be changed to compare old_sort_key_ids with the existing sort_key_ids instead of
-    // comparing old_sort_key with existing sort_key
     async fn cas_sort_key(
         &mut self,
-        partition_id: &TransitionPartitionId,
-        old_sort_key: Option<Vec<String>>, // todo: remove this old_sort_key
-        old_sort_key_ids: Option<SortedColumnSet>,
-        new_sort_key: &[&str], //todo: remove this new_sort_key
-        new_sort_key_ids: &SortedColumnSet,
-    ) -> Result<Partition, CasFailure<(Vec<String>, SortedColumnSet)>>;
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>>;
 
     /// Record an instance of a partition being selected for compaction but compaction was not
     /// completed for the specified reason.
@@ -475,34 +410,41 @@ pub trait PartitionRepo: Send + Sync {
     /// Can be removed when all partitions have hash IDs and support for old-style partitions is no
     /// longer needed.
     async fn list_old_style(&mut self) -> Result<Vec<Partition>>;
+
+    /// Obtain a partition snapshot
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot>;
 }
 
-/// Functions for working with parquet file pointers in the catalog
+/// Extension trait for [`ParquetFileRepo`]
 #[async_trait]
-pub trait ParquetFileRepo: Send + Sync {
+pub trait ParquetFileRepoExt {
     /// create the parquet file
-    async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile>;
+    async fn create(self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile>;
+}
 
-    /// List all parquet files in implementation-defined, non-deterministic order.
-    ///
-    /// This includes files that were marked for deletion.
-    ///
-    /// This is mostly useful for testing and will likely not succeed in production.
-    async fn list_all(&mut self) -> Result<Vec<ParquetFile>>;
+#[async_trait]
+impl ParquetFileRepoExt for &mut dyn ParquetFileRepo {
+    /// create the parquet file
+    async fn create(self, params: ParquetFileParams) -> Result<ParquetFile> {
+        let files = self
+            .create_upgrade_delete(
+                params.partition_id,
+                &[],
+                &[],
+                &[params.clone()],
+                CompactionLevel::Initial,
+            )
+            .await?;
+        let id = files.into_iter().next().unwrap();
+        Ok(ParquetFile::from_params(params, id))
+    }
+}
 
+/// Functions for working with parquet file pointers in the catalog
+#[async_trait]
+pub trait ParquetFileRepo: Send + Sync {
     /// Flag all parquet files for deletion that are older than their namespace's retention period.
-    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<ParquetFileId>>;
-
-    /// List all parquet files within a given namespace that are NOT marked as
-    /// [`to_delete`](ParquetFile::to_delete).
-    async fn list_by_namespace_not_to_delete(
-        &mut self,
-        namespace_id: NamespaceId,
-    ) -> Result<Vec<ParquetFile>>;
-
-    /// List all parquet files within a given table that are NOT marked as
-    /// [`to_delete`](ParquetFile::to_delete).
-    async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>>;
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>>;
 
     /// Delete parquet files that were marked to be deleted earlier than the specified time.
     ///
@@ -510,2901 +452,39 @@ pub trait ParquetFileRepo: Send + Sync {
     ///
     /// This deletion is limited to a certain (backend-specific) number of files to avoid overlarge
     /// changes. The caller MAY call this method again if the result was NOT empty.
-    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>>;
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>>;
 
-    /// List parquet files for a given partition that are NOT marked as
+    /// List parquet files for given partitions that are NOT marked as
     /// [`to_delete`](ParquetFile::to_delete).
-    async fn list_by_partition_not_to_delete(
+    ///
+    /// The output order is undefined, non-existing partitions are not part of the output.
+    async fn list_by_partition_not_to_delete_batch(
         &mut self,
-        partition_id: &TransitionPartitionId,
+        partition_ids: Vec<PartitionId>,
     ) -> Result<Vec<ParquetFile>>;
 
     /// Return the parquet file with the given object store id
     // used heavily in tests for verification of catalog state.
     async fn get_by_object_store_id(
         &mut self,
-        object_store_id: Uuid,
+        object_store_id: ObjectStoreId,
     ) -> Result<Option<ParquetFile>>;
 
     /// Test a batch of parquet files exist by object store ids
     async fn exists_by_object_store_id_batch(
         &mut self,
-        object_store_ids: Vec<Uuid>,
-    ) -> Result<Vec<Uuid>>;
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>>;
 
     /// Commit deletions, upgrades and creations in a single transaction.
     ///
     /// Returns IDs of created files.
     async fn create_upgrade_delete(
         &mut self,
-        delete: &[ParquetFileId],
-        upgrade: &[ParquetFileId],
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
         create: &[ParquetFileParams],
         target_level: CompactionLevel,
     ) -> Result<Vec<ParquetFileId>>;
 }
-
-/// Gets the namespace schema including all tables and columns.
-pub async fn get_schema_by_id<R>(
-    id: NamespaceId,
-    repos: &mut R,
-    deleted: SoftDeletedRows,
-) -> Result<NamespaceSchema>
-where
-    R: RepoCollection + ?Sized,
-{
-    let namespace = repos
-        .namespaces()
-        .get_by_id(id, deleted)
-        .await?
-        .context(NamespaceNotFoundByIdSnafu { id })?;
-
-    get_schema_internal(namespace, repos).await
-}
-
-/// Gets the namespace schema including all tables and columns.
-pub async fn get_schema_by_name<R>(
-    name: &str,
-    repos: &mut R,
-    deleted: SoftDeletedRows,
-) -> Result<NamespaceSchema>
-where
-    R: RepoCollection + ?Sized,
-{
-    let namespace = repos
-        .namespaces()
-        .get_by_name(name, deleted)
-        .await?
-        .context(NamespaceNotFoundByNameSnafu { name })?;
-
-    get_schema_internal(namespace, repos).await
-}
-
-async fn get_schema_internal<R>(namespace: Namespace, repos: &mut R) -> Result<NamespaceSchema>
-where
-    R: RepoCollection + ?Sized,
-{
-    // get the columns first just in case someone else is creating schema while we're doing this.
-    let columns = repos.columns().list_by_namespace_id(namespace.id).await?;
-    let tables = repos.tables().list_by_namespace_id(namespace.id).await?;
-
-    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
-
-    let mut table_id_to_schema = BTreeMap::new();
-    for t in tables {
-        let table_schema = TableSchema::new_empty_from(&t);
-        table_id_to_schema.insert(t.id, (t.name, table_schema));
-    }
-
-    for c in columns {
-        let (_, t) = table_id_to_schema.get_mut(&c.table_id).unwrap();
-        t.add_column(c);
-    }
-
-    for (_, (table_name, schema)) in table_id_to_schema {
-        namespace.tables.insert(table_name, schema);
-    }
-
-    Ok(namespace)
-}
-
-/// Gets the schema for one particular table in a namespace.
-pub async fn get_schema_by_namespace_and_table<R>(
-    name: &str,
-    table_name: &str,
-    repos: &mut R,
-    deleted: SoftDeletedRows,
-) -> Result<NamespaceSchema>
-where
-    R: RepoCollection + ?Sized,
-{
-    let namespace = repos
-        .namespaces()
-        .get_by_name(name, deleted)
-        .await?
-        .context(NamespaceNotFoundByNameSnafu { name })?;
-
-    let table = repos
-        .tables()
-        .get_by_namespace_and_name(namespace.id, table_name)
-        .await?
-        .context(TableNotFoundByNameSnafu {
-            name: table_name.to_string(),
-        })?;
-    let mut table_schema = TableSchema::new_empty_from(&table);
-
-    let columns = repos.columns().list_by_table_id(table.id).await?;
-    for c in columns {
-        table_schema.add_column(c);
-    }
-
-    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
-    namespace
-        .tables
-        .insert(table_name.to_string(), table_schema);
-
-    Ok(namespace)
-}
-
-/// Gets all the table's columns.
-pub async fn get_table_columns_by_id<R>(id: TableId, repos: &mut R) -> Result<ColumnsByName>
-where
-    R: RepoCollection + ?Sized,
-{
-    let columns = repos.columns().list_by_table_id(id).await?;
-
-    Ok(ColumnsByName::new(columns))
-}
-
-/// Fetch all [`NamespaceSchema`] in the catalog.
-///
-/// This method performs the minimal number of queries needed to build the
-/// result set. No table lock is obtained, nor are queries executed within a
-/// transaction, but this method does return a point-in-time snapshot of the
-/// catalog state.
-///
-/// # Soft Deletion
-///
-/// No schemas for soft-deleted namespaces are returned.
-pub async fn list_schemas(
-    catalog: &dyn Catalog,
-) -> Result<impl Iterator<Item = (Namespace, NamespaceSchema)>> {
-    let mut repos = catalog.repositories().await;
-
-    // In order to obtain a point-in-time snapshot, first fetch the columns,
-    // then the tables, and then resolve the namespace IDs to Namespace in order
-    // to construct the schemas.
-    //
-    // The set of columns returned forms the state snapshot, with the subsequent
-    // queries resolving only what is needed to construct schemas for the
-    // retrieved columns (ignoring any newly added tables/namespaces since the
-    // column snapshot was taken).
-    //
-    // This approach also tolerates concurrently deleted namespaces, which are
-    // simply ignored at the end when joining to the namespace query result.
-
-    // First fetch all the columns - this is the state snapshot of the catalog
-    // schemas.
-    let columns = repos.columns().list().await?;
-
-    // Construct the set of table IDs these columns belong to.
-    let retain_table_ids = columns.iter().map(|c| c.table_id).collect::<HashSet<_>>();
-
-    // Fetch all tables, and filter for those that are needed to construct
-    // schemas for "columns" only.
-    //
-    // Discard any tables that have no columns or have been created since
-    // the "columns" snapshot was retrieved, and construct a map of ID->Table.
-    let tables = repos
-        .tables()
-        .list()
-        .await?
-        .into_iter()
-        .filter_map(|t| {
-            if !retain_table_ids.contains(&t.id) {
-                return None;
-            }
-
-            Some((t.id, t))
-        })
-        .collect::<HashMap<_, _>>();
-
-    // Drop the table ID set as it will not be referenced again.
-    drop(retain_table_ids);
-
-    // Do all the I/O to fetch the namespaces in the background, while this
-    // thread constructs the NamespaceId->TableSchema map below.
-    let namespaces = tokio::spawn(async move {
-        repos
-            .namespaces()
-            .list(SoftDeletedRows::ExcludeDeleted)
-            .await
-    });
-
-    // A set of tables within a single namespace.
-    type NamespaceTables = BTreeMap<String, TableSchema>;
-
-    let mut joined = HashMap::<NamespaceId, NamespaceTables>::default();
-    for column in columns {
-        // Resolve the table this column references
-        let table = tables.get(&column.table_id).expect("no table for column");
-
-        let table_schema = joined
-            // Find or create a record in the joined <NamespaceId, Tables> map
-            // for this namespace ID.
-            .entry(table.namespace_id)
-            .or_default()
-            // Fetch the schema record for this table, or create an empty one.
-            .entry(table.name.clone())
-            .or_insert_with(|| TableSchema::new_empty_from(table));
-
-        table_schema.add_column(column);
-    }
-
-    // The table map is no longer needed - immediately reclaim the memory.
-    drop(tables);
-
-    // Convert the Namespace instances into NamespaceSchema instances.
-    let iter = namespaces
-        .await
-        .expect("namespace list task panicked")?
-        .into_iter()
-        // Ignore any namespaces that did not exist when the "columns" snapshot
-        // was created, or have no tables/columns (and therefore have no entry
-        // in "joined").
-        .filter_map(move |v| {
-            // The catalog call explicitly asked for no soft deleted records.
-            assert!(v.deleted_at.is_none());
-
-            let mut ns = NamespaceSchema::new_empty_from(&v);
-
-            ns.tables = joined.remove(&v.id)?;
-            Some((v, ns))
-        });
-
-    Ok(iter)
-}
-
-#[cfg(test)]
-pub(crate) mod test_helpers {
-    use crate::{
-        test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table},
-        validate_or_insert_schema,
-    };
-
-    use super::*;
-    use ::test_helpers::assert_error;
-    use assert_matches::assert_matches;
-    use data_types::{ColumnId, CompactionLevel, MaxColumnsPerTable, MaxTables};
-    use futures::Future;
-    use generated_types::influxdata::iox::partition_template::v1 as proto;
-    use metric::{Attributes, DurationHistogram, Metric};
-    use std::{collections::BTreeSet, ops::DerefMut, sync::Arc, time::Duration};
-
-    pub(crate) async fn test_catalog<R, F>(clean_state: R)
-    where
-        R: Fn() -> F + Send + Sync,
-        F: Future<Output = Arc<dyn Catalog>> + Send,
-    {
-        test_setup(clean_state().await).await;
-        test_namespace_soft_deletion(clean_state().await).await;
-        test_partitions_new_file_between(clean_state().await).await;
-        test_column(clean_state().await).await;
-        test_partition(clean_state().await).await;
-        test_parquet_file(clean_state().await).await;
-        test_parquet_file_delete_broken(clean_state().await).await;
-        test_update_to_compaction_level_1(clean_state().await).await;
-        test_list_by_partiton_not_to_delete(clean_state().await).await;
-        test_list_schemas(clean_state().await).await;
-        test_list_schemas_soft_deleted_rows(clean_state().await).await;
-        test_delete_namespace(clean_state().await).await;
-
-        let catalog = clean_state().await;
-        test_namespace(Arc::clone(&catalog)).await;
-        assert_metric_hit(&catalog.metrics(), "namespace_create");
-
-        let catalog = clean_state().await;
-        test_table(Arc::clone(&catalog)).await;
-        assert_metric_hit(&catalog.metrics(), "table_create");
-
-        let catalog = clean_state().await;
-        test_column(Arc::clone(&catalog)).await;
-        assert_metric_hit(&catalog.metrics(), "column_create_or_get");
-
-        let catalog = clean_state().await;
-        test_partition(Arc::clone(&catalog)).await;
-        assert_metric_hit(&catalog.metrics(), "partition_create_or_get");
-
-        let catalog = clean_state().await;
-        test_parquet_file(Arc::clone(&catalog)).await;
-        assert_metric_hit(&catalog.metrics(), "parquet_create");
-    }
-
-    async fn test_setup(catalog: Arc<dyn Catalog>) {
-        catalog.setup().await.expect("first catalog setup");
-        catalog.setup().await.expect("second catalog setup");
-    }
-
-    async fn test_namespace(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace_name = NamespaceName::new("test_namespace").unwrap();
-        let namespace = repos
-            .namespaces()
-            .create(&namespace_name, None, None, None)
-            .await
-            .unwrap();
-        assert!(namespace.id > NamespaceId::new(0));
-        assert_eq!(namespace.name, namespace_name.as_str());
-        assert_eq!(
-            namespace.partition_template,
-            NamespacePartitionTemplateOverride::default()
-        );
-        let lookup_namespace = repos
-            .namespaces()
-            .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_eq!(namespace, lookup_namespace);
-
-        // Assert default values for service protection limits.
-        assert_eq!(namespace.max_tables, MaxTables::default());
-        assert_eq!(
-            namespace.max_columns_per_table,
-            MaxColumnsPerTable::default()
-        );
-
-        let conflict = repos
-            .namespaces()
-            .create(&namespace_name, None, None, None)
-            .await;
-        assert!(matches!(
-            conflict.unwrap_err(),
-            Error::NameExists { name: _ }
-        ));
-
-        let found = repos
-            .namespaces()
-            .get_by_id(namespace.id, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .expect("namespace should be there");
-        assert_eq!(namespace, found);
-
-        let not_found = repos
-            .namespaces()
-            .get_by_id(NamespaceId::new(i64::MAX), SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap();
-        assert!(not_found.is_none());
-
-        let found = repos
-            .namespaces()
-            .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .expect("namespace should be there");
-        assert_eq!(namespace, found);
-
-        let not_found = repos
-            .namespaces()
-            .get_by_name("does_not_exist", SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap();
-        assert!(not_found.is_none());
-
-        let namespace2 = arbitrary_namespace(&mut *repos, "test_namespace2").await;
-        let mut namespaces = repos
-            .namespaces()
-            .list(SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap();
-        namespaces.sort_by_key(|ns| ns.name.clone());
-        assert_eq!(namespaces, vec![namespace, namespace2]);
-
-        let new_table_limit = MaxTables::new(15000);
-        let modified = repos
-            .namespaces()
-            .update_table_limit(namespace_name.as_str(), new_table_limit)
-            .await
-            .expect("namespace should be updateable");
-        assert_eq!(new_table_limit, modified.max_tables);
-
-        let new_column_limit = MaxColumnsPerTable::new(1500);
-        let modified = repos
-            .namespaces()
-            .update_column_limit(namespace_name.as_str(), new_column_limit)
-            .await
-            .expect("namespace should be updateable");
-        assert_eq!(new_column_limit, modified.max_columns_per_table);
-
-        const NEW_RETENTION_PERIOD_NS: i64 = 5 * 60 * 60 * 1000 * 1000 * 1000;
-        let modified = repos
-            .namespaces()
-            .update_retention_period(namespace_name.as_str(), Some(NEW_RETENTION_PERIOD_NS))
-            .await
-            .expect("namespace should be updateable");
-        assert_eq!(
-            NEW_RETENTION_PERIOD_NS,
-            modified.retention_period_ns.unwrap()
-        );
-
-        let modified = repos
-            .namespaces()
-            .update_retention_period(namespace_name.as_str(), None)
-            .await
-            .expect("namespace should be updateable");
-        assert!(modified.retention_period_ns.is_none());
-
-        // create namespace with retention period NULL (the default)
-        let namespace3 = arbitrary_namespace(&mut *repos, "test_namespace3").await;
-        assert!(namespace3.retention_period_ns.is_none());
-
-        // create namespace with retention period
-        let namespace4_name = NamespaceName::new("test_namespace4").unwrap();
-        let namespace4 = repos
-            .namespaces()
-            .create(&namespace4_name, None, Some(NEW_RETENTION_PERIOD_NS), None)
-            .await
-            .expect("namespace with 5-hour retention should be created");
-        assert_eq!(
-            NEW_RETENTION_PERIOD_NS,
-            namespace4.retention_period_ns.unwrap()
-        );
-        // reset retention period to NULL to avoid affecting later tests
-        repos
-            .namespaces()
-            .update_retention_period(&namespace4_name, None)
-            .await
-            .expect("namespace should be updateable");
-
-        // create a namespace with a PartitionTemplate other than the default
-        let tag_partition_template =
-            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
-                parts: vec![proto::TemplatePart {
-                    part: Some(proto::template_part::Part::TagValue("tag1".into())),
-                }],
-            })
-            .unwrap();
-        let namespace5_name = NamespaceName::new("test_namespace5").unwrap();
-        let namespace5 = repos
-            .namespaces()
-            .create(
-                &namespace5_name,
-                Some(tag_partition_template.clone()),
-                None,
-                None,
-            )
-            .await
-            .unwrap();
-        assert_eq!(namespace5.partition_template, tag_partition_template);
-        let lookup_namespace5 = repos
-            .namespaces()
-            .get_by_name(&namespace5_name, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_eq!(namespace5, lookup_namespace5);
-
-        // remove namespace to avoid it from affecting later tests
-        repos
-            .namespaces()
-            .soft_delete("test_namespace")
-            .await
-            .expect("delete namespace should succeed");
-        repos
-            .namespaces()
-            .soft_delete("test_namespace2")
-            .await
-            .expect("delete namespace should succeed");
-        repos
-            .namespaces()
-            .soft_delete("test_namespace3")
-            .await
-            .expect("delete namespace should succeed");
-        repos
-            .namespaces()
-            .soft_delete("test_namespace4")
-            .await
-            .expect("delete namespace should succeed");
-    }
-
-    /// Construct a set of two namespaces:
-    ///
-    ///  * deleted-ns: marked as soft-deleted
-    ///  * active-ns: not marked as deleted
-    ///
-    /// And assert the expected "soft delete" semantics / correctly filter out
-    /// the expected rows for all three states of [`SoftDeletedRows`].
-    async fn test_namespace_soft_deletion(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-
-        let deleted_ns = arbitrary_namespace(&mut *repos, "deleted-ns").await;
-        let active_ns = arbitrary_namespace(&mut *repos, "active-ns").await;
-
-        // Mark "deleted-ns" as soft-deleted.
-        repos.namespaces().soft_delete("deleted-ns").await.unwrap();
-
-        // Which should be idempotent (ignoring the timestamp change - when
-        // changing this to "soft delete" it was idempotent, so I am preserving
-        // that).
-        repos.namespaces().soft_delete("deleted-ns").await.unwrap();
-
-        // Listing should respect soft deletion.
-        let got = repos
-            .namespaces()
-            .list(SoftDeletedRows::AllRows)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["deleted-ns", "active-ns"]);
-
-        let got = repos
-            .namespaces()
-            .list(SoftDeletedRows::OnlyDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["deleted-ns"]);
-
-        let got = repos
-            .namespaces()
-            .list(SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["active-ns"]);
-
-        // As should get by ID
-        let got = repos
-            .namespaces()
-            .get_by_id(deleted_ns.id, SoftDeletedRows::AllRows)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["deleted-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_id(deleted_ns.id, SoftDeletedRows::OnlyDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| {
-                assert!(v.deleted_at.is_some());
-                v.name
-            });
-        assert_string_set_eq(got, ["deleted-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_id(deleted_ns.id, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap();
-        assert!(got.is_none());
-        let got = repos
-            .namespaces()
-            .get_by_id(active_ns.id, SoftDeletedRows::AllRows)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["active-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_id(active_ns.id, SoftDeletedRows::OnlyDeleted)
-            .await
-            .unwrap();
-        assert!(got.is_none());
-        let got = repos
-            .namespaces()
-            .get_by_id(active_ns.id, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["active-ns"]);
-
-        // And get by name
-        let got = repos
-            .namespaces()
-            .get_by_name(&deleted_ns.name, SoftDeletedRows::AllRows)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["deleted-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_name(&deleted_ns.name, SoftDeletedRows::OnlyDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| {
-                assert!(v.deleted_at.is_some());
-                v.name
-            });
-        assert_string_set_eq(got, ["deleted-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_name(&deleted_ns.name, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap();
-        assert!(got.is_none());
-        let got = repos
-            .namespaces()
-            .get_by_name(&active_ns.name, SoftDeletedRows::AllRows)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["active-ns"]);
-        let got = repos
-            .namespaces()
-            .get_by_name(&active_ns.name, SoftDeletedRows::OnlyDeleted)
-            .await
-            .unwrap();
-        assert!(got.is_none());
-        let got = repos
-            .namespaces()
-            .get_by_name(&active_ns.name, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .unwrap()
-            .into_iter()
-            .map(|v| v.name);
-        assert_string_set_eq(got, ["active-ns"]);
-    }
-
-    // Assert the set of strings "a" is equal to the set "b", tolerating
-    // duplicates.
-    #[track_caller]
-    fn assert_string_set_eq<T, U>(a: impl IntoIterator<Item = T>, b: impl IntoIterator<Item = U>)
-    where
-        T: Into<String>,
-        U: Into<String>,
-    {
-        let mut a = a.into_iter().map(Into::into).collect::<Vec<String>>();
-        a.sort_unstable();
-        let mut b = b.into_iter().map(Into::into).collect::<Vec<String>>();
-        b.sort_unstable();
-        assert_eq!(a, b);
-    }
-
-    async fn test_table(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(&mut *repos, "namespace_table_test").await;
-
-        // test we can create a table
-        let t = arbitrary_table(&mut *repos, "test_table", &namespace).await;
-        assert!(t.id > TableId::new(0));
-        assert_eq!(
-            t.partition_template,
-            TablePartitionTemplateOverride::default()
-        );
-
-        // The default template doesn't use any tag values, so no columns need to be created.
-        let table_columns = repos.columns().list_by_table_id(t.id).await.unwrap();
-        assert!(table_columns.is_empty());
-
-        // test we get an error if we try to create it again
-        let err = repos
-            .tables()
-            .create(
-                "test_table",
-                TablePartitionTemplateOverride::try_new(None, &namespace.partition_template)
-                    .unwrap(),
-                namespace.id,
-            )
-            .await;
-        assert_error!(
-            err,
-            Error::TableNameExists { ref name, namespace_id }
-                if name == "test_table" && namespace_id == namespace.id
-        );
-
-        // get by id
-        assert_eq!(t, repos.tables().get_by_id(t.id).await.unwrap().unwrap());
-        assert!(repos
-            .tables()
-            .get_by_id(TableId::new(i64::MAX))
-            .await
-            .unwrap()
-            .is_none());
-
-        let tables = repos
-            .tables()
-            .list_by_namespace_id(namespace.id)
-            .await
-            .unwrap();
-        assert_eq!(vec![t.clone()], tables);
-
-        // test we can create a table of the same name in a different namespace
-        let namespace2 = arbitrary_namespace(&mut *repos, "two").await;
-        assert_ne!(namespace, namespace2);
-        let test_table = arbitrary_table(&mut *repos, "test_table", &namespace2).await;
-        assert_ne!(t.id, test_table.id);
-        assert_eq!(test_table.namespace_id, namespace2.id);
-
-        // test get by namespace and name
-        let foo_table = arbitrary_table(&mut *repos, "foo", &namespace2).await;
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_namespace_and_name(NamespaceId::new(i64::MAX), "test_table")
-                .await
-                .unwrap(),
-            None
-        );
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_namespace_and_name(namespace.id, "not_existing")
-                .await
-                .unwrap(),
-            None
-        );
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_namespace_and_name(namespace.id, "test_table")
-                .await
-                .unwrap(),
-            Some(t.clone())
-        );
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_namespace_and_name(namespace2.id, "test_table")
-                .await
-                .unwrap()
-                .as_ref(),
-            Some(&test_table)
-        );
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_namespace_and_name(namespace2.id, "foo")
-                .await
-                .unwrap()
-                .as_ref(),
-            Some(&foo_table)
-        );
-
-        // All tables should be returned by list(), regardless of namespace
-        let mut list = repos.tables().list().await.unwrap();
-        list.sort_by_key(|t| t.id);
-        let mut expected = [t, test_table, foo_table];
-        expected.sort_by_key(|t| t.id);
-        assert_eq!(&list, &expected);
-
-        // test per-namespace table limits
-        let latest = repos
-            .namespaces()
-            .update_table_limit("namespace_table_test", MaxTables::new(1))
-            .await
-            .expect("namespace should be updateable");
-        let err = repos
-            .tables()
-            .create(
-                "definitely_unique",
-                TablePartitionTemplateOverride::try_new(None, &latest.partition_template).unwrap(),
-                latest.id,
-            )
-            .await
-            .expect_err("should error with table create limit error");
-        assert!(matches!(
-            err,
-            Error::TableCreateLimitError {
-                table_name: _,
-                namespace_id: _
-            }
-        ));
-
-        // Create a table with a partition template other than the default
-        let custom_table_template = TablePartitionTemplateOverride::try_new(
-            Some(proto::PartitionTemplate {
-                parts: vec![
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TagValue("tag1".into())),
-                    },
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
-                    },
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TagValue("tag2".into())),
-                    },
-                ],
-            }),
-            &namespace2.partition_template,
-        )
-        .unwrap();
-        let templated = repos
-            .tables()
-            .create(
-                "use_a_template",
-                custom_table_template.clone(),
-                namespace2.id,
-            )
-            .await
-            .unwrap();
-        assert_eq!(templated.partition_template, custom_table_template);
-
-        // Tag columns should be created for tags used in the template
-        let table_columns = repos
-            .columns()
-            .list_by_table_id(templated.id)
-            .await
-            .unwrap();
-        assert_eq!(table_columns.len(), 2);
-        assert!(table_columns.iter().all(|c| c.is_tag()));
-        let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
-        column_names.sort();
-        assert_eq!(column_names, &["tag1", "tag2"]);
-
-        let lookup_templated = repos
-            .tables()
-            .get_by_namespace_and_name(namespace2.id, "use_a_template")
-            .await
-            .unwrap()
-            .unwrap();
-        assert_eq!(templated, lookup_templated);
-
-        // Create a namespace with a partition template other than the default
-        let custom_namespace_template =
-            NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
-                parts: vec![
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TagValue("zzz".into())),
-                    },
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TagValue("aaa".into())),
-                    },
-                    proto::TemplatePart {
-                        part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
-                    },
-                ],
-            })
-            .unwrap();
-        let custom_namespace_name = NamespaceName::new("custom_namespace").unwrap();
-        let custom_namespace = repos
-            .namespaces()
-            .create(
-                &custom_namespace_name,
-                Some(custom_namespace_template.clone()),
-                None,
-                None,
-            )
-            .await
-            .unwrap();
-        // Create a table without specifying the partition template
-        let custom_table_template =
-            TablePartitionTemplateOverride::try_new(None, &custom_namespace.partition_template)
-                .unwrap();
-        let table_templated_by_namespace = repos
-            .tables()
-            .create(
-                "use_namespace_template",
-                custom_table_template,
-                custom_namespace.id,
-            )
-            .await
-            .unwrap();
-        assert_eq!(
-            table_templated_by_namespace.partition_template,
-            TablePartitionTemplateOverride::try_new(None, &custom_namespace_template).unwrap()
-        );
-
-        // Tag columns should be created for tags used in the template
-        let table_columns = repos
-            .columns()
-            .list_by_table_id(table_templated_by_namespace.id)
-            .await
-            .unwrap();
-        assert_eq!(table_columns.len(), 2);
-        assert!(table_columns.iter().all(|c| c.is_tag()));
-        let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
-        column_names.sort();
-        assert_eq!(column_names, &["aaa", "zzz"]);
-
-        repos
-            .namespaces()
-            .soft_delete("namespace_table_test")
-            .await
-            .expect("delete namespace should succeed");
-        repos
-            .namespaces()
-            .soft_delete("two")
-            .await
-            .expect("delete namespace should succeed");
-    }
-
-    async fn test_column(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(&mut *repos, "namespace_column_test").await;
-        let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
-        assert_eq!(table.namespace_id, namespace.id);
-
-        // test we can create or get a column
-        let c = repos
-            .columns()
-            .create_or_get("column_test", table.id, ColumnType::Tag)
-            .await
-            .unwrap();
-        let cc = repos
-            .columns()
-            .create_or_get("column_test", table.id, ColumnType::Tag)
-            .await
-            .unwrap();
-        assert!(c.id > ColumnId::new(0));
-        assert_eq!(c, cc);
-
-        // test that attempting to create an already defined column of a different type returns
-        // error
-        let err = repos
-            .columns()
-            .create_or_get("column_test", table.id, ColumnType::U64)
-            .await
-            .expect_err("should error with wrong column type");
-        assert!(matches!(err, Error::ColumnTypeMismatch { .. }));
-
-        // test that we can create a column of the same name under a different table
-        let table2 = arbitrary_table(&mut *repos, "test_table_2", &namespace).await;
-        let ccc = repos
-            .columns()
-            .create_or_get("column_test", table2.id, ColumnType::U64)
-            .await
-            .unwrap();
-        assert_ne!(c, ccc);
-
-        let columns = repos
-            .columns()
-            .list_by_namespace_id(namespace.id)
-            .await
-            .unwrap();
-
-        let mut want = vec![c.clone(), ccc];
-        assert_eq!(want, columns);
-
-        let columns = repos.columns().list_by_table_id(table.id).await.unwrap();
-
-        let want2 = vec![c];
-        assert_eq!(want2, columns);
-
-        // Add another tag column into table2
-        let c3 = repos
-            .columns()
-            .create_or_get("b", table2.id, ColumnType::Tag)
-            .await
-            .unwrap();
-
-        // Listing columns should return all columns in the catalog
-        let list = repos.columns().list().await.unwrap();
-        want.extend([c3]);
-        assert_eq!(list, want);
-
-        // test create_or_get_many_unchecked, below column limit
-        let mut columns = HashMap::new();
-        columns.insert("column_test", ColumnType::Tag);
-        columns.insert("new_column", ColumnType::Tag);
-        let table1_columns = repos
-            .columns()
-            .create_or_get_many_unchecked(table.id, columns)
-            .await
-            .unwrap();
-        let mut table1_column_names: Vec<_> = table1_columns.iter().map(|c| &c.name).collect();
-        table1_column_names.sort();
-        assert_eq!(table1_column_names, vec!["column_test", "new_column"]);
-
-        // test per-namespace column limits
-        repos
-            .namespaces()
-            .update_column_limit("namespace_column_test", MaxColumnsPerTable::new(1))
-            .await
-            .expect("namespace should be updateable");
-        let err = repos
-            .columns()
-            .create_or_get("definitely unique", table.id, ColumnType::Tag)
-            .await
-            .expect_err("should error with table create limit error");
-        assert!(matches!(
-            err,
-            Error::ColumnCreateLimitError {
-                column_name: _,
-                table_id: _,
-            }
-        ));
-
-        // test per-namespace column limits are NOT enforced with create_or_get_many_unchecked
-        let table3 = arbitrary_table(&mut *repos, "test_table_3", &namespace).await;
-        let mut columns = HashMap::new();
-        columns.insert("apples", ColumnType::Tag);
-        columns.insert("oranges", ColumnType::Tag);
-        let table3_columns = repos
-            .columns()
-            .create_or_get_many_unchecked(table3.id, columns)
-            .await
-            .unwrap();
-        let mut table3_column_names: Vec<_> = table3_columns.iter().map(|c| &c.name).collect();
-        table3_column_names.sort();
-        assert_eq!(table3_column_names, vec!["apples", "oranges"]);
-
-        repos
-            .namespaces()
-            .soft_delete("namespace_column_test")
-            .await
-            .expect("delete namespace should succeed");
-    }
-
-    async fn test_partition(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(&mut *repos, "namespace_partition_test").await;
-        let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
-
-        let mut created = BTreeMap::new();
-        // partition to use
-        let partition = repos
-            .partitions()
-            .create_or_get("foo".into(), table.id)
-            .await
-            .expect("failed to create partition");
-        // Test: sort_key_ids from create_or_get
-        assert!(partition.sort_key_ids().is_empty());
-        created.insert(partition.id, partition.clone());
-        // partition to use
-        let partition_bar = repos
-            .partitions()
-            .create_or_get("bar".into(), table.id)
-            .await
-            .expect("failed to create partition");
-        created.insert(partition_bar.id, partition_bar);
-        // partition to be skipped later
-        let to_skip_partition = repos
-            .partitions()
-            .create_or_get("asdf".into(), table.id)
-            .await
-            .unwrap();
-        created.insert(to_skip_partition.id, to_skip_partition.clone());
-        // partition to be skipped later
-        let to_skip_partition_too = repos
-            .partitions()
-            .create_or_get("asdf too".into(), table.id)
-            .await
-            .unwrap();
-        created.insert(to_skip_partition_too.id, to_skip_partition_too.clone());
-
-        // partitions can be retrieved easily
-        let mut created_sorted = created.values().cloned().collect::<Vec<_>>();
-        created_sorted.sort_by_key(|p| p.id);
-        assert_eq!(
-            to_skip_partition,
-            repos
-                .partitions()
-                .get_by_id(to_skip_partition.id)
-                .await
-                .unwrap()
-                .unwrap()
-        );
-        assert_eq!(
-            to_skip_partition,
-            repos
-                .partitions()
-                .get_by_hash_id(to_skip_partition.hash_id().unwrap())
-                .await
-                .unwrap()
-                .unwrap()
-        );
-        let non_existing_partition_id = PartitionId::new(i64::MAX);
-        let non_existing_partition_hash_id =
-            PartitionHashId::new(TableId::new(i64::MAX), &PartitionKey::from("arbitrary"));
-        assert!(repos
-            .partitions()
-            .get_by_id(non_existing_partition_id)
-            .await
-            .unwrap()
-            .is_none());
-        assert!(repos
-            .partitions()
-            .get_by_hash_id(&non_existing_partition_hash_id)
-            .await
-            .unwrap()
-            .is_none());
-        let mut batch = repos
-            .partitions()
-            .get_by_id_batch(
-                created
-                    .keys()
-                    .cloned()
-                    .chain([non_existing_partition_id])
-                    .collect(),
-            )
-            .await
-            .unwrap();
-        batch.sort_by_key(|p| p.id);
-        assert_eq!(created_sorted, batch);
-        // Test: sort_key_ids from get_by_id_batch
-        assert!(batch.iter().all(|p| p.sort_key_ids().is_empty()));
-        let mut batch = repos
-            .partitions()
-            .get_by_hash_id_batch(
-                &created
-                    .values()
-                    .map(|p| p.hash_id().unwrap())
-                    .chain([&non_existing_partition_hash_id])
-                    .collect::<Vec<_>>(),
-            )
-            .await
-            .unwrap();
-        batch.sort_by_key(|p| p.id);
-        // Test: sort_key_ids from get_by_hash_id_batch
-        assert!(batch.iter().all(|p| p.sort_key_ids().is_empty()));
-        assert_eq!(created_sorted, batch);
-
-        let listed = repos
-            .partitions()
-            .list_by_table_id(table.id)
-            .await
-            .expect("failed to list partitions")
-            .into_iter()
-            .map(|v| (v.id, v))
-            .collect::<BTreeMap<_, _>>();
-        // Test: sort_key_ids from list_by_table_id
-        assert!(listed.values().all(|p| p.sort_key_ids().is_empty()));
-
-        assert_eq!(created, listed);
-
-        let listed = repos
-            .partitions()
-            .list_ids()
-            .await
-            .expect("failed to list partitions")
-            .into_iter()
-            .collect::<BTreeSet<_>>();
-
-        assert_eq!(created.keys().copied().collect::<BTreeSet<_>>(), listed);
-
-        // The code no longer supports creating old-style partitions, so this list is always empty
-        // in these tests. See each catalog implementation for tests that insert old-style
-        // partitions directly and verify they're returned.
-        let old_style = repos.partitions().list_old_style().await.unwrap();
-        assert!(
-            old_style.is_empty(),
-            "Expected no old-style partitions, got {old_style:?}"
-        );
-
-        // sort_key should be empty on creation
-        assert!(to_skip_partition.sort_key.is_empty());
-        assert!(to_skip_partition.sort_key_ids.as_ref().is_empty());
-
-        // test that updates sort_key and sort_key_ids from None to Some
-        let updated_partition = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition.transition_partition_id(),
-                None,
-                None,
-                &["tag2", "tag1", "time"],
-                &SortedColumnSet::from([2, 1, 3]),
-            )
-            .await
-            .unwrap();
-
-        // verify sort_key and sort_key_ids are updated correctly
-        assert_eq!(updated_partition.sort_key, &["tag2", "tag1", "time"]);
-        assert_eq!(
-            updated_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 3])
-        );
-
-        // test that provides values of both old_sort_key and old_sort_key_ids but they do not match the existing ones
-        // --> the new sort key will not be updated
-        let err = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition.transition_partition_id(),
-                Some(["bananas".to_string()].to_vec()),
-                Some(SortedColumnSet::from([1])),
-                &["tag2", "tag1", "tag3 , with comma", "time"],
-                &SortedColumnSet::from([1, 2, 3, 4]),
-            )
-            .await
-            .expect_err("CAS with incorrect value should fail");
-        // verify the sort key is not updated
-        assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => {
-            assert_eq!(old_sort_key, &["tag2", "tag1", "time"]);
-            assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3]));
-        });
-
-        // test that provides matched old_sort_key but not-matched old_sort_key_ids
-        // --> the new sort key will not be updated
-        let err = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition.transition_partition_id(),
-                Some(["tag2".to_string(), "tag1".to_string(), "time".to_string()].to_vec()),
-                Some(SortedColumnSet::from([1, 5, 10])),
-                &["tag2", "tag1", "tag3 , with comma", "time"],
-                &SortedColumnSet::from([1, 2, 3, 4]),
-            )
-            .await
-            .expect_err("CAS with incorrect value should fail");
-        // verify the sort key is not updated
-        assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => {
-            assert_eq!(old_sort_key, &["tag2", "tag1", "time"]);
-            assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3]));
-        });
-
-        // test that provide None sort_key and None sort_key_ids that do not match with existing values that are not None
-        // --> the new sort key will not be updated
-        let err = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition.transition_partition_id(),
-                None,
-                None,
-                &["tag2", "tag1", "tag3 , with comma", "time"],
-                &SortedColumnSet::from([1, 2, 3, 4]),
-            )
-            .await
-            .expect_err("CAS with incorrect value should fail");
-        assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => {
-            assert_eq!(old_sort_key, &["tag2", "tag1", "time"]);
-            assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3]));
-        });
-
-        // test getting partition from partition id and verify values of sort_key and sort_key_ids
-        let updated_other_partition = repos
-            .partitions()
-            .get_by_id(to_skip_partition.id)
-            .await
-            .unwrap()
-            .unwrap();
-        // still has the old sort key
-        assert_eq!(
-            updated_other_partition.sort_key,
-            vec!["tag2", "tag1", "time"]
-        );
-        assert_eq!(
-            updated_other_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 3])
-        );
-
-        // test getting partition from hash_id and verify values of sort_key and sort_key_ids
-        let updated_other_partition = repos
-            .partitions()
-            .get_by_hash_id(to_skip_partition.hash_id().unwrap())
-            .await
-            .unwrap()
-            .unwrap();
-        // still has the old sort key
-        assert_eq!(
-            updated_other_partition.sort_key,
-            vec!["tag2", "tag1", "time"]
-        );
-        assert_eq!(
-            updated_other_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 3])
-        );
-
-        // test that updates sort_key and sort_key_ids from Some matching values to Some other values
-        let updated_partition = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition.transition_partition_id(),
-                Some(
-                    ["tag2", "tag1", "time"]
-                        .into_iter()
-                        .map(ToString::to_string)
-                        .collect(),
-                ),
-                Some(SortedColumnSet::from([2, 1, 3])),
-                &["tag2", "tag1", "tag3 , with comma", "time"],
-                &SortedColumnSet::from([2, 1, 4, 3]),
-            )
-            .await
-            .unwrap();
-        // verify the new values are updated
-        assert_eq!(
-            updated_partition.sort_key,
-            vec!["tag2", "tag1", "tag3 , with comma", "time"]
-        );
-        assert_eq!(
-            updated_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 4, 3])
-        );
-
-        // test getting the new sort key from partition id
-        let updated_partition = repos
-            .partitions()
-            .get_by_id(to_skip_partition.id)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_eq!(
-            updated_partition.sort_key,
-            vec!["tag2", "tag1", "tag3 , with comma", "time"]
-        );
-        assert_eq!(
-            updated_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 4, 3])
-        );
-
-        // test getting the new sort key from partition hash_id
-        let updated_partition = repos
-            .partitions()
-            .get_by_hash_id(to_skip_partition.hash_id().unwrap())
-            .await
-            .unwrap()
-            .unwrap();
-        assert_eq!(
-            updated_partition.sort_key,
-            vec!["tag2", "tag1", "tag3 , with comma", "time"]
-        );
-        assert_eq!(
-            updated_partition.sort_key_ids,
-            SortedColumnSet::from([2, 1, 4, 3])
-        );
-
-        // use to_skip_partition_too to update sort key from empty old values
-        // first make sure the old values are empty
-        assert!(to_skip_partition_too.sort_key.is_empty());
-        assert!(to_skip_partition_too.sort_key_ids.as_ref().is_empty());
-
-        // test that provides empty old_sort_key and empty old_sort_key_ids
-        // --> the new sort key will be updated
-        let updated_to_skip_partition_too = repos
-            .partitions()
-            .cas_sort_key(
-                &to_skip_partition_too.transition_partition_id(),
-                Some(vec![]),
-                Some(SortedColumnSet::from([])),
-                &["tag3", "time"],
-                &SortedColumnSet::from([3, 4]),
-            )
-            .await
-            .unwrap();
-        // verify the new values are updated
-        assert_eq!(updated_to_skip_partition_too.sort_key, vec!["tag3", "time"]);
-        assert_eq!(
-            updated_to_skip_partition_too.sort_key_ids,
-            SortedColumnSet::from([3, 4])
-        );
-
-        // The compactor can log why compaction was skipped
-        let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
-        assert!(
-            skipped_compactions.is_empty(),
-            "Expected no skipped compactions, got: {skipped_compactions:?}"
-        );
-        repos
-            .partitions()
-            .record_skipped_compaction(to_skip_partition.id, "I am le tired", 1, 2, 4, 10, 20)
-            .await
-            .unwrap();
-        let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
-        assert_eq!(skipped_compactions.len(), 1);
-        assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
-        assert_eq!(skipped_compactions[0].reason, "I am le tired");
-        assert_eq!(skipped_compactions[0].num_files, 1);
-        assert_eq!(skipped_compactions[0].limit_num_files, 2);
-        assert_eq!(skipped_compactions[0].estimated_bytes, 10);
-        assert_eq!(skipped_compactions[0].limit_bytes, 20);
-        //
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[to_skip_partition.id])
-            .await
-            .unwrap();
-        assert_eq!(
-            skipped_partition_records[0].partition_id,
-            to_skip_partition.id
-        );
-        assert_eq!(skipped_partition_records[0].reason, "I am le tired");
-
-        // Only save the last reason that any particular partition was skipped (really if the
-        // partition appears in the skipped compactions, it shouldn't become a compaction candidate
-        // again, but race conditions and all that)
-        repos
-            .partitions()
-            .record_skipped_compaction(to_skip_partition.id, "I'm on fire", 11, 12, 24, 110, 120)
-            .await
-            .unwrap();
-        let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
-        assert_eq!(skipped_compactions.len(), 1);
-        assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
-        assert_eq!(skipped_compactions[0].reason, "I'm on fire");
-        assert_eq!(skipped_compactions[0].num_files, 11);
-        assert_eq!(skipped_compactions[0].limit_num_files, 12);
-        assert_eq!(skipped_compactions[0].estimated_bytes, 110);
-        assert_eq!(skipped_compactions[0].limit_bytes, 120);
-        //
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[to_skip_partition.id])
-            .await
-            .unwrap();
-        assert_eq!(
-            skipped_partition_records[0].partition_id,
-            to_skip_partition.id
-        );
-        assert_eq!(skipped_partition_records[0].reason, "I'm on fire");
-
-        // Can receive multiple skipped compactions for different partitions
-        repos
-            .partitions()
-            .record_skipped_compaction(
-                to_skip_partition_too.id,
-                "I am le tired too",
-                1,
-                2,
-                4,
-                10,
-                20,
-            )
-            .await
-            .unwrap();
-        let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
-        assert_eq!(skipped_compactions.len(), 2);
-        assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
-        assert_eq!(
-            skipped_compactions[1].partition_id,
-            to_skip_partition_too.id
-        );
-        // confirm can fetch subset of skipped compactions (a.k.a. have two, only fetch 1)
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[to_skip_partition.id])
-            .await
-            .unwrap();
-        assert_eq!(skipped_partition_records.len(), 1);
-        assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[to_skip_partition_too.id])
-            .await
-            .unwrap();
-        assert_eq!(skipped_partition_records.len(), 1);
-        assert_eq!(
-            skipped_partition_records[0].partition_id,
-            to_skip_partition_too.id
-        );
-        // confirm can fetch both skipped compactions, and not the unskipped one
-        // also confirm will not error on non-existing partition
-        let non_existing_partition_id = PartitionId::new(9999);
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[
-                partition.id,
-                to_skip_partition.id,
-                to_skip_partition_too.id,
-                non_existing_partition_id,
-            ])
-            .await
-            .unwrap();
-        assert_eq!(skipped_partition_records.len(), 2);
-        assert_eq!(
-            skipped_partition_records[0].partition_id,
-            to_skip_partition.id
-        );
-        assert_eq!(
-            skipped_partition_records[1].partition_id,
-            to_skip_partition_too.id
-        );
-
-        // Delete the skipped compactions
-        let deleted_skipped_compaction = repos
-            .partitions()
-            .delete_skipped_compactions(to_skip_partition.id)
-            .await
-            .unwrap()
-            .expect("The skipped compaction should have been returned");
-        assert_eq!(
-            deleted_skipped_compaction.partition_id,
-            to_skip_partition.id
-        );
-        assert_eq!(deleted_skipped_compaction.reason, "I'm on fire");
-        assert_eq!(deleted_skipped_compaction.num_files, 11);
-        assert_eq!(deleted_skipped_compaction.limit_num_files, 12);
-        assert_eq!(deleted_skipped_compaction.estimated_bytes, 110);
-        assert_eq!(deleted_skipped_compaction.limit_bytes, 120);
-        //
-        let deleted_skipped_compaction = repos
-            .partitions()
-            .delete_skipped_compactions(to_skip_partition_too.id)
-            .await
-            .unwrap()
-            .expect("The skipped compaction should have been returned");
-        assert_eq!(
-            deleted_skipped_compaction.partition_id,
-            to_skip_partition_too.id
-        );
-        assert_eq!(deleted_skipped_compaction.reason, "I am le tired too");
-        //
-        let skipped_partition_records = repos
-            .partitions()
-            .get_in_skipped_compactions(&[to_skip_partition.id])
-            .await
-            .unwrap();
-        assert!(skipped_partition_records.is_empty());
-
-        let not_deleted_skipped_compaction = repos
-            .partitions()
-            .delete_skipped_compactions(to_skip_partition.id)
-            .await
-            .unwrap();
-
-        assert!(
-            not_deleted_skipped_compaction.is_none(),
-            "There should be no skipped compation",
-        );
-
-        let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
-        assert!(
-            skipped_compactions.is_empty(),
-            "Expected no skipped compactions, got: {skipped_compactions:?}"
-        );
-
-        let recent = repos
-            .partitions()
-            .most_recent_n(10)
-            .await
-            .expect("should list most recent");
-        assert_eq!(recent.len(), 4);
-
-        // Test: sort_key_ids from most_recent_n
-        // Only the first two partitions (represent to_skip_partition_too and to_skip_partition) have vallues, the others are empty
-        let empty_vec_string: Vec<String> = vec![];
-
-        assert_eq!(
-            recent[0].sort_key,
-            vec!["tag3".to_string(), "time".to_string(),]
-        );
-        assert_eq!(recent[0].sort_key_ids, SortedColumnSet::from(vec![3, 4]));
-
-        assert_eq!(
-            recent[1].sort_key,
-            vec![
-                "tag2".to_string(),
-                "tag1".to_string(),
-                "tag3 , with comma".to_string(),
-                "time".to_string()
-            ]
-        );
-        assert_eq!(
-            recent[1].sort_key_ids,
-            SortedColumnSet::from(vec![2, 1, 4, 3])
-        );
-
-        assert_eq!(recent[2].sort_key, empty_vec_string);
-        assert_eq!(recent[2].sort_key_ids, SortedColumnSet::from(vec![]));
-
-        assert_eq!(recent[3].sort_key, empty_vec_string);
-        assert_eq!(recent[3].sort_key_ids, SortedColumnSet::from(vec![]));
-
-        let recent = repos
-            .partitions()
-            .most_recent_n(4)
-            .await
-            .expect("should list most recent");
-        assert_eq!(recent.len(), 4); // no off by one error
-
-        let recent = repos
-            .partitions()
-            .most_recent_n(2)
-            .await
-            .expect("should list most recent");
-        assert_eq!(recent.len(), 2);
-
-        repos
-            .namespaces()
-            .soft_delete("namespace_partition_test")
-            .await
-            .expect("delete namespace should succeed");
-    }
-
-    /// tests many interactions with the catalog and parquet files. See the individual conditions
-    /// herein
-    async fn test_parquet_file(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await;
-        let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
-        let other_table = arbitrary_table(&mut *repos, "other", &namespace).await;
-        let partition = repos
-            .partitions()
-            .create_or_get("one".into(), table.id)
-            .await
-            .unwrap();
-        let other_partition = repos
-            .partitions()
-            .create_or_get("one".into(), other_table.id)
-            .await
-            .unwrap();
-
-        let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
-        let parquet_file = repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-
-        // verify we can get it by its object store id
-        let pfg = repos
-            .parquet_files()
-            .get_by_object_store_id(parquet_file.object_store_id)
-            .await
-            .unwrap();
-        assert_eq!(parquet_file, pfg.unwrap());
-
-        // verify that trying to create a file with the same UUID throws an error
-        let err = repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap_err();
-        assert!(matches!(err, Error::FileExists { object_store_id: _ }));
-
-        let other_params = ParquetFileParams {
-            table_id: other_partition.table_id,
-            partition_id: other_partition.transition_partition_id(),
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(50),
-            max_time: Timestamp::new(60),
-            ..parquet_file_params.clone()
-        };
-        let other_file = repos.parquet_files().create(other_params).await.unwrap();
-
-        let exist_id = parquet_file.id;
-        let non_exist_id = ParquetFileId::new(other_file.id.get() + 10);
-        // make sure exists_id != non_exist_id
-        assert_ne!(exist_id, non_exist_id);
-
-        // verify that to_delete is initially set to null and the file does not get deleted
-        assert!(parquet_file.to_delete.is_none());
-        let older_than = Timestamp::new(
-            (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
-        );
-        let deleted = repos
-            .parquet_files()
-            .delete_old_ids_only(older_than)
-            .await
-            .unwrap();
-        assert!(deleted.is_empty());
-
-        // test list_all that includes soft-deleted file
-        // at this time the file is not soft-deleted yet and will be included in the returned list
-        let files = repos.parquet_files().list_all().await.unwrap();
-        assert_eq!(files.len(), 2);
-
-        // verify to_delete can be updated to a timestamp
-        repos
-            .parquet_files()
-            .create_upgrade_delete(&[parquet_file.id], &[], &[], CompactionLevel::Initial)
-            .await
-            .unwrap();
-
-        // test list_all that includes soft-deleted file
-        // at this time the file is soft-deleted and will be included in the returned list
-        let files = repos.parquet_files().list_all().await.unwrap();
-        assert_eq!(files.len(), 2);
-        let marked_deleted = files
-            .iter()
-            .find(|f| f.to_delete.is_some())
-            .cloned()
-            .unwrap();
-
-        // File is not deleted if it was marked to be deleted after the specified time
-        let before_deleted = Timestamp::new(
-            (catalog.time_provider().now() - Duration::from_secs(100)).timestamp_nanos(),
-        );
-        let deleted = repos
-            .parquet_files()
-            .delete_old_ids_only(before_deleted)
-            .await
-            .unwrap();
-        assert!(deleted.is_empty());
-
-        // test list_all that includes soft-deleted file
-        // at this time the file is not actually hard deleted yet and stay as soft deleted
-        // and will be returned in the list
-        let files = repos.parquet_files().list_all().await.unwrap();
-        assert_eq!(files.len(), 2);
-
-        // File is deleted if it was marked to be deleted before the specified time
-        let deleted = repos
-            .parquet_files()
-            .delete_old_ids_only(older_than)
-            .await
-            .unwrap();
-        assert_eq!(deleted.len(), 1);
-        assert_eq!(marked_deleted.id, deleted[0]);
-
-        // test list_all that includes soft-deleted file
-        // at this time the file is hard deleted -> the returned list is empty
-        let files = repos.parquet_files().list_all().await.unwrap();
-        assert_eq!(files.len(), 1);
-
-        // test list_by_table_not_to_delete
-        let files = repos
-            .parquet_files()
-            .list_by_table_not_to_delete(table.id)
-            .await
-            .unwrap();
-        assert_eq!(files, vec![]);
-        let files = repos
-            .parquet_files()
-            .list_by_table_not_to_delete(other_table.id)
-            .await
-            .unwrap();
-        assert_eq!(files, vec![other_file.clone()]);
-
-        // test list_all
-        let files = repos.parquet_files().list_all().await.unwrap();
-        assert_eq!(vec![other_file.clone()], files);
-
-        // test list_by_namespace_not_to_delete
-        let namespace2 = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test1").await;
-        let table2 = arbitrary_table(&mut *repos, "test_table2", &namespace2).await;
-        let partition2 = repos
-            .partitions()
-            .create_or_get("foo".into(), table2.id)
-            .await
-            .unwrap();
-        let files = repos
-            .parquet_files()
-            .list_by_namespace_not_to_delete(namespace2.id)
-            .await
-            .unwrap();
-        assert!(files.is_empty());
-
-        let f1_params = ParquetFileParams {
-            table_id: partition2.table_id,
-            partition_id: partition2.transition_partition_id(),
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(1),
-            max_time: Timestamp::new(10),
-            ..parquet_file_params
-        };
-        let f1 = repos
-            .parquet_files()
-            .create(f1_params.clone())
-            .await
-            .unwrap();
-
-        let f2_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(50),
-            max_time: Timestamp::new(60),
-            ..f1_params.clone()
-        };
-        let f2 = repos
-            .parquet_files()
-            .create(f2_params.clone())
-            .await
-            .unwrap();
-        let files = repos
-            .parquet_files()
-            .list_by_namespace_not_to_delete(namespace2.id)
-            .await
-            .unwrap();
-        assert_eq!(vec![f1.clone(), f2.clone()], files);
-
-        let f3_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(50),
-            max_time: Timestamp::new(60),
-            ..f2_params
-        };
-        let f3 = repos
-            .parquet_files()
-            .create(f3_params.clone())
-            .await
-            .unwrap();
-        let files = repos
-            .parquet_files()
-            .list_by_namespace_not_to_delete(namespace2.id)
-            .await
-            .unwrap();
-        assert_eq!(vec![f1.clone(), f2.clone(), f3.clone()], files);
-
-        repos
-            .parquet_files()
-            .create_upgrade_delete(&[f2.id], &[], &[], CompactionLevel::Initial)
-            .await
-            .unwrap();
-        let files = repos
-            .parquet_files()
-            .list_by_namespace_not_to_delete(namespace2.id)
-            .await
-            .unwrap();
-        assert_eq!(vec![f1.clone(), f3.clone()], files);
-
-        let files = repos
-            .parquet_files()
-            .list_by_namespace_not_to_delete(NamespaceId::new(i64::MAX))
-            .await
-            .unwrap();
-        assert!(files.is_empty());
-
-        // test delete_old_ids_only
-        let older_than = Timestamp::new(
-            (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
-        );
-        let ids = repos
-            .parquet_files()
-            .delete_old_ids_only(older_than)
-            .await
-            .unwrap();
-        assert_eq!(ids.len(), 1);
-
-        // test retention-based flagging for deletion
-        // Since mem catalog has default retention 1 hour, let us first set it to 0 means infinite
-        let namespaces = repos
-            .namespaces()
-            .list(SoftDeletedRows::AllRows)
-            .await
-            .expect("listing namespaces");
-        for namespace in namespaces {
-            repos
-                .namespaces()
-                .update_retention_period(&namespace.name, None) // infinite
-                .await
-                .unwrap();
-        }
-
-        // 1. with no retention period set on the ns, nothing should get flagged
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert!(ids.is_empty());
-        // 2. set ns retention period to one hour then create some files before and after and
-        //    ensure correct files get deleted
-        repos
-            .namespaces()
-            .update_retention_period(&namespace.name, Some(60 * 60 * 1_000_000_000)) // 1 hour
-            .await
-            .unwrap();
-        let f4_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            max_time: Timestamp::new(
-                // a bit over an hour ago
-                (catalog.time_provider().now() - Duration::from_secs(60 * 65)).timestamp_nanos(),
-            ),
-            ..f3_params
-        };
-        let f4 = repos
-            .parquet_files()
-            .create(f4_params.clone())
-            .await
-            .unwrap();
-        let f5_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            max_time: Timestamp::new(
-                // a bit under an hour ago
-                (catalog.time_provider().now() - Duration::from_secs(60 * 55)).timestamp_nanos(),
-            ),
-            ..f4_params
-        };
-        let f5 = repos
-            .parquet_files()
-            .create(f5_params.clone())
-            .await
-            .unwrap();
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert!(ids.len() > 1); // it's also going to flag f1, f2 & f3 because they have low max
-                                // timestamps but i don't want this test to be brittle if those
-                                // values change so i'm not asserting len == 4
-        let f4 = repos
-            .parquet_files()
-            .get_by_object_store_id(f4.object_store_id)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_matches!(f4.to_delete, Some(_)); // f4 is > 1hr old
-        let f5 = repos
-            .parquet_files()
-            .get_by_object_store_id(f5.object_store_id)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_matches!(f5.to_delete, None); // f5 is < 1hr old
-
-        // call flag_for_delete_by_retention() again and nothing should be flagged because they've
-        // already been flagged
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert!(ids.is_empty());
-
-        // test that flag_for_delete_by_retention respects UPDATE LIMIT
-        // create limit + the meaning of life parquet files that are all older than the retention (>1hr)
-        const LIMIT: usize = 1000;
-        const MOL: usize = 42;
-        for _ in 0..LIMIT + MOL {
-            let params = ParquetFileParams {
-                object_store_id: Uuid::new_v4(),
-                max_time: Timestamp::new(
-                    // a bit over an hour ago
-                    (catalog.time_provider().now() - Duration::from_secs(60 * 65))
-                        .timestamp_nanos(),
-                ),
-                ..f1_params.clone()
-            };
-            repos.parquet_files().create(params.clone()).await.unwrap();
-        }
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert_eq!(ids.len(), LIMIT);
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert_eq!(ids.len(), MOL); // second call took remainder
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert_eq!(ids.len(), 0); // none left
-
-        // test create_update_delete
-        let f6_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            ..f5_params
-        };
-        let f1_uuid = f1.object_store_id;
-        let f5_uuid = f5.object_store_id;
-        let cud = repos
-            .parquet_files()
-            .create_upgrade_delete(
-                &[f5.id],
-                &[f1.id],
-                &[f6_params.clone()],
-                CompactionLevel::Final,
-            )
-            .await
-            .unwrap();
-
-        assert_eq!(cud.len(), 1);
-        let f5_delete = repos
-            .parquet_files()
-            .get_by_object_store_id(f5_uuid)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_matches!(f5_delete.to_delete, Some(_));
-
-        let f1_compaction_level = repos
-            .parquet_files()
-            .get_by_object_store_id(f1_uuid)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_matches!(f1_compaction_level.compaction_level, CompactionLevel::Final);
-
-        let f6 = repos
-            .parquet_files()
-            .get_by_object_store_id(f6_params.object_store_id)
-            .await
-            .unwrap()
-            .unwrap();
-
-        let f6_uuid = f6.object_store_id;
-
-        // test create_update_delete transaction (rollsback because f6 already exists)
-        let cud = repos
-            .parquet_files()
-            .create_upgrade_delete(
-                &[f5.id],
-                &[f2.id],
-                &[f6_params.clone()],
-                CompactionLevel::Final,
-            )
-            .await;
-
-        assert_matches!(
-            cud,
-            Err(Error::FileExists {
-                object_store_id
-            }) if object_store_id == f6_params.object_store_id
-        );
-
-        let f6_not_delete = repos
-            .parquet_files()
-            .get_by_object_store_id(f6_uuid)
-            .await
-            .unwrap()
-            .unwrap();
-        assert_matches!(f6_not_delete.to_delete, None);
-
-        // test exists_by_object_store_id_batch returns parquet files by object store id
-        let does_not_exist = Uuid::new_v4();
-        let mut present = repos
-            .parquet_files()
-            .exists_by_object_store_id_batch(vec![f6_uuid, f1_uuid, does_not_exist])
-            .await
-            .unwrap();
-        assert_eq!(present.len(), 2);
-        let mut expected = vec![f6_uuid, f1_uuid];
-        present.sort();
-        expected.sort();
-        assert_eq!(present, expected);
-    }
-
-    async fn test_parquet_file_delete_broken(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace_1 = arbitrary_namespace(&mut *repos, "retention_broken_1").await;
-        let namespace_2 = repos
-            .namespaces()
-            .create(
-                &NamespaceName::new("retention_broken_2").unwrap(),
-                None,
-                Some(1),
-                None,
-            )
-            .await
-            .unwrap();
-        let table_1 = arbitrary_table(&mut *repos, "test_table", &namespace_1).await;
-        let table_2 = arbitrary_table(&mut *repos, "test_table", &namespace_2).await;
-        let partition_1 = repos
-            .partitions()
-            .create_or_get("one".into(), table_1.id)
-            .await
-            .unwrap();
-        let partition_2 = repos
-            .partitions()
-            .create_or_get("one".into(), table_2.id)
-            .await
-            .unwrap();
-
-        let parquet_file_params_1 =
-            arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
-        let parquet_file_params_2 =
-            arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
-        let _parquet_file_1 = repos
-            .parquet_files()
-            .create(parquet_file_params_1)
-            .await
-            .unwrap();
-        let parquet_file_2 = repos
-            .parquet_files()
-            .create(parquet_file_params_2)
-            .await
-            .unwrap();
-
-        let ids = repos
-            .parquet_files()
-            .flag_for_delete_by_retention()
-            .await
-            .unwrap();
-        assert_eq!(ids, vec![parquet_file_2.id]);
-    }
-
-    async fn test_partitions_new_file_between(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(&mut *repos, "test_partitions_new_file_between").await;
-        let table =
-            arbitrary_table(&mut *repos, "test_table_for_new_file_between", &namespace).await;
-
-        // param for the tests
-        let time_now = Timestamp::from(catalog.time_provider().now());
-        let time_one_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(1));
-        let time_two_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(2));
-        let time_three_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(3));
-        let time_five_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(5));
-        let time_six_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(6));
-
-        // Db has no partitions
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // -----------------
-        // PARTITION one
-        // The DB has 1 partition but it does not have any file
-        let partition1 = repos
-            .partitions()
-            .create_or_get("one".into(), table.id)
-            .await
-            .unwrap();
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // create files for partition one
-        let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition1);
-
-        // create a deleted L0 file that was created 3 hours ago
-        let delete_l0_file = repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-        repos
-            .parquet_files()
-            .create_upgrade_delete(&[delete_l0_file.id], &[], &[], CompactionLevel::Initial)
-            .await
-            .unwrap();
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, Some(time_one_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_one_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // create a deleted L0 file that was created 1 hour ago
-        let l0_one_hour_ago_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            created_at: time_one_hour_ago,
-            ..parquet_file_params.clone()
-        };
-        repos
-            .parquet_files()
-            .create(l0_one_hour_ago_file_params.clone())
-            .await
-            .unwrap();
-        // partition one should be returned
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_two_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // -----------------
-        // PARTITION two
-        // Partition two without any file
-        let partition2 = repos
-            .partitions()
-            .create_or_get("two".into(), table.id)
-            .await
-            .unwrap();
-        // should return partition one only
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-
-        // Add a L0 file created 5 hours ago
-        let l0_five_hour_ago_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            created_at: time_five_hour_ago,
-            partition_id: partition2.transition_partition_id(),
-            ..parquet_file_params.clone()
-        };
-        repos
-            .parquet_files()
-            .create(l0_five_hour_ago_file_params.clone())
-            .await
-            .unwrap();
-        // still return partition one only
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        // Between six and three hours ago, return only partition 2
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition2.id);
-
-        // Add an L1 file created just now
-        let l1_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            created_at: time_now,
-            partition_id: partition2.transition_partition_id(),
-            compaction_level: CompactionLevel::FileNonOverlapped,
-            ..parquet_file_params.clone()
-        };
-        repos
-            .parquet_files()
-            .create(l1_file_params.clone())
-            .await
-            .unwrap();
-        // should return both partitions
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 2);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition2.id);
-        // Only return partition1: the creation time must be strictly less than the maximum time,
-        // not equal
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        // Between six and three hours ago, return none
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // -----------------
-        // PARTITION three
-        // Partition three without any file
-        let partition3 = repos
-            .partitions()
-            .create_or_get("three".into(), table.id)
-            .await
-            .unwrap();
-        // should return partition one and two only
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 2);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition2.id);
-        // Only return partition1: the creation time must be strictly less than the maximum time,
-        // not equal
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        // When the maximum time is greater than the creation time of partition2, return it
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now + 1))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 2);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition2.id);
-        // Between six and three hours ago, return none
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // Add an L2 file created just now for partition three
-        let l2_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            created_at: time_now,
-            partition_id: partition3.transition_partition_id(),
-            compaction_level: CompactionLevel::Final,
-            ..parquet_file_params.clone()
-        };
-        repos
-            .parquet_files()
-            .create(l2_file_params.clone())
-            .await
-            .unwrap();
-        // now should return partition one two and three
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 3);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition2.id);
-        assert_eq!(partitions[2], partition3.id);
-        // Only return partition1: the creation time must be strictly less than the maximum time,
-        // not equal
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 1);
-        assert_eq!(partitions[0], partition1.id);
-        // Between six and three hours ago, return none
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-
-        // add an L0 file created one hour ago for partition three
-        let l0_one_hour_ago_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            created_at: time_one_hour_ago,
-            partition_id: partition3.transition_partition_id(),
-            ..parquet_file_params.clone()
-        };
-        repos
-            .parquet_files()
-            .create(l0_one_hour_ago_file_params.clone())
-            .await
-            .unwrap();
-        // should return all partitions
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_two_hour_ago, None)
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 3);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition2.id);
-        assert_eq!(partitions[2], partition3.id);
-        // Only return partitions 1 and 3; 2 was created just now
-        let mut partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_three_hour_ago, Some(time_now))
-            .await
-            .unwrap();
-        assert_eq!(partitions.len(), 2);
-        partitions.sort();
-        assert_eq!(partitions[0], partition1.id);
-        assert_eq!(partitions[1], partition3.id);
-        // Between six and three hours ago, return none
-        let partitions = repos
-            .partitions()
-            .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
-            .await
-            .unwrap();
-        assert!(partitions.is_empty());
-    }
-
-    async fn test_list_by_partiton_not_to_delete(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace = arbitrary_namespace(
-            &mut *repos,
-            "namespace_parquet_file_test_list_by_partiton_not_to_delete",
-        )
-        .await;
-        let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
-
-        let partition = repos
-            .partitions()
-            .create_or_get("test_list_by_partiton_not_to_delete_one".into(), table.id)
-            .await
-            .unwrap();
-        let partition2 = repos
-            .partitions()
-            .create_or_get("test_list_by_partiton_not_to_delete_two".into(), table.id)
-            .await
-            .unwrap();
-
-        let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
-
-        let parquet_file = repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-        let delete_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            ..parquet_file_params.clone()
-        };
-        let delete_file = repos
-            .parquet_files()
-            .create(delete_file_params)
-            .await
-            .unwrap();
-        repos
-            .parquet_files()
-            .create_upgrade_delete(&[delete_file.id], &[], &[], CompactionLevel::Initial)
-            .await
-            .unwrap();
-        let level1_file_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            ..parquet_file_params.clone()
-        };
-        let mut level1_file = repos
-            .parquet_files()
-            .create(level1_file_params)
-            .await
-            .unwrap();
-        repos
-            .parquet_files()
-            .create_upgrade_delete(
-                &[],
-                &[level1_file.id],
-                &[],
-                CompactionLevel::FileNonOverlapped,
-            )
-            .await
-            .unwrap();
-        level1_file.compaction_level = CompactionLevel::FileNonOverlapped;
-
-        let other_partition_params = ParquetFileParams {
-            partition_id: partition2.transition_partition_id(),
-            object_store_id: Uuid::new_v4(),
-            ..parquet_file_params.clone()
-        };
-        let _partition2_file = repos
-            .parquet_files()
-            .create(other_partition_params)
-            .await
-            .unwrap();
-
-        let files = repos
-            .parquet_files()
-            .list_by_partition_not_to_delete(&partition.transition_partition_id())
-            .await
-            .unwrap();
-        assert_eq!(files.len(), 2);
-
-        let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
-        file_ids.sort();
-        let mut expected_ids = vec![parquet_file.id, level1_file.id];
-        expected_ids.sort();
-        assert_eq!(file_ids, expected_ids);
-
-        // Using the catalog partition ID should return the same files, even if the Parquet file
-        // records don't have the partition ID on them (which is the default now)
-        let files = repos
-            .parquet_files()
-            .list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition.id))
-            .await
-            .unwrap();
-        assert_eq!(files.len(), 2);
-
-        let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
-        file_ids.sort();
-        let mut expected_ids = vec![parquet_file.id, level1_file.id];
-        expected_ids.sort();
-        assert_eq!(file_ids, expected_ids);
-    }
-
-    async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace =
-            arbitrary_namespace(&mut *repos, "namespace_update_to_compaction_level_1_test").await;
-        let table = arbitrary_table(&mut *repos, "update_table", &namespace).await;
-        let partition = repos
-            .partitions()
-            .create_or_get("test_update_to_compaction_level_1_one".into(), table.id)
-            .await
-            .unwrap();
-
-        // Set up the window of times we're interested in level 1 files for
-        let query_min_time = Timestamp::new(5);
-        let query_max_time = Timestamp::new(10);
-
-        // Create a file with times entirely within the window
-        let mut parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
-        parquet_file_params.min_time = query_min_time + 1;
-        parquet_file_params.max_time = query_max_time - 1;
-        let parquet_file = repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-
-        // Create a file that will remain as level 0
-        let level_0_params = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            ..parquet_file_params.clone()
-        };
-        let level_0_file = repos.parquet_files().create(level_0_params).await.unwrap();
-
-        // Create a ParquetFileId that doesn't actually exist in the catalog
-        let nonexistent_parquet_file_id = ParquetFileId::new(level_0_file.id.get() + 1);
-
-        // Make parquet_file compaction level 1, attempt to mark the nonexistent file; operation
-        // should succeed
-        let created = repos
-            .parquet_files()
-            .create_upgrade_delete(
-                &[],
-                &[parquet_file.id, nonexistent_parquet_file_id],
-                &[],
-                CompactionLevel::FileNonOverlapped,
-            )
-            .await
-            .unwrap();
-        assert_eq!(created, vec![]);
-
-        // remove namespace to avoid it from affecting later tests
-        repos
-            .namespaces()
-            .soft_delete("namespace_update_to_compaction_level_1_test")
-            .await
-            .expect("delete namespace should succeed");
-    }
-
-    /// Assert that a namespace deletion does NOT cascade to the tables/schema
-    /// items/parquet files/etc.
-    ///
-    /// Removal of this entities breaks the invariant that once created, a row
-    /// always exists for the lifetime of an IOx process, and causes the system
-    /// to panic in multiple components. It's also ineffective, because most
-    /// components maintain a cache of at least one of these entities.
-    ///
-    /// Instead soft deleted namespaces should have their files GC'd like a
-    /// normal parquet file deletion, removing the rows once they're no longer
-    /// being actively used by the system. This is done by waiting a long time
-    /// before deleting records, and whilst isn't perfect, it is largely
-    /// effective.
-    async fn test_delete_namespace(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-        let namespace_1 =
-            arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_1").await;
-        let table_1 = arbitrary_table(&mut *repos, "test_table_1", &namespace_1).await;
-        let _c = repos
-            .columns()
-            .create_or_get("column_test_1", table_1.id, ColumnType::Tag)
-            .await
-            .unwrap();
-        let partition_1 = repos
-            .partitions()
-            .create_or_get("test_delete_namespace_one".into(), table_1.id)
-            .await
-            .unwrap();
-
-        // parquet files
-        let parquet_file_params =
-            arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
-        repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-        let parquet_file_params_2 = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(200),
-            max_time: Timestamp::new(300),
-            ..parquet_file_params
-        };
-        repos
-            .parquet_files()
-            .create(parquet_file_params_2.clone())
-            .await
-            .unwrap();
-
-        // we've now created a namespace with a table and parquet files. before we test deleting
-        // it, let's create another so we can ensure that doesn't get deleted.
-        let namespace_2 =
-            arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_2").await;
-        let table_2 = arbitrary_table(&mut *repos, "test_table_2", &namespace_2).await;
-        let _c = repos
-            .columns()
-            .create_or_get("column_test_2", table_2.id, ColumnType::Tag)
-            .await
-            .unwrap();
-        let partition_2 = repos
-            .partitions()
-            .create_or_get("test_delete_namespace_two".into(), table_2.id)
-            .await
-            .unwrap();
-
-        // parquet files
-        let parquet_file_params =
-            arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
-        repos
-            .parquet_files()
-            .create(parquet_file_params.clone())
-            .await
-            .unwrap();
-        let parquet_file_params_2 = ParquetFileParams {
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(200),
-            max_time: Timestamp::new(300),
-            ..parquet_file_params
-        };
-        repos
-            .parquet_files()
-            .create(parquet_file_params_2.clone())
-            .await
-            .unwrap();
-
-        // now delete namespace_1 and assert it's all gone and none of
-        // namespace_2 is gone
-        repos
-            .namespaces()
-            .soft_delete("namespace_test_delete_namespace_1")
-            .await
-            .expect("delete namespace should succeed");
-        // assert that namespace is soft-deleted, but the table, column, and parquet files are all
-        // still there.
-        assert!(repos
-            .namespaces()
-            .get_by_id(namespace_1.id, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .expect("get namespace should succeed")
-            .is_none());
-        assert_eq!(
-            repos
-                .namespaces()
-                .get_by_id(namespace_1.id, SoftDeletedRows::AllRows)
-                .await
-                .expect("get namespace should succeed")
-                .map(|mut v| {
-                    // The only change after soft-deletion should be the deleted_at
-                    // field being set - this block normalises that field, so that
-                    // the before/after can be asserted as equal.
-                    v.deleted_at = None;
-                    v
-                })
-                .expect("should see soft-deleted row"),
-            namespace_1
-        );
-        assert_eq!(
-            repos
-                .tables()
-                .get_by_id(table_1.id)
-                .await
-                .expect("get table should succeed")
-                .expect("should return row"),
-            table_1
-        );
-        assert_eq!(
-            repos
-                .columns()
-                .list_by_namespace_id(namespace_1.id)
-                .await
-                .expect("listing columns should succeed")
-                .len(),
-            1
-        );
-        assert_eq!(
-            repos
-                .columns()
-                .list_by_table_id(table_1.id)
-                .await
-                .expect("listing columns should succeed")
-                .len(),
-            1
-        );
-
-        // partition's get_by_id should succeed
-        repos
-            .partitions()
-            .get_by_id(partition_1.id)
-            .await
-            .unwrap()
-            .unwrap();
-
-        // assert that the namespace, table, column, and parquet files for namespace_2 are still
-        // there
-        assert!(repos
-            .namespaces()
-            .get_by_id(namespace_2.id, SoftDeletedRows::ExcludeDeleted)
-            .await
-            .expect("get namespace should succeed")
-            .is_some());
-
-        assert!(repos
-            .tables()
-            .get_by_id(table_2.id)
-            .await
-            .expect("get table should succeed")
-            .is_some());
-        assert_eq!(
-            repos
-                .columns()
-                .list_by_namespace_id(namespace_2.id)
-                .await
-                .expect("listing columns should succeed")
-                .len(),
-            1
-        );
-        assert_eq!(
-            repos
-                .columns()
-                .list_by_table_id(table_2.id)
-                .await
-                .expect("listing columns should succeed")
-                .len(),
-            1
-        );
-
-        // partition's get_by_id should succeed
-        repos
-            .partitions()
-            .get_by_id(partition_2.id)
-            .await
-            .unwrap()
-            .unwrap();
-    }
-
-    /// Upsert a namespace called `namespace_name` and write `lines` to it.
-    async fn populate_namespace<R>(
-        repos: &mut R,
-        namespace_name: &str,
-        lines: &str,
-    ) -> (Namespace, NamespaceSchema)
-    where
-        R: RepoCollection + ?Sized,
-    {
-        let namespace = repos
-            .namespaces()
-            .create(
-                &NamespaceName::new(namespace_name).unwrap(),
-                None,
-                None,
-                None,
-            )
-            .await;
-
-        let namespace = match namespace {
-            Ok(v) => v,
-            Err(Error::NameExists { .. }) => repos
-                .namespaces()
-                .get_by_name(namespace_name, SoftDeletedRows::AllRows)
-                .await
-                .unwrap()
-                .unwrap(),
-            e @ Err(_) => e.unwrap(),
-        };
-
-        let batches = mutable_batch_lp::lines_to_batches(lines, 42).unwrap();
-        let batches = batches.iter().map(|(table, batch)| (table.as_str(), batch));
-        let ns = NamespaceSchema::new_empty_from(&namespace);
-
-        let schema = validate_or_insert_schema(batches, &ns, repos)
-            .await
-            .expect("validate schema failed")
-            .unwrap_or(ns);
-
-        (namespace, schema)
-    }
-
-    async fn test_list_schemas(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-
-        let ns1 = populate_namespace(
-            repos.deref_mut(),
-            "ns1",
-            "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
-        )
-        .await;
-        let ns2 = populate_namespace(
-            repos.deref_mut(),
-            "ns2",
-            "cpu,tag=1 field=1i\nsomethingelse field=1u",
-        )
-        .await;
-
-        // Otherwise the in-mem catalog deadlocks.... (but not postgres)
-        drop(repos);
-
-        let got = list_schemas(&*catalog)
-            .await
-            .expect("should be able to list the schemas")
-            .collect::<Vec<_>>();
-
-        assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
-        assert!(got.contains(&ns2), "{:#?}\n\nwant{:#?}", got, &ns2);
-    }
-
-    async fn test_list_schemas_soft_deleted_rows(catalog: Arc<dyn Catalog>) {
-        let mut repos = catalog.repositories().await;
-
-        let ns1 = populate_namespace(
-            repos.deref_mut(),
-            "ns1",
-            "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
-        )
-        .await;
-        let ns2 = populate_namespace(
-            repos.deref_mut(),
-            "ns2",
-            "cpu,tag=1 field=1i\nsomethingelse field=1u",
-        )
-        .await;
-
-        repos
-            .namespaces()
-            .soft_delete(&ns2.0.name)
-            .await
-            .expect("failed to soft delete namespace");
-
-        // Otherwise the in-mem catalog deadlocks.... (but not postgres)
-        drop(repos);
-
-        let got = list_schemas(&*catalog)
-            .await
-            .expect("should be able to list the schemas")
-            .collect::<Vec<_>>();
-
-        assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
-        assert!(!got.contains(&ns2), "{:#?}\n\n do not want{:#?}", got, &ns2);
-    }
-
-    fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) {
-        let histogram = metrics
-            .get_instrument::<Metric<DurationHistogram>>("catalog_op_duration")
-            .expect("failed to read metric")
-            .get_observer(&Attributes::from(&[("op", name), ("result", "success")]))
-            .expect("failed to get observer")
-            .fetch();
-
-        let hit_count = histogram.sample_count();
-        assert!(hit_count > 1, "metric did not record any calls");
-    }
-}
diff --git a/iox_catalog/src/interface_tests.rs b/iox_catalog/src/interface_tests.rs
new file mode 100644
index 00000000000..4635483c37a
--- /dev/null
+++ b/iox_catalog/src/interface_tests.rs
@@ -0,0 +1,3203 @@
+//! Abstract tests of the catalog interface w/o relying on the actual implementation.
+use crate::{
+    interface::{
+        CasFailure, Catalog, Error, ParquetFileRepoExt, PartitionRepoExt, RepoCollection,
+        SoftDeletedRows,
+    },
+    test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table},
+    util::{list_schemas, validate_or_insert_schema},
+};
+
+use ::test_helpers::assert_error;
+use assert_matches::assert_matches;
+use async_trait::async_trait;
+use data_types::snapshot::table::TableSnapshot;
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
+    NamespaceName, NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileId, ParquetFileParams,
+    PartitionId, SortKeyIds, TableId, Timestamp,
+};
+use data_types::{snapshot::partition::PartitionSnapshot, Column, PartitionHashId, PartitionKey};
+use futures::{Future, StreamExt};
+use generated_types::influxdata::iox::partition_template::v1 as proto;
+use iox_time::TimeProvider;
+use metric::{Attributes, DurationHistogram, Metric};
+use parking_lot::Mutex;
+use std::{any::Any, fmt::Display};
+use std::{
+    collections::{BTreeMap, BTreeSet, HashMap},
+    ops::DerefMut,
+    sync::Arc,
+    time::Duration,
+};
+
+pub(crate) async fn test_catalog<R, F>(clean_state: R)
+where
+    R: Fn() -> F + Send + Sync,
+    F: Future<Output = Arc<dyn Catalog>> + Send,
+{
+    test_setup(clean_state().await).await;
+    test_namespace_soft_deletion(clean_state().await).await;
+    test_partitions_new_file_between(clean_state().await).await;
+    test_column(clean_state().await).await;
+    test_partition(clean_state().await).await;
+    test_parquet_file(clean_state().await).await;
+    test_parquet_file_delete_broken(clean_state().await).await;
+    test_update_to_compaction_level_1(clean_state().await).await;
+    test_list_by_partiton_not_to_delete(clean_state().await).await;
+    test_list_schemas(clean_state().await).await;
+    test_list_schemas_soft_deleted_rows(clean_state().await).await;
+    test_delete_namespace(clean_state().await).await;
+
+    let catalog = clean_state().await;
+    test_namespace(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "namespace_create");
+
+    let catalog = clean_state().await;
+    test_table(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "table_create");
+
+    let catalog = clean_state().await;
+    test_column(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "column_create_or_get");
+
+    let catalog = clean_state().await;
+    test_partition(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "partition_create_or_get");
+
+    let catalog = clean_state().await;
+    test_parquet_file(Arc::clone(&catalog)).await;
+    assert_metric_hit(&catalog.metrics(), "parquet_create_upgrade_delete");
+
+    test_two_repos(clean_state().await).await;
+    test_partition_create_or_get_idempotent(clean_state().await).await;
+    test_column_create_or_get_many_unchecked(clean_state).await;
+}
+
+async fn test_setup(catalog: Arc<dyn Catalog>) {
+    catalog.setup().await.expect("first catalog setup");
+    catalog.setup().await.expect("second catalog setup");
+}
+
+async fn test_namespace(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_name = NamespaceName::new("test_namespace").unwrap();
+    let namespace = repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap();
+    assert!(namespace.id > NamespaceId::new(0));
+    assert_eq!(namespace.name, namespace_name.as_str());
+    assert_eq!(
+        namespace.partition_template,
+        NamespacePartitionTemplateOverride::default()
+    );
+    let lookup_namespace = repos
+        .namespaces()
+        .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(namespace, lookup_namespace);
+
+    // Assert default values for service protection limits.
+    assert_eq!(namespace.max_tables, MaxTables::default());
+    assert_eq!(
+        namespace.max_columns_per_table,
+        MaxColumnsPerTable::default()
+    );
+
+    let conflict = repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await;
+    assert!(matches!(conflict.unwrap_err(), Error::AlreadyExists { .. }));
+
+    let found = repos
+        .namespaces()
+        .get_by_id(namespace.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .expect("namespace should be there");
+    assert_eq!(namespace, found);
+
+    let not_found = repos
+        .namespaces()
+        .get_by_id(NamespaceId::new(i64::MAX), SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(not_found.is_none());
+
+    let found = repos
+        .namespaces()
+        .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .expect("namespace should be there");
+    assert_eq!(namespace, found);
+
+    let not_found = repos
+        .namespaces()
+        .get_by_name("does_not_exist", SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(not_found.is_none());
+
+    let namespace2 = arbitrary_namespace(&mut *repos, "test_namespace2").await;
+    let mut namespaces = repos
+        .namespaces()
+        .list(SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    namespaces.sort_by_key(|ns| ns.name.clone());
+    assert_eq!(namespaces, vec![namespace, namespace2]);
+
+    let new_table_limit = MaxTables::try_from(15_000).unwrap();
+    let modified = repos
+        .namespaces()
+        .update_table_limit(namespace_name.as_str(), new_table_limit)
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(new_table_limit, modified.max_tables);
+
+    let new_column_limit = MaxColumnsPerTable::try_from(1_500).unwrap();
+    let modified = repos
+        .namespaces()
+        .update_column_limit(namespace_name.as_str(), new_column_limit)
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(new_column_limit, modified.max_columns_per_table);
+
+    const NEW_RETENTION_PERIOD_NS: i64 = 5 * 60 * 60 * 1000 * 1000 * 1000;
+    let modified = repos
+        .namespaces()
+        .update_retention_period(namespace_name.as_str(), Some(NEW_RETENTION_PERIOD_NS))
+        .await
+        .expect("namespace should be updateable");
+    assert_eq!(
+        NEW_RETENTION_PERIOD_NS,
+        modified.retention_period_ns.unwrap()
+    );
+
+    let modified = repos
+        .namespaces()
+        .update_retention_period(namespace_name.as_str(), None)
+        .await
+        .expect("namespace should be updateable");
+    assert!(modified.retention_period_ns.is_none());
+
+    // create namespace with retention period NULL (the default)
+    let namespace3 = arbitrary_namespace(&mut *repos, "test_namespace3").await;
+    assert!(namespace3.retention_period_ns.is_none());
+
+    // create namespace with retention period
+    let namespace4_name = NamespaceName::new("test_namespace4").unwrap();
+    let namespace4 = repos
+        .namespaces()
+        .create(&namespace4_name, None, Some(NEW_RETENTION_PERIOD_NS), None)
+        .await
+        .expect("namespace with 5-hour retention should be created");
+    assert_eq!(
+        NEW_RETENTION_PERIOD_NS,
+        namespace4.retention_period_ns.unwrap()
+    );
+    // reset retention period to NULL to avoid affecting later tests
+    repos
+        .namespaces()
+        .update_retention_period(&namespace4_name, None)
+        .await
+        .expect("namespace should be updateable");
+
+    // create a namespace with a PartitionTemplate other than the default
+    let tag_partition_template =
+        NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+            parts: vec![proto::TemplatePart {
+                part: Some(proto::template_part::Part::TagValue("tag1".into())),
+            }],
+        })
+        .unwrap();
+    let namespace5_name = NamespaceName::new("test_namespace5").unwrap();
+    let namespace5 = repos
+        .namespaces()
+        .create(
+            &namespace5_name,
+            Some(tag_partition_template.clone()),
+            None,
+            None,
+        )
+        .await
+        .unwrap();
+    assert_eq!(namespace5.partition_template, tag_partition_template);
+    let lookup_namespace5 = repos
+        .namespaces()
+        .get_by_name(&namespace5_name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(namespace5, lookup_namespace5);
+
+    // remove namespace to avoid it from affecting later tests
+    repos
+        .namespaces()
+        .soft_delete("test_namespace")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace2")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace3")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("test_namespace4")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+/// Construct a set of two namespaces:
+///
+///  * deleted-ns: marked as soft-deleted
+///  * active-ns: not marked as deleted
+///
+/// And assert the expected "soft delete" semantics / correctly filter out
+/// the expected rows for all three states of [`SoftDeletedRows`].
+async fn test_namespace_soft_deletion(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let deleted_ns = arbitrary_namespace(&mut *repos, "deleted-ns").await;
+    let active_ns = arbitrary_namespace(&mut *repos, "active-ns").await;
+
+    // Mark "deleted-ns" as soft-deleted.
+    repos.namespaces().soft_delete("deleted-ns").await.unwrap();
+
+    // Which should be idempotent (ignoring the timestamp change - when
+    // changing this to "soft delete" it was idempotent, so I am preserving
+    // that).
+    repos.namespaces().soft_delete("deleted-ns").await.unwrap();
+
+    // Listing should respect soft deletion.
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns", "active-ns"]);
+
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+
+    let got = repos
+        .namespaces()
+        .list(SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+
+    // As should get by ID
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| {
+            assert!(v.deleted_at.is_some());
+            v.name
+        });
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(deleted_ns.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_id(active_ns.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+
+    // And get by name
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| {
+            assert!(v.deleted_at.is_some());
+            v.name
+        });
+    assert_string_set_eq(got, ["deleted-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&deleted_ns.name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::OnlyDeleted)
+        .await
+        .unwrap();
+    assert!(got.is_none());
+    let got = repos
+        .namespaces()
+        .get_by_name(&active_ns.name, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .unwrap()
+        .into_iter()
+        .map(|v| v.name);
+    assert_string_set_eq(got, ["active-ns"]);
+}
+
+// Assert the set of strings "a" is equal to the set "b", tolerating
+// duplicates.
+#[track_caller]
+fn assert_string_set_eq<T, U>(a: impl IntoIterator<Item = T>, b: impl IntoIterator<Item = U>)
+where
+    T: Into<String>,
+    U: Into<String>,
+{
+    let mut a = a.into_iter().map(Into::into).collect::<Vec<String>>();
+    a.sort_unstable();
+    let mut b = b.into_iter().map(Into::into).collect::<Vec<String>>();
+    b.sort_unstable();
+    assert_eq!(a, b);
+}
+
+async fn test_table(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_table_test").await;
+
+    // test we can create a table
+    let t = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    assert!(t.id > TableId::new(0));
+    assert_eq!(
+        t.partition_template,
+        TablePartitionTemplateOverride::default()
+    );
+
+    // The default template doesn't use any tag values, so no columns need to be created.
+    let table_columns = repos.columns().list_by_table_id(t.id).await.unwrap();
+    assert!(table_columns.is_empty());
+
+    // test we get an error if we try to create it again
+    let err = repos
+        .tables()
+        .create(
+            "test_table",
+            TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(),
+            namespace.id,
+        )
+        .await;
+    assert_error!(
+        err,
+        Error::AlreadyExists { ref descr }
+            if descr == &format!("table 'test_table' in namespace {}", namespace.id)
+    );
+
+    // get by id
+    assert_eq!(t, repos.tables().get_by_id(t.id).await.unwrap().unwrap());
+    assert!(repos
+        .tables()
+        .get_by_id(TableId::new(i64::MAX))
+        .await
+        .unwrap()
+        .is_none());
+
+    let tables = repos
+        .tables()
+        .list_by_namespace_id(namespace.id)
+        .await
+        .unwrap();
+    assert_eq!(vec![t.clone()], tables);
+
+    // test we can create a table of the same name in a different namespace
+    let namespace2 = arbitrary_namespace(&mut *repos, "two").await;
+    assert_ne!(namespace, namespace2);
+    let test_table = arbitrary_table(&mut *repos, "test_table", &namespace2).await;
+    assert_ne!(t.id, test_table.id);
+    assert_eq!(test_table.namespace_id, namespace2.id);
+
+    // test get by namespace and name
+    let foo_table = arbitrary_table(&mut *repos, "foo", &namespace2).await;
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(NamespaceId::new(i64::MAX), "test_table")
+            .await
+            .unwrap(),
+        None
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace.id, "not_existing")
+            .await
+            .unwrap(),
+        None
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace.id, "test_table")
+            .await
+            .unwrap(),
+        Some(t.clone())
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace2.id, "test_table")
+            .await
+            .unwrap()
+            .as_ref(),
+        Some(&test_table)
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_namespace_and_name(namespace2.id, "foo")
+            .await
+            .unwrap()
+            .as_ref(),
+        Some(&foo_table)
+    );
+
+    // All tables should be returned by list(), regardless of namespace
+    let mut list = repos.tables().list().await.unwrap();
+    list.sort_by_key(|t| t.id);
+    let mut expected = [t, test_table, foo_table];
+    expected.sort_by_key(|t| t.id);
+    assert_eq!(&list, &expected);
+
+    // test per-namespace table limits
+    let latest = repos
+        .namespaces()
+        .update_table_limit("namespace_table_test", MaxTables::try_from(1).unwrap())
+        .await
+        .expect("namespace should be updateable");
+    let err = repos
+        .tables()
+        .create(
+            "definitely_unique",
+            TablePartitionTemplateOverride::try_new(None, &latest.partition_template).unwrap(),
+            latest.id,
+        )
+        .await
+        .expect_err("should error with table create limit error");
+    assert!(matches!(err, Error::LimitExceeded { .. }));
+
+    // Create a table with a partition template other than the default
+    let custom_table_template = TablePartitionTemplateOverride::try_new(
+        Some(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("tag1".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("tag2".into())),
+                },
+            ],
+        }),
+        &namespace2.partition_template,
+    )
+    .unwrap();
+    let templated = repos
+        .tables()
+        .create(
+            "use_a_template",
+            custom_table_template.clone(),
+            namespace2.id,
+        )
+        .await
+        .unwrap();
+    assert_eq!(templated.partition_template, custom_table_template);
+
+    // Tag columns should be created for tags used in the template
+    let table_columns = repos
+        .columns()
+        .list_by_table_id(templated.id)
+        .await
+        .unwrap();
+    assert_eq!(table_columns.len(), 2);
+    assert!(table_columns.iter().all(|c| c.is_tag()));
+    let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
+    column_names.sort();
+    assert_eq!(column_names, &["tag1", "tag2"]);
+
+    let lookup_templated = repos
+        .tables()
+        .get_by_namespace_and_name(namespace2.id, "use_a_template")
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(templated, lookup_templated);
+
+    // Create a namespace with a partition template other than the default
+    let custom_namespace_template =
+        NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate {
+            parts: vec![
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("zzz".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TagValue("aaa".into())),
+                },
+                proto::TemplatePart {
+                    part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())),
+                },
+            ],
+        })
+        .unwrap();
+    let custom_namespace_name = NamespaceName::new("custom_namespace").unwrap();
+    let custom_namespace = repos
+        .namespaces()
+        .create(
+            &custom_namespace_name,
+            Some(custom_namespace_template.clone()),
+            None,
+            None,
+        )
+        .await
+        .unwrap();
+    // Create a table without specifying the partition template
+    let custom_table_template =
+        TablePartitionTemplateOverride::try_new(None, &custom_namespace.partition_template)
+            .unwrap();
+    let table_templated_by_namespace = repos
+        .tables()
+        .create(
+            "use_namespace_template",
+            custom_table_template,
+            custom_namespace.id,
+        )
+        .await
+        .unwrap();
+    assert_eq!(
+        table_templated_by_namespace.partition_template,
+        TablePartitionTemplateOverride::try_new(None, &custom_namespace_template).unwrap()
+    );
+
+    // Tag columns should be created for tags used in the template
+    let table_columns = repos
+        .columns()
+        .list_by_table_id(table_templated_by_namespace.id)
+        .await
+        .unwrap();
+    assert_eq!(table_columns.len(), 2);
+    assert!(table_columns.iter().all(|c| c.is_tag()));
+    let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect();
+    column_names.sort();
+    assert_eq!(column_names, &["aaa", "zzz"]);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_table_test")
+        .await
+        .expect("delete namespace should succeed");
+    repos
+        .namespaces()
+        .soft_delete("two")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn test_column(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_column_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    assert_eq!(table.namespace_id, namespace.id);
+
+    // test we can create or get a column
+    let c = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::Tag)
+        .await
+        .unwrap();
+
+    let ts1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts1).await;
+
+    let cc = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    assert!(c.id > ColumnId::new(0));
+    assert_eq!(c, cc);
+
+    let ts2 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts2).await;
+
+    assert_gt(ts2.generation(), ts1.generation());
+
+    // test that attempting to create an already defined column of a different type returns
+    // error
+    let err = repos
+        .columns()
+        .create_or_get("column_test", table.id, ColumnType::U64)
+        .await
+        .expect_err("should error with wrong column type");
+    assert!(matches!(err, Error::AlreadyExists { .. }));
+
+    // test that we can create a column of the same name under a different table
+    let table2 = arbitrary_table(&mut *repos, "test_table_2", &namespace).await;
+    let ccc = repos
+        .columns()
+        .create_or_get("column_test", table2.id, ColumnType::U64)
+        .await
+        .unwrap();
+    assert_ne!(c, ccc);
+
+    let columns = repos
+        .columns()
+        .list_by_namespace_id(namespace.id)
+        .await
+        .unwrap();
+
+    let ts3 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts3).await;
+
+    let mut want = vec![c.clone(), ccc];
+    assert_eq!(want, columns);
+
+    let columns = repos.columns().list_by_table_id(table.id).await.unwrap();
+
+    let want2 = vec![c];
+    assert_eq!(want2, columns);
+
+    // Add another tag column into table2
+    let c3 = repos
+        .columns()
+        .create_or_get("b", table2.id, ColumnType::Tag)
+        .await
+        .unwrap();
+
+    let ts4 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts4).await;
+
+    assert_gt(ts4.generation(), ts3.generation());
+
+    // Listing columns should return all columns in the catalog
+    let list = repos.columns().list().await.unwrap();
+    want.extend([c3]);
+    assert_eq!(list, want);
+
+    // test create_or_get_many_unchecked, below column limit
+    let mut columns = HashMap::new();
+    columns.insert("column_test", ColumnType::Tag);
+    columns.insert("new_column", ColumnType::Tag);
+    let table1_columns = repos
+        .columns()
+        .create_or_get_many_unchecked(table.id, columns)
+        .await
+        .unwrap();
+    let mut table1_column_names: Vec<_> = table1_columns.iter().map(|c| &c.name).collect();
+    table1_column_names.sort();
+    assert_eq!(table1_column_names, vec!["column_test", "new_column"]);
+
+    // test per-namespace column limits
+    repos
+        .namespaces()
+        .update_column_limit(
+            "namespace_column_test",
+            MaxColumnsPerTable::try_from(1).unwrap(),
+        )
+        .await
+        .expect("namespace should be updateable");
+    let err = repos
+        .columns()
+        .create_or_get("definitely unique", table.id, ColumnType::Tag)
+        .await
+        .expect_err("should error with table create limit error");
+    assert!(matches!(err, Error::LimitExceeded { .. }));
+
+    // test per-namespace column limits are NOT enforced with create_or_get_many_unchecked
+    let table3 = arbitrary_table(&mut *repos, "test_table_3", &namespace).await;
+    let mut columns = HashMap::new();
+    columns.insert("apples", ColumnType::Tag);
+    columns.insert("oranges", ColumnType::Tag);
+    let table3_columns = repos
+        .columns()
+        .create_or_get_many_unchecked(table3.id, columns)
+        .await
+        .unwrap();
+    let mut table3_column_names: Vec<_> = table3_columns.iter().map(|c| &c.name).collect();
+    table3_column_names.sort();
+    assert_eq!(table3_column_names, vec!["apples", "oranges"]);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_column_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn test_partition(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_partition_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+
+    let mut created = BTreeMap::new();
+    // partition to use
+    let partition = repos
+        .partitions()
+        .create_or_get("foo".into(), table.id)
+        .await
+        .expect("failed to create partition");
+    // Test: sort_key_ids from create_or_get
+    assert!(partition.sort_key_ids().is_none());
+    created.insert(partition.id, partition.clone());
+    // partition to use
+    let partition_bar = repos
+        .partitions()
+        .create_or_get("bar".into(), table.id)
+        .await
+        .expect("failed to create partition");
+    created.insert(partition_bar.id, partition_bar);
+    // partition to be skipped later
+    let to_skip_partition = repos
+        .partitions()
+        .create_or_get("asdf".into(), table.id)
+        .await
+        .unwrap();
+    created.insert(to_skip_partition.id, to_skip_partition.clone());
+    // partition to be skipped later
+    let to_skip_partition_too = repos
+        .partitions()
+        .create_or_get("asdf too".into(), table.id)
+        .await
+        .unwrap();
+    created.insert(to_skip_partition_too.id, to_skip_partition_too.clone());
+
+    // partitions can be retrieved easily
+    let mut created_sorted = created.values().cloned().collect::<Vec<_>>();
+    created_sorted.sort_by_key(|p| p.id);
+    assert_eq!(
+        to_skip_partition,
+        repos
+            .partitions()
+            .get_by_id_batch(&[to_skip_partition.id])
+            .await
+            .unwrap()
+            .into_iter()
+            .next()
+            .unwrap()
+    );
+    let non_existing_partition_id = PartitionId::new(i64::MAX);
+    assert!(repos
+        .partitions()
+        .get_by_id_batch(&[non_existing_partition_id])
+        .await
+        .unwrap()
+        .is_empty());
+    let mut batch = repos
+        .partitions()
+        .get_by_id_batch(
+            &created
+                .keys()
+                .cloned()
+                // non-existing entries are ignored
+                .chain([non_existing_partition_id])
+                // duplicates are ignored
+                .chain(created.keys().cloned())
+                .collect::<Vec<_>>(),
+        )
+        .await
+        .unwrap();
+    batch.sort_by_key(|p| p.id);
+    assert_eq!(created_sorted, batch);
+    // Test: sort_key_ids from get_by_id_batch
+    assert!(batch.iter().all(|p| p.sort_key_ids().is_none()));
+
+    assert_eq!(created_sorted, batch);
+
+    let s1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &s1).await;
+
+    let listed = repos
+        .partitions()
+        .list_by_table_id(table.id)
+        .await
+        .expect("failed to list partitions")
+        .into_iter()
+        .map(|v| (v.id, v))
+        .collect::<BTreeMap<_, _>>();
+    // Test: sort_key_ids from list_by_table_id
+    assert!(listed.values().all(|p| p.sort_key_ids().is_none()));
+
+    assert_eq!(created, listed);
+
+    let listed = repos
+        .partitions()
+        .list_ids()
+        .await
+        .expect("failed to list partitions")
+        .into_iter()
+        .collect::<BTreeSet<_>>();
+
+    assert_eq!(created.keys().copied().collect::<BTreeSet<_>>(), listed);
+
+    // The code no longer supports creating old-style partitions, so this list is always empty
+    // in these tests. See each catalog implementation for tests that insert old-style
+    // partitions directly and verify they're returned.
+    let old_style = repos.partitions().list_old_style().await.unwrap();
+    assert!(
+        old_style.is_empty(),
+        "Expected no old-style partitions, got {old_style:?}"
+    );
+
+    // sort key should be unset on creation
+    assert!(to_skip_partition.sort_key_ids().is_none());
+
+    let s1 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    validate_partition_snapshot(repos.as_mut(), &s1).await;
+
+    // test that updates sort key from None to Some
+    let updated_partition = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([2, 1, 3]))
+        .await
+        .unwrap();
+
+    // verify sort key is updated correctly
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 3])
+    );
+
+    let s2 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s2.generation(), s1.generation());
+    validate_partition_snapshot(repos.as_mut(), &s2).await;
+
+    // test that provides value of old_sort_key_ids but it do not match the existing one
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([1])),
+            &SortKeyIds::from([1, 2, 3, 4]),
+        )
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    // verify the sort key is not updated
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test that provides same length but not-matched old_sort_key_ids
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([1, 5, 10])),
+            &SortKeyIds::from([1, 2, 3, 4]),
+        )
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    // verify the sort key is not updated
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test that provide None sort_key_ids that do not match with existing values that are not None
+    // --> the new sort key will not be updated
+    let err = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([1, 2, 3, 4]))
+        .await
+        .expect_err("CAS with incorrect value should fail");
+    assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => {
+        assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3]));
+    });
+
+    // test getting partition from partition id and verify values of sort_key and sort_key_ids
+    let updated_other_partition = repos
+        .partitions()
+        .get_by_id_batch(&[to_skip_partition.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+    // still has the old sort key
+    assert_eq!(
+        updated_other_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 3])
+    );
+
+    // test that updates sort_key_ids from Some matching value to Some other value
+    let updated_partition = repos
+        .partitions()
+        .cas_sort_key(
+            to_skip_partition.id,
+            Some(&SortKeyIds::from([2, 1, 3])),
+            &SortKeyIds::from([2, 1, 4, 3]),
+        )
+        .await
+        .unwrap();
+    // verify the new values are updated
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 4, 3])
+    );
+
+    // test getting the new sort key from partition id
+    let updated_partition = repos
+        .partitions()
+        .get_by_id_batch(&[to_skip_partition.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+    assert_eq!(
+        updated_partition.sort_key_ids().unwrap(),
+        &SortKeyIds::from([2, 1, 4, 3])
+    );
+
+    // use to_skip_partition_too to update sort key from empty old values
+    // first make sure the old sort key is unset
+    assert!(to_skip_partition_too.sort_key_ids().is_none());
+
+    // test that provides empty old_sort_key_ids
+    // --> the new sort key will be updated
+    let updated_to_skip_partition_too = repos
+        .partitions()
+        .cas_sort_key(to_skip_partition_too.id, None, &SortKeyIds::from([3, 4]))
+        .await
+        .unwrap();
+    // verify the new values are updated
+    assert_eq!(
+        updated_to_skip_partition_too.sort_key_ids().unwrap(),
+        &SortKeyIds::from([3, 4])
+    );
+
+    let s3 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s3.generation(), s2.generation());
+    validate_partition_snapshot(repos.as_mut(), &s3).await;
+
+    // The compactor can log why compaction was skipped
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert!(
+        skipped_compactions.is_empty(),
+        "Expected no skipped compactions, got: {skipped_compactions:?}"
+    );
+    repos
+        .partitions()
+        .record_skipped_compaction(to_skip_partition.id, "I am le tired", 1, 2, 4, 10, 20)
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(skipped_compactions[0].reason, "I am le tired");
+    assert_eq!(skipped_compactions[0].num_files, 1);
+    assert_eq!(skipped_compactions[0].limit_num_files, 2);
+    assert_eq!(skipped_compactions[0].estimated_bytes, 10);
+    assert_eq!(skipped_compactions[0].limit_bytes, 20);
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[
+            to_skip_partition.id,
+            PartitionId::new(i64::MAX),
+            to_skip_partition.id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(skipped_partition_records[0].reason, "I am le tired");
+
+    let s4 = repos
+        .partitions()
+        .snapshot(to_skip_partition.id)
+        .await
+        .unwrap();
+    assert_gt(s4.generation(), s3.generation());
+    validate_partition_snapshot(repos.as_mut(), &s4).await;
+
+    // Only save the last reason that any particular partition was skipped (really if the
+    // partition appears in the skipped compactions, it shouldn't become a compaction candidate
+    // again, but race conditions and all that)
+    repos
+        .partitions()
+        .record_skipped_compaction(to_skip_partition.id, "I'm on fire", 11, 12, 24, 110, 120)
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(skipped_compactions[0].reason, "I'm on fire");
+    assert_eq!(skipped_compactions[0].num_files, 11);
+    assert_eq!(skipped_compactions[0].limit_num_files, 12);
+    assert_eq!(skipped_compactions[0].estimated_bytes, 110);
+    assert_eq!(skipped_compactions[0].limit_bytes, 120);
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(skipped_partition_records[0].reason, "I'm on fire");
+
+    // Can receive multiple skipped compactions for different partitions
+    repos
+        .partitions()
+        .record_skipped_compaction(
+            to_skip_partition_too.id,
+            "I am le tired too",
+            1,
+            2,
+            4,
+            10,
+            20,
+        )
+        .await
+        .unwrap();
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert_eq!(skipped_compactions.len(), 2);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    assert_eq!(
+        skipped_compactions[1].partition_id,
+        to_skip_partition_too.id
+    );
+    // confirm can fetch subset of skipped compactions (a.k.a. have two, only fetch 1)
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 1);
+    assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id);
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition_too.id])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 1);
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition_too.id
+    );
+    // confirm can fetch both skipped compactions, and not the unskipped one
+    // also confirm will not error on non-existing partition
+    let non_existing_partition_id = PartitionId::new(9999);
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[
+            partition.id,
+            to_skip_partition.id,
+            to_skip_partition_too.id,
+            non_existing_partition_id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(skipped_partition_records.len(), 2);
+    assert_eq!(
+        skipped_partition_records[0].partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(
+        skipped_partition_records[1].partition_id,
+        to_skip_partition_too.id
+    );
+
+    // Delete the skipped compactions
+    let deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition.id)
+        .await
+        .unwrap()
+        .expect("The skipped compaction should have been returned");
+    assert_eq!(
+        deleted_skipped_compaction.partition_id,
+        to_skip_partition.id
+    );
+    assert_eq!(deleted_skipped_compaction.reason, "I'm on fire");
+    assert_eq!(deleted_skipped_compaction.num_files, 11);
+    assert_eq!(deleted_skipped_compaction.limit_num_files, 12);
+    assert_eq!(deleted_skipped_compaction.estimated_bytes, 110);
+    assert_eq!(deleted_skipped_compaction.limit_bytes, 120);
+    //
+    let deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition_too.id)
+        .await
+        .unwrap()
+        .expect("The skipped compaction should have been returned");
+    assert_eq!(
+        deleted_skipped_compaction.partition_id,
+        to_skip_partition_too.id
+    );
+    assert_eq!(deleted_skipped_compaction.reason, "I am le tired too");
+    //
+    let skipped_partition_records = repos
+        .partitions()
+        .get_in_skipped_compactions(&[to_skip_partition.id])
+        .await
+        .unwrap();
+    assert!(skipped_partition_records.is_empty());
+
+    let not_deleted_skipped_compaction = repos
+        .partitions()
+        .delete_skipped_compactions(to_skip_partition.id)
+        .await
+        .unwrap();
+
+    assert!(
+        not_deleted_skipped_compaction.is_none(),
+        "There should be no skipped compation",
+    );
+
+    let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap();
+    assert!(
+        skipped_compactions.is_empty(),
+        "Expected no skipped compactions, got: {skipped_compactions:?}"
+    );
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(10)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 4);
+
+    // Test: sort_key_ids from most_recent_n
+    // Only the first two partitions (represent to_skip_partition_too and to_skip_partition) have vallues, the others are empty
+    assert_eq!(
+        recent[0].sort_key_ids().unwrap(),
+        &SortKeyIds::from(vec![3, 4])
+    );
+    assert_eq!(
+        recent[1].sort_key_ids().unwrap(),
+        &SortKeyIds::from(vec![2, 1, 4, 3])
+    );
+    assert!(recent[2].sort_key_ids().is_none());
+    assert!(recent[3].sort_key_ids().is_none());
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(4)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 4); // no off by one error
+
+    let recent = repos
+        .partitions()
+        .most_recent_n(2)
+        .await
+        .expect("should list most recent");
+    assert_eq!(recent.len(), 2);
+
+    repos
+        .namespaces()
+        .soft_delete("namespace_partition_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+async fn validate_partition_snapshot(repos: &mut dyn RepoCollection, snapshot: &PartitionSnapshot) {
+    // compare files
+    let mut expected = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![snapshot.partition_id()])
+        .await
+        .unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot.files().collect::<Result<Vec<_>, _>>().unwrap();
+    actual.sort_unstable_by_key(|x| x.id);
+    assert_eq!(expected, actual);
+
+    // compare skipped partition
+    let expected = repos
+        .partitions()
+        .get_in_skipped_compactions(&[snapshot.partition_id()])
+        .await
+        .unwrap()
+        .into_iter()
+        .next();
+    let actual = snapshot.skipped_compaction();
+    assert_eq!(actual, expected);
+
+    // compare partition itself
+    let actual = snapshot.partition().unwrap();
+    let expected = repos
+        .partitions()
+        .get_by_id(snapshot.partition_id())
+        .await
+        .unwrap()
+        .unwrap();
+    assert_eq!(actual, expected);
+}
+
+async fn validate_table_snapshot(repos: &mut dyn RepoCollection, snapshot: &TableSnapshot) {
+    let table = snapshot.table().unwrap();
+
+    let expected = repos.tables().get_by_id(table.id).await.unwrap().unwrap();
+    assert_eq!(table, expected);
+
+    // compare columns
+    let mut expected = repos.columns().list_by_table_id(table.id).await.unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot.columns().collect::<Result<Vec<_>, _>>().unwrap();
+    actual.sort_unstable_by_key(|x| x.id);
+    assert_eq!(expected, actual);
+
+    // compare partitions
+    let mut expected = repos.partitions().list_by_table_id(table.id).await.unwrap();
+    expected.sort_unstable_by_key(|x| x.id);
+    let mut actual = snapshot
+        .partitions()
+        .collect::<Result<Vec<_>, _>>()
+        .unwrap();
+    actual.sort_unstable_by_key(|x| x.id());
+    assert_eq!(expected.len(), actual.len());
+
+    let eq = expected
+        .iter()
+        .zip(&actual)
+        .all(|(l, r)| l.id == r.id() && l.partition_key.as_bytes() == r.key());
+    assert!(eq, "expected {expected:?} got {actual:?}");
+}
+
+/// List all parquet files in given namespace.
+async fn list_parquet_files_by_namespace_not_to_delete(
+    catalog: Arc<dyn Catalog>,
+    namespace_id: NamespaceId,
+) -> Vec<ParquetFile> {
+    let partitions = futures::stream::iter(
+        catalog
+            .repositories()
+            .tables()
+            .list_by_namespace_id(namespace_id)
+            .await
+            .unwrap(),
+    )
+    .then(|t| {
+        let catalog = Arc::clone(&catalog);
+        async move {
+            futures::stream::iter(
+                catalog
+                    .repositories()
+                    .partitions()
+                    .list_by_table_id(t.id)
+                    .await
+                    .unwrap(),
+            )
+        }
+    })
+    .flatten()
+    .map(|p| p.id)
+    .collect::<Vec<_>>()
+    .await;
+
+    catalog
+        .repositories()
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(partitions)
+        .await
+        .unwrap()
+}
+
+/// tests many interactions with the catalog and parquet files. See the individual conditions
+/// herein
+async fn test_parquet_file(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+    let other_table = arbitrary_table(&mut *repos, "other", &namespace).await;
+    let partition = repos
+        .partitions()
+        .create_or_get("one".into(), table.id)
+        .await
+        .unwrap();
+    let other_partition = repos
+        .partitions()
+        .create_or_get("one".into(), other_table.id)
+        .await
+        .unwrap();
+
+    let ts1 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts1).await;
+
+    let ts2 = repos.tables().snapshot(other_table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts2).await;
+
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+
+    // verify we can get it by its object store id
+    let pfg = repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap();
+    assert_eq!(parquet_file, pfg.unwrap());
+
+    // verify that trying to create a file with the same UUID throws an error
+    let err = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap_err();
+    assert!(matches!(err, Error::AlreadyExists { .. }));
+
+    let other_params = ParquetFileParams {
+        table_id: other_partition.table_id,
+        partition_id: other_partition.id,
+        partition_hash_id: other_partition.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..parquet_file_params.clone()
+    };
+    let other_file = repos.parquet_files().create(other_params).await.unwrap();
+
+    let exist_id = parquet_file.id;
+    let non_exist_id = ParquetFileId::new(other_file.id.get() + 10);
+    // make sure exists_id != non_exist_id
+    assert_ne!(exist_id, non_exist_id);
+
+    // verify that to_delete is initially set to null and the file does not get deleted
+    assert!(parquet_file.to_delete.is_none());
+    let older_than = Timestamp::new(
+        (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert!(deleted.is_empty());
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is not soft-deleted yet and will be included in the returned list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(files.len(), 2);
+
+    // verify to_delete can be updated to a timestamp
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            parquet_file.partition_id,
+            &[parquet_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is soft-deleted and will be NOT included in the returned list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(files.len(), 1);
+
+    // the deleted file can still be retrieved by UUID though
+    repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    // File is not deleted if it was marked to be deleted after the specified time
+    let before_deleted = Timestamp::new(
+        (catalog.time_provider().now() - Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(before_deleted)
+        .await
+        .unwrap();
+    assert!(deleted.is_empty());
+
+    // not hard-deleted yet
+    repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    // File is deleted if it was marked to be deleted before the specified time
+    let deleted = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert_eq!(deleted.len(), 1);
+    assert_eq!(parquet_file.object_store_id, deleted[0]);
+
+    // test list_all that includes soft-deleted file
+    // at this time the file is hard deleted -> the returned list is empty
+    assert!(repos
+        .parquet_files()
+        .get_by_object_store_id(parquet_file.object_store_id)
+        .await
+        .unwrap()
+        .is_none());
+
+    // test list
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await;
+    assert_eq!(vec![other_file.clone()], files);
+
+    // test list_by_namespace_not_to_delete
+    let namespace2 = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test1").await;
+    let table2 = arbitrary_table(&mut *repos, "test_table2", &namespace2).await;
+    let partition2 = repos
+        .partitions()
+        .create_or_get("foo".into(), table2.id)
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert!(files.is_empty());
+
+    let ts3 = repos.tables().snapshot(table2.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts3).await;
+
+    let f1_params = ParquetFileParams {
+        table_id: partition2.table_id,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        namespace_id: namespace2.id,
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(1),
+        max_time: Timestamp::new(10),
+        ..parquet_file_params
+    };
+    let f1 = repos
+        .parquet_files()
+        .create(f1_params.clone())
+        .await
+        .unwrap();
+
+    let f2_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..f1_params.clone()
+    };
+    let f2 = repos
+        .parquet_files()
+        .create(f2_params.clone())
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f2.clone()], files);
+
+    let f3_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(50),
+        max_time: Timestamp::new(60),
+        ..f2_params
+    };
+    let f3 = repos
+        .parquet_files()
+        .create(f3_params.clone())
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f2.clone(), f3.clone()], files);
+
+    let s1 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    validate_partition_snapshot(repos.as_mut(), &s1).await;
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f2.partition_id,
+            &[f2.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f3.clone()], files);
+
+    // Cannot delete file twice
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f2.object_store_id, f3.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f2.object_store_id],
+            &[f3.object_store_id],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    // Cannot upgrade deleted file
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f3.object_store_id],
+            &[f2.object_store_id],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+
+    // Failed transactions don't modify
+    let files =
+        list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await;
+    assert_eq!(vec![f1.clone(), f3.clone()], files);
+
+    let s2 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s2.generation(), s1.generation());
+    validate_partition_snapshot(repos.as_mut(), &s2).await;
+
+    let files = list_parquet_files_by_namespace_not_to_delete(
+        Arc::clone(&catalog),
+        NamespaceId::new(i64::MAX),
+    )
+    .await;
+    assert!(files.is_empty());
+
+    // test delete_old_ids_only
+    let older_than = Timestamp::new(
+        (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(),
+    );
+    let ids = repos
+        .parquet_files()
+        .delete_old_ids_only(older_than)
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), 1);
+
+    let s3 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_ge(s3.generation(), s2.generation()); // no new snapshot required, but some backends will generate a new one
+    validate_partition_snapshot(repos.as_mut(), &s3).await;
+
+    // test retention-based flagging for deletion
+    // Since mem catalog has default retention 1 hour, let us first set it to 0 means infinite
+    let namespaces = repos
+        .namespaces()
+        .list(SoftDeletedRows::AllRows)
+        .await
+        .expect("listing namespaces");
+    for namespace in namespaces {
+        repos
+            .namespaces()
+            .update_retention_period(&namespace.name, None) // infinite
+            .await
+            .unwrap();
+    }
+
+    // 1. with no retention period set on the ns, nothing should get flagged
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.is_empty());
+    // 2. set ns retention period to one hour then create some files before and after and
+    //    ensure correct files get deleted
+    repos
+        .namespaces()
+        .update_retention_period(&namespace2.name, Some(60 * 60 * 1_000_000_000)) // 1 hour
+        .await
+        .unwrap();
+    let f4_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        max_time: Timestamp::new(
+            // a bit over an hour ago
+            (catalog.time_provider().now() - Duration::from_secs(60 * 65)).timestamp_nanos(),
+        ),
+        ..f3_params
+    };
+    let f4 = repos
+        .parquet_files()
+        .create(f4_params.clone())
+        .await
+        .unwrap();
+    let f5_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        max_time: Timestamp::new(
+            // a bit under an hour ago
+            (catalog.time_provider().now() - Duration::from_secs(60 * 55)).timestamp_nanos(),
+        ),
+        ..f4_params
+    };
+    let f5 = repos
+        .parquet_files()
+        .create(f5_params.clone())
+        .await
+        .unwrap();
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.len() > 1); // it's also going to flag f1, f2 & f3 because they have low max
+                            // timestamps but i don't want this test to be brittle if those
+                            // values change so i'm not asserting len == 4
+    let f4 = repos
+        .parquet_files()
+        .get_by_object_store_id(f4.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f4.to_delete, Some(_)); // f4 is > 1hr old
+    let f5 = repos
+        .parquet_files()
+        .get_by_object_store_id(f5.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f5.to_delete, None); // f5 is < 1hr old
+
+    let s4 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s4.generation(), s3.generation());
+    validate_partition_snapshot(repos.as_mut(), &s4).await;
+
+    // call flag_for_delete_by_retention() again and nothing should be flagged because they've
+    // already been flagged
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert!(ids.is_empty());
+
+    // test that flag_for_delete_by_retention respects UPDATE LIMIT
+    // create limit + the meaning of life parquet files that are all older than the retention (>1hr)
+    const LIMIT: usize = 1000;
+    const MOL: usize = 42;
+    let now = catalog.time_provider().now();
+    let params = (0..LIMIT + MOL)
+        .map(|_| {
+            ParquetFileParams {
+                object_store_id: ObjectStoreId::new(),
+                max_time: Timestamp::new(
+                    // a bit over an hour ago
+                    (now - Duration::from_secs(60 * 65)).timestamp_nanos(),
+                ),
+                ..f1_params.clone()
+            }
+        })
+        .collect::<Vec<_>>();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f1_params.partition_id,
+            &[],
+            &[],
+            &params,
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), LIMIT);
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), MOL); // second call took remainder
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(ids.len(), 0); // none left
+
+    // test create_update_delete
+    let f6_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..f5_params
+    };
+    let f6 = repos
+        .parquet_files()
+        .create(f6_params.clone())
+        .await
+        .unwrap();
+
+    let f7_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..f6_params
+    };
+    let f1_uuid = f1.object_store_id;
+    let f6_uuid = f6.object_store_id;
+    let f5_uuid = f5.object_store_id;
+    let cud = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            f5.partition_id,
+            &[f5.object_store_id],
+            &[f6.object_store_id],
+            &[f7_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap();
+
+    assert_eq!(cud.len(), 1);
+    let f5_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f5_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f5_delete.to_delete, Some(_));
+
+    let f6_compaction_level = repos
+        .parquet_files()
+        .get_by_object_store_id(f6_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+
+    assert_matches!(f6_compaction_level.compaction_level, CompactionLevel::Final);
+
+    let f7 = repos
+        .parquet_files()
+        .get_by_object_store_id(f7_params.object_store_id)
+        .await
+        .unwrap()
+        .unwrap();
+
+    let f7_uuid = f7.object_store_id;
+
+    // test create_update_delete transaction (rollback because f7 already exists)
+    let cud = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[],
+            &[],
+            &[f7_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await;
+
+    assert_matches!(
+        cud,
+        Err(Error::AlreadyExists {
+            descr
+        }) if descr == f7_params.object_store_id.to_string()
+    );
+
+    let f1_to_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f1_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f1_to_delete.to_delete, Some(_));
+
+    let f7_not_delete = repos
+        .parquet_files()
+        .get_by_object_store_id(f7_uuid)
+        .await
+        .unwrap()
+        .unwrap();
+    assert_matches!(f7_not_delete.to_delete, None);
+
+    // test exists_by_object_store_id_batch returns parquet files by object store id
+    let does_not_exist = ObjectStoreId::new();
+    let mut present = repos
+        .parquet_files()
+        .exists_by_object_store_id_batch(vec![f1_uuid, f7_uuid, does_not_exist])
+        .await
+        .unwrap();
+    let mut expected = vec![f1_uuid, f7_uuid];
+    present.sort();
+    expected.sort();
+    assert_eq!(present, expected);
+
+    let s5 = repos.partitions().snapshot(partition2.id).await.unwrap();
+    assert_gt(s5.generation(), s4.generation());
+    validate_partition_snapshot(repos.as_mut(), &s5).await;
+
+    // Cannot mix partition IDs
+    let partition3 = repos
+        .partitions()
+        .create_or_get("three".into(), table.id)
+        .await
+        .unwrap();
+
+    let ts4 = repos.tables().snapshot(table.id).await.unwrap();
+    validate_table_snapshot(repos.as_mut(), &ts4).await;
+    assert_gt(ts4.generation(), ts1.generation());
+
+    let f8_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        partition_id: partition3.id,
+        ..f7_params
+    };
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition2.id,
+            &[f7_uuid],
+            &[],
+            &[f8_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap_err()
+        .to_string();
+
+    assert!(
+        err.contains("Inconsistent ParquetFileParams, expected PartitionId"),
+        "{err}"
+    );
+
+    let list = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition2.id])
+        .await
+        .unwrap();
+    assert_eq!(list.len(), 2);
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition3.id,
+            &[],
+            &[],
+            &[f8_params.clone()],
+            CompactionLevel::Final,
+        )
+        .await
+        .unwrap();
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition3.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 1);
+    let f8_uuid = files[0].object_store_id;
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 0);
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition2.id, partition3.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 3);
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![
+            partition2.id,
+            PartitionId::new(i64::MAX),
+            partition3.id,
+            partition2.id,
+        ])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 3);
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(partition2.id, &[f8_uuid], &[], &[], CompactionLevel::Final)
+        .await
+        .unwrap_err();
+
+    assert_matches!(err, Error::NotFound { .. });
+
+    let err = repos
+        .parquet_files()
+        .create_upgrade_delete(partition2.id, &[], &[f8_uuid], &[], CompactionLevel::Final)
+        .await
+        .unwrap_err();
+
+    assert_matches!(err, Error::NotFound { .. });
+
+    repos
+        .parquet_files()
+        .create_upgrade_delete(partition3.id, &[f8_uuid], &[], &[], CompactionLevel::Final)
+        .await
+        .unwrap();
+
+    // take snapshot of unknown partition
+    let err = repos
+        .partitions()
+        .snapshot(PartitionId::new(i64::MAX))
+        .await
+        .unwrap_err();
+    assert_matches!(err, Error::NotFound { .. });
+}
+
+async fn test_parquet_file_delete_broken(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_1 = arbitrary_namespace(&mut *repos, "retention_broken_1").await;
+    let namespace_2 = repos
+        .namespaces()
+        .create(
+            &NamespaceName::new("retention_broken_2").unwrap(),
+            None,
+            Some(1),
+            None,
+        )
+        .await
+        .unwrap();
+    let table_1 = arbitrary_table(&mut *repos, "test_table", &namespace_1).await;
+    let table_2 = arbitrary_table(&mut *repos, "test_table", &namespace_2).await;
+    let partition_1 = repos
+        .partitions()
+        .create_or_get("one".into(), table_1.id)
+        .await
+        .unwrap();
+    let partition_2 = repos
+        .partitions()
+        .create_or_get("one".into(), table_2.id)
+        .await
+        .unwrap();
+
+    let parquet_file_params_1 = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
+    let parquet_file_params_2 = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
+    let _parquet_file_1 = repos
+        .parquet_files()
+        .create(parquet_file_params_1)
+        .await
+        .unwrap();
+    let parquet_file_2 = repos
+        .parquet_files()
+        .create(parquet_file_params_2)
+        .await
+        .unwrap();
+
+    let ids = repos
+        .parquet_files()
+        .flag_for_delete_by_retention()
+        .await
+        .unwrap();
+    assert_eq!(
+        ids,
+        vec![(parquet_file_2.partition_id, parquet_file_2.object_store_id)]
+    );
+}
+
+async fn test_partitions_new_file_between(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(&mut *repos, "test_partitions_new_file_between").await;
+    let table = arbitrary_table(&mut *repos, "test_table_for_new_file_between", &namespace).await;
+
+    // param for the tests
+    let time_now = Timestamp::from(catalog.time_provider().now());
+    let time_one_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(1));
+    let time_two_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(2));
+    let time_three_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(3));
+    let time_five_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(5));
+    let time_six_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(6));
+
+    // Db has no partitions
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION one
+    // The DB has 1 partition but it does not have any file
+    let partition1 = repos
+        .partitions()
+        .create_or_get("one".into(), table.id)
+        .await
+        .unwrap();
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // create files for partition one
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition1);
+
+    // create a deleted L0 file that was created 3 hours ago
+    let delete_l0_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            delete_l0_file.partition_id,
+            &[delete_l0_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, Some(time_one_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_one_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // create a deleted L0 file that was created 1 hour ago
+    let l0_one_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_one_hour_ago,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_one_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // partition one should be returned
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_two_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION two
+    // Partition two without any file
+    let partition2 = repos
+        .partitions()
+        .create_or_get("two".into(), table.id)
+        .await
+        .unwrap();
+    // should return partition one only
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+
+    // Add a L0 file created 5 hours ago
+    let l0_five_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_five_hour_ago,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_five_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // still return partition one only
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return only partition 2
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition2.id);
+
+    // Add an L1 file created just now
+    let l1_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_now,
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        compaction_level: CompactionLevel::FileNonOverlapped,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l1_file_params.clone())
+        .await
+        .unwrap();
+    // should return both partitions
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // -----------------
+    // PARTITION three
+    // Partition three without any file
+    let partition3 = repos
+        .partitions()
+        .create_or_get("three".into(), table.id)
+        .await
+        .unwrap();
+    // should return partition one and two only
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // When the maximum time is greater than the creation time of partition2, return it
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now + 1))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // Add an L2 file created just now for partition three
+    let l2_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_now,
+        partition_id: partition3.id,
+        partition_hash_id: partition3.hash_id().cloned(),
+        compaction_level: CompactionLevel::Final,
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l2_file_params.clone())
+        .await
+        .unwrap();
+    // now should return partition one two and three
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 3);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    assert_eq!(partitions[2], partition3.id);
+    // Only return partition1: the creation time must be strictly less than the maximum time,
+    // not equal
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 1);
+    assert_eq!(partitions[0], partition1.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+
+    // add an L0 file created one hour ago for partition three
+    let l0_one_hour_ago_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        created_at: time_one_hour_ago,
+        partition_id: partition3.id,
+        partition_hash_id: partition3.hash_id().cloned(),
+        ..parquet_file_params.clone()
+    };
+    repos
+        .parquet_files()
+        .create(l0_one_hour_ago_file_params.clone())
+        .await
+        .unwrap();
+    // should return all partitions
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_two_hour_ago, None)
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 3);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition2.id);
+    assert_eq!(partitions[2], partition3.id);
+    // Only return partitions 1 and 3; 2 was created just now
+    let mut partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_three_hour_ago, Some(time_now))
+        .await
+        .unwrap();
+    assert_eq!(partitions.len(), 2);
+    partitions.sort();
+    assert_eq!(partitions[0], partition1.id);
+    assert_eq!(partitions[1], partition3.id);
+    // Between six and three hours ago, return none
+    let partitions = repos
+        .partitions()
+        .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago))
+        .await
+        .unwrap();
+    assert!(partitions.is_empty());
+}
+
+async fn test_list_by_partiton_not_to_delete(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace = arbitrary_namespace(
+        &mut *repos,
+        "namespace_parquet_file_test_list_by_partiton_not_to_delete",
+    )
+    .await;
+    let table = arbitrary_table(&mut *repos, "test_table", &namespace).await;
+
+    let partition = repos
+        .partitions()
+        .create_or_get("test_list_by_partiton_not_to_delete_one".into(), table.id)
+        .await
+        .unwrap();
+    let partition2 = repos
+        .partitions()
+        .create_or_get("test_list_by_partiton_not_to_delete_two".into(), table.id)
+        .await
+        .unwrap();
+
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let delete_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let delete_file = repos
+        .parquet_files()
+        .create(delete_file_params)
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition.id,
+            &[delete_file.object_store_id],
+            &[],
+            &[],
+            CompactionLevel::Initial,
+        )
+        .await
+        .unwrap();
+    let level1_file_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let mut level1_file = repos
+        .parquet_files()
+        .create(level1_file_params)
+        .await
+        .unwrap();
+    repos
+        .parquet_files()
+        .create_upgrade_delete(
+            partition.id,
+            &[],
+            &[level1_file.object_store_id],
+            &[],
+            CompactionLevel::FileNonOverlapped,
+        )
+        .await
+        .unwrap();
+    level1_file.compaction_level = CompactionLevel::FileNonOverlapped;
+
+    let other_partition_params = ParquetFileParams {
+        partition_id: partition2.id,
+        partition_hash_id: partition2.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    let _partition2_file = repos
+        .parquet_files()
+        .create(other_partition_params)
+        .await
+        .unwrap();
+
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 2);
+
+    let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
+    file_ids.sort();
+    let mut expected_ids = vec![parquet_file.id, level1_file.id];
+    expected_ids.sort();
+    assert_eq!(file_ids, expected_ids);
+
+    // Using the catalog partition ID should return the same files, even if the Parquet file
+    // records don't have the partition ID on them (which is the default now)
+    let files = repos
+        .parquet_files()
+        .list_by_partition_not_to_delete_batch(vec![partition.id])
+        .await
+        .unwrap();
+    assert_eq!(files.len(), 2);
+
+    let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect();
+    file_ids.sort();
+    let mut expected_ids = vec![parquet_file.id, level1_file.id];
+    expected_ids.sort();
+    assert_eq!(file_ids, expected_ids);
+}
+
+async fn test_update_to_compaction_level_1(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace =
+        arbitrary_namespace(&mut *repos, "namespace_update_to_compaction_level_1_test").await;
+    let table = arbitrary_table(&mut *repos, "update_table", &namespace).await;
+    let partition = repos
+        .partitions()
+        .create_or_get("test_update_to_compaction_level_1_one".into(), table.id)
+        .await
+        .unwrap();
+
+    // Set up the window of times we're interested in level 1 files for
+    let query_min_time = Timestamp::new(5);
+    let query_max_time = Timestamp::new(10);
+
+    // Create a file with times entirely within the window
+    let mut parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition);
+    parquet_file_params.min_time = query_min_time + 1;
+    parquet_file_params.max_time = query_max_time - 1;
+    let parquet_file = repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+
+    // Create a file that will remain as level 0
+    let level_0_params = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        ..parquet_file_params.clone()
+    };
+    repos.parquet_files().create(level_0_params).await.unwrap();
+
+    // Make parquet_file compaction level 1
+    let created = repos
+        .parquet_files()
+        .create_upgrade_delete(
+            parquet_file.partition_id,
+            &[],
+            &[parquet_file.object_store_id],
+            &[],
+            CompactionLevel::FileNonOverlapped,
+        )
+        .await
+        .unwrap();
+    assert_eq!(created, vec![]);
+
+    // remove namespace to avoid it from affecting later tests
+    repos
+        .namespaces()
+        .soft_delete("namespace_update_to_compaction_level_1_test")
+        .await
+        .expect("delete namespace should succeed");
+}
+
+/// Assert that a namespace deletion does NOT cascade to the tables/schema
+/// items/parquet files/etc.
+///
+/// Removal of this entities breaks the invariant that once created, a row
+/// always exists for the lifetime of an IOx process, and causes the system
+/// to panic in multiple components. It's also ineffective, because most
+/// components maintain a cache of at least one of these entities.
+///
+/// Instead soft deleted namespaces should have their files GC'd like a
+/// normal parquet file deletion, removing the rows once they're no longer
+/// being actively used by the system. This is done by waiting a long time
+/// before deleting records, and whilst isn't perfect, it is largely
+/// effective.
+async fn test_delete_namespace(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+    let namespace_1 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_1").await;
+    let table_1 = arbitrary_table(&mut *repos, "test_table_1", &namespace_1).await;
+    let _c = repos
+        .columns()
+        .create_or_get("column_test_1", table_1.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    let partition_1 = repos
+        .partitions()
+        .create_or_get("test_delete_namespace_one".into(), table_1.id)
+        .await
+        .unwrap();
+
+    // parquet files
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1);
+    repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let parquet_file_params_2 = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(200),
+        max_time: Timestamp::new(300),
+        ..parquet_file_params
+    };
+    repos
+        .parquet_files()
+        .create(parquet_file_params_2.clone())
+        .await
+        .unwrap();
+
+    // we've now created a namespace with a table and parquet files. before we test deleting
+    // it, let's create another so we can ensure that doesn't get deleted.
+    let namespace_2 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_2").await;
+    let table_2 = arbitrary_table(&mut *repos, "test_table_2", &namespace_2).await;
+    let _c = repos
+        .columns()
+        .create_or_get("column_test_2", table_2.id, ColumnType::Tag)
+        .await
+        .unwrap();
+    let partition_2 = repos
+        .partitions()
+        .create_or_get("test_delete_namespace_two".into(), table_2.id)
+        .await
+        .unwrap();
+
+    // parquet files
+    let parquet_file_params = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2);
+    repos
+        .parquet_files()
+        .create(parquet_file_params.clone())
+        .await
+        .unwrap();
+    let parquet_file_params_2 = ParquetFileParams {
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(200),
+        max_time: Timestamp::new(300),
+        ..parquet_file_params
+    };
+    repos
+        .parquet_files()
+        .create(parquet_file_params_2.clone())
+        .await
+        .unwrap();
+
+    // now delete namespace_1 and assert it's all gone and none of
+    // namespace_2 is gone
+    repos
+        .namespaces()
+        .soft_delete("namespace_test_delete_namespace_1")
+        .await
+        .expect("delete namespace should succeed");
+    // assert that namespace is soft-deleted, but the table, column, and parquet files are all
+    // still there.
+    assert!(repos
+        .namespaces()
+        .get_by_id(namespace_1.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .expect("get namespace should succeed")
+        .is_none());
+    assert_eq!(
+        repos
+            .namespaces()
+            .get_by_id(namespace_1.id, SoftDeletedRows::AllRows)
+            .await
+            .expect("get namespace should succeed")
+            .map(|mut v| {
+                // The only change after soft-deletion should be the deleted_at
+                // field being set - this block normalises that field, so that
+                // the before/after can be asserted as equal.
+                v.deleted_at = None;
+                v
+            })
+            .expect("should see soft-deleted row"),
+        namespace_1
+    );
+    assert_eq!(
+        repos
+            .tables()
+            .get_by_id(table_1.id)
+            .await
+            .expect("get table should succeed")
+            .expect("should return row"),
+        table_1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_namespace_id(namespace_1.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_table_id(table_1.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+
+    // partition's get_by_id should succeed
+    repos
+        .partitions()
+        .get_by_id_batch(&[partition_1.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+
+    // assert that the namespace, table, column, and parquet files for namespace_2 are still
+    // there
+    assert!(repos
+        .namespaces()
+        .get_by_id(namespace_2.id, SoftDeletedRows::ExcludeDeleted)
+        .await
+        .expect("get namespace should succeed")
+        .is_some());
+
+    assert!(repos
+        .tables()
+        .get_by_id(table_2.id)
+        .await
+        .expect("get table should succeed")
+        .is_some());
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_namespace_id(namespace_2.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+    assert_eq!(
+        repos
+            .columns()
+            .list_by_table_id(table_2.id)
+            .await
+            .expect("listing columns should succeed")
+            .len(),
+        1
+    );
+
+    // partition's get_by_id should succeed
+    repos
+        .partitions()
+        .get_by_id_batch(&[partition_2.id])
+        .await
+        .unwrap()
+        .into_iter()
+        .next()
+        .unwrap();
+}
+
+/// Upsert a namespace called `namespace_name` and write `lines` to it.
+async fn populate_namespace<R>(
+    repos: &mut R,
+    namespace_name: &str,
+    lines: &str,
+) -> (Namespace, NamespaceSchema)
+where
+    R: RepoCollection + ?Sized,
+{
+    let namespace = repos
+        .namespaces()
+        .create(
+            &NamespaceName::new(namespace_name).unwrap(),
+            None,
+            None,
+            None,
+        )
+        .await;
+
+    let namespace = match namespace {
+        Ok(v) => v,
+        Err(Error::AlreadyExists { .. }) => repos
+            .namespaces()
+            .get_by_name(namespace_name, SoftDeletedRows::AllRows)
+            .await
+            .unwrap()
+            .unwrap(),
+        e @ Err(_) => e.unwrap(),
+    };
+
+    let batches = mutable_batch_lp::lines_to_batches(lines, 42).unwrap();
+    let batches = batches.iter().map(|(table, batch)| (table.as_str(), batch));
+    let ns = NamespaceSchema::new_empty_from(&namespace);
+
+    let schema = validate_or_insert_schema(batches, &ns, repos)
+        .await
+        .expect("validate schema failed")
+        .unwrap_or(ns);
+
+    (namespace, schema)
+}
+
+async fn test_list_schemas(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let ns1 = populate_namespace(
+        repos.deref_mut(),
+        "ns1",
+        "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
+    )
+    .await;
+    let ns2 = populate_namespace(
+        repos.deref_mut(),
+        "ns2",
+        "cpu,tag=1 field=1i\nsomethingelse field=1u",
+    )
+    .await;
+
+    // Otherwise the in-mem catalog deadlocks.... (but not postgres)
+    drop(repos);
+
+    let got = list_schemas(&*catalog)
+        .await
+        .expect("should be able to list the schemas")
+        .collect::<Vec<_>>();
+
+    assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
+    assert!(got.contains(&ns2), "{:#?}\n\nwant{:#?}", got, &ns2);
+}
+
+async fn test_list_schemas_soft_deleted_rows(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let ns1 = populate_namespace(
+        repos.deref_mut(),
+        "ns1",
+        "cpu,tag=1 field=1i\nanother,tag=1 field=1.0",
+    )
+    .await;
+    let ns2 = populate_namespace(
+        repos.deref_mut(),
+        "ns2",
+        "cpu,tag=1 field=1i\nsomethingelse field=1u",
+    )
+    .await;
+
+    repos
+        .namespaces()
+        .soft_delete(&ns2.0.name)
+        .await
+        .expect("failed to soft delete namespace");
+
+    // Otherwise the in-mem catalog deadlocks.... (but not postgres)
+    drop(repos);
+
+    let got = list_schemas(&*catalog)
+        .await
+        .expect("should be able to list the schemas")
+        .collect::<Vec<_>>();
+
+    assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1);
+    assert!(!got.contains(&ns2), "{:#?}\n\n do not want{:#?}", got, &ns2);
+}
+
+/// Ensure that we can create two repo objects and that they instantly share their state.
+///
+/// This is a regression test for <https://github.com/influxdata/influxdb_iox/issues/3859>.
+async fn test_two_repos(catalog: Arc<dyn Catalog>) {
+    let mut repos_1 = catalog.repositories();
+    let mut repos_2 = catalog.repositories();
+    let repo_1 = repos_1.namespaces();
+    let repo_2 = repos_2.namespaces();
+
+    let namespace_name = NamespaceName::new("test_namespace").unwrap();
+    repo_1
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap();
+
+    repo_2
+        .get_by_name(&namespace_name, SoftDeletedRows::AllRows)
+        .await
+        .unwrap()
+        .unwrap();
+}
+
+async fn test_partition_create_or_get_idempotent(catalog: Arc<dyn Catalog>) {
+    let mut repos = catalog.repositories();
+
+    let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+    let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
+
+    let key = PartitionKey::from("bananas");
+
+    let hash_id = PartitionHashId::new(table_id, &key);
+
+    let a = repos
+        .partitions()
+        .create_or_get(key.clone(), table_id)
+        .await
+        .expect("should create OK");
+
+    assert_eq!(a.hash_id().unwrap(), &hash_id);
+    // Test: sort_key_ids from partition_create_or_get_idempotent
+    assert!(a.sort_key_ids().is_none());
+
+    // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent.
+    let b = repos
+        .partitions()
+        .create_or_get(key.clone(), table_id)
+        .await
+        .expect("idempotent write should succeed");
+
+    assert_eq!(a, b);
+
+    // Check that the hash_id is saved in the database and is returned when queried.
+    let table_partitions = repos.partitions().list_by_table_id(table_id).await.unwrap();
+    assert_eq!(table_partitions.len(), 1);
+    assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id);
+
+    // Test: sort_key_ids from partition_create_or_get_idempotent
+    assert!(table_partitions[0].sort_key_ids().is_none());
+}
+
+#[track_caller]
+fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) {
+    let histogram = metrics
+        .get_instrument::<Metric<DurationHistogram>>("catalog_op_duration")
+        .expect("failed to read metric")
+        .get_observer(&Attributes::from(&[("op", name), ("result", "success")]))
+        .expect("failed to get observer")
+        .fetch();
+
+    let hit_count = histogram.sample_count();
+    assert!(hit_count > 0, "metric did not record any calls");
+}
+
+async fn test_column_create_or_get_many_unchecked<R, F>(clean_state: R)
+where
+    R: Fn() -> F + Send + Sync,
+    F: Future<Output = Arc<dyn Catalog>> + Send,
+{
+    // Issue a few calls to create_or_get_many that contain distinct columns and
+    // covers the full set of column types.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+            ],
+            &[("test8", ColumnType::String), ("test9", ColumnType::Bool)],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with overlapping columns - request should succeed (upsert
+    // semantics).
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+                ("test8", ColumnType::String),
+            ],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with the same columns and types.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+            &[
+                ("test1", ColumnType::I64),
+                ("test2", ColumnType::U64),
+                ("test3", ColumnType::F64),
+                ("test4", ColumnType::Bool),
+            ],
+        ],
+        |res| assert_matches!(res, Ok(_)),
+    )
+    .await;
+
+    // Issue two calls with overlapping columns with conflicting types and
+    // observe a correctly populated ColumnTypeMismatch error.
+    test_column_create_or_get_many_unchecked_sub(
+        clean_state().await,
+        &[
+            &[
+                ("test1", ColumnType::String),
+                ("test2", ColumnType::String),
+                ("test3", ColumnType::String),
+                ("test4", ColumnType::String),
+            ],
+            &[
+                ("test1", ColumnType::String),
+                ("test2", ColumnType::Bool), // This one differs
+                ("test3", ColumnType::String),
+                // 4 is missing.
+                ("test5", ColumnType::String),
+                ("test6", ColumnType::Time),
+                ("test7", ColumnType::Tag),
+                ("test8", ColumnType::String),
+            ],
+        ],
+        |res| assert_matches!(res, Err(e) => {
+            assert_matches!(e, Error::AlreadyExists { descr } => {
+                assert_eq!(descr, "column test2 is type string but schema update has type bool");
+            })
+        }),
+    ).await;
+}
+
+async fn test_column_create_or_get_many_unchecked_sub<F>(
+    catalog: Arc<dyn Catalog>,
+    calls: &[&[(&'static str, ColumnType)]],
+    want: F,
+) where
+    F: FnOnce(Result<Vec<Column>, Error>) + Send,
+{
+    let mut repos = catalog.repositories();
+
+    let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
+    let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
+
+    let mut last_got = None;
+    for insert in calls {
+        let insert = insert
+            .iter()
+            .map(|(n, t)| (*n, *t))
+            .collect::<HashMap<_, _>>();
+
+        let got = repos
+            .columns()
+            .create_or_get_many_unchecked(table_id, insert.clone())
+            .await;
+
+        // The returned columns MUST always match the requested
+        // column values if successful.
+        if let Ok(got) = &got {
+            assert_eq!(insert.len(), got.len());
+
+            for got in got {
+                assert_eq!(table_id, got.table_id);
+                let requested_column_type = insert
+                    .get(got.name.as_str())
+                    .expect("Should have gotten back a column that was inserted");
+                assert_eq!(*requested_column_type, got.column_type,);
+            }
+
+            assert_metric_hit(&catalog.metrics(), "column_create_or_get_many_unchecked");
+        }
+
+        last_got = Some(got);
+    }
+
+    want(last_got.unwrap());
+}
+
+/// [`Catalog`] wrapper that is helpful for testing.
+#[derive(Debug)]
+pub(crate) struct TestCatalog {
+    hold_onto: Mutex<Vec<Box<dyn Any + Send>>>,
+    inner: Arc<dyn Catalog>,
+}
+
+impl TestCatalog {
+    /// Create new test catalog.
+    pub(crate) fn new(inner: Arc<dyn Catalog>) -> Self {
+        Self {
+            hold_onto: Mutex::new(vec![]),
+            inner,
+        }
+    }
+
+    /// Hold onto given value til dropped.
+    pub(crate) fn hold_onto<T>(&self, o: T)
+    where
+        T: Send + 'static,
+    {
+        self.hold_onto.lock().push(Box::new(o) as _)
+    }
+}
+
+impl Display for TestCatalog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "test({})", self.inner)
+    }
+}
+
+#[async_trait]
+impl Catalog for TestCatalog {
+    async fn setup(&self) -> Result<(), Error> {
+        self.inner.setup().await
+    }
+
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        self.inner.repositories()
+    }
+
+    fn metrics(&self) -> Arc<metric::Registry> {
+        self.inner.metrics()
+    }
+
+    fn time_provider(&self) -> Arc<dyn TimeProvider> {
+        self.inner.time_provider()
+    }
+}
+
+#[track_caller]
+fn assert_gt<T>(a: T, b: T)
+where
+    T: Display + PartialOrd,
+{
+    assert!(a > b, "failed: {a} > {b}",);
+}
+
+#[track_caller]
+fn assert_ge<T>(a: T, b: T)
+where
+    T: Display + PartialOrd,
+{
+    assert!(a >= b, "failed: {a} >= {b}",);
+}
diff --git a/iox_catalog/src/lib.rs b/iox_catalog/src/lib.rs
index d7d56113b9d..17fa14f836e 100644
--- a/iox_catalog/src/lib.rs
+++ b/iox_catalog/src/lib.rs
@@ -19,700 +19,17 @@
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
-use crate::interface::{ColumnTypeMismatchSnafu, Error, RepoCollection, Result};
-use data_types::{
-    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
-    ColumnType, NamespaceId, NamespaceSchema, Partition, TableSchema, TransitionPartitionId,
-};
-use mutable_batch::MutableBatch;
-use std::{borrow::Cow, collections::HashMap};
-use thiserror::Error;
-
-/// Column name for built in time column on every table.
-pub const TIME_COLUMN: &str = "time";
-
-/// Default retention period for data in the catalog.
-pub const DEFAULT_RETENTION_PERIOD: Option<i64> = None;
-
+pub mod cache;
+pub mod constants;
+pub mod grpc;
 pub mod interface;
-pub(crate) mod kafkaless_transition;
 pub mod mem;
 pub mod metrics;
 pub mod migrate;
 pub mod postgres;
 pub mod sqlite;
-
-/// An [`crate::interface::Error`] scoped to a single table for schema validation errors.
-#[derive(Debug, Error)]
-#[error("table {}, {}", .0, .1)]
-pub struct TableScopedError(String, Error);
-
-impl TableScopedError {
-    /// Return the table name for this error.
-    pub fn table(&self) -> &str {
-        &self.0
-    }
-
-    /// Return a reference to the error.
-    pub fn err(&self) -> &Error {
-        &self.1
-    }
-
-    /// Return ownership of the error, discarding the table name.
-    pub fn into_err(self) -> Error {
-        self.1
-    }
-}
-
-/// Look up a partition in the catalog by either database-assigned ID or deterministic hash ID.
-///
-/// The existence of this function should be temporary; it can be removed once all partition lookup
-/// is happening with only the deterministic hash ID.
-pub async fn partition_lookup<R>(
-    repos: &mut R,
-    id: &TransitionPartitionId,
-) -> Result<Option<Partition>, Error>
-where
-    R: RepoCollection + ?Sized,
-{
-    match id {
-        TransitionPartitionId::Deprecated(partition_id) => {
-            repos.partitions().get_by_id(*partition_id).await
-        }
-        TransitionPartitionId::Deterministic(partition_hash_id) => {
-            repos.partitions().get_by_hash_id(partition_hash_id).await
-        }
-    }
-}
-
-/// Look up multiple partitions in the catalog by either database-assigned ID or deterministic hash ID.
-///
-/// The output only contains existing partitions, the order is undefined.
-///
-/// The existence of this function should be temporary; it can be removed once all partition lookup
-/// is happening with only the deterministic hash ID.
-pub async fn partition_lookup_batch<R>(
-    repos: &mut R,
-    ids: &[&TransitionPartitionId],
-) -> Result<Vec<Partition>, Error>
-where
-    R: RepoCollection + ?Sized,
-{
-    let mut partition_ids = Vec::with_capacity(ids.len());
-    let mut partition_hash_ids = Vec::with_capacity(ids.len());
-
-    for id in ids {
-        match id {
-            TransitionPartitionId::Deprecated(partition_id) => {
-                partition_ids.push(*partition_id);
-            }
-            TransitionPartitionId::Deterministic(partition_hash_id) => {
-                partition_hash_ids.push(partition_hash_id);
-            }
-        }
-    }
-
-    let mut out = Vec::with_capacity(partition_ids.len() + partition_hash_ids.len());
-    if !partition_ids.is_empty() {
-        let mut partitions = repos.partitions().get_by_id_batch(partition_ids).await?;
-        out.append(&mut partitions);
-    }
-    if !partition_hash_ids.is_empty() {
-        let mut partitions = repos
-            .partitions()
-            .get_by_hash_id_batch(&partition_hash_ids)
-            .await?;
-        out.append(&mut partitions);
-    }
-    Ok(out)
-}
-
-/// Given an iterator of `(table_name, batch)` to validate, this function
-/// ensures all the columns within `batch` match the existing schema for
-/// `table_name` in `schema`. If the column does not already exist in `schema`,
-/// it is created and an updated [`NamespaceSchema`] is returned.
-///
-/// This function pushes schema additions through to the backend catalog, and
-/// relies on the catalog to serialize concurrent additions of a given column,
-/// ensuring only one type is ever accepted per column.
-pub async fn validate_or_insert_schema<'a, T, U, R>(
-    tables: T,
-    schema: &NamespaceSchema,
-    repos: &mut R,
-) -> Result<Option<NamespaceSchema>, TableScopedError>
-where
-    T: IntoIterator<IntoIter = U, Item = (&'a str, &'a MutableBatch)> + Send + Sync,
-    U: Iterator<Item = T::Item> + Send,
-    R: RepoCollection + ?Sized,
-{
-    let tables = tables.into_iter();
-
-    // The (potentially updated) NamespaceSchema to return to the caller.
-    let mut schema = Cow::Borrowed(schema);
-
-    for (table_name, batch) in tables {
-        validate_mutable_batch(batch, table_name, &mut schema, repos)
-            .await
-            .map_err(|e| TableScopedError(table_name.to_string(), e))?;
-    }
-
-    match schema {
-        Cow::Owned(v) => Ok(Some(v)),
-        Cow::Borrowed(_) => Ok(None),
-    }
-}
-
-// &mut Cow is used to avoid a copy, so allow it
-#[allow(clippy::ptr_arg)]
-async fn validate_mutable_batch<R>(
-    mb: &MutableBatch,
-    table_name: &str,
-    schema: &mut Cow<'_, NamespaceSchema>,
-    repos: &mut R,
-) -> Result<()>
-where
-    R: RepoCollection + ?Sized,
-{
-    // Check if the table exists in the schema.
-    //
-    // Because the entry API requires &mut it is not used to avoid a premature
-    // clone of the Cow.
-    let mut table = match schema.tables.get(table_name) {
-        Some(t) => Cow::Borrowed(t),
-        None => {
-            // The table does not exist in the cached schema.
-            //
-            // Attempt to load an existing table from the catalog or create a new table in the
-            // catalog to populate the cache.
-            let table =
-                table_load_or_create(repos, schema.id, &schema.partition_template, table_name)
-                    .await?;
-
-            assert!(schema
-                .to_mut()
-                .tables
-                .insert(table_name.to_string(), table)
-                .is_none());
-
-            Cow::Borrowed(schema.tables.get(table_name).unwrap())
-        }
-    };
-
-    // The table is now in the schema (either by virtue of it already existing,
-    // or through adding it above).
-    //
-    // If the table itself needs to be updated during column validation it
-    // becomes a Cow::owned() copy and the modified copy should be inserted into
-    // the schema before returning.
-    let mut column_batch: HashMap<&str, ColumnType> = HashMap::new();
-
-    for (name, col) in mb.columns() {
-        // Check if the column exists in the cached schema.
-        //
-        // If it does, validate it. If it does not exist, create it and insert
-        // it into the cached schema.
-
-        match table.columns.get(name.as_str()) {
-            Some(existing) if existing.matches_type(col.influx_type()) => {
-                // No action is needed as the column matches the existing column
-                // schema.
-            }
-            Some(existing) => {
-                // The column schema, and the column in the mutable batch are of
-                // different types.
-                return ColumnTypeMismatchSnafu {
-                    name,
-                    existing: existing.column_type,
-                    new: col.influx_type(),
-                }
-                .fail();
-            }
-            None => {
-                // The column does not exist in the cache, add it to the column
-                // batch to be bulk inserted later.
-                let old = column_batch.insert(name.as_str(), ColumnType::from(col.influx_type()));
-                assert!(
-                    old.is_none(),
-                    "duplicate column name `{name}` in new column batch shouldn't be possible"
-                );
-            }
-        }
-    }
-
-    if !column_batch.is_empty() {
-        repos
-            .columns()
-            .create_or_get_many_unchecked(table.id, column_batch)
-            .await?
-            .into_iter()
-            .for_each(|c| table.to_mut().add_column(c));
-    }
-
-    if let Cow::Owned(table) = table {
-        // The table schema was mutated and needs inserting into the namespace
-        // schema to make the changes visible to the caller.
-        assert!(schema
-            .to_mut()
-            .tables
-            .insert(table_name.to_string(), table)
-            .is_some());
-    }
-
-    Ok(())
-}
-
-/// load the table or create a new one
-pub async fn table_load_or_create<R>(
-    repos: &mut R,
-    namespace_id: NamespaceId,
-    namespace_partition_template: &NamespacePartitionTemplateOverride,
-    table_name: &str,
-) -> Result<TableSchema>
-where
-    R: RepoCollection + ?Sized,
-{
-    let table = match repos
-        .tables()
-        .get_by_namespace_and_name(namespace_id, table_name)
-        .await?
-    {
-        Some(table) => table,
-        None => {
-            // There is a possibility of a race condition here, if another request has also
-            // created this table after the `get_by_namespace_and_name` call but before
-            // this `create` call. In that (hopefully) rare case, do an additional fetch
-            // from the catalog for the record that should now exist.
-            let create_result = repos
-                .tables()
-                .create(
-                    table_name,
-                    // This table is being created implicitly by this write, so there's no
-                    // possibility of a user-supplied partition template here, which is why there's
-                    // a hardcoded `None`. If there is a namespace template, it must be valid because
-                    // validity was checked during its creation, so that's why there's an `expect`.
-                    TablePartitionTemplateOverride::try_new(None, namespace_partition_template)
-                        .expect("no table partition template; namespace partition template has been validated"),
-                    namespace_id,
-                )
-                .await;
-            if let Err(Error::TableNameExists { .. }) = create_result {
-                repos
-                    .tables()
-                    .get_by_namespace_and_name(namespace_id, table_name)
-                    // Propagate any `Err` returned by the catalog
-                    .await?
-                    // Getting `Ok(None)` should be impossible if we're in this code path because
-                    // the `create` request just said the table exists
-                    .expect(
-                        "Table creation failed because the table exists, so looking up the table \
-                        should return `Some(table)`, but it returned `None`",
-                    )
-            } else {
-                create_result?
-            }
-        }
-    };
-
-    let mut table = TableSchema::new_empty_from(&table);
-
-    // Always add a time column to all new tables.
-    let time_col = repos
-        .columns()
-        .create_or_get(TIME_COLUMN, table.id, ColumnType::Time)
-        .await?;
-
-    table.add_column(time_col);
-
-    Ok(table)
-}
-
-/// Catalog helper functions for creation of catalog objects
-pub mod test_helpers {
-    use crate::RepoCollection;
-    use data_types::{
-        partition_template::TablePartitionTemplateOverride, ColumnId, ColumnSet, CompactionLevel,
-        Namespace, NamespaceName, ParquetFileParams, Partition, Table, Timestamp,
-    };
-    use uuid::Uuid;
-
-    /// When the details of the namespace don't matter; the test just needs *a* catalog namespace
-    /// with a particular name.
-    ///
-    /// Use [`NamespaceRepo::create`] directly if:
-    ///
-    /// - The values of the parameters to `create` need to be different than what's here
-    /// - The values of the parameters to `create` are relevant to the behavior under test
-    /// - You expect namespace creation to fail in the test
-    ///
-    /// [`NamespaceRepo::create`]: crate::interface::NamespaceRepo::create
-    pub async fn arbitrary_namespace<R: RepoCollection + ?Sized>(
-        repos: &mut R,
-        name: &str,
-    ) -> Namespace {
-        let namespace_name = NamespaceName::new(name).unwrap();
-        repos
-            .namespaces()
-            .create(&namespace_name, None, None, None)
-            .await
-            .unwrap()
-    }
-
-    /// When the details of the table don't matter; the test just needs *a* catalog table
-    /// with a particular name in a particular namespace.
-    ///
-    /// Use [`TableRepo::create`] directly if:
-    ///
-    /// - The values of the parameters to `create_or_get` need to be different than what's here
-    /// - The values of the parameters to `create_or_get` are relevant to the behavior under test
-    /// - You expect table creation to fail in the test
-    ///
-    /// [`TableRepo::create`]: crate::interface::TableRepo::create
-    pub async fn arbitrary_table<R: RepoCollection + ?Sized>(
-        repos: &mut R,
-        name: &str,
-        namespace: &Namespace,
-    ) -> Table {
-        repos
-            .tables()
-            .create(
-                name,
-                TablePartitionTemplateOverride::try_new(None, &namespace.partition_template)
-                    .unwrap(),
-                namespace.id,
-            )
-            .await
-            .unwrap()
-    }
-
-    /// When the details of a Parquet file record don't matter, the test just needs *a* Parquet
-    /// file record in a particular namespace+table+partition.
-    pub fn arbitrary_parquet_file_params(
-        namespace: &Namespace,
-        table: &Table,
-        partition: &Partition,
-    ) -> ParquetFileParams {
-        ParquetFileParams {
-            namespace_id: namespace.id,
-            table_id: table.id,
-            partition_id: partition.transition_partition_id(),
-            object_store_id: Uuid::new_v4(),
-            min_time: Timestamp::new(1),
-            max_time: Timestamp::new(10),
-            file_size_bytes: 1337,
-            row_count: 0,
-            compaction_level: CompactionLevel::Initial,
-            created_at: Timestamp::new(1),
-            column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
-            max_l0_created_at: Timestamp::new(1),
-        }
-    }
-}
+pub mod test_helpers;
+pub mod util;
 
 #[cfg(test)]
-mod tests {
-    use std::{collections::BTreeMap, sync::Arc};
-
-    use super::*;
-    use crate::{
-        interface::{get_schema_by_name, SoftDeletedRows},
-        mem::MemCatalog,
-    };
-
-    // Generate a test that simulates multiple, sequential writes in `lp` and
-    // asserts the resulting schema.
-    //
-    // This test asserts the cached schema and the database entry are always in
-    // sync.
-    macro_rules! test_validate_schema {
-        (
-            $name:ident,
-            lp = [$($lp:literal,)+],                                // An array of multi-line LP writes
-            want_observe_conflict = $want_observe_conflict:literal, // true if a schema validation error should be observed at some point
-            want_schema = {$($want_schema:tt) +}                    // The expected resulting schema after all writes complete.
-        ) => {
-            paste::paste! {
-                #[allow(clippy::bool_assert_comparison)]
-                #[tokio::test]
-                async fn [<test_validate_schema_ $name>]() {
-                    use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
-                    use std::ops::DerefMut;
-                    use pretty_assertions::assert_eq;
-                    const NAMESPACE_NAME: &str = "bananas";
-
-                    let metrics = Arc::new(metric::Registry::default());
-                    let repo = MemCatalog::new(metrics);
-                    let mut txn = repo.repositories().await;
-
-                    let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME)
-                        .await;
-                    let schema = NamespaceSchema::new_empty_from(&namespace);
-
-                    // Apply all the lp literals as individual writes, feeding
-                    // the result of one validation into the next to drive
-                    // incremental construction of the schemas.
-                    let mut observed_conflict = false;
-                    $(
-                        let schema = {
-                            let lp: String = $lp.to_string();
-
-                            let writes = mutable_batch_lp::lines_to_batches(lp.as_str(), 42)
-                                .expect("failed to build test writes from LP");
-
-                            let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut())
-                                .await;
-
-                            match got {
-                                Err(TableScopedError(_, Error::ColumnTypeMismatch{ .. })) => {
-                                    observed_conflict = true;
-                                    schema
-                                },
-                                Err(e) => panic!("unexpected error: {}", e),
-                                Ok(Some(new_schema)) => new_schema,
-                                Ok(None) => schema,
-                            }
-                        };
-                    )+
-
-                    assert_eq!($want_observe_conflict, observed_conflict, "should error mismatch");
-
-                    // Invariant: in absence of concurrency, the schema within
-                    // the database must always match the incrementally built
-                    // cached schema.
-                    let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut(), SoftDeletedRows::ExcludeDeleted)
-                        .await
-                        .expect("database failed to query for namespace schema");
-                    assert_eq!(schema, db_schema, "schema in DB and cached schema differ");
-
-                    // Generate the map of tables => desired column types
-                    let want_tables: BTreeMap<String, BTreeMap<String, ColumnType>> = test_validate_schema!(@table, $($want_schema)+);
-
-                    // Generate a similarly structured map from the actual
-                    // schema
-                    let actual_tables: BTreeMap<String, BTreeMap<String, ColumnType>> = schema
-                        .tables
-                        .iter()
-                        .map(|(table, table_schema)| {
-                            let desired_cols = table_schema
-                                .columns
-                                .iter()
-                                .map(|(column, column_schema)| (column.clone(), column_schema.column_type))
-                                .collect::<BTreeMap<_, _>>();
-
-                            (table.clone(), desired_cols)
-                        })
-                        .collect();
-
-                    // Assert the actual namespace contents matches the desired
-                    // table schemas in the test args.
-                    assert_eq!(want_tables, actual_tables, "cached schema and desired schema differ");
-                }
-            }
-        };
-        // Generate a map of table names => column map (below)
-        //
-        // out: BTreeMap<String, BTreeMap<ColName, ColumnType>>
-        (@table, $($table_name:literal: [$($columns:tt) +],)*) => {{
-            let mut tables = BTreeMap::new();
-            $(
-                let want_cols = test_validate_schema!(@column, $($columns)+);
-                assert!(tables.insert($table_name.to_string(), want_cols).is_none());
-            )*
-            tables
-        }};
-        // Generate a map of column names => ColumnType
-        //
-        // out: BTreeMap<ColName, ColumnType>
-        (@column, $($col_name:literal => $col_type:expr,)+) => {{
-            let mut cols = BTreeMap::new();
-            $(
-                assert!(cols.insert($col_name.to_string(), $col_type).is_none());
-            )*
-            cols
-        }};
-    }
-
-    test_validate_schema!(
-        one_write_multiple_tables,
-        lp = [
-            "
-                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
-                m1,t1=a f1=3i 2\n\
-                m2,t3=b f1=true 1\n\
-            ",
-        ],
-        want_observe_conflict = false,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "t2" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "f2" => ColumnType::F64,
-                "time" => ColumnType::Time,
-            ],
-            "m2": [
-                "f1" => ColumnType::Bool,
-                "t3" => ColumnType::Tag,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    // test that a new table will be created
-    test_validate_schema!(
-        two_writes_incremental_new_table,
-        lp = [
-            "
-                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
-                m1,t1=a f1=3i 2\n\
-                m2,t3=b f1=true 1\n\
-            ",
-            "
-                m1,t1=c f1=1i 2\n\
-                new_measurement,t9=a f10=true 1\n\
-            ",
-        ],
-        want_observe_conflict = false,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "t2" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "f2" => ColumnType::F64,
-                "time" => ColumnType::Time,
-            ],
-            "m2": [
-                "f1" => ColumnType::Bool,
-                "t3" => ColumnType::Tag,
-                "time" => ColumnType::Time,
-                ],
-            "new_measurement": [
-                "t9" => ColumnType::Tag,
-                "f10" => ColumnType::Bool,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    // test that a new column for an existing table will be created
-    test_validate_schema!(
-        two_writes_incremental_new_column,
-        lp = [
-            "
-                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
-                m1,t1=a f1=3i 2\n\
-                m2,t3=b f1=true 1\n\
-            ",
-            "m1,new_tag=c new_field=1i 2",
-        ],
-        want_observe_conflict = false,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "t2" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "f2" => ColumnType::F64,
-                "time" => ColumnType::Time,
-                // These are the incremental additions:
-                "new_tag" => ColumnType::Tag,
-                "new_field" => ColumnType::I64,
-            ],
-            "m2": [
-                "f1" => ColumnType::Bool,
-                "t3" => ColumnType::Tag,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    test_validate_schema!(
-        table_always_has_time_column,
-        lp = [
-            "m1,t1=a f1=2i",
-        ],
-        want_observe_conflict = false,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    test_validate_schema!(
-        two_writes_conflicting_column_types,
-        lp = [
-            "m1,t1=a f1=2i",
-            // Second write has conflicting type for f1.
-            "m1,t1=a f1=2.0",
-        ],
-        want_observe_conflict = true,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    test_validate_schema!(
-        two_writes_tag_field_transposition,
-        lp = [
-            // x is a tag
-            "m1,t1=a,x=t f1=2i",
-            // x is a field
-            "m1,t1=a x=t,f1=2i",
-        ],
-        want_observe_conflict = true,
-        want_schema = {
-            "m1": [
-                "t1" => ColumnType::Tag,
-                "x" => ColumnType::Tag,
-                "f1" => ColumnType::I64,
-                "time" => ColumnType::Time,
-            ],
-        }
-    );
-
-    #[tokio::test]
-    async fn validate_table_create_race_doesnt_get_all_columns() {
-        use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
-        use std::{collections::BTreeSet, ops::DerefMut};
-        const NAMESPACE_NAME: &str = "bananas";
-
-        let repo = MemCatalog::new(Default::default());
-        let mut txn = repo.repositories().await;
-        let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME).await;
-
-        // One cached schema has no tables.
-        let empty_schema = NamespaceSchema::new_empty_from(&namespace);
-
-        // Another cached schema gets a write that creates a table with some columns.
-        let schema_with_table = empty_schema.clone();
-        let writes = mutable_batch_lp::lines_to_batches("m1,t1=a f1=2i", 42).unwrap();
-        validate_or_insert_schema(
-            writes.iter().map(|(k, v)| (k.as_str(), v)),
-            &schema_with_table,
-            txn.deref_mut(),
-        )
-        .await
-        .unwrap();
-
-        // then the empty schema adds the same table with some different columns
-        let other_writes = mutable_batch_lp::lines_to_batches("m1,t2=a f2=2i", 43).unwrap();
-        let formerly_empty_schema = validate_or_insert_schema(
-            other_writes.iter().map(|(k, v)| (k.as_str(), v)),
-            &empty_schema,
-            txn.deref_mut(),
-        )
-        .await
-        .unwrap()
-        .unwrap();
-
-        // the formerly-empty schema should NOT have all the columns; schema convergence is handled
-        // at a higher level by the namespace cache/gossip system
-        let table = formerly_empty_schema.tables.get("m1").unwrap();
-        assert_eq!(table.columns.names(), BTreeSet::from(["t2", "f2", "time"]));
-    }
-}
+pub(crate) mod interface_tests;
diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs
index a779a5567e6..0d810fd0ca8 100644
--- a/iox_catalog/src/mem.rs
+++ b/iox_catalog/src/mem.rs
@@ -1,35 +1,38 @@
 //! This module implements an in-memory implementation of the iox_catalog interface. It can be
 //! used for testing or for an IOx designed to run without catalog persistence.
 
-use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE;
 use crate::{
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    },
     interface::{
-        CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo,
-        ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
-        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
     },
     metrics::MetricDecorator,
 };
 use async_trait::async_trait;
-use data_types::SortedColumnSet;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
 use data_types::{
     partition_template::{
         NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
     },
     Column, ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace,
-    NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile,
-    ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey,
-    SkippedCompaction, Table, TableId, Timestamp, TransitionPartitionId,
+    NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
 };
-use iox_time::{SystemProvider, TimeProvider};
+use iox_time::TimeProvider;
+use parking_lot::Mutex;
 use snafu::ensure;
-use sqlx::types::Uuid;
+use std::ops::Deref;
 use std::{
     collections::{HashMap, HashSet},
     fmt::{Display, Formatter},
+    ops::DerefMut,
     sync::Arc,
 };
-use tokio::sync::{Mutex, OwnedMutexGuard};
 
 /// In-memory catalog that implements the `RepoCollection` and individual repo traits from
 /// the catalog interface.
@@ -40,20 +43,20 @@ pub struct MemCatalog {
 }
 
 impl MemCatalog {
-    /// return new initialized `MemCatalog`
-    pub fn new(metrics: Arc<metric::Registry>) -> Self {
+    /// return new initialized [`MemCatalog`]
+    pub fn new(metrics: Arc<metric::Registry>, time_provider: Arc<dyn TimeProvider>) -> Self {
         Self {
             metrics,
             collections: Default::default(),
-            time_provider: Arc::new(SystemProvider::new()),
+            time_provider,
         }
     }
 
     /// Add partition directly, for testing purposes only as it does not do any consistency or
     /// uniqueness checks
-    pub async fn add_partition(&self, partition: Partition) {
-        let mut collections = Arc::clone(&self.collections).lock_owned().await;
-        collections.partitions.push(partition);
+    pub fn add_partition(&self, partition: Partition) {
+        let mut stage = self.collections.lock();
+        stage.partitions.push(partition.into());
     }
 }
 
@@ -63,12 +66,42 @@ impl std::fmt::Debug for MemCatalog {
     }
 }
 
+/// A wrapper around `T` adding a generation number
+#[derive(Debug, Clone)]
+struct Versioned<T> {
+    generation: u64,
+    value: T,
+}
+
+impl<T> Deref for Versioned<T> {
+    type Target = T;
+
+    fn deref(&self) -> &Self::Target {
+        &self.value
+    }
+}
+
+impl<T> DerefMut for Versioned<T> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.value
+    }
+}
+
+impl<T> From<T> for Versioned<T> {
+    fn from(value: T) -> Self {
+        Self {
+            generation: 0,
+            value,
+        }
+    }
+}
+
 #[derive(Default, Debug, Clone)]
 struct MemCollections {
     namespaces: Vec<Namespace>,
-    tables: Vec<Table>,
+    tables: Vec<Versioned<Table>>,
     columns: Vec<Column>,
-    partitions: Vec<Partition>,
+    partitions: Vec<Versioned<Partition>>,
     skipped_compactions: Vec<SkippedCompaction>,
     parquet_files: Vec<ParquetFile>,
 }
@@ -76,16 +109,10 @@ struct MemCollections {
 /// transaction bound to an in-memory catalog.
 #[derive(Debug)]
 pub struct MemTxn {
-    inner: OwnedMutexGuard<MemCollections>,
+    collections: Arc<Mutex<MemCollections>>,
     time_provider: Arc<dyn TimeProvider>,
 }
 
-impl MemTxn {
-    fn stage(&mut self) -> &mut MemCollections {
-        &mut self.inner
-    }
-}
-
 impl Display for MemCatalog {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "Memory")
@@ -98,14 +125,15 @@ impl Catalog for MemCatalog {
         Ok(())
     }
 
-    async fn repositories(&self) -> Box<dyn RepoCollection> {
-        let collections = Arc::clone(&self.collections).lock_owned().await;
+    fn repositories(&self) -> Box<dyn RepoCollection> {
+        let collections = Arc::clone(&self.collections);
         Box::new(MetricDecorator::new(
             MemTxn {
-                inner: collections,
+                collections,
                 time_provider: self.time_provider(),
             },
             Arc::clone(&self.metrics),
+            self.time_provider(),
         ))
     }
 
@@ -119,7 +147,6 @@ impl Catalog for MemCatalog {
     }
 }
 
-#[async_trait]
 impl RepoCollection for MemTxn {
     fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
         self
@@ -151,11 +178,11 @@ impl NamespaceRepo for MemTxn {
         retention_period_ns: Option<i64>,
         service_protection_limits: Option<NamespaceServiceProtectionLimitsOverride>,
     ) -> Result<Namespace> {
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
 
         if stage.namespaces.iter().any(|n| n.name == name.as_str()) {
-            return Err(Error::NameExists {
-                name: name.to_string(),
+            return Err(Error::AlreadyExists {
+                descr: name.to_string(),
             });
         }
 
@@ -180,7 +207,7 @@ impl NamespaceRepo for MemTxn {
     }
 
     async fn list(&mut self, deleted: SoftDeletedRows) -> Result<Vec<Namespace>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         Ok(filter_namespace_soft_delete(&stage.namespaces, deleted)
             .cloned()
@@ -192,11 +219,13 @@ impl NamespaceRepo for MemTxn {
         id: NamespaceId,
         deleted: SoftDeletedRows,
     ) -> Result<Option<Namespace>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
-        Ok(filter_namespace_soft_delete(&stage.namespaces, deleted)
+        let res = filter_namespace_soft_delete(&stage.namespaces, deleted)
             .find(|n| n.id == id)
-            .cloned())
+            .cloned();
+
+        Ok(res)
     }
 
     async fn get_by_name(
@@ -204,39 +233,41 @@ impl NamespaceRepo for MemTxn {
         name: &str,
         deleted: SoftDeletedRows,
     ) -> Result<Option<Namespace>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
-        Ok(filter_namespace_soft_delete(&stage.namespaces, deleted)
+        let res = filter_namespace_soft_delete(&stage.namespaces, deleted)
             .find(|n| n.name == name)
-            .cloned())
+            .cloned();
+
+        Ok(res)
     }
 
     // performs a cascading delete of all things attached to the namespace, then deletes the
     // namespace
     async fn soft_delete(&mut self, name: &str) -> Result<()> {
+        let mut stage = self.collections.lock();
         let timestamp = self.time_provider.now();
-        let stage = self.stage();
         // get namespace by name
         match stage.namespaces.iter_mut().find(|n| n.name == name) {
             Some(n) => {
                 n.deleted_at = Some(Timestamp::from(timestamp));
                 Ok(())
             }
-            None => Err(Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
             }),
         }
     }
 
     async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result<Namespace> {
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
         match stage.namespaces.iter_mut().find(|n| n.name == name) {
             Some(n) => {
                 n.max_tables = new_max;
                 Ok(n.clone())
             }
-            None => Err(Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
             }),
         }
     }
@@ -246,14 +277,14 @@ impl NamespaceRepo for MemTxn {
         name: &str,
         new_max: MaxColumnsPerTable,
     ) -> Result<Namespace> {
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
         match stage.namespaces.iter_mut().find(|n| n.name == name) {
             Some(n) => {
                 n.max_columns_per_table = new_max;
                 Ok(n.clone())
             }
-            None => Err(Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
             }),
         }
     }
@@ -263,14 +294,14 @@ impl NamespaceRepo for MemTxn {
         name: &str,
         retention_period_ns: Option<i64>,
     ) -> Result<Namespace> {
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
         match stage.namespaces.iter_mut().find(|n| n.name == name) {
             Some(n) => {
                 n.retention_period_ns = retention_period_ns;
                 Ok(n.clone())
             }
-            None => Err(Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            None => Err(Error::NotFound {
+                descr: name.to_string(),
             }),
         }
     }
@@ -284,9 +315,9 @@ impl TableRepo for MemTxn {
         partition_template: TablePartitionTemplateOverride,
         namespace_id: NamespaceId,
     ) -> Result<Table> {
-        let table = {
-            let stage = self.stage();
+        let mut stage = self.collections.lock();
 
+        let table = {
             // this block is just to ensure the mem impl correctly creates TableCreateLimitError in
             // tests, we don't care about any of the errors it is discarding
             stage
@@ -294,10 +325,10 @@ impl TableRepo for MemTxn {
                 .iter()
                 .find(|n| n.id == namespace_id)
                 .cloned()
-                .ok_or_else(|| Error::NamespaceNotFoundByName {
+                .ok_or_else(|| Error::NotFound {
                     // we're never going to use this error, this is just for flow control,
                     // so it doesn't matter that we only have the ID, not the name
-                    name: "".to_string(),
+                    descr: "".to_string(),
                 })
                 .and_then(|n| {
                     let max_tables = n.max_tables;
@@ -306,10 +337,12 @@ impl TableRepo for MemTxn {
                         .iter()
                         .filter(|t| t.namespace_id == namespace_id)
                         .count();
-                    if tables_count >= max_tables.get().try_into().unwrap() {
-                        return Err(Error::TableCreateLimitError {
-                            table_name: name.to_string(),
-                            namespace_id,
+                    if tables_count >= max_tables.get() {
+                        return Err(Error::LimitExceeded {
+                            descr: format!(
+                                "couldn't create table {}; limit reached on namespace {}",
+                                name, namespace_id
+                            ),
                         });
                     }
                     Ok(())
@@ -321,9 +354,8 @@ impl TableRepo for MemTxn {
                 .find(|t| t.name == name && t.namespace_id == namespace_id)
             {
                 Some(_t) => {
-                    return Err(Error::TableNameExists {
-                        name: name.to_string(),
-                        namespace_id,
+                    return Err(Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
                     })
                 }
                 None => {
@@ -333,23 +365,19 @@ impl TableRepo for MemTxn {
                         name: name.to_string(),
                         partition_template,
                     };
-                    stage.tables.push(table);
-                    stage.tables.last().unwrap()
+                    stage.tables.push(table.into());
+                    stage.tables.last().unwrap().value.clone()
                 }
             }
         };
 
-        let table = table.clone();
-
         // Partitioning is only supported for tags, so create tag columns for all `TagValue`
         // partition template parts. It's important this happens within the table creation
         // transaction so that there isn't a possibility of a concurrent write creating these
         // columns with an unsupported type.
         for template_part in table.partition_template.parts() {
             if let TemplatePart::TagValue(tag_name) = template_part {
-                self.columns()
-                    .create_or_get(tag_name, table.id, ColumnType::Tag)
-                    .await?;
+                create_or_get_column(&mut stage, tag_name, table.id, ColumnType::Tag)?;
             }
         }
 
@@ -357,9 +385,10 @@ impl TableRepo for MemTxn {
     }
 
     async fn get_by_id(&mut self, table_id: TableId) -> Result<Option<Table>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
-        Ok(stage.tables.iter().find(|t| t.id == table_id).cloned())
+        let mut tables = stage.tables.iter();
+        Ok(tables.find(|t| t.id == table_id).map(|v| v.value.clone()))
     }
 
     async fn get_by_namespace_and_name(
@@ -367,30 +396,59 @@ impl TableRepo for MemTxn {
         namespace_id: NamespaceId,
         name: &str,
     ) -> Result<Option<Table>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
-        Ok(stage
-            .tables
-            .iter()
-            .find(|t| t.namespace_id == namespace_id && t.name == name)
-            .cloned())
+        let mut tables = stage.tables.iter();
+        let search = tables.find(|t| t.namespace_id == namespace_id && t.name == name);
+        Ok(search.map(|v| v.value.clone()))
     }
 
     async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
-        let tables: Vec<_> = stage
-            .tables
-            .iter()
-            .filter(|t| t.namespace_id == namespace_id)
-            .cloned()
-            .collect();
+        let tables = stage.tables.iter();
+        let filtered = tables.filter(|t| t.namespace_id == namespace_id);
+        let tables: Vec<_> = filtered.map(|v| v.value.clone()).collect();
         Ok(tables)
     }
 
     async fn list(&mut self) -> Result<Vec<Table>> {
-        let stage = self.stage();
-        Ok(stage.tables.clone())
+        let stage = self.collections.lock();
+        Ok(stage.tables.iter().map(|v| v.value.clone()).collect())
+    }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut guard = self.collections.lock();
+
+        let (table, generation) = {
+            let mut tables = guard.tables.iter_mut();
+            let search = tables.find(|x| x.id == table_id);
+            let table = search.ok_or_else(|| Error::NotFound {
+                descr: table_id.to_string(),
+            })?;
+
+            let generation = table.generation;
+            table.generation += 1;
+            (table.value.clone(), generation)
+        };
+
+        let columns = guard
+            .columns
+            .iter()
+            .filter(|x| x.table_id == table_id)
+            .cloned()
+            .collect();
+
+        let partitions = guard
+            .partitions
+            .iter()
+            .filter(|x| x.table_id == table_id)
+            .map(|v| v.value.clone())
+            .collect();
+
+        Ok(TableSnapshot::encode(
+            table, partitions, columns, generation,
+        )?)
     }
 }
 
@@ -402,74 +460,8 @@ impl ColumnRepo for MemTxn {
         table_id: TableId,
         column_type: ColumnType,
     ) -> Result<Column> {
-        let stage = self.stage();
-
-        // this block is just to ensure the mem impl correctly creates ColumnCreateLimitError in
-        // tests, we don't care about any of the errors it is discarding
-        stage
-            .tables
-            .iter()
-            .find(|t| t.id == table_id)
-            .cloned()
-            .ok_or(Error::TableNotFound { id: table_id }) // error never used, this is just for flow control
-            .and_then(|t| {
-                stage
-                    .namespaces
-                    .iter()
-                    .find(|n| n.id == t.namespace_id)
-                    .cloned()
-                    .ok_or_else(|| Error::NamespaceNotFoundByName {
-                        // we're never going to use this error, this is just for flow control,
-                        // so it doesn't matter that we only have the ID, not the name
-                        name: "".to_string(),
-                    })
-                    .and_then(|n| {
-                        let max_columns_per_table = n.max_columns_per_table;
-                        let columns_count = stage
-                            .columns
-                            .iter()
-                            .filter(|t| t.table_id == table_id)
-                            .count();
-                        if columns_count >= max_columns_per_table.get().try_into().unwrap() {
-                            return Err(Error::ColumnCreateLimitError {
-                                column_name: name.to_string(),
-                                table_id,
-                            });
-                        }
-                        Ok(())
-                    })?;
-                Ok(())
-            })?;
-
-        let column = match stage
-            .columns
-            .iter()
-            .find(|t| t.name == name && t.table_id == table_id)
-        {
-            Some(c) => {
-                ensure!(
-                    column_type == c.column_type,
-                    ColumnTypeMismatchSnafu {
-                        name,
-                        existing: c.column_type,
-                        new: column_type
-                    }
-                );
-                c
-            }
-            None => {
-                let column = Column {
-                    id: ColumnId::new(stage.columns.len() as i64 + 1),
-                    table_id,
-                    name: name.to_string(),
-                    column_type,
-                };
-                stage.columns.push(column);
-                stage.columns.last().unwrap()
-            }
-        };
-
-        Ok(column.clone())
+        let mut stage = self.collections.lock();
+        create_or_get_column(&mut stage, name, table_id, column_type)
     }
 
     async fn create_or_get_many_unchecked(
@@ -481,7 +473,7 @@ impl ColumnRepo for MemTxn {
         // check column limits when inserting many columns because it's complicated and expensive,
         // and for testing purposes the in-memory catalog needs to match its functionality.
 
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
 
         let out: Vec<_> = columns
             .iter()
@@ -494,10 +486,11 @@ impl ColumnRepo for MemTxn {
                     Some(c) => {
                         ensure!(
                             column_type == c.column_type,
-                            ColumnTypeMismatchSnafu {
-                                name: column_name,
-                                existing: c.column_type,
-                                new: column_type
+                            AlreadyExistsSnafu {
+                                descr: format!(
+                                    "column {} is type {} but schema update has type {}",
+                                    column_name, c.column_type, column_type
+                                ),
                             }
                         );
                         Ok(c.clone())
@@ -520,7 +513,7 @@ impl ColumnRepo for MemTxn {
     }
 
     async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Column>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let table_ids: Vec<_> = stage
             .tables
@@ -539,7 +532,7 @@ impl ColumnRepo for MemTxn {
     }
 
     async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Column>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let columns: Vec<_> = stage
             .columns
@@ -552,7 +545,7 @@ impl ColumnRepo for MemTxn {
     }
 
     async fn list(&mut self) -> Result<Vec<Column>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
         Ok(stage.columns.clone())
     }
 }
@@ -560,7 +553,7 @@ impl ColumnRepo for MemTxn {
 #[async_trait]
 impl PartitionRepo for MemTxn {
     async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition> {
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
 
         let partition = match stage
             .partitions
@@ -569,96 +562,50 @@ impl PartitionRepo for MemTxn {
         {
             Some(p) => p,
             None => {
-                let p = Partition::new_in_memory_only(
+                let hash_id = PartitionHashId::new(table_id, &key);
+                let p = Partition::new_catalog_only(
                     PartitionId::new(stage.partitions.len() as i64 + 1),
+                    Some(hash_id),
                     table_id,
                     key,
-                    vec![],
-                    SortedColumnSet::new(vec![]),
+                    SortKeyIds::default(),
                     None,
                 );
-                stage.partitions.push(p);
+                stage.partitions.push(p.into());
                 stage.partitions.last().unwrap()
             }
         };
 
-        Ok(partition.clone())
+        Ok(partition.value.clone())
     }
 
-    async fn get_by_id(&mut self, partition_id: PartitionId) -> Result<Option<Partition>> {
-        let stage = self.stage();
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
+        let lookup = partition_ids.iter().collect::<HashSet<_>>();
 
-        Ok(stage
-            .partitions
-            .iter()
-            .find(|p| p.id == partition_id)
-            .cloned())
-    }
-
-    async fn get_by_id_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<Partition>> {
-        let lookup = partition_ids.into_iter().collect::<HashSet<_>>();
-
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         Ok(stage
             .partitions
             .iter()
             .filter(|p| lookup.contains(&p.id))
-            .cloned()
-            .collect())
-    }
-
-    async fn get_by_hash_id(
-        &mut self,
-        partition_hash_id: &PartitionHashId,
-    ) -> Result<Option<Partition>> {
-        let stage = self.stage();
-
-        Ok(stage
-            .partitions
-            .iter()
-            .find(|p| {
-                p.hash_id()
-                    .map(|hash_id| hash_id == partition_hash_id)
-                    .unwrap_or_default()
-            })
-            .cloned())
-    }
-
-    async fn get_by_hash_id_batch(
-        &mut self,
-        partition_hash_ids: &[&PartitionHashId],
-    ) -> Result<Vec<Partition>> {
-        let lookup = partition_hash_ids.iter().copied().collect::<HashSet<_>>();
-
-        let stage = self.stage();
-
-        Ok(stage
-            .partitions
-            .iter()
-            .filter(|p| {
-                p.hash_id()
-                    .map(|hash_id| lookup.contains(hash_id))
-                    .unwrap_or_default()
-            })
-            .cloned()
+            .map(|x| x.value.clone())
             .collect())
     }
 
     async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let partitions: Vec<_> = stage
             .partitions
             .iter()
             .filter(|p| p.table_id == table_id)
-            .cloned()
+            .map(|x| x.value.clone())
             .collect();
         Ok(partitions)
     }
 
     async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let partitions: Vec<_> = stage.partitions.iter().map(|p| p.id).collect();
 
@@ -667,45 +614,24 @@ impl PartitionRepo for MemTxn {
 
     async fn cas_sort_key(
         &mut self,
-        partition_id: &TransitionPartitionId,
-        old_sort_key: Option<Vec<String>>,
-        old_sort_key_ids: Option<SortedColumnSet>,
-        new_sort_key: &[&str],
-        new_sort_key_ids: &SortedColumnSet,
-    ) -> Result<Partition, CasFailure<(Vec<String>, SortedColumnSet)>> {
-        // These asserts are here to cacth bugs. They will be removed when we remove the sort_key
-        // field from the Partition
-        assert_eq!(
-            old_sort_key.as_ref().map(|v| v.len()),
-            old_sort_key_ids.as_ref().map(|v| v.len())
-        );
-        assert_eq!(new_sort_key.len(), new_sort_key_ids.len());
-
-        let stage = self.stage();
-        let old_sort_key = old_sort_key.unwrap_or_default();
-        let old_sort_key_ids = old_sort_key_ids.unwrap_or_default();
-
-        match stage.partitions.iter_mut().find(|p| match partition_id {
-            TransitionPartitionId::Deterministic(hash_id) => {
-                p.hash_id().map_or(false, |h| h == hash_id)
-            }
-            TransitionPartitionId::Deprecated(id) => p.id == *id,
-        }) {
-            Some(p) if p.sort_key_ids == old_sort_key_ids => {
-                // This is here to catch bugs. It will be removed when we remove the sort_key
-                assert_eq!(p.sort_key, old_sort_key);
-                p.sort_key = new_sort_key.iter().map(|s| s.to_string()).collect();
-                p.sort_key_ids = new_sort_key_ids.clone();
-                Ok(p.clone())
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let mut stage = self.collections.lock();
+
+        match stage.partitions.iter_mut().find(|p| p.id == partition_id) {
+            Some(p) if p.sort_key_ids() == old_sort_key_ids => {
+                p.set_sort_key_ids(new_sort_key_ids);
+                Ok(p.value.clone())
             }
             Some(p) => {
-                return Err(CasFailure::ValueMismatch((
-                    p.sort_key.clone(),
-                    p.sort_key_ids.clone(),
-                )));
+                return Err(CasFailure::ValueMismatch(
+                    p.sort_key_ids().cloned().unwrap_or_default(),
+                ));
             }
-            None => Err(CasFailure::QueryError(Error::PartitionNotFound {
-                id: partition_id.clone(),
+            None => Err(CasFailure::QueryError(Error::NotFound {
+                descr: partition_id.to_string(),
             })),
         }
     }
@@ -720,34 +646,31 @@ impl PartitionRepo for MemTxn {
         estimated_bytes: u64,
         limit_bytes: u64,
     ) -> Result<()> {
+        let mut stage = self.collections.lock();
+
         let reason = reason.to_string();
         let skipped_at = Timestamp::from(self.time_provider.now());
 
-        let stage = self.stage();
+        let sc = SkippedCompaction {
+            partition_id,
+            reason,
+            skipped_at,
+            num_files: num_files as i64,
+            limit_num_files: limit_num_files as i64,
+            limit_num_files_first_in_partition: limit_num_files_first_in_partition as i64,
+            estimated_bytes: estimated_bytes as i64,
+            limit_bytes: limit_bytes as i64,
+        };
+
         match stage
             .skipped_compactions
             .iter_mut()
             .find(|s| s.partition_id == partition_id)
         {
             Some(s) => {
-                s.reason = reason;
-                s.skipped_at = skipped_at;
-                s.num_files = num_files as i64;
-                s.limit_num_files = limit_num_files as i64;
-                s.limit_num_files_first_in_partition = limit_num_files_first_in_partition as i64;
-                s.estimated_bytes = estimated_bytes as i64;
-                s.limit_bytes = limit_bytes as i64;
+                *s = sc;
             }
-            None => stage.skipped_compactions.push(SkippedCompaction {
-                partition_id,
-                reason,
-                skipped_at,
-                num_files: num_files as i64,
-                limit_num_files: limit_num_files as i64,
-                limit_num_files_first_in_partition: limit_num_files_first_in_partition as i64,
-                estimated_bytes: estimated_bytes as i64,
-                limit_bytes: limit_bytes as i64,
-            }),
+            None => stage.skipped_compactions.push(sc),
         }
         Ok(())
     }
@@ -756,7 +679,7 @@ impl PartitionRepo for MemTxn {
         &mut self,
         partition_ids: &[PartitionId],
     ) -> Result<Vec<SkippedCompaction>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
         let find: HashSet<&PartitionId> = partition_ids.iter().collect();
         Ok(stage
             .skipped_compactions
@@ -767,7 +690,7 @@ impl PartitionRepo for MemTxn {
     }
 
     async fn list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
         Ok(stage.skipped_compactions.clone())
     }
 
@@ -777,7 +700,7 @@ impl PartitionRepo for MemTxn {
     ) -> Result<Option<SkippedCompaction>> {
         use std::mem;
 
-        let stage = self.stage();
+        let mut stage = self.collections.lock();
         let skipped_compactions = mem::take(&mut stage.skipped_compactions);
         let (mut removed, remaining) = skipped_compactions
             .into_iter()
@@ -792,8 +715,9 @@ impl PartitionRepo for MemTxn {
     }
 
     async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
-        let stage = self.stage();
-        Ok(stage.partitions.iter().rev().take(n).cloned().collect())
+        let stage = self.collections.lock();
+        let iter = stage.partitions.iter().rev().take(n);
+        Ok(iter.map(|x| x.value.clone()).collect())
     }
 
     async fn partitions_new_file_between(
@@ -801,7 +725,7 @@ impl PartitionRepo for MemTxn {
         minimum_time: Timestamp,
         maximum_time: Option<Timestamp>,
     ) -> Result<Vec<PartitionId>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let partitions: Vec<_> = stage
             .partitions
@@ -819,34 +743,65 @@ impl PartitionRepo for MemTxn {
     }
 
     async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         let old_style: Vec<_> = stage
             .partitions
             .iter()
             .filter(|p| p.hash_id().is_none())
-            .cloned()
+            .map(|x| x.value.clone())
             .collect();
 
         Ok(old_style)
     }
-}
 
-#[async_trait]
-impl ParquetFileRepo for MemTxn {
-    async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile> {
-        create_parquet_file(self.stage(), parquet_file_params).await
-    }
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut guard = self.collections.lock();
+        let (partition, generation) = {
+            let search = guard.partitions.iter_mut().find(|x| x.id == partition_id);
+            let partition = search.ok_or_else(|| Error::NotFound {
+                descr: format!("Partition {partition_id} not found"),
+            })?;
+
+            let generation = partition.generation;
+            partition.generation += 1;
+            (partition.value.clone(), generation)
+        };
 
-    async fn list_all(&mut self) -> Result<Vec<ParquetFile>> {
-        let stage = self.stage();
+        let files = guard
+            .parquet_files
+            .iter()
+            .filter(|x| x.partition_id == partition_id && x.to_delete.is_none())
+            .cloned()
+            .collect();
+
+        let search = guard.tables.iter().find(|x| x.id == partition.table_id);
+        let table = search.ok_or_else(|| Error::NotFound {
+            descr: format!("Table {} not found", partition.table_id),
+        })?;
 
-        Ok(stage.parquet_files.clone())
+        let sc = guard
+            .skipped_compactions
+            .iter()
+            .find(|sc| sc.partition_id == partition_id)
+            .cloned();
+
+        Ok(PartitionSnapshot::encode(
+            table.namespace_id,
+            partition,
+            files,
+            sc,
+            generation,
+        )?)
     }
+}
 
-    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<ParquetFileId>> {
+#[async_trait]
+impl ParquetFileRepo for MemTxn {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
+        let mut stage = self.collections.lock();
         let now = Timestamp::from(self.time_provider.now());
-        let stage = self.stage();
+        let stage = stage.deref_mut();
 
         Ok(stage
             .parquet_files
@@ -864,7 +819,7 @@ impl ParquetFileRepo for MemTxn {
                         ns.retention_period_ns.and_then(|rp| {
                             if f.max_time < now - rp {
                                 f.to_delete = Some(now);
-                                Some(f.id)
+                                Some((f.partition_id, f.object_store_id))
                             } else {
                                 None
                             }
@@ -875,40 +830,8 @@ impl ParquetFileRepo for MemTxn {
             .collect())
     }
 
-    async fn list_by_namespace_not_to_delete(
-        &mut self,
-        namespace_id: NamespaceId,
-    ) -> Result<Vec<ParquetFile>> {
-        let stage = self.stage();
-
-        let table_ids: HashSet<_> = stage
-            .tables
-            .iter()
-            .filter_map(|table| (table.namespace_id == namespace_id).then_some(table.id))
-            .collect();
-        let parquet_files: Vec<_> = stage
-            .parquet_files
-            .iter()
-            .filter(|f| table_ids.contains(&f.table_id) && f.to_delete.is_none())
-            .cloned()
-            .collect();
-        Ok(parquet_files)
-    }
-
-    async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>> {
-        let stage = self.stage();
-
-        let parquet_files: Vec<_> = stage
-            .parquet_files
-            .iter()
-            .filter(|f| table_id == f.table_id && f.to_delete.is_none())
-            .cloned()
-            .collect();
-        Ok(parquet_files)
-    }
-
-    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>> {
-        let stage = self.stage();
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
+        let mut stage = self.collections.lock();
 
         let (delete, keep): (Vec<_>, Vec<_>) = stage.parquet_files.iter().cloned().partition(
             |f| matches!(f.to_delete, Some(marked_deleted) if marked_deleted < older_than),
@@ -919,50 +842,31 @@ impl ParquetFileRepo for MemTxn {
         let delete = delete
             .into_iter()
             .take(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE as usize)
-            .map(|f| f.id)
+            .map(|f| f.object_store_id)
             .collect();
         Ok(delete)
     }
 
-    async fn list_by_partition_not_to_delete(
+    async fn list_by_partition_not_to_delete_batch(
         &mut self,
-        partition_id: &TransitionPartitionId,
+        partition_ids: Vec<PartitionId>,
     ) -> Result<Vec<ParquetFile>> {
-        let stage = self.stage();
-
-        let partition = stage
-            .partitions
-            .iter()
-            .find(|p| match partition_id {
-                TransitionPartitionId::Deterministic(hash_id) => p
-                    .hash_id()
-                    .map(|p_hash_id| p_hash_id == hash_id)
-                    .unwrap_or(false),
-                TransitionPartitionId::Deprecated(id) => id == &p.id,
-            })
-            .unwrap()
-            .clone();
+        let partition_ids = partition_ids.into_iter().collect::<HashSet<_>>();
+        let stage = self.collections.lock();
 
         Ok(stage
             .parquet_files
             .iter()
-            .filter(|f| match &f.partition_id {
-                TransitionPartitionId::Deterministic(hash_id) => partition
-                    .hash_id()
-                    .map(|p_hash_id| p_hash_id == hash_id)
-                    .unwrap_or(false),
-                TransitionPartitionId::Deprecated(id) => id == &partition.id,
-            })
-            .filter(|f| f.to_delete.is_none())
+            .filter(|f| partition_ids.contains(&f.partition_id) && f.to_delete.is_none())
             .cloned()
             .collect())
     }
 
     async fn get_by_object_store_id(
         &mut self,
-        object_store_id: Uuid,
+        object_store_id: ObjectStoreId,
     ) -> Result<Option<ParquetFile>> {
-        let stage = self.stage();
+        let stage = self.collections.lock();
 
         Ok(stage
             .parquet_files
@@ -973,9 +877,9 @@ impl ParquetFileRepo for MemTxn {
 
     async fn exists_by_object_store_id_batch(
         &mut self,
-        object_store_ids: Vec<Uuid>,
-    ) -> Result<Vec<Uuid>> {
-        let stage = self.stage();
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
+        let stage = self.collections.lock();
 
         Ok(stage
             .parquet_files
@@ -987,8 +891,9 @@ impl ParquetFileRepo for MemTxn {
 
     async fn create_upgrade_delete(
         &mut self,
-        delete: &[ParquetFileId],
-        upgrade: &[ParquetFileId],
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
         create: &[ParquetFileParams],
         target_level: CompactionLevel,
     ) -> Result<Vec<ParquetFileId>> {
@@ -1000,22 +905,28 @@ impl ParquetFileRepo for MemTxn {
             "attempted to upgrade a file scheduled for delete"
         );
 
-        let mut stage = self.inner.clone();
+        let mut collections = self.collections.lock();
+        let mut stage = collections.clone();
 
         for id in delete {
             let marked_at = Timestamp::from(self.time_provider.now());
-            flag_for_delete(&mut stage, *id, marked_at).await?;
+            flag_for_delete(&mut stage, partition_id, *id, marked_at)?;
         }
 
-        update_compaction_level(&mut stage, upgrade, target_level).await?;
+        update_compaction_level(&mut stage, partition_id, upgrade, target_level)?;
 
         let mut ids = Vec::with_capacity(create.len());
         for file in create {
-            let res = create_parquet_file(&mut stage, file.clone()).await?;
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
+            let res = create_parquet_file(&mut stage, file.clone())?;
             ids.push(res.id);
         }
 
-        *self.inner = stage;
+        *collections = stage;
 
         Ok(ids)
     }
@@ -1032,9 +943,88 @@ fn filter_namespace_soft_delete<'a>(
     })
 }
 
+fn create_or_get_column(
+    stage: &mut MemCollections,
+    name: &str,
+    table_id: TableId,
+    column_type: ColumnType,
+) -> Result<Column> {
+    // this block is just to ensure the mem impl correctly creates ColumnCreateLimitError in
+    // tests, we don't care about any of the errors it is discarding
+    stage
+        .tables
+        .iter()
+        .find(|t| t.id == table_id)
+        .cloned()
+        .ok_or(Error::NotFound {
+            descr: format!("table: {}", table_id),
+        }) // error never used, this is just for flow control
+        .and_then(|t| {
+            stage
+                .namespaces
+                .iter()
+                .find(|n| n.id == t.namespace_id)
+                .cloned()
+                .ok_or_else(|| Error::NotFound {
+                    // we're never going to use this error, this is just for flow control,
+                    // so it doesn't matter that we only have the ID, not the name
+                    descr: "".to_string(),
+                })
+                .and_then(|n| {
+                    let max_columns_per_table = n.max_columns_per_table;
+                    let columns_count = stage
+                        .columns
+                        .iter()
+                        .filter(|t| t.table_id == table_id)
+                        .count();
+                    if columns_count >= max_columns_per_table.get() {
+                        return Err(Error::LimitExceeded {
+                            descr: format!(
+                                "couldn't create column {} in table {}; limit reached on namespace",
+                                name, table_id
+                            ),
+                        });
+                    }
+                    Ok(())
+                })?;
+            Ok(())
+        })?;
+
+    let column = match stage
+        .columns
+        .iter()
+        .find(|t| t.name == name && t.table_id == table_id)
+    {
+        Some(c) => {
+            ensure!(
+                column_type == c.column_type,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        name, c.column_type, column_type
+                    ),
+                }
+            );
+            c
+        }
+        None => {
+            let column = Column {
+                id: ColumnId::new(stage.columns.len() as i64 + 1),
+                table_id,
+                name: name.to_string(),
+                column_type,
+            };
+            stage.columns.push(column);
+            stage.columns.last().unwrap()
+        }
+    };
+
+    Ok(column.clone())
+}
+
 // The following three functions are helpers to the create_upgrade_delete method.
 // They are also used by the respective create/flag_for_delete/update_compaction_level methods.
-async fn create_parquet_file(
+fn create_parquet_file(
     stage: &mut MemCollections,
     parquet_file_params: ParquetFileParams,
 ) -> Result<ParquetFile> {
@@ -1043,8 +1033,8 @@ async fn create_parquet_file(
         .iter()
         .any(|f| f.object_store_id == parquet_file_params.object_store_id)
     {
-        return Err(Error::FileExists {
-            object_store_id: parquet_file_params.object_store_id,
+        return Err(Error::AlreadyExists {
+            descr: parquet_file_params.object_store_id.to_string(),
         });
     }
 
@@ -1053,47 +1043,73 @@ async fn create_parquet_file(
         ParquetFileId::new(stage.parquet_files.len() as i64 + 1),
     );
     let created_at = parquet_file.created_at;
-    let partition_id = parquet_file.partition_id.clone();
+    let partition_id = parquet_file.partition_id;
     stage.parquet_files.push(parquet_file);
 
     // Update the new_file_at field its partition to the time of created_at
     let partition = stage
         .partitions
         .iter_mut()
-        .find(|p| p.transition_partition_id() == partition_id)
-        .ok_or(Error::PartitionNotFound { id: partition_id })?;
+        .find(|p| p.id == partition_id)
+        .ok_or(Error::NotFound {
+            descr: partition_id.to_string(),
+        })?;
     partition.new_file_at = Some(created_at);
 
     Ok(stage.parquet_files.last().unwrap().clone())
 }
 
-async fn flag_for_delete(
+fn flag_for_delete(
     stage: &mut MemCollections,
-    id: ParquetFileId,
+    partition_id: PartitionId,
+    id: ObjectStoreId,
     marked_at: Timestamp,
 ) -> Result<()> {
-    match stage.parquet_files.iter_mut().find(|p| p.id == id) {
-        Some(f) => f.to_delete = Some(marked_at),
-        None => return Err(Error::ParquetRecordNotFound { id }),
+    match stage
+        .parquet_files
+        .iter_mut()
+        .find(|p| p.object_store_id == id && p.partition_id == partition_id)
+    {
+        Some(f) if f.to_delete.is_none() => f.to_delete = Some(marked_at),
+        _ => {
+            return Err(Error::NotFound {
+                descr: format!("parquet file {id} not found for delete"),
+            })
+        }
     }
 
     Ok(())
 }
 
-async fn update_compaction_level(
+fn update_compaction_level(
     stage: &mut MemCollections,
-    parquet_file_ids: &[ParquetFileId],
+    partition_id: PartitionId,
+    object_store_ids: &[ObjectStoreId],
     compaction_level: CompactionLevel,
-) -> Result<Vec<ParquetFileId>> {
-    let mut updated = Vec::with_capacity(parquet_file_ids.len());
+) -> Result<Vec<ObjectStoreId>> {
+    let all_ids = stage
+        .parquet_files
+        .iter()
+        .filter(|f| f.partition_id == partition_id && f.to_delete.is_none())
+        .map(|f| f.object_store_id)
+        .collect::<HashSet<_>>();
+    for id in object_store_ids {
+        if !all_ids.contains(id) {
+            return Err(Error::NotFound {
+                descr: format!("parquet file {id} not found for upgrade"),
+            });
+        }
+    }
 
+    let update_ids = object_store_ids.iter().copied().collect::<HashSet<_>>();
+    let mut updated = Vec::with_capacity(object_store_ids.len());
     for f in stage
         .parquet_files
         .iter_mut()
-        .filter(|p| parquet_file_ids.contains(&p.id))
+        .filter(|p| update_ids.contains(&p.object_store_id) && p.partition_id == partition_id)
     {
         f.compaction_level = compaction_level;
-        updated.push(f.id);
+        updated.push(f.object_store_id);
     }
 
     Ok(updated)
@@ -1101,14 +1117,17 @@ async fn update_compaction_level(
 
 #[cfg(test)]
 mod tests {
+    use iox_time::SystemProvider;
+
     use super::*;
     use std::sync::Arc;
 
     #[tokio::test]
     async fn test_catalog() {
-        crate::interface::test_helpers::test_catalog(|| async {
+        crate::interface_tests::test_catalog(|| async {
             let metrics = Arc::new(metric::Registry::default());
-            let x: Arc<dyn Catalog> = Arc::new(MemCatalog::new(metrics));
+            let time_provider = Arc::new(SystemProvider::new());
+            let x: Arc<dyn Catalog> = Arc::new(MemCatalog::new(metrics, time_provider));
             x
         })
         .await;
diff --git a/iox_catalog/src/metrics.rs b/iox_catalog/src/metrics.rs
index e02646705fa..b179fd3cb24 100644
--- a/iox_catalog/src/metrics.rs
+++ b/iox_catalog/src/metrics.rs
@@ -5,17 +5,18 @@ use crate::interface::{
     SoftDeletedRows, TableRepo,
 };
 use async_trait::async_trait;
+use data_types::snapshot::table::TableSnapshot;
 use data_types::{
     partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    snapshot::partition::PartitionSnapshot,
     Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
-    NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId,
-    ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction,
-    SortedColumnSet, Table, TableId, Timestamp, TransitionPartitionId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction,
+    SortKeyIds, Table, TableId, Timestamp,
 };
-use iox_time::{SystemProvider, TimeProvider};
+use iox_time::TimeProvider;
 use metric::{DurationHistogram, Metric};
 use std::{collections::HashMap, fmt::Debug, sync::Arc};
-use uuid::Uuid;
 
 /// Decorates a implementation of the catalog's [`RepoCollection`] (and the
 /// transactional variant) with instrumentation that emits latency histograms
@@ -24,27 +25,30 @@ use uuid::Uuid;
 /// Values are recorded under the `catalog_op_duration` metric, labelled by
 /// operation name and result (success/error).
 #[derive(Debug)]
-pub struct MetricDecorator<T, P = SystemProvider> {
+pub struct MetricDecorator<T> {
     inner: T,
-    time_provider: P,
+    time_provider: Arc<dyn TimeProvider>,
     metrics: Arc<metric::Registry>,
 }
 
 impl<T> MetricDecorator<T> {
     /// Wrap `T` with instrumentation recording operation latency in `metrics`.
-    pub fn new(inner: T, metrics: Arc<metric::Registry>) -> Self {
+    pub fn new(
+        inner: T,
+        metrics: Arc<metric::Registry>,
+        time_provider: Arc<dyn TimeProvider>,
+    ) -> Self {
         Self {
             inner,
-            time_provider: Default::default(),
+            time_provider,
             metrics,
         }
     }
 }
 
-impl<T, P> RepoCollection for MetricDecorator<T, P>
+impl<T> RepoCollection for MetricDecorator<T>
 where
     T: NamespaceRepo + TableRepo + ColumnRepo + PartitionRepo + ParquetFileRepo + Debug,
-    P: TimeProvider,
 {
     fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
         self
@@ -97,7 +101,7 @@ macro_rules! decorate {
         )+]
     ) => {
         #[async_trait]
-        impl<P: TimeProvider, T:$trait> $trait for MetricDecorator<T, P> {
+        impl<T:$trait> $trait for MetricDecorator<T> {
             /// NOTE: if you're seeing an error here about "not all trait items
             /// implemented" or something similar, one or more methods are
             /// missing from / incorrectly defined in the decorate!() blocks
@@ -152,6 +156,7 @@ decorate!(
         "table_get_by_namespace_and_name" = get_by_namespace_and_name(&mut self, namespace_id: NamespaceId, name: &str) -> Result<Option<Table>>;
         "table_list_by_namespace_id" = list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result<Vec<Table>>;
         "table_list" = list(&mut self) -> Result<Vec<Table>>;
+        "table_snapshot" = snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot>;
     ]
 );
 
@@ -170,13 +175,10 @@ decorate!(
     impl_trait = PartitionRepo,
     methods = [
         "partition_create_or_get" = create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result<Partition>;
-        "partition_get_by_id" = get_by_id(&mut self, partition_id: PartitionId) -> Result<Option<Partition>>;
-        "partition_get_by_id_batch" = get_by_id_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<Partition>>;
-        "partition_get_by_hash_id" = get_by_hash_id(&mut self, partition_hash_id: &PartitionHashId) -> Result<Option<Partition>>;
-        "partition_get_by_hash_id_batch" = get_by_hash_id_batch(&mut self, partition_hash_ids: &[&PartitionHashId]) -> Result<Vec<Partition>>;
+        "partition_get_by_id_batch" = get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>>;
         "partition_list_by_table_id" = list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>>;
         "partition_list_ids" = list_ids(&mut self) -> Result<Vec<PartitionId>>;
-        "partition_update_sort_key" = cas_sort_key(&mut self, partition_id: &TransitionPartitionId, old_sort_key: Option<Vec<String>>, old_sort_key_ids: Option<SortedColumnSet>, new_sort_key: &[&str], new_sort_key_ids: &SortedColumnSet) -> Result<Partition, CasFailure<(Vec<String>, SortedColumnSet)>>;
+        "partition_update_sort_key" = cas_sort_key(&mut self, partition_id: PartitionId, old_sort_key_ids: Option<&SortKeyIds>, new_sort_key_ids: &SortKeyIds) -> Result<Partition, CasFailure<SortKeyIds>>;
         "partition_record_skipped_compaction" = record_skipped_compaction(&mut self, partition_id: PartitionId, reason: &str, num_files: usize, limit_num_files: usize, limit_num_files_first_in_partition: usize, estimated_bytes: u64, limit_bytes: u64) -> Result<()>;
         "partition_list_skipped_compactions" = list_skipped_compactions(&mut self) -> Result<Vec<SkippedCompaction>>;
         "partition_delete_skipped_compactions" = delete_skipped_compactions(&mut self, partition_id: PartitionId) -> Result<Option<SkippedCompaction>>;
@@ -184,21 +186,18 @@ decorate!(
         "partition_partitions_new_file_between" = partitions_new_file_between(&mut self, minimum_time: Timestamp, maximum_time: Option<Timestamp>) -> Result<Vec<PartitionId>>;
         "partition_get_in_skipped_compactions" = get_in_skipped_compactions(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<SkippedCompaction>>;
         "partition_list_old_style" = list_old_style(&mut self) -> Result<Vec<Partition>>;
+        "partition_snapshot" = snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot>;
     ]
 );
 
 decorate!(
     impl_trait = ParquetFileRepo,
     methods = [
-        "parquet_create" = create(&mut self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile>;
-        "parquet_list_all" = list_all(&mut self) -> Result<Vec<ParquetFile>>;
-        "parquet_flag_for_delete_by_retention" = flag_for_delete_by_retention(&mut self) -> Result<Vec<ParquetFileId>>;
-        "parquet_list_by_namespace_not_to_delete" = list_by_namespace_not_to_delete(&mut self, namespace_id: NamespaceId) -> Result<Vec<ParquetFile>>;
-        "parquet_list_by_table_not_to_delete" = list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>>;
-        "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>>;
-        "parquet_list_by_partition_not_to_delete" = list_by_partition_not_to_delete(&mut self, partition_id: &TransitionPartitionId) -> Result<Vec<ParquetFile>>;
-        "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: Uuid) -> Result<Option<ParquetFile>>;
-        "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<Uuid>) -> Result<Vec<Uuid>>;
-        "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, delete: &[ParquetFileId], upgrade: &[ParquetFileId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result<Vec<ParquetFileId>>;
+        "parquet_flag_for_delete_by_retention" = flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>>;
+        "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>>;
+        "parquet_list_by_partition_not_to_delete_batch" = list_by_partition_not_to_delete_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<ParquetFile>>;
+        "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: ObjectStoreId) -> Result<Option<ParquetFile>>;
+        "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec<ObjectStoreId>) -> Result<Vec<ObjectStoreId>>;
+        "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, partition_id: PartitionId, delete: &[ObjectStoreId], upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result<Vec<ParquetFileId>>;
     ]
 );
diff --git a/iox_catalog/src/migrate.rs b/iox_catalog/src/migrate.rs
index ac38fb6dbdf..5bbf9635206 100644
--- a/iox_catalog/src/migrate.rs
+++ b/iox_catalog/src/migrate.rs
@@ -537,11 +537,11 @@ impl TryFrom<&Migrator> for IOxMigrator {
     }
 }
 
-/// Validate an already-applied migration.
+/// Validate already-applied migrations
 ///
 /// Checks that:
 ///
-/// - applied migration is known
+/// - all applied migrations are known or all known migrations are applied
 /// - checksum of applied migration and known migration match
 /// - new migrations are newer than both the successfully applied and the dirty version
 /// - there is at most one dirty migration (bug check)
@@ -552,9 +552,18 @@ fn validate_applied_migrations(
 ) -> Result<(), MigrateError> {
     let migrations: HashMap<_, _> = migrator.migrations.iter().map(|m| (m.version, m)).collect();
 
-    for applied_migration in applied_migrations {
+    let mut dirty_version = None;
+    for (idx, applied_migration) in applied_migrations.iter().enumerate() {
         match migrations.get(&applied_migration.version) {
             None => {
+                if idx == migrations.len() && dirty_version.is_none() {
+                    // All migrations in `migrator` have been applied
+                    // We therefore continue as this should not prevent startup
+                    // if there are no local migrations to apply
+                    warn!("found applied migrations not present locally, but all local migrations applied - continuing");
+                    return Ok(());
+                }
+
                 return Err(MigrateError::VersionMissing(applied_migration.version));
             }
             Some(migration) => {
@@ -564,7 +573,15 @@ fn validate_applied_migrations(
                 {
                     return Err(MigrateError::VersionMismatch(migration.version));
                 }
+
                 if applied_migration.dirty {
+                    if let Some(first) = dirty_version {
+                        return Err(MigrateError::Source(format!(
+                            "there are multiple dirty versions, this should not happen and is considered a bug: {:?}",
+                            &[first, migration.version],
+                        ).into()));
+                    }
+                    dirty_version = Some(migration.version);
                     warn!(
                         version = migration.version,
                         "found dirty migration, trying to recover"
@@ -574,19 +591,6 @@ fn validate_applied_migrations(
         }
     }
 
-    let dirty_versions = applied_migrations
-        .iter()
-        .filter(|m| m.dirty)
-        .map(|m| m.version)
-        .collect::<Vec<_>>();
-    if dirty_versions.len() > 1 {
-        return Err(MigrateError::Source(format!(
-            "there are multiple dirty versions, this should not happen and is considered a bug: {:?}",
-            dirty_versions,
-        ).into()));
-    }
-    let dirty_version = dirty_versions.into_iter().next();
-
     let applied_last = applied_migrations
         .iter()
         .filter(|m| Some(m.version) != dirty_version)
@@ -2273,6 +2277,42 @@ mod tests {
             );
         }
 
+        #[tokio::test]
+        async fn test_migrator_allows_unknown_migrations_if_they_are_clean() {
+            maybe_skip_integration!();
+            let mut conn = setup().await;
+            let conn = &mut *conn;
+
+            let migrator_1 = IOxMigrator::try_new([
+                IOxMigration {
+                    version: 1,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [1, 2, 3].into(),
+                    other_compatible_checksums: [].into(),
+                },
+                IOxMigration {
+                    version: 2,
+                    description: "".into(),
+                    steps: [].into(),
+                    checksum: [4, 5, 6].into(),
+                    other_compatible_checksums: [].into(),
+                },
+            ])
+            .unwrap();
+            let migrator_2 = IOxMigrator::try_new([IOxMigration {
+                version: 1,
+                description: "".into(),
+                steps: [].into(),
+                checksum: [1, 2, 3].into(),
+                other_compatible_checksums: [].into(),
+            }])
+            .unwrap();
+
+            migrator_1.run_direct(conn).await.unwrap();
+            migrator_2.run_direct(conn).await.unwrap();
+        }
+
         #[tokio::test]
         async fn test_tester_finds_invalid_migration() {
             maybe_skip_integration!();
diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs
index 1744a532ffe..ef9c5d28070 100644
--- a/iox_catalog/src/postgres.rs
+++ b/iox_catalog/src/postgres.rs
@@ -1,29 +1,28 @@
 //! A Postgres backed implementation of the Catalog
 
-use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE;
+use crate::interface::PartitionRepoExt;
 use crate::{
-    interface::{
-        self, CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo,
-        ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
-        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
     },
-    kafkaless_transition::{
-        SHARED_QUERY_POOL, SHARED_QUERY_POOL_ID, SHARED_TOPIC_ID, SHARED_TOPIC_NAME,
-        TRANSITION_SHARD_ID, TRANSITION_SHARD_INDEX,
+    interface::{
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
     },
     metrics::MetricDecorator,
     migrate::IOxMigrator,
 };
 use async_trait::async_trait;
-use data_types::SortedColumnSet;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
 use data_types::{
     partition_template::{
         NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
     },
     Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId,
-    NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId,
-    ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction,
-    Table, TableId, Timestamp, TransitionPartitionId,
+    NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile,
+    ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey,
+    SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
 };
 use iox_time::{SystemProvider, TimeProvider};
 use metric::{Attributes, Instrument, MetricKind};
@@ -33,14 +32,21 @@ use parking_lot::{RwLock, RwLockWriteGuard};
 use snafu::prelude::*;
 use sqlx::{
     postgres::{PgConnectOptions, PgPoolOptions},
-    types::Uuid,
     Acquire, ConnectOptions, Executor, Postgres, Row,
 };
 use sqlx_hotswap_pool::HotSwapPool;
-use std::borrow::Cow;
-use std::collections::HashSet;
-use std::sync::atomic::{AtomicU64, Ordering};
-use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc, time::Duration};
+use std::{
+    borrow::Cow,
+    collections::{HashMap, HashSet},
+    env,
+    fmt::Display,
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
 
 static MIGRATOR: Lazy<IOxMigrator> =
     Lazy::new(|| IOxMigrator::try_from(&sqlx::migrate!()).expect("valid migration"));
@@ -122,9 +128,7 @@ impl PostgresCatalog {
         options: PostgresConnectionOptions,
         metrics: Arc<metric::Registry>,
     ) -> Result<Self> {
-        let pool = new_pool(&options, Arc::clone(&metrics))
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+        let pool = new_pool(&options, Arc::clone(&metrics)).await?;
 
         Ok(Self {
             pool,
@@ -243,67 +247,14 @@ impl Catalog for PostgresCatalog {
         // This makes the migrations/20210217134322_create_schema.sql step unnecessary; we need to
         // keep that file because migration files are immutable.
         let create_schema_query = format!("CREATE SCHEMA IF NOT EXISTS {};", self.schema_name());
-        self.pool
-            .execute(sqlx::query(&create_schema_query))
-            .await
-            .map_err(|e| Error::Setup { source: e })?;
-
-        MIGRATOR
-            .run(&self.pool)
-            .await
-            .map_err(|e| Error::Setup { source: e.into() })?;
-
-        // We need to manually insert the topic here so that we can create the transition shard
-        // below.
-        sqlx::query(
-            r#"
-INSERT INTO topic (name)
-VALUES ($1)
-ON CONFLICT ON CONSTRAINT topic_name_unique
-DO NOTHING;
-        "#,
-        )
-        .bind(SHARED_TOPIC_NAME)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
-
-        // The transition shard must exist and must have magic ID and INDEX.
-        sqlx::query(
-            r#"
-INSERT INTO shard (id, topic_id, shard_index, min_unpersisted_sequence_number)
-OVERRIDING SYSTEM VALUE
-VALUES ($1, $2, $3, 0)
-ON CONFLICT ON CONSTRAINT shard_unique
-DO NOTHING;
-        "#,
-        )
-        .bind(TRANSITION_SHARD_ID)
-        .bind(SHARED_TOPIC_ID)
-        .bind(TRANSITION_SHARD_INDEX)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
+        self.pool.execute(sqlx::query(&create_schema_query)).await?;
 
-        // We need to manually insert the query pool here so that we can create namespaces that
-        // reference it.
-        sqlx::query(
-            r#"
-INSERT INTO query_pool (name)
-VALUES ($1)
-ON CONFLICT ON CONSTRAINT query_pool_name_unique
-DO NOTHING;
-        "#,
-        )
-        .bind(SHARED_QUERY_POOL)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
+        MIGRATOR.run(&self.pool).await?;
 
         Ok(())
     }
 
-    async fn repositories(&self) -> Box<dyn RepoCollection> {
+    fn repositories(&self) -> Box<dyn RepoCollection> {
         Box::new(MetricDecorator::new(
             PostgresTxn {
                 inner: PostgresTxnInner {
@@ -312,6 +263,7 @@ DO NOTHING;
                 time_provider: Arc::clone(&self.time_provider),
             },
             Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
         ))
     }
 
@@ -453,10 +405,17 @@ async fn new_raw_pool(
     metrics: PoolMetrics,
 ) -> Result<sqlx::Pool<Postgres>, sqlx::Error> {
     // sqlx exposes some options as pool options, while other options are available as connection options.
-    let connect_options = PgConnectOptions::from_str(parsed_dsn)?
+    let mut connect_options = PgConnectOptions::from_str(parsed_dsn)?
         // the default is INFO, which is frankly surprising.
         .log_statements(log::LevelFilter::Trace);
 
+    // Workaround sqlx ignoring the SSL_CERT_FILE environment variable.
+    // Remove workaround when upstream sqlx handles SSL_CERT_FILE properly (#8994).
+    let cert_file = env::var("SSL_CERT_FILE").unwrap_or_default();
+    if !cert_file.is_empty() {
+        connect_options = connect_options.ssl_root_cert(cert_file);
+    }
+
     let app_name = options.app_name.clone();
     let app_name2 = options.app_name.clone(); // just to log below
     let schema_name = options.schema_name.clone();
@@ -610,7 +569,6 @@ fn get_dsn_file_path(dsn: &str) -> Option<String> {
         .then(|| dsn[DSN_SCHEME.len()..].to_owned())
 }
 
-#[async_trait]
 impl RepoCollection for PostgresTxn {
     fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
         self
@@ -663,24 +621,24 @@ RETURNING *;
         .fetch_one(executor)
         .await
         .map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::ColumnCreateLimitError {
-                column_name: name.to_string(),
-                table_id,
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id)
             },
             _ => {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound { descr: e.to_string() }
             } else {
-                Error::SqlxError { source: e }
+                Error::External { source: Box::new(e) }
             }
         }})?;
 
     ensure!(
         rec.column_type == column_type,
-        ColumnTypeMismatchSnafu {
-            name,
-            existing: rec.column_type,
-            new: column_type,
+        AlreadyExistsSnafu {
+            descr: format!(
+                "column {} is type {} but schema update has type {}",
+                name, rec.column_type, column_type
+            ),
         }
     );
 
@@ -706,30 +664,32 @@ impl NamespaceRepo for PostgresTxn {
         let rec = sqlx::query_as::<_, Namespace>(
             r#"
 INSERT INTO namespace (
-    name, topic_id, query_pool_id, retention_period_ns, max_tables, max_columns_per_table, partition_template
+    name, retention_period_ns, max_tables, max_columns_per_table, partition_template
 )
-VALUES ( $1, $2, $3, $4, $5, $6, $7 )
+VALUES ( $1, $2, $3, $4, $5 )
 RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
           partition_template;
             "#,
         )
         .bind(name.as_str()) // $1
-        .bind(SHARED_TOPIC_ID) // $2
-        .bind(SHARED_QUERY_POOL_ID) // $3
-        .bind(retention_period_ns) // $4
-        .bind(max_tables) // $5
-        .bind(max_columns_per_table) // $6
-        .bind(partition_template); // $7
+        .bind(retention_period_ns) // $2
+        .bind(max_tables) // $3
+        .bind(max_columns_per_table) // $4
+        .bind(partition_template); // $5
 
         let rec = rec.fetch_one(&mut self.inner).await.map_err(|e| {
             if is_unique_violation(&e) {
-                Error::NameExists {
-                    name: name.to_string(),
+                Error::AlreadyExists {
+                    descr: name.to_string(),
                 }
             } else if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
@@ -750,8 +710,7 @@ WHERE {v};
             .as_str(),
         )
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -781,7 +740,7 @@ WHERE id=$1 AND {v};
             return Ok(None);
         }
 
-        let namespace = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let namespace = rec?;
 
         Ok(Some(namespace))
     }
@@ -811,7 +770,7 @@ WHERE name=$1 AND {v};
             return Ok(None);
         }
 
-        let namespace = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let namespace = rec?;
 
         Ok(Some(namespace))
     }
@@ -825,7 +784,7 @@ WHERE name=$1 AND {v};
             .bind(name) // $2
             .execute(&mut self.inner)
             .await
-            .context(interface::CouldNotDeleteNamespaceSnafu)
+            .map_err(Error::from)
             .map(|_| ())
     }
 
@@ -845,10 +804,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -874,10 +835,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -903,10 +866,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -921,12 +886,7 @@ impl TableRepo for PostgresTxn {
         partition_template: TablePartitionTemplateOverride,
         namespace_id: NamespaceId,
     ) -> Result<Table> {
-        let mut tx = self
-            .inner
-            .pool
-            .begin()
-            .await
-            .map_err(|e| Error::StartTransaction { source: e })?;
+        let mut tx = self.inner.pool.begin().await?;
 
         // A simple insert statement becomes quite complicated in order to avoid checking the table
         // limits in a select and then conditionally inserting (which would be racey).
@@ -955,20 +915,25 @@ RETURNING *;
         .fetch_one(&mut *tx)
         .await
         .map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::TableCreateLimitError {
-                table_name: name.to_string(),
-                namespace_id,
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!(
+                    "couldn't create table {}; limit reached on namespace {}",
+                    name, namespace_id
+                ),
             },
             _ => {
                 if is_unique_violation(&e) {
-                    Error::TableNameExists {
-                        name: name.to_string(),
-                        namespace_id,
+                    Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
                     }
                 } else if is_fk_violation(&e) {
-                    Error::ForeignKeyViolation { source: e }
+                    Error::NotFound {
+                        descr: e.to_string(),
+                    }
                 } else {
-                    Error::SqlxError { source: e }
+                    Error::External {
+                        source: Box::new(e),
+                    }
                 }
             }
         })?;
@@ -984,9 +949,7 @@ RETURNING *;
             }
         }
 
-        tx.commit()
-            .await
-            .map_err(|source| Error::FailedToCommit { source })?;
+        tx.commit().await?;
 
         Ok(table)
     }
@@ -1007,7 +970,7 @@ WHERE id = $1;
             return Ok(None);
         }
 
-        let table = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let table = rec?;
 
         Ok(Some(table))
     }
@@ -1033,7 +996,7 @@ WHERE namespace_id = $1 AND name = $2;
             return Ok(None);
         }
 
-        let table = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let table = rec?;
 
         Ok(Some(table))
     }
@@ -1048,8 +1011,7 @@ WHERE namespace_id = $1;
         )
         .bind(namespace_id)
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -1057,11 +1019,52 @@ WHERE namespace_id = $1;
     async fn list(&mut self) -> Result<Vec<Table>> {
         let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;")
             .fetch_all(&mut self.inner)
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+            .await?;
 
         Ok(rec)
     }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut tx = self.inner.pool.begin().await?;
+        let rec = sqlx::query_as::<_, Table>("SELECT * from table_name WHERE id = $1 FOR UPDATE;")
+            .bind(table_id) // $1
+            .fetch_one(&mut *tx)
+            .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("table: {table_id}"),
+            });
+        }
+        let table = rec?;
+
+        let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;")
+            .bind(table_id) // $1
+            .fetch_all(&mut *tx)
+            .await?;
+
+        let partitions =
+            sqlx::query_as::<_, Partition>(r#"SELECT * FROM partition WHERE table_id = $1;"#)
+                .bind(table_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let (generation,): (i64,) = sqlx::query_as(
+            "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING generation;",
+        )
+        .bind(table_id) // $1
+        .fetch_one(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(TableSnapshot::encode(
+            table,
+            partitions,
+            columns,
+            generation as _,
+        )?)
+    }
 }
 
 #[async_trait]
@@ -1085,8 +1088,7 @@ WHERE table_name.namespace_id = $1;
         )
         .bind(namespace_id)
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -1100,8 +1102,7 @@ WHERE table_id = $1;
         )
         .bind(table_id)
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -1109,8 +1110,7 @@ WHERE table_id = $1;
     async fn list(&mut self) -> Result<Vec<Column>> {
         let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;")
             .fetch_all(&mut self.inner)
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+            .await?;
 
         Ok(rec)
     }
@@ -1150,9 +1150,13 @@ RETURNING *;
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
@@ -1162,10 +1166,11 @@ RETURNING *;
             let want = columns.get(existing.name.as_str()).unwrap();
             ensure!(
                 existing.column_type == *want,
-                ColumnTypeMismatchSnafu {
-                    name: &existing.name,
-                    existing: existing.column_type,
-                    new: *want,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        existing.name, existing.column_type, want
+                    ),
                 }
             );
         }
@@ -1182,58 +1187,52 @@ impl PartitionRepo for PostgresTxn {
         let v = sqlx::query_as::<_, Partition>(
             r#"
 INSERT INTO partition
-    (partition_key, shard_id, table_id, hash_id, sort_key, sort_key_ids)
+    (partition_key, table_id, hash_id, sort_key_ids)
 VALUES
-    ( $1, $2, $3, $4, '{}', '{}')
+    ( $1, $2, $3, '{}')
 ON CONFLICT ON CONSTRAINT partition_key_unique
 DO UPDATE SET partition_key = partition.partition_key
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
         )
-        .bind(key) // $1
-        .bind(TRANSITION_SHARD_ID) // $2
-        .bind(table_id) // $3
-        .bind(&hash_id) // $4
+        .bind(&key) // $1
+        .bind(table_id) // $2
+        .bind(&hash_id) // $3
         .fetch_one(&mut self.inner)
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
+            } else if is_unique_violation(&e) {
+                // Logging more information to diagnose a production issue maybe
+                warn!(
+                    error=?e,
+                    %table_id,
+                    %key,
+                    %hash_id,
+                    "possible duplicate partition_hash_id?"
+                );
+                Error::External {
+                    source: Box::new(e),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
         Ok(v)
     }
 
-    async fn get_by_id(&mut self, partition_id: PartitionId) -> Result<Option<Partition>> {
-        let rec = sqlx::query_as::<_, Partition>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE id = $1;
-        "#,
-        )
-        .bind(partition_id) // $1
-        .fetch_one(&mut self.inner)
-        .await;
-
-        if let Err(sqlx::Error::RowNotFound) = rec {
-            return Ok(None);
-        }
-
-        let partition = rec.map_err(|e| Error::SqlxError { source: e })?;
-
-        Ok(Some(partition))
-    }
-
-    async fn get_by_id_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<Partition>> {
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
         let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
 
         sqlx::query_as::<_, Partition>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE id = ANY($1);
         "#,
@@ -1241,56 +1240,13 @@ WHERE id = ANY($1);
         .bind(&ids[..]) // $1
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
-    }
-
-    async fn get_by_hash_id(
-        &mut self,
-        partition_hash_id: &PartitionHashId,
-    ) -> Result<Option<Partition>> {
-        let rec = sqlx::query_as::<_, Partition>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE hash_id = $1;
-        "#,
-        )
-        .bind(partition_hash_id) // $1
-        .fetch_one(&mut self.inner)
-        .await;
-
-        if let Err(sqlx::Error::RowNotFound) = rec {
-            return Ok(None);
-        }
-
-        let partition = rec.map_err(|e| Error::SqlxError { source: e })?;
-
-        Ok(Some(partition))
-    }
-
-    async fn get_by_hash_id_batch(
-        &mut self,
-        partition_ids: &[&PartitionHashId],
-    ) -> Result<Vec<Partition>> {
-        let ids: Vec<_> = partition_ids.iter().map(|p| p.as_bytes()).collect();
-
-        sqlx::query_as::<_, Partition>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE hash_id = ANY($1);
-        "#,
-        )
-        .bind(&ids[..]) // $1
-        .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
         sqlx::query_as::<_, Partition>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE table_id = $1;
             "#,
@@ -1298,7 +1254,7 @@ WHERE table_id = $1;
         .bind(table_id) // $1
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn list_ids(&mut self) -> Result<Vec<PartitionId>> {
@@ -1310,7 +1266,7 @@ WHERE table_id = $1;
         )
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     /// Update the sort key for `partition_id` if and only if `old_sort_key`
@@ -1321,52 +1277,26 @@ WHERE table_id = $1;
     /// round trips to service a transaction in the happy path).
     async fn cas_sort_key(
         &mut self,
-        partition_id: &TransitionPartitionId,
-        old_sort_key: Option<Vec<String>>,
-        old_sort_key_ids: Option<SortedColumnSet>,
-        new_sort_key: &[&str],
-        new_sort_key_ids: &SortedColumnSet,
-    ) -> Result<Partition, CasFailure<(Vec<String>, SortedColumnSet)>> {
-        // These asserts are here to cacth bugs. They will be removed when we remove the sort_key
-        // field from the Partition
-        assert_eq!(
-            old_sort_key.as_ref().map(|v| v.len()),
-            old_sort_key_ids.as_ref().map(|v| v.len())
-        );
-        assert_eq!(new_sort_key.len(), new_sort_key_ids.len());
-
-        let old_sort_key = old_sort_key.unwrap_or_default();
-        let old_sort_key_ids = old_sort_key_ids.unwrap_or_default();
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let old_sort_key_ids = old_sort_key_ids
+            .map(std::ops::Deref::deref)
+            .unwrap_or_default();
 
         // This `match` will go away when all partitions have hash IDs in the database.
-        let query = match partition_id {
-            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, Partition>(
-                r#"
-UPDATE partition
-SET sort_key = $1, sort_key_ids = $4
-WHERE hash_id = $2 AND sort_key = $3 AND sort_key_ids = $5
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
-        "#,
-            )
-            .bind(new_sort_key) // $1
-            .bind(hash_id) // $2
-            .bind(&old_sort_key) // $3
-            .bind(new_sort_key_ids) // $4
-            .bind(old_sort_key_ids), // $5
-            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, Partition>(
-                r#"
+        let query = sqlx::query_as::<_, Partition>(
+            r#"
 UPDATE partition
-SET sort_key = $1, sort_key_ids = $4
-WHERE id = $2 AND sort_key = $3 AND sort_key_ids = $5
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+SET sort_key_ids = $1
+WHERE id = $2 AND sort_key_ids = $3
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
-            )
-            .bind(new_sort_key) // $1
-            .bind(id) // $2
-            .bind(&old_sort_key) // $3
-            .bind(new_sort_key_ids) // $4
-            .bind(old_sort_key_ids), // $5
-        };
+        )
+        .bind(new_sort_key_ids) // $1
+        .bind(partition_id) // $2
+        .bind(old_sort_key_ids); // $3;
 
         let res = query.fetch_one(&mut self.inner).await;
 
@@ -1384,24 +1314,26 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
                 //
                 // NOTE: this is racy, but documented - this might return "Sort
                 // key differs! Old key: <old sort key you provided>"
-                let partition = crate::partition_lookup(self, partition_id)
+                let partition = (self as &mut dyn PartitionRepo)
+                    .get_by_id(partition_id)
                     .await
                     .map_err(CasFailure::QueryError)?
-                    .ok_or(CasFailure::QueryError(Error::PartitionNotFound {
-                        id: partition_id.clone(),
+                    .ok_or(CasFailure::QueryError(Error::NotFound {
+                        descr: partition_id.to_string(),
                     }))?;
-                return Err(CasFailure::ValueMismatch((
-                    partition.sort_key,
-                    partition.sort_key_ids,
-                )));
+                return Err(CasFailure::ValueMismatch(
+                    partition.sort_key_ids().cloned().unwrap_or_default(),
+                ));
+            }
+            Err(e) => {
+                return Err(CasFailure::QueryError(Error::External {
+                    source: Box::new(e),
+                }))
             }
-            Err(e) => return Err(CasFailure::QueryError(Error::SqlxError { source: e })),
         };
 
         debug!(
             ?partition_id,
-            ?old_sort_key,
-            ?new_sort_key,
             ?new_sort_key_ids,
             "partition sort key cas successful"
         );
@@ -1445,8 +1377,7 @@ skipped_at = EXCLUDED.skipped_at;
         .bind(estimated_bytes as i64)
         .bind(limit_bytes as i64)
         .execute(&mut self.inner)
-        .await
-        .context(interface::CouldNotRecordSkippedCompactionSnafu { partition_id })?;
+        .await?;
         Ok(())
     }
 
@@ -1465,7 +1396,7 @@ skipped_at = EXCLUDED.skipped_at;
             return Ok(Vec::new());
         }
 
-        let skipped_partition_records = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let skipped_partition_records = rec?;
 
         Ok(skipped_partition_records)
     }
@@ -1478,7 +1409,7 @@ SELECT * FROM skipped_compactions
         )
         .fetch_all(&mut self.inner)
         .await
-        .context(interface::CouldNotListSkippedCompactionsSnafu)
+        .map_err(Error::from)
     }
 
     async fn delete_skipped_compactions(
@@ -1495,15 +1426,13 @@ RETURNING *
         .bind(partition_id)
         .fetch_optional(&mut self.inner)
         .await
-        .context(interface::CouldNotDeleteSkippedCompactionsSnafu)
+        .map_err(Error::from)
     }
 
     async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
         sqlx::query_as(
-    // TODO: Carol has confirmed the persisted_sequence_number is not needed anywhere so let us remove it
-    // but in a seperate PR to ensure we don't break anything
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, persisted_sequence_number, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 ORDER BY id DESC
 LIMIT $1;"#,
@@ -1511,7 +1440,7 @@ LIMIT $1;"#,
         .bind(n as i64) // $1
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn partitions_new_file_between(
@@ -1536,7 +1465,7 @@ LIMIT $1;"#,
             .bind(maximum_time) // $2
             .fetch_all(&mut self.inner)
             .await
-            .map_err(|e| Error::SqlxError { source: e })
+            .map_err(Error::from)
     }
 
     async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
@@ -1549,49 +1478,72 @@ LIMIT $1;"#,
         // The load this query saves vastly outsizes the load this query causes.
         sqlx::query_as(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, persisted_sequence_number,
-       new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE hash_id IS NULL
 ORDER BY id DESC;"#,
         )
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
-}
 
-#[async_trait]
-impl ParquetFileRepo for PostgresTxn {
-    async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile> {
-        let executor = &mut self.inner;
-        let id = create_parquet_file(executor, &parquet_file_params).await?;
-        Ok(ParquetFile::from_params(parquet_file_params, id))
-    }
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut tx = self.inner.pool.begin().await?;
 
-    async fn list_all(&mut self) -> Result<Vec<ParquetFile>> {
-        sqlx::query_as::<_, ParquetFile>(
-            r#"
-SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id,
-       parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id,
-       parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete,
-       parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level,
-       parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at
-FROM parquet_file;
-             "#,
+        let rec =
+            sqlx::query_as::<_, Partition>("SELECT * from partition WHERE id = $1 FOR UPDATE;")
+                .bind(partition_id) // $1
+                .fetch_one(&mut *tx)
+                .await;
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("partition: {partition_id}"),
+            });
+        }
+        let partition = rec?;
+
+        let files =
+            sqlx::query_as::<_, ParquetFile>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;")
+                .bind(partition_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let sc = sqlx::query_as::<_, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#,
         )
-        .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .bind(partition_id) // $1
+        .fetch_optional(&mut *tx)
+        .await?;
+
+        let (generation, namespace_id): (i64,NamespaceId) = sqlx::query_as(
+            "UPDATE partition SET generation = partition.generation + 1 from table_name where partition.id = $1 and table_name.id = partition.table_id RETURNING partition.generation, table_name.namespace_id;",
+        )
+        .bind(partition_id) // $1
+        .fetch_one(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(PartitionSnapshot::encode(
+            namespace_id,
+            partition,
+            files,
+            sc,
+            generation as _,
+        )?)
     }
+}
 
-    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<ParquetFileId>> {
+#[async_trait]
+impl ParquetFileRepo for PostgresTxn {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
         let flagged_at = Timestamp::from(self.time_provider.now());
         // TODO - include check of table retention period once implemented
         let flagged = sqlx::query(
             r#"
 WITH parquet_file_ids as (
-    SELECT parquet_file.id
+    SELECT parquet_file.object_store_id
     FROM namespace, parquet_file
     WHERE namespace.retention_period_ns IS NOT NULL
     AND parquet_file.to_delete IS NULL
@@ -1601,127 +1553,72 @@ WITH parquet_file_ids as (
 )
 UPDATE parquet_file
 SET to_delete = $1
-WHERE id IN (SELECT id FROM parquet_file_ids)
-RETURNING id;
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING partition_id, object_store_id;
             "#,
         )
         .bind(flagged_at) // $1
         .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
-        let flagged = flagged.into_iter().map(|row| row.get("id")).collect();
+        let flagged = flagged
+            .into_iter()
+            .map(|row| (row.get("partition_id"), row.get("object_store_id")))
+            .collect();
         Ok(flagged)
     }
 
-    async fn list_by_namespace_not_to_delete(
-        &mut self,
-        namespace_id: NamespaceId,
-    ) -> Result<Vec<ParquetFile>> {
-        sqlx::query_as::<_, ParquetFile>(
-            r#"
-SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id,
-       parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id,
-       parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete,
-       parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level,
-       parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at
-FROM parquet_file
-INNER JOIN table_name on table_name.id = parquet_file.table_id
-WHERE table_name.namespace_id = $1
-  AND parquet_file.to_delete IS NULL;
-             "#,
-        )
-        .bind(namespace_id) // $1
-        .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })
-    }
-
-    async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>> {
-        sqlx::query_as::<_, ParquetFile>(
-            r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id,
-       min_time, max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at,
-       column_set, max_l0_created_at
-FROM parquet_file
-WHERE table_id = $1 AND to_delete IS NULL;
-             "#,
-        )
-        .bind(table_id) // $1
-        .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })
-    }
-
-    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>> {
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
         // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-postgres-ctes-to-the-rescue
         let deleted = sqlx::query(
             r#"
 WITH parquet_file_ids as (
-    SELECT id
+    SELECT object_store_id
     FROM parquet_file
     WHERE to_delete < $1
     LIMIT $2
 )
 DELETE FROM parquet_file
-WHERE id IN (SELECT id FROM parquet_file_ids)
-RETURNING id;
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING object_store_id;
              "#,
         )
         .bind(older_than) // $1
         .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2
         .fetch_all(&mut self.inner)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
-        let deleted = deleted.into_iter().map(|row| row.get("id")).collect();
+        let deleted = deleted
+            .into_iter()
+            .map(|row| row.get("object_store_id"))
+            .collect();
         Ok(deleted)
     }
 
-    async fn list_by_partition_not_to_delete(
+    async fn list_by_partition_not_to_delete_batch(
         &mut self,
-        partition_id: &TransitionPartitionId,
+        partition_ids: Vec<PartitionId>,
     ) -> Result<Vec<ParquetFile>> {
-        // This `match` will go away when all partitions have hash IDs in the database.
-        let query = match partition_id {
-            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFile>(
-                r#"
-SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
-       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
-       compaction_level, created_at, column_set, max_l0_created_at
-FROM parquet_file
-INNER JOIN partition
-ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
-WHERE partition.hash_id = $1
-  AND parquet_file.to_delete IS NULL;
-        "#,
-            )
-            .bind(hash_id), // $1
-            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFile>(
-                r#"
+        sqlx::query_as::<_, ParquetFile>(
+            r#"
 SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
        object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
        compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-INNER JOIN partition
-ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
-WHERE partition.id = $1
+WHERE parquet_file.partition_id = ANY($1)
   AND parquet_file.to_delete IS NULL;
         "#,
-            )
-            .bind(id), // $1
-        };
-
-        query
-            .fetch_all(&mut self.inner)
-            .await
-            .map_err(|e| Error::SqlxError { source: e })
+        )
+        .bind(partition_ids) // $1
+        .fetch_all(&mut self.inner)
+        .await
+        .map_err(Error::from)
     }
 
     async fn get_by_object_store_id(
         &mut self,
-        object_store_id: Uuid,
+        object_store_id: ObjectStoreId,
     ) -> Result<Option<ParquetFile>> {
         let rec = sqlx::query_as::<_, ParquetFile>(
             r#"
@@ -1740,15 +1637,15 @@ WHERE object_store_id = $1;
             return Ok(None);
         }
 
-        let parquet_file = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let parquet_file = rec?;
 
         Ok(Some(parquet_file))
     }
 
     async fn exists_by_object_store_id_batch(
         &mut self,
-        object_store_ids: Vec<Uuid>,
-    ) -> Result<Vec<Uuid>> {
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
         sqlx::query(
             // sqlx's readme suggests using PG's ANY operator instead of IN; see link below.
             // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query
@@ -1759,48 +1656,48 @@ WHERE object_store_id = ANY($1);
              "#,
         )
         .bind(object_store_ids) // $1
-        .map(|pgr| pgr.get::<Uuid, _>("object_store_id"))
+        .map(|pgr| pgr.get::<ObjectStoreId, _>("object_store_id"))
         .fetch_all(&mut self.inner)
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn create_upgrade_delete(
         &mut self,
-        delete: &[ParquetFileId],
-        upgrade: &[ParquetFileId],
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
         create: &[ParquetFileParams],
         target_level: CompactionLevel,
     ) -> Result<Vec<ParquetFileId>> {
-        let delete_set: HashSet<_> = delete.iter().map(|d| d.get()).collect();
-        let upgrade_set: HashSet<_> = upgrade.iter().map(|u| u.get()).collect();
+        let delete_set: HashSet<_> = delete.iter().map(|d| d.get_uuid()).collect();
+        let upgrade_set: HashSet<_> = upgrade.iter().map(|u| u.get_uuid()).collect();
 
         assert!(
             delete_set.is_disjoint(&upgrade_set),
             "attempted to upgrade a file scheduled for delete"
         );
 
-        let mut tx = self
-            .inner
-            .pool
-            .begin()
-            .await
-            .map_err(|e| Error::StartTransaction { source: e })?;
+        let mut tx = self.inner.pool.begin().await?;
 
         let marked_at = Timestamp::from(self.time_provider.now());
-        flag_for_delete(&mut *tx, delete, marked_at).await?;
+        flag_for_delete(&mut *tx, partition_id, delete, marked_at).await?;
 
-        update_compaction_level(&mut *tx, upgrade, target_level).await?;
+        update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?;
 
         let mut ids = Vec::with_capacity(create.len());
         for file in create {
-            let id = create_parquet_file(&mut *tx, file).await?;
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
+            let id = create_parquet_file(&mut *tx, partition_id, file).await?;
             ids.push(id);
         }
 
-        tx.commit()
-            .await
-            .map_err(|source| Error::FailedToCommit { source })?;
+        tx.commit().await?;
+
         Ok(ids)
     }
 }
@@ -1809,6 +1706,7 @@ WHERE object_store_id = ANY($1);
 // They are also used by the respective create/flag_for_delete/update_compaction_level methods.
 async fn create_parquet_file<'q, E>(
     executor: E,
+    partition_id: PartitionId,
     parquet_file_params: &ParquetFileParams,
 ) -> Result<ParquetFileId>
 where
@@ -1817,7 +1715,8 @@ where
     let ParquetFileParams {
         namespace_id,
         table_id,
-        partition_id,
+        partition_id: _,
+        partition_hash_id,
         object_store_id,
         min_time,
         max_time,
@@ -1829,46 +1728,43 @@ where
         max_l0_created_at,
     } = parquet_file_params;
 
-    let (partition_id, partition_hash_id) = match partition_id {
-        TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
-        TransitionPartitionId::Deprecated(id) => (Some(id), None),
-    };
-
-    let partition_hash_id_ref = &partition_hash_id.as_ref();
     let query = sqlx::query_scalar::<_, ParquetFileId>(
         r#"
 INSERT INTO parquet_file (
-    shard_id, table_id, partition_id, partition_hash_id, object_store_id,
+    table_id, partition_id, partition_hash_id, object_store_id,
     min_time, max_time, file_size_bytes,
     row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at )
-VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14 )
+VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 )
 RETURNING id;
         "#,
     )
-    .bind(TRANSITION_SHARD_ID) // $1
-    .bind(table_id) // $2
-    .bind(partition_id) // $3
-    .bind(partition_hash_id_ref) // $4
-    .bind(object_store_id) // $5
-    .bind(min_time) // $6
-    .bind(max_time) // $7
-    .bind(file_size_bytes) // $8
-    .bind(row_count) // $9
-    .bind(compaction_level) // $10
-    .bind(created_at) // $11
-    .bind(namespace_id) // $12
-    .bind(column_set) // $13
-    .bind(max_l0_created_at); // $14
+    .bind(table_id) // $1
+    .bind(partition_id) // $2
+    .bind(partition_hash_id.as_ref()) // $3
+    .bind(object_store_id) // $4
+    .bind(min_time) // $5
+    .bind(max_time) // $6
+    .bind(file_size_bytes) // $7
+    .bind(row_count) // $8
+    .bind(compaction_level) // $9
+    .bind(created_at) // $10
+    .bind(namespace_id) // $11
+    .bind(column_set) // $12
+    .bind(max_l0_created_at); // $13
 
     let parquet_file_id = query.fetch_one(executor).await.map_err(|e| {
         if is_unique_violation(&e) {
-            Error::FileExists {
-                object_store_id: *object_store_id,
+            Error::AlreadyExists {
+                descr: object_store_id.to_string(),
             }
         } else if is_fk_violation(&e) {
-            Error::ForeignKeyViolation { source: e }
+            Error::NotFound {
+                descr: e.to_string(),
+            }
         } else {
-            Error::SqlxError { source: e }
+            Error::External {
+                source: Box::new(e),
+            }
         }
     })?;
 
@@ -1877,44 +1773,57 @@ RETURNING id;
 
 async fn flag_for_delete<'q, E>(
     executor: E,
-    ids: &[ParquetFileId],
+    partition_id: PartitionId,
+    ids: &[ObjectStoreId],
     marked_at: Timestamp,
 ) -> Result<()>
 where
     E: Executor<'q, Database = Postgres>,
 {
-    let query = sqlx::query(r#"UPDATE parquet_file SET to_delete = $1 WHERE id = ANY($2);"#)
-        .bind(marked_at) // $1
-        .bind(ids); // $2
-    query
-        .execute(executor)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+    let updated =
+        sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id;"#)
+            .bind(marked_at) // $1
+            .bind(ids) // $2
+            .bind(partition_id) // $3
+            .fetch_all(executor)
+            .await?;
+
+    if updated.len() != ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for delete".to_string(),
+        });
+    }
 
     Ok(())
 }
 
 async fn update_compaction_level<'q, E>(
     executor: E,
-    parquet_file_ids: &[ParquetFileId],
+    partition_id: PartitionId,
+    parquet_file_ids: &[ObjectStoreId],
     compaction_level: CompactionLevel,
 ) -> Result<()>
 where
     E: Executor<'q, Database = Postgres>,
 {
-    let query = sqlx::query(
+    let updated = sqlx::query_as::<_, (i64,)>(
         r#"
 UPDATE parquet_file
 SET compaction_level = $1
-WHERE id = ANY($2);
+WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id;
         "#,
     )
     .bind(compaction_level) // $1
-    .bind(parquet_file_ids); // $2
-    query
-        .execute(executor)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+    .bind(parquet_file_ids) // $2
+    .bind(partition_id) // $3
+    .fetch_all(executor)
+    .await?;
+
+    if updated.len() != parquet_file_ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for upgrade".to_string(),
+        });
+    }
 
     Ok(())
 }
@@ -1959,7 +1868,7 @@ pub(crate) mod test_utils {
     use rand::Rng;
     use sqlx::migrate::MigrateDatabase;
 
-    pub const TEST_DSN_ENV: &str = "TEST_INFLUXDB_IOX_CATALOG_DSN";
+    pub(crate) const TEST_DSN_ENV: &str = "TEST_INFLUXDB_IOX_CATALOG_DSN";
 
     /// Helper macro to skip tests if TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN environment
     /// variables are not set.
@@ -2010,7 +1919,7 @@ pub(crate) mod test_utils {
 
     pub(crate) use maybe_skip_integration;
 
-    pub async fn create_db(dsn: &str) {
+    pub(crate) async fn create_db(dsn: &str) {
         // Create the catalog database if it doesn't exist
         if !Postgres::database_exists(dsn).await.unwrap() {
             // Ignore failure if another test has already created the database
@@ -2018,7 +1927,7 @@ pub(crate) mod test_utils {
         }
     }
 
-    pub async fn setup_db_no_migration() -> PostgresCatalog {
+    pub(crate) async fn setup_db_no_migration() -> PostgresCatalog {
         // create a random schema for this particular pool
         let schema_name = {
             // use scope to make it clear to clippy / rust that `rng` is
@@ -2030,7 +1939,9 @@ pub(crate) mod test_utils {
                 .take(20)
                 .map(char::from)
                 .collect::<String>()
+                .to_ascii_lowercase()
         };
+        info!(schema_name, "test schema");
 
         let metrics = Arc::new(metric::Registry::default());
         let dsn = std::env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").unwrap();
@@ -2068,7 +1979,7 @@ pub(crate) mod test_utils {
         pg
     }
 
-    pub async fn setup_db() -> PostgresCatalog {
+    pub(crate) async fn setup_db() -> PostgresCatalog {
         let pg = setup_db_no_migration().await;
         // Run the migrations against this random schema.
         pg.setup().await.expect("failed to initialise database");
@@ -2079,6 +1990,7 @@ pub(crate) mod test_utils {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::interface::ParquetFileRepoExt;
     use crate::{
         postgres::test_utils::{
             create_db, maybe_skip_integration, setup_db, setup_db_no_migration,
@@ -2088,23 +2000,11 @@ mod tests {
     use assert_matches::assert_matches;
     use data_types::partition_template::TemplatePart;
     use generated_types::influxdata::iox::partition_template::v1 as proto;
-    use metric::{Attributes, DurationHistogram, Metric, Observation, RawReporter};
+    use metric::{Observation, RawReporter};
     use std::{io::Write, ops::Deref, sync::Arc, time::Instant};
     use tempfile::NamedTempFile;
     use test_helpers::maybe_start_logging;
 
-    fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) {
-        let histogram = metrics
-            .get_instrument::<Metric<DurationHistogram>>("catalog_op_duration")
-            .expect("failed to read metric")
-            .get_observer(&Attributes::from(&[("op", name), ("result", "success")]))
-            .expect("failed to get observer")
-            .fetch();
-
-        let hit_count = histogram.sample_count();
-        assert!(hit_count > 0, "metric did not record any calls");
-    }
-
     /// Small no-op test just to print out the migrations.
     ///
     /// This is helpful to look up migration checksums and debug parsing of the migration files.
@@ -2159,7 +2059,7 @@ mod tests {
 
         let postgres: Arc<dyn Catalog> = Arc::new(postgres);
 
-        crate::interface::test_helpers::test_catalog(|| async {
+        crate::interface_tests::test_catalog(|| async {
             // Clean the schema.
             pool
                 .execute(format!("DROP SCHEMA {schema_name} CASCADE").as_str())
@@ -2191,55 +2091,6 @@ mod tests {
         .await;
     }
 
-    #[tokio::test]
-    async fn test_partition_create_or_get_idempotent() {
-        maybe_skip_integration!();
-
-        let postgres = setup_db().await;
-        let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-        let mut repos = postgres.repositories().await;
-
-        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
-        let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
-
-        let key = PartitionKey::from("bananas");
-
-        let hash_id = PartitionHashId::new(table_id, &key);
-
-        let a = repos
-            .partitions()
-            .create_or_get(key.clone(), table_id)
-            .await
-            .expect("should create OK");
-
-        assert_eq!(a.hash_id().unwrap(), &hash_id);
-        // Test: sort_key_ids from partition_create_or_get_idempotent
-        assert!(a.sort_key_ids().is_empty());
-
-        // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent.
-        let b = repos
-            .partitions()
-            .create_or_get(key.clone(), table_id)
-            .await
-            .expect("idempotent write should succeed");
-
-        assert_eq!(a, b);
-
-        // Check that the hash_id is saved in the database and is returned when queried.
-        let table_partitions = postgres
-            .repositories()
-            .await
-            .partitions()
-            .list_by_table_id(table_id)
-            .await
-            .unwrap();
-        assert_eq!(table_partitions.len(), 1);
-        assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id);
-
-        // Test: sort_key_ids from partition_create_or_get_idempotent
-        assert!(table_partitions[0].sort_key_ids().is_empty());
-    }
-
     #[tokio::test]
     async fn existing_partitions_without_hash_id() {
         maybe_skip_integration!();
@@ -2247,7 +2098,7 @@ mod tests {
         let postgres = setup_db().await;
         let pool = postgres.pool.clone();
         let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-        let mut repos = postgres.repositories().await;
+        let mut repos = postgres.repositories();
 
         let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
         let table = arbitrary_table(&mut *repos, "table", &namespace).await;
@@ -2259,17 +2110,16 @@ mod tests {
         sqlx::query(
             r#"
 INSERT INTO partition
-    (partition_key, shard_id, table_id, sort_key, sort_key_ids)
+    (partition_key, table_id, sort_key_ids)
 VALUES
-    ( $1, $2, $3, '{}', '{}')
+    ( $1, $2, '{}')
 ON CONFLICT ON CONSTRAINT partition_key_unique
 DO UPDATE SET partition_key = partition.partition_key
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
         )
         .bind(&key) // $1
-        .bind(TRANSITION_SHARD_ID) // $2
-        .bind(table_id) // $3
+        .bind(table_id) // $2
         .fetch_one(&pool)
         .await
         .unwrap();
@@ -2289,7 +2139,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
             .expect("idempotent write should succeed");
 
         // Test: sort_key_ids from freshly insert with empty value
-        assert!(inserted_again.sort_key_ids().is_empty());
+        assert!(inserted_again.sort_key_ids().is_none());
 
         assert_eq!(partition, &inserted_again);
 
@@ -2301,10 +2151,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
             .create(parquet_file_params)
             .await
             .unwrap();
-        assert_matches!(
-            parquet_file.partition_id,
-            TransitionPartitionId::Deprecated(_)
-        );
+        assert_eq!(parquet_file.partition_hash_id, None);
 
         // Add a partition record WITH a hash ID
         repos
@@ -2404,164 +2251,6 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         assert_eq!(application_name, TEST_APPLICATION_NAME_NEW);
     }
 
-    macro_rules! test_column_create_or_get_many_unchecked {
-        (
-            $name:ident,
-            calls = {$([$($col_name:literal => $col_type:expr),+ $(,)?]),+},
-            want = $($want:tt)+
-        ) => {
-            paste::paste! {
-                #[tokio::test]
-                async fn [<test_column_create_or_get_many_unchecked_ $name>]() {
-                    maybe_skip_integration!();
-
-                    let postgres = setup_db().await;
-                    let metrics = Arc::clone(&postgres.metrics);
-                    let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-                    let mut repos = postgres.repositories().await;
-
-                    let namespace = arbitrary_namespace(&mut *repos, "ns4")
-                        .await;
-                    let table_id = arbitrary_table(&mut *repos, "table", &namespace)
-                        .await
-                        .id;
-
-                    $(
-                        let mut insert = HashMap::new();
-                        $(
-                            insert.insert($col_name, $col_type);
-                        )+
-
-                        let got = repos
-                            .columns()
-                            .create_or_get_many_unchecked(table_id, insert.clone())
-                            .await;
-
-                        // The returned columns MUST always match the requested
-                        // column values if successful.
-                        if let Ok(got) = &got {
-                            assert_eq!(insert.len(), got.len());
-
-                            for got in got {
-                                assert_eq!(table_id, got.table_id);
-                                let requested_column_type = insert
-                                    .get(got.name.as_str())
-                                    .expect("Should have gotten back a column that was inserted");
-                                assert_eq!(
-                                    *requested_column_type,
-                                    ColumnType::try_from(got.column_type)
-                                        .expect("invalid column type")
-                                );
-                            }
-
-                            assert_metric_hit(&metrics, "column_create_or_get_many_unchecked");
-                        }
-                    )+
-
-                    assert_matches!(got, $($want)+);
-                }
-            }
-        }
-    }
-
-    // Issue a few calls to create_or_get_many that contain distinct columns and
-    // covers the full set of column types.
-    test_column_create_or_get_many_unchecked!(
-        insert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-            ],
-            [
-                "test8" => ColumnType::String,
-                "test9" => ColumnType::Bool,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with overlapping columns - request should succeed (upsert
-    // semantics).
-    test_column_create_or_get_many_unchecked!(
-        partial_upsert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ],
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-                "test8" => ColumnType::String,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with the same columns and types.
-    test_column_create_or_get_many_unchecked!(
-        full_upsert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ],
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with overlapping columns with conflicting types and
-    // observe a correctly populated ColumnTypeMismatch error.
-    test_column_create_or_get_many_unchecked!(
-        partial_type_conflict,
-        calls = {
-            [
-                "test1" => ColumnType::String,
-                "test2" => ColumnType::String,
-                "test3" => ColumnType::String,
-                "test4" => ColumnType::String,
-            ],
-            [
-                "test1" => ColumnType::String,
-                "test2" => ColumnType::Bool, // This one differs
-                "test3" => ColumnType::String,
-                // 4 is missing.
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-                "test8" => ColumnType::String,
-            ]
-        },
-        want = Err(e) => {
-            assert_matches!(e, Error::ColumnTypeMismatch { name, existing, new } => {
-                assert_eq!(name, "test2");
-                assert_eq!(existing, ColumnType::String);
-                assert_eq!(new, ColumnType::Bool);
-            })
-        }
-    );
-
     #[tokio::test]
     async fn test_billing_summary_on_parqet_file_creation() {
         maybe_skip_integration!();
@@ -2569,7 +2258,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let postgres = setup_db().await;
         let pool = postgres.pool.clone();
         let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-        let mut repos = postgres.repositories().await;
+        let mut repos = postgres.repositories();
         let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
         let table = arbitrary_table(&mut *repos, "table", &namespace).await;
         let key = "bananas";
@@ -2585,7 +2274,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let f1 = repos.parquet_files().create(p1.clone()).await.unwrap();
         // insert the same again with a different size; we should then have 3x1337 as total file
         // size
-        p1.object_store_id = Uuid::new_v4();
+        p1.object_store_id = ObjectStoreId::new();
         p1.file_size_bytes *= 2;
         let _f2 = repos
             .parquet_files()
@@ -2604,7 +2293,13 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         // flag f1 for deletion and assert that the total file size is reduced accordingly.
         repos
             .parquet_files()
-            .create_upgrade_delete(&[f1.id], &[], &[], CompactionLevel::Initial)
+            .create_upgrade_delete(
+                partition.id,
+                &[f1.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
             .await
             .expect("flag parquet file for deletion should succeed");
         let total_file_size_bytes: i64 =
@@ -2638,7 +2333,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let postgres = setup_db().await;
         let pool = postgres.pool.clone();
         let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-        let mut repos = postgres.repositories().await;
+        let mut repos = postgres.repositories();
 
         let namespace_name = "apples";
 
@@ -2647,17 +2342,15 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let insert_null_partition_template_namespace = sqlx::query(
             r#"
 INSERT INTO namespace (
-    name, topic_id, query_pool_id, retention_period_ns, partition_template
+    name, retention_period_ns, partition_template
 )
-VALUES ( $1, $2, $3, $4, NULL )
+VALUES ( $1, $2, NULL )
 RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
           partition_template;
             "#,
         )
         .bind(namespace_name) // $1
-        .bind(SHARED_TOPIC_ID) // $2
-        .bind(SHARED_QUERY_POOL_ID) // $3
-        .bind(None::<Option<i64>>); // $4
+        .bind(None::<Option<i64>>); // $2
 
         insert_null_partition_template_namespace
             .fetch_one(&pool)
@@ -2756,7 +2449,7 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         let postgres = setup_db().await;
         let pool = postgres.pool.clone();
         let postgres: Arc<dyn Catalog> = Arc::new(postgres);
-        let mut repos = postgres.repositories().await;
+        let mut repos = postgres.repositories();
 
         let namespace_default_template_name = "oranges";
         let namespace_default_template = repos
diff --git a/iox_catalog/src/sqlite.rs b/iox_catalog/src/sqlite.rs
index 7e29a2b9a4a..e91cde3e9bf 100644
--- a/iox_catalog/src/sqlite.rs
+++ b/iox_catalog/src/sqlite.rs
@@ -1,45 +1,46 @@
 //! A SQLite backed implementation of the Catalog
 
+use crate::interface::PartitionRepoExt;
 use crate::{
-    interface::{
-        self, CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo,
-        ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
-        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
+    constants::{
+        MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION,
     },
-    kafkaless_transition::{
-        SHARED_QUERY_POOL, SHARED_QUERY_POOL_ID, SHARED_TOPIC_ID, SHARED_TOPIC_NAME,
-        TRANSITION_SHARD_ID, TRANSITION_SHARD_INDEX,
+    interface::{
+        AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo,
+        PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo,
     },
     metrics::MetricDecorator,
 };
 use async_trait::async_trait;
+use data_types::snapshot::partition::PartitionSnapshot;
+use data_types::snapshot::table::TableSnapshot;
 use data_types::{
     partition_template::{
         NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart,
     },
     Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables,
-    Namespace, NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile,
-    ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey,
-    SkippedCompaction, SortedColumnSet, Table, TableId, Timestamp, TransitionPartitionId,
+    Namespace, NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp,
 };
-use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, fmt::Display};
-use std::{collections::HashSet, fmt::Write};
-
-use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE;
 use iox_time::{SystemProvider, TimeProvider};
 use metric::Registry;
 use observability_deps::tracing::debug;
 use parking_lot::Mutex;
+use serde::{Deserialize, Serialize};
 use snafu::prelude::*;
-use sqlx::sqlite::SqliteRow;
-use sqlx::types::Json;
 use sqlx::{
-    migrate::Migrator, sqlite::SqliteConnectOptions, types::Uuid, Executor, Pool, Row, Sqlite,
-    SqlitePool,
+    migrate::Migrator,
+    sqlite::{SqliteConnectOptions, SqliteRow},
+    types::Json,
+    Executor, FromRow, Pool, Row, Sqlite, SqlitePool,
+};
+use std::{
+    collections::{HashMap, HashSet},
+    fmt::Display,
+    str::FromStr,
+    sync::Arc,
 };
-use std::str::FromStr;
-use std::sync::Arc;
 
 static MIGRATOR: Migrator = sqlx::migrate!("sqlite/migrations");
 
@@ -137,13 +138,9 @@ impl<'c> Executor<'c> for &'c mut SqliteTxnInner {
 impl SqliteCatalog {
     /// Connect to the catalog store.
     pub async fn connect(options: SqliteConnectionOptions, metrics: Arc<Registry>) -> Result<Self> {
-        let opts = SqliteConnectOptions::from_str(&options.file_path)
-            .map_err(|e| Error::SqlxError { source: e })?
-            .create_if_missing(true);
+        let opts = SqliteConnectOptions::from_str(&options.file_path)?.create_if_missing(true);
 
-        let pool = SqlitePool::connect_with(opts)
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+        let pool = SqlitePool::connect_with(opts).await?;
         Ok(Self {
             metrics,
             pool,
@@ -162,61 +159,12 @@ impl Display for SqliteCatalog {
 #[async_trait]
 impl Catalog for SqliteCatalog {
     async fn setup(&self) -> Result<()> {
-        MIGRATOR
-            .run(&self.pool)
-            .await
-            .map_err(|e| Error::Setup { source: e.into() })?;
-
-        // We need to manually insert the topic here so that we can create the transition shard
-        // below.
-        sqlx::query(
-            r#"
-INSERT INTO topic (name)
-VALUES ($1)
-ON CONFLICT (name)
-DO NOTHING;
-        "#,
-        )
-        .bind(SHARED_TOPIC_NAME)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
-
-        // The transition shard must exist and must have magic ID and INDEX.
-        sqlx::query(
-            r#"
-INSERT INTO shard (id, topic_id, shard_index, min_unpersisted_sequence_number)
-VALUES ($1, $2, $3, 0)
-ON CONFLICT (topic_id, shard_index)
-DO NOTHING;
-        "#,
-        )
-        .bind(TRANSITION_SHARD_ID)
-        .bind(SHARED_TOPIC_ID)
-        .bind(TRANSITION_SHARD_INDEX)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
-
-        // We need to manually insert the query pool here so that we can create namespaces that
-        // reference it.
-        sqlx::query(
-            r#"
-INSERT INTO query_pool (name)
-VALUES ($1)
-ON CONFLICT (name)
-DO NOTHING;
-        "#,
-        )
-        .bind(SHARED_QUERY_POOL)
-        .execute(&self.pool)
-        .await
-        .map_err(|e| Error::Setup { source: e })?;
+        MIGRATOR.run(&self.pool).await?;
 
         Ok(())
     }
 
-    async fn repositories(&self) -> Box<dyn RepoCollection> {
+    fn repositories(&self) -> Box<dyn RepoCollection> {
         Box::new(MetricDecorator::new(
             SqliteTxn {
                 inner: Mutex::new(SqliteTxnInner {
@@ -225,6 +173,7 @@ DO NOTHING;
                 time_provider: Arc::clone(&self.time_provider),
             },
             Arc::clone(&self.metrics),
+            Arc::clone(&self.time_provider),
         ))
     }
 
@@ -238,7 +187,6 @@ DO NOTHING;
     }
 }
 
-#[async_trait]
 impl RepoCollection for SqliteTxn {
     fn namespaces(&mut self) -> &mut dyn NamespaceRepo {
         self
@@ -279,29 +227,31 @@ impl NamespaceRepo for SqliteTxn {
 
         let rec = sqlx::query_as::<_, Namespace>(
             r#"
-INSERT INTO namespace ( name, topic_id, query_pool_id, retention_period_ns, max_tables, max_columns_per_table, partition_template )
-VALUES ( $1, $2, $3, $4, $5, $6, $7 )
+INSERT INTO namespace ( name, retention_period_ns, max_tables, max_columns_per_table, partition_template )
+VALUES ( $1, $2, $3, $4, $5 )
 RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
           partition_template;
             "#,
         )
         .bind(name.as_str()) // $1
-        .bind(SHARED_TOPIC_ID) // $2
-        .bind(SHARED_QUERY_POOL_ID) // $3
-        .bind(retention_period_ns) // $4
-        .bind(max_tables) // $5
-        .bind(max_columns_per_table) // $6
-        .bind(partition_template); // $7
+        .bind(retention_period_ns) // $2
+        .bind(max_tables) // $3
+        .bind(max_columns_per_table) // $4
+        .bind(partition_template); // $5
 
         let rec = rec.fetch_one(self.inner.get_mut()).await.map_err(|e| {
             if is_unique_violation(&e) {
-                Error::NameExists {
-                    name: name.to_string(),
+                Error::AlreadyExists {
+                    descr: name.to_string(),
                 }
             } else if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
@@ -322,8 +272,7 @@ WHERE {v};
             .as_str(),
         )
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -353,7 +302,7 @@ WHERE id=$1 AND {v};
             return Ok(None);
         }
 
-        let namespace = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let namespace = rec?;
 
         Ok(Some(namespace))
     }
@@ -383,7 +332,7 @@ WHERE name=$1 AND {v};
             return Ok(None);
         }
 
-        let namespace = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let namespace = rec?;
 
         Ok(Some(namespace))
     }
@@ -397,7 +346,7 @@ WHERE name=$1 AND {v};
             .bind(name) // $2
             .execute(self.inner.get_mut())
             .await
-            .context(interface::CouldNotDeleteNamespaceSnafu)
+            .map_err(Error::from)
             .map(|_| ())
     }
 
@@ -417,10 +366,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -446,10 +397,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -475,10 +428,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         .await;
 
         let namespace = rec.map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName {
-                name: name.to_string(),
+            sqlx::Error::RowNotFound => Error::NotFound {
+                descr: name.to_string(),
+            },
+            _ => Error::External {
+                source: Box::new(e),
             },
-            _ => Error::SqlxError { source: e },
         })?;
 
         Ok(namespace)
@@ -520,24 +475,24 @@ RETURNING *;
         .fetch_one(executor)
         .await
         .map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::ColumnCreateLimitError {
-                column_name: name.to_string(),
-                table_id,
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id)
             },
             _ => {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound { descr: e.to_string() }
             } else {
-                Error::SqlxError { source: e }
+                Error::External { source: Box::new(e) }
             }
         }})?;
 
     ensure!(
         rec.column_type == column_type,
-        ColumnTypeMismatchSnafu {
-            name,
-            existing: rec.column_type,
-            new: column_type,
+        AlreadyExistsSnafu {
+            descr: format!(
+                "column {} is type {} but schema update has type {}",
+                name, rec.column_type, column_type
+            ),
         }
     );
 
@@ -552,13 +507,7 @@ impl TableRepo for SqliteTxn {
         partition_template: TablePartitionTemplateOverride,
         namespace_id: NamespaceId,
     ) -> Result<Table> {
-        let mut tx = self
-            .inner
-            .get_mut()
-            .pool
-            .begin()
-            .await
-            .map_err(|e| Error::StartTransaction { source: e })?;
+        let mut tx = self.inner.get_mut().pool.begin().await?;
 
         // A simple insert statement becomes quite complicated in order to avoid checking the table
         // limits in a select and then conditionally inserting (which would be racey).
@@ -587,20 +536,25 @@ RETURNING *;
         .fetch_one(&mut *tx)
         .await
         .map_err(|e| match e {
-            sqlx::Error::RowNotFound => Error::TableCreateLimitError {
-                table_name: name.to_string(),
-                namespace_id,
+            sqlx::Error::RowNotFound => Error::LimitExceeded {
+                descr: format!(
+                    "couldn't create table {}; limit reached on namespace {}",
+                    name, namespace_id
+                ),
             },
             _ => {
                 if is_unique_violation(&e) {
-                    Error::TableNameExists {
-                        name: name.to_string(),
-                        namespace_id,
+                    Error::AlreadyExists {
+                        descr: format!("table '{name}' in namespace {namespace_id}"),
                     }
                 } else if is_fk_violation(&e) {
-                    Error::ForeignKeyViolation { source: e }
+                    Error::NotFound {
+                        descr: e.to_string(),
+                    }
                 } else {
-                    Error::SqlxError { source: e }
+                    Error::External {
+                        source: Box::new(e),
+                    }
                 }
             }
         })?;
@@ -616,9 +570,7 @@ RETURNING *;
             }
         }
 
-        tx.commit()
-            .await
-            .map_err(|source| Error::FailedToCommit { source })?;
+        tx.commit().await?;
 
         Ok(table)
     }
@@ -639,7 +591,7 @@ WHERE id = $1;
             return Ok(None);
         }
 
-        let table = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let table = rec?;
 
         Ok(Some(table))
     }
@@ -665,7 +617,7 @@ WHERE namespace_id = $1 AND name = $2;
             return Ok(None);
         }
 
-        let table = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let table = rec?;
 
         Ok(Some(table))
     }
@@ -680,8 +632,7 @@ WHERE namespace_id = $1;
         )
         .bind(namespace_id)
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -689,11 +640,52 @@ WHERE namespace_id = $1;
     async fn list(&mut self) -> Result<Vec<Table>> {
         let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;")
             .fetch_all(self.inner.get_mut())
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+            .await?;
 
         Ok(rec)
     }
+
+    async fn snapshot(&mut self, table_id: TableId) -> Result<TableSnapshot> {
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        // This will upgrade the transaction to be exclusive
+        let rec = sqlx::query(
+            "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING *;",
+        )
+        .bind(table_id) // $1
+        .fetch_one(&mut *tx)
+        .await;
+
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("table: {table_id}"),
+            });
+        }
+        let row = rec?;
+
+        let generation: i64 = row.get("generation");
+        let table = Table::from_row(&row)?;
+
+        let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;")
+            .bind(table_id) // $1
+            .fetch_all(&mut *tx)
+            .await?;
+
+        let partitions =
+            sqlx::query_as::<_, PartitionPod>("SELECT * from partition where table_id = $1;")
+                .bind(table_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        tx.commit().await?;
+
+        Ok(TableSnapshot::encode(
+            table,
+            partitions.into_iter().map(Into::into).collect(),
+            columns,
+            generation as _,
+        )?)
+    }
 }
 
 #[async_trait]
@@ -717,8 +709,7 @@ WHERE table_name.namespace_id = $1;
         )
         .bind(namespace_id)
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -732,8 +723,7 @@ WHERE table_id = $1;
         )
         .bind(table_id)
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
         Ok(rec)
     }
@@ -741,8 +731,7 @@ WHERE table_id = $1;
     async fn list(&mut self) -> Result<Vec<Column>> {
         let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;")
             .fetch_all(self.inner.get_mut())
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?;
+            .await?;
 
         Ok(rec)
     }
@@ -791,9 +780,13 @@ RETURNING *;
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
@@ -803,10 +796,11 @@ RETURNING *;
             let want = columns.get(existing.name.as_str()).unwrap();
             ensure!(
                 existing.column_type == *want,
-                ColumnTypeMismatchSnafu {
-                    name: &existing.name,
-                    existing: existing.column_type,
-                    new: *want,
+                AlreadyExistsSnafu {
+                    descr: format!(
+                        "column {} is type {} but schema update has type {}",
+                        existing.name, existing.column_type, want
+                    ),
                 }
             );
         }
@@ -824,21 +818,19 @@ struct PartitionPod {
     hash_id: Option<PartitionHashId>,
     table_id: TableId,
     partition_key: PartitionKey,
-    sort_key: Json<Vec<String>>,
     sort_key_ids: Json<Vec<i64>>,
     new_file_at: Option<Timestamp>,
 }
 
 impl From<PartitionPod> for Partition {
     fn from(value: PartitionPod) -> Self {
-        let sort_key_ids = SortedColumnSet::from(value.sort_key_ids.0);
+        let sort_key_ids = SortKeyIds::from(value.sort_key_ids.0);
 
-        Self::new_with_hash_id_from_sqlite_catalog_only(
+        Self::new_catalog_only(
             value.id,
             value.hash_id,
             value.table_id,
             value.partition_key,
-            value.sort_key.0,
             sort_key_ids,
             value.new_file_at,
         )
@@ -857,59 +849,41 @@ impl PartitionRepo for SqliteTxn {
         let v = sqlx::query_as::<_, PartitionPod>(
             r#"
 INSERT INTO partition
-    (partition_key, shard_id, table_id, hash_id, sort_key, sort_key_ids)
+    (partition_key, table_id, hash_id, sort_key_ids)
 VALUES
-    ($1, $2, $3, $4, '[]', '[]')
+    ($1, $2, $3, '[]')
 ON CONFLICT (table_id, partition_key)
 DO UPDATE SET partition_key = partition.partition_key
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
         )
         .bind(key) // $1
-        .bind(TRANSITION_SHARD_ID) // $2
-        .bind(table_id) // $3
-        .bind(&hash_id) // $4
+        .bind(table_id) // $2
+        .bind(&hash_id) // $3
         .fetch_one(self.inner.get_mut())
         .await
         .map_err(|e| {
             if is_fk_violation(&e) {
-                Error::ForeignKeyViolation { source: e }
+                Error::NotFound {
+                    descr: e.to_string(),
+                }
             } else {
-                Error::SqlxError { source: e }
+                Error::External {
+                    source: Box::new(e),
+                }
             }
         })?;
 
         Ok(v.into())
     }
 
-    async fn get_by_id(&mut self, partition_id: PartitionId) -> Result<Option<Partition>> {
-        let rec = sqlx::query_as::<_, PartitionPod>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE id = $1;
-            "#,
-        )
-        .bind(partition_id) // $1
-        .fetch_one(self.inner.get_mut())
-        .await;
-
-        if let Err(sqlx::Error::RowNotFound) = rec {
-            return Ok(None);
-        }
-
-        let partition = rec.map_err(|e| Error::SqlxError { source: e })?;
-
-        Ok(Some(partition.into()))
-    }
-
-    async fn get_by_id_batch(&mut self, partition_ids: Vec<PartitionId>) -> Result<Vec<Partition>> {
+    async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result<Vec<Partition>> {
         // We use a JSON-based "IS IN" check.
         let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
 
         sqlx::query_as::<_, PartitionPod>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE id IN (SELECT value FROM json_each($1));
             "#,
@@ -918,77 +892,20 @@ WHERE id IN (SELECT value FROM json_each($1));
         .fetch_all(self.inner.get_mut())
         .await
         .map(|vals| vals.into_iter().map(Partition::from).collect())
-        .map_err(|e| Error::SqlxError { source: e })
-    }
-
-    async fn get_by_hash_id(
-        &mut self,
-        partition_hash_id: &PartitionHashId,
-    ) -> Result<Option<Partition>> {
-        let rec = sqlx::query_as::<_, PartitionPod>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE hash_id = $1;
-            "#,
-        )
-        .bind(partition_hash_id) // $1
-        .fetch_one(self.inner.get_mut())
-        .await;
-
-        if let Err(sqlx::Error::RowNotFound) = rec {
-            return Ok(None);
-        }
-
-        let partition = rec.map_err(|e| Error::SqlxError { source: e })?;
-
-        Ok(Some(partition.into()))
-    }
-
-    async fn get_by_hash_id_batch(
-        &mut self,
-        partition_hash_ids: &[&PartitionHashId],
-    ) -> Result<Vec<Partition>> {
-        // We use a JSON-based "IS IN" check.
-        let ids: Vec<_> = partition_hash_ids
-            .iter()
-            .map(|id| {
-                // convert partiion hash ID to uppercase hex string
-                let bytes = id.as_bytes();
-                let mut s = String::with_capacity(bytes.len() * 2);
-                for b in bytes {
-                    write!(&mut s, "{:02X}", b).expect("never fails");
-                }
-                s
-            })
-            .collect();
-
-        sqlx::query_as::<_, PartitionPod>(
-            r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
-FROM partition
-WHERE hex(hash_id) IN (SELECT value FROM json_each($1));
-            "#,
-        )
-        .bind(Json(&ids[..])) // $1
-        .fetch_all(self.inner.get_mut())
-        .await
-        .map(|vals| vals.into_iter().map(Partition::from).collect())
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn list_by_table_id(&mut self, table_id: TableId) -> Result<Vec<Partition>> {
         Ok(sqlx::query_as::<_, PartitionPod>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE table_id = $1;
             "#,
         )
         .bind(table_id) // $1
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
+        .await?
         .into_iter()
         .map(Into::into)
         .collect())
@@ -1003,7 +920,7 @@ WHERE table_id = $1;
         )
         .fetch_all(self.inner.get_mut())
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     /// Update the sort key for `partition_id` if and only if `old_sort_key`
@@ -1014,57 +931,26 @@ WHERE table_id = $1;
     /// round trips to service a transaction in the happy path).
     async fn cas_sort_key(
         &mut self,
-        partition_id: &TransitionPartitionId,
-        old_sort_key: Option<Vec<String>>,
-        old_sort_key_ids: Option<SortedColumnSet>,
-        new_sort_key: &[&str],
-        new_sort_key_ids: &SortedColumnSet,
-    ) -> Result<Partition, CasFailure<(Vec<String>, SortedColumnSet)>> {
-        // These asserts are here to cacth bugs. They will be removed when we remove the sort_key
-        // field from the Partition
-        assert_eq!(
-            old_sort_key.as_ref().map(|v| v.len()),
-            old_sort_key_ids.as_ref().map(|v| v.len())
-        );
-        assert_eq!(new_sort_key.len(), new_sort_key_ids.len());
+        partition_id: PartitionId,
+        old_sort_key_ids: Option<&SortKeyIds>,
+        new_sort_key_ids: &SortKeyIds,
+    ) -> Result<Partition, CasFailure<SortKeyIds>> {
+        let old_sort_key_ids: Vec<i64> = old_sort_key_ids.map(Into::into).unwrap_or_default();
 
-        let old_sort_key = old_sort_key.unwrap_or_default();
-        let raw_old_sort_key_ids: Vec<_> = old_sort_key_ids
-            .unwrap_or_default()
-            .iter()
-            .map(|c| c.get())
-            .collect();
-        let raw_new_sort_key_ids: Vec<_> = new_sort_key_ids.iter().map(|cid| cid.get()).collect();
+        let raw_new_sort_key_ids: Vec<i64> = new_sort_key_ids.into();
 
         // This `match` will go away when all partitions have hash IDs in the database.
-        let query = match partition_id {
-            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, PartitionPod>(
-                r#"
-UPDATE partition
-SET sort_key = $1, sort_key_ids = $4
-WHERE hash_id = $2 AND sort_key = $3 AND sort_key_ids = $5
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
-        "#,
-            )
-            .bind(Json(new_sort_key)) // $1
-            .bind(hash_id) // $2
-            .bind(Json(&old_sort_key)) // $3
-            .bind(Json(&raw_new_sort_key_ids)) // $4
-            .bind(Json(&raw_old_sort_key_ids)), // $5
-            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, PartitionPod>(
-                r#"
+        let query = sqlx::query_as::<_, PartitionPod>(
+            r#"
 UPDATE partition
-SET sort_key = $1, sort_key_ids = $4
-WHERE id = $2 AND sort_key = $3 AND sort_key_ids = $5
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+SET sort_key_ids = $1
+WHERE id = $2 AND sort_key_ids = $3
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
-            )
-            .bind(Json(new_sort_key)) // $1
-            .bind(id) // $2
-            .bind(Json(&old_sort_key)) // $3
-            .bind(Json(&raw_new_sort_key_ids)) // $4
-            .bind(Json(&raw_old_sort_key_ids)), // $5
-        };
+        )
+        .bind(Json(raw_new_sort_key_ids)) // $1
+        .bind(partition_id) // $2
+        .bind(Json(old_sort_key_ids)); // $3
 
         let res = query.fetch_one(self.inner.get_mut()).await;
 
@@ -1082,26 +968,26 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
                 //
                 // NOTE: this is racy, but documented - this might return "Sort
                 // key differs! Old key: <old sort key you provided>"
-                let partition = crate::partition_lookup(self, partition_id)
+
+                let partition = (self as &mut dyn PartitionRepo)
+                    .get_by_id(partition_id)
                     .await
                     .map_err(CasFailure::QueryError)?
-                    .ok_or(CasFailure::QueryError(Error::PartitionNotFound {
-                        id: partition_id.clone(),
+                    .ok_or(CasFailure::QueryError(Error::NotFound {
+                        descr: partition_id.to_string(),
                     }))?;
-                return Err(CasFailure::ValueMismatch((
-                    partition.sort_key,
-                    partition.sort_key_ids,
-                )));
+                return Err(CasFailure::ValueMismatch(
+                    partition.sort_key_ids().cloned().unwrap_or_default(),
+                ));
+            }
+            Err(e) => {
+                return Err(CasFailure::QueryError(Error::External {
+                    source: Box::new(e),
+                }))
             }
-            Err(e) => return Err(CasFailure::QueryError(Error::SqlxError { source: e })),
         };
 
-        debug!(
-            ?partition_id,
-            ?old_sort_key,
-            ?new_sort_key,
-            "partition sort key cas successful"
-        );
+        debug!(?partition_id, "partition sort key cas successful");
 
         Ok(partition.into())
     }
@@ -1143,8 +1029,7 @@ skipped_at = EXCLUDED.skipped_at;
             .bind(limit_bytes as i64)
             .bind(std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs() as i64)
             .execute(self.inner.get_mut())
-            .await
-            .context(interface::CouldNotRecordSkippedCompactionSnafu { partition_id })?;
+            .await?;
         Ok(())
     }
 
@@ -1160,7 +1045,7 @@ skipped_at = EXCLUDED.skipped_at;
         .fetch_all(self.inner.get_mut())
         .await;
 
-        let skipped_partition_records = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let skipped_partition_records = rec?;
 
         Ok(skipped_partition_records)
     }
@@ -1173,7 +1058,7 @@ SELECT * FROM skipped_compactions
         )
         .fetch_all(self.inner.get_mut())
         .await
-        .context(interface::CouldNotListSkippedCompactionsSnafu)
+        .map_err(Error::from)
     }
 
     async fn delete_skipped_compactions(
@@ -1190,13 +1075,13 @@ RETURNING *
         .bind(partition_id)
         .fetch_optional(self.inner.get_mut())
         .await
-        .context(interface::CouldNotDeleteSkippedCompactionsSnafu)
+        .map_err(Error::from)
     }
 
     async fn most_recent_n(&mut self, n: usize) -> Result<Vec<Partition>> {
         Ok(sqlx::query_as::<_, PartitionPod>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 ORDER BY id DESC
 LIMIT $1;
@@ -1204,8 +1089,7 @@ LIMIT $1;
         )
         .bind(n as i64) // $1
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
+        .await?
         .into_iter()
         .map(Into::into)
         .collect())
@@ -1233,25 +1117,74 @@ LIMIT $1;
             .bind(maximum_time) // $2
             .fetch_all(self.inner.get_mut())
             .await
-            .map_err(|e| Error::SqlxError { source: e })
+            .map_err(Error::from)
     }
 
     async fn list_old_style(&mut self) -> Result<Vec<Partition>> {
         Ok(sqlx::query_as::<_, PartitionPod>(
             r#"
-SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at
+SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at
 FROM partition
 WHERE hash_id IS NULL
 ORDER BY id DESC;
         "#,
         )
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
+        .await?
         .into_iter()
         .map(Into::into)
         .collect())
     }
+
+    async fn snapshot(&mut self, partition_id: PartitionId) -> Result<PartitionSnapshot> {
+        let mut tx = self.inner.get_mut().pool.begin().await?;
+
+        // This will upgrade the transaction to be exclusive
+        let rec = sqlx::query(
+            "UPDATE partition SET generation = generation + 1 where id = $1 RETURNING *;",
+        )
+        .bind(partition_id) // $1
+        .fetch_one(&mut *tx)
+        .await;
+        if let Err(sqlx::Error::RowNotFound) = rec {
+            return Err(Error::NotFound {
+                descr: format!("partition: {partition_id}"),
+            });
+        }
+        let row = rec?;
+
+        let generation: i64 = row.get("generation");
+        let partition = PartitionPod::from_row(&row)?;
+
+        let (namespace_id,): (NamespaceId,) =
+            sqlx::query_as("SELECT namespace_id from table_name where id = $1")
+                .bind(partition.table_id) // $1
+                .fetch_one(&mut *tx)
+                .await?;
+
+        let files =
+            sqlx::query_as::<_, ParquetFilePod>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;")
+                .bind(partition_id) // $1
+                .fetch_all(&mut *tx)
+                .await?;
+
+        let sc = sqlx::query_as::<sqlx::sqlite::Sqlite, SkippedCompaction>(
+            r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#,
+        )
+        .bind(partition_id)
+        .fetch_optional(&mut *tx)
+        .await?;
+
+        tx.commit().await?;
+
+        Ok(PartitionSnapshot::encode(
+            namespace_id,
+            partition.into(),
+            files.into_iter().map(Into::into).collect(),
+            sc,
+            generation as _,
+        )?)
+    }
 }
 
 fn from_column_set(v: &ColumnSet) -> Json<Vec<i64>> {
@@ -1267,9 +1200,9 @@ struct ParquetFilePod {
     id: ParquetFileId,
     namespace_id: NamespaceId,
     table_id: TableId,
-    #[sqlx(flatten)]
-    partition_id: TransitionPartitionId,
-    object_store_id: Uuid,
+    partition_id: PartitionId,
+    partition_hash_id: Option<PartitionHashId>,
+    object_store_id: ObjectStoreId,
     min_time: Timestamp,
     max_time: Timestamp,
     to_delete: Option<Timestamp>,
@@ -1288,6 +1221,7 @@ impl From<ParquetFilePod> for ParquetFile {
             namespace_id: value.namespace_id,
             table_id: value.table_id,
             partition_id: value.partition_id,
+            partition_hash_id: value.partition_hash_id,
             object_store_id: value.object_store_id,
             min_time: value.min_time,
             max_time: value.max_time,
@@ -1304,39 +1238,13 @@ impl From<ParquetFilePod> for ParquetFile {
 
 #[async_trait]
 impl ParquetFileRepo for SqliteTxn {
-    async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result<ParquetFile> {
-        let executor = self.inner.get_mut();
-        create_parquet_file(executor, parquet_file_params).await
-    }
-
-    async fn list_all(&mut self) -> Result<Vec<ParquetFile>> {
-        // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large
-        // `parquet_metadata` column!!
-        Ok(sqlx::query_as::<_, ParquetFilePod>(
-            r#"
-SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id,
-       parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id,
-       parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete,
-       parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level,
-       parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at
-FROM parquet_file;
-             "#,
-        )
-        .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
-        .into_iter()
-        .map(Into::into)
-        .collect())
-    }
-
-    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<ParquetFileId>> {
+    async fn flag_for_delete_by_retention(&mut self) -> Result<Vec<(PartitionId, ObjectStoreId)>> {
         let flagged_at = Timestamp::from(self.time_provider.now());
         // TODO - include check of table retention period once implemented
         let flagged = sqlx::query(
             r#"
 WITH parquet_file_ids as (
-    SELECT parquet_file.id
+    SELECT parquet_file.object_store_id
     FROM namespace, parquet_file
     WHERE namespace.retention_period_ns IS NOT NULL
     AND parquet_file.to_delete IS NULL
@@ -1346,130 +1254,71 @@ WITH parquet_file_ids as (
 )
 UPDATE parquet_file
 SET to_delete = $1
-WHERE id IN (SELECT id FROM parquet_file_ids)
-RETURNING id;
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING partition_id, object_store_id;
             "#,
         )
         .bind(flagged_at) // $1
         .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
-        let flagged = flagged.into_iter().map(|row| row.get("id")).collect();
+        let flagged = flagged
+            .into_iter()
+            .map(|row| (row.get("partition_id"), row.get("object_store_id")))
+            .collect();
         Ok(flagged)
     }
 
-    async fn list_by_namespace_not_to_delete(
-        &mut self,
-        namespace_id: NamespaceId,
-    ) -> Result<Vec<ParquetFile>> {
-        // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large
-        // `parquet_metadata` column!!
-        Ok(sqlx::query_as::<_, ParquetFilePod>(
-            r#"
-SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id,
-       parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id,
-       parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete,
-       parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level,
-       parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at
-FROM parquet_file
-INNER JOIN table_name on table_name.id = parquet_file.table_id
-WHERE table_name.namespace_id = $1
-  AND parquet_file.to_delete IS NULL;
-             "#,
-        )
-        .bind(namespace_id) // $1
-        .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
-        .into_iter()
-        .map(Into::into)
-        .collect())
-    }
-
-    async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result<Vec<ParquetFile>> {
-        Ok(sqlx::query_as::<_, ParquetFilePod>(
-            r#"
-SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id,
-       min_time, max_time, to_delete, file_size_bytes,
-       row_count, compaction_level, created_at, column_set, max_l0_created_at
-FROM parquet_file
-WHERE table_id = $1 AND to_delete IS NULL;
-             "#,
-        )
-        .bind(table_id) // $1
-        .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?
-        .into_iter()
-        .map(Into::into)
-        .collect())
-    }
-
-    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ParquetFileId>> {
+    async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result<Vec<ObjectStoreId>> {
         // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-sqlite-ctes-to-the-rescue
         let deleted = sqlx::query(
             r#"
 WITH parquet_file_ids as (
-    SELECT id
+    SELECT object_store_id
     FROM parquet_file
     WHERE to_delete < $1
     LIMIT $2
 )
 DELETE FROM parquet_file
-WHERE id IN (SELECT id FROM parquet_file_ids)
-RETURNING id;
+WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids)
+RETURNING object_store_id;
              "#,
         )
         .bind(older_than) // $1
         .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2
         .fetch_all(self.inner.get_mut())
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+        .await?;
 
-        let deleted = deleted.into_iter().map(|row| row.get("id")).collect();
+        let deleted = deleted
+            .into_iter()
+            .map(|row| row.get("object_store_id"))
+            .collect();
         Ok(deleted)
     }
 
-    async fn list_by_partition_not_to_delete(
+    async fn list_by_partition_not_to_delete_batch(
         &mut self,
-        partition_id: &TransitionPartitionId,
+        partition_ids: Vec<PartitionId>,
     ) -> Result<Vec<ParquetFile>> {
-        // This `match` will go away when all partitions have hash IDs in the database.
-        let query = match partition_id {
-            TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFilePod>(
-                r#"
-SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
-       object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
-       compaction_level, created_at, column_set, max_l0_created_at
-FROM parquet_file
-INNER JOIN partition
-ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
-WHERE partition.hash_id = $1
-  AND parquet_file.to_delete IS NULL;
-        "#,
-            )
-            .bind(hash_id), // $1
-            TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFilePod>(
-                r#"
+        // We use a JSON-based "IS IN" check.
+        let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect();
+
+        let query = sqlx::query_as::<_, ParquetFilePod>(
+            r#"
 SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id,
        object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count,
        compaction_level, created_at, column_set, max_l0_created_at
 FROM parquet_file
-INNER JOIN partition
-ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id
-WHERE partition.id = $1
+WHERE parquet_file.partition_id IN (SELECT value FROM json_each($1))
   AND parquet_file.to_delete IS NULL;
         "#,
-            )
-            .bind(id), // $1
-        };
+        )
+        .bind(Json(&ids[..])); // $1
 
         Ok(query
             .fetch_all(self.inner.get_mut())
-            .await
-            .map_err(|e| Error::SqlxError { source: e })?
+            .await?
             .into_iter()
             .map(Into::into)
             .collect())
@@ -1477,7 +1326,7 @@ WHERE partition.id = $1
 
     async fn get_by_object_store_id(
         &mut self,
-        object_store_id: Uuid,
+        object_store_id: ObjectStoreId,
     ) -> Result<Option<ParquetFile>> {
         let rec = sqlx::query_as::<_, ParquetFilePod>(
             r#"
@@ -1496,19 +1345,19 @@ WHERE object_store_id = $1;
             return Ok(None);
         }
 
-        let parquet_file = rec.map_err(|e| Error::SqlxError { source: e })?;
+        let parquet_file = rec?;
 
         Ok(Some(parquet_file.into()))
     }
 
     async fn exists_by_object_store_id_batch(
         &mut self,
-        object_store_ids: Vec<Uuid>,
-    ) -> Result<Vec<Uuid>> {
+        object_store_ids: Vec<ObjectStoreId>,
+    ) -> Result<Vec<ObjectStoreId>> {
         let in_value = object_store_ids
             .into_iter()
             // use a sqlite blob literal
-            .map(|id| format!("X'{}'", id.simple()))
+            .map(|id| format!("X'{}'", id.get_uuid().simple()))
             .collect::<Vec<String>>()
             .join(",");
 
@@ -1519,18 +1368,19 @@ FROM parquet_file
 WHERE object_store_id IN ({v});",
             v = in_value
         ))
-        .map(|slr: SqliteRow| slr.get::<Uuid, _>("object_store_id"))
+        .map(|slr: SqliteRow| slr.get::<ObjectStoreId, _>("object_store_id"))
         // limitation of sqlx: will not bind arrays
         // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query
         .fetch_all(self.inner.get_mut())
         .await
-        .map_err(|e| Error::SqlxError { source: e })
+        .map_err(Error::from)
     }
 
     async fn create_upgrade_delete(
         &mut self,
-        delete: &[ParquetFileId],
-        upgrade: &[ParquetFileId],
+        partition_id: PartitionId,
+        delete: &[ObjectStoreId],
+        upgrade: &[ObjectStoreId],
         create: &[ParquetFileParams],
         target_level: CompactionLevel,
     ) -> Result<Vec<ParquetFileId>> {
@@ -1541,29 +1391,26 @@ WHERE object_store_id IN ({v});",
             delete_set.is_disjoint(&upgrade_set),
             "attempted to upgrade a file scheduled for delete"
         );
-        let mut tx = self
-            .inner
-            .get_mut()
-            .pool
-            .begin()
-            .await
-            .map_err(|e| Error::StartTransaction { source: e })?;
+        let mut tx = self.inner.get_mut().pool.begin().await?;
 
         for id in delete {
             let marked_at = Timestamp::from(self.time_provider.now());
-            flag_for_delete(&mut *tx, *id, marked_at).await?;
+            flag_for_delete(&mut *tx, partition_id, *id, marked_at).await?;
         }
 
-        update_compaction_level(&mut *tx, upgrade, target_level).await?;
+        update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?;
 
         let mut ids = Vec::with_capacity(create.len());
         for file in create {
+            if file.partition_id != partition_id {
+                return Err(Error::External {
+                    source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(),
+                });
+            }
             let res = create_parquet_file(&mut *tx, file.clone()).await?;
             ids.push(res.id);
         }
-        tx.commit()
-            .await
-            .map_err(|e| Error::FailedToCommit { source: e })?;
+        tx.commit().await?;
 
         Ok(ids)
     }
@@ -1582,6 +1429,7 @@ where
         namespace_id,
         table_id,
         partition_id,
+        partition_hash_id,
         object_store_id,
         min_time,
         max_time,
@@ -1593,96 +1441,116 @@ where
         max_l0_created_at,
     } = parquet_file_params;
 
-    let (partition_id, partition_hash_id) = match partition_id {
-        TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)),
-        TransitionPartitionId::Deprecated(id) => (Some(id), None),
-    };
     let res = sqlx::query_as::<_, ParquetFilePod>(
         r#"
 INSERT INTO parquet_file (
-    shard_id, table_id, partition_id, partition_hash_id, object_store_id,
+    table_id, partition_id, partition_hash_id, object_store_id,
     min_time, max_time, file_size_bytes,
     row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at )
-VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14 )
+VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 )
 RETURNING
     id, table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, to_delete,
     file_size_bytes, row_count, compaction_level, created_at, namespace_id, column_set,
     max_l0_created_at;
         "#,
     )
-    .bind(TRANSITION_SHARD_ID) // $1
-    .bind(table_id) // $2
-    .bind(partition_id) // $3
-    .bind(partition_hash_id.as_ref()) // $4
-    .bind(object_store_id) // $5
-    .bind(min_time) // $6
-    .bind(max_time) // $7
-    .bind(file_size_bytes) // $8
-    .bind(row_count) // $9
-    .bind(compaction_level) // $10
-    .bind(created_at) // $11
-    .bind(namespace_id) // $12
-    .bind(from_column_set(&column_set)) // $13
-    .bind(max_l0_created_at) // $14
+    .bind(table_id) // $1
+    .bind(partition_id) // $2
+    .bind(partition_hash_id.as_ref()) // $3
+    .bind(object_store_id) // $4
+    .bind(min_time) // $5
+    .bind(max_time) // $6
+    .bind(file_size_bytes) // $7
+    .bind(row_count) // $8
+    .bind(compaction_level) // $9
+    .bind(created_at) // $10
+    .bind(namespace_id) // $11
+    .bind(from_column_set(&column_set)) // $12
+    .bind(max_l0_created_at) // $13
     .fetch_one(executor)
     .await;
 
     let rec = res.map_err(|e| {
         if is_unique_violation(&e) {
-            Error::FileExists { object_store_id }
+            Error::AlreadyExists {
+                descr: object_store_id.to_string(),
+            }
         } else if is_fk_violation(&e) {
-            Error::ForeignKeyViolation { source: e }
+            Error::NotFound {
+                descr: e.to_string(),
+            }
         } else {
-            Error::SqlxError { source: e }
+            Error::External {
+                source: Box::new(e),
+            }
         }
     })?;
 
     Ok(rec.into())
 }
 
-async fn flag_for_delete<'q, E>(executor: E, id: ParquetFileId, marked_at: Timestamp) -> Result<()>
+async fn flag_for_delete<'q, E>(
+    executor: E,
+    partition_id: PartitionId,
+    id: ObjectStoreId,
+    marked_at: Timestamp,
+) -> Result<()>
 where
     E: Executor<'q, Database = Sqlite>,
 {
-    let query = sqlx::query(r#"UPDATE parquet_file SET to_delete = $1 WHERE id = $2;"#)
-        .bind(marked_at) // $1
-        .bind(id); // $2
+    let updated =
+        sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = $2 AND partition_id = $3 AND to_delete is NULL returning id;"#)
+            .bind(marked_at) // $1
+            .bind(id) // $2
+            .bind(partition_id) // $3
+            .fetch_all(executor)
+            .await?;
 
-    query
-        .execute(executor)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+    if updated.len() != 1 {
+        return Err(Error::NotFound {
+            descr: format!("parquet file {id} not found for delete"),
+        });
+    }
 
     Ok(())
 }
 
 async fn update_compaction_level<'q, E>(
     executor: E,
-    parquet_file_ids: &[ParquetFileId],
+    partition_id: PartitionId,
+    object_store_ids: &[ObjectStoreId],
     compaction_level: CompactionLevel,
-) -> Result<Vec<ParquetFileId>>
+) -> Result<()>
 where
     E: Executor<'q, Database = Sqlite>,
 {
-    // We use a JSON-based "IS IN" check.
-    let ids: Vec<_> = parquet_file_ids.iter().map(|p| p.get()).collect();
-    let query = sqlx::query(
+    let in_value = object_store_ids
+        .iter()
+        // use a sqlite blob literal
+        .map(|id| format!("X'{}'", id.get_uuid().simple()))
+        .collect::<Vec<String>>()
+        .join(",");
+
+    let updated = sqlx::query_as::<_, (i64,)>(&format!(
         r#"
 UPDATE parquet_file
 SET compaction_level = $1
-WHERE id IN (SELECT value FROM json_each($2))
-RETURNING id;
+WHERE object_store_id IN ({v}) AND partition_id = $2 AND to_delete is NULL returning id;
         "#,
-    )
+        v = in_value,
+    ))
     .bind(compaction_level) // $1
-    .bind(Json(&ids[..])); // $2
-    let updated = query
-        .fetch_all(executor)
-        .await
-        .map_err(|e| Error::SqlxError { source: e })?;
+    .bind(partition_id) // $2
+    .fetch_all(executor)
+    .await?;
 
-    let updated = updated.into_iter().map(|row| row.get("id")).collect();
-    Ok(updated)
+    if updated.len() != object_store_ids.len() {
+        return Err(Error::NotFound {
+            descr: "parquet file(s) not found for upgrade".to_string(),
+        });
+    }
+
+    Ok(())
 }
 
 /// The error code returned by SQLite for a unique constraint violation.
@@ -1722,27 +1590,15 @@ fn is_unique_violation(e: &sqlx::Error) -> bool {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::interface::ParquetFileRepoExt;
     use crate::test_helpers::{
         arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table,
     };
     use assert_matches::assert_matches;
     use data_types::partition_template::TemplatePart;
     use generated_types::influxdata::iox::partition_template::v1 as proto;
-    use metric::{Attributes, DurationHistogram, Metric};
     use std::sync::Arc;
 
-    fn assert_metric_hit(metrics: &Registry, name: &'static str) {
-        let histogram = metrics
-            .get_instrument::<Metric<DurationHistogram>>("catalog_op_duration")
-            .expect("failed to read metric")
-            .get_observer(&Attributes::from(&[("op", name), ("result", "success")]))
-            .expect("failed to get observer")
-            .fetch();
-
-        let hit_count = histogram.sample_count();
-        assert!(hit_count > 0, "metric did not record any calls");
-    }
-
     async fn setup_db() -> SqliteCatalog {
         let dsn =
             std::env::var("TEST_INFLUXDB_SQLITE_DSN").unwrap_or("sqlite::memory:".to_string());
@@ -1757,7 +1613,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_catalog() {
-        interface::test_helpers::test_catalog(|| async {
+        crate::interface_tests::test_catalog(|| async {
             let sqlite = setup_db().await;
             let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
             sqlite
@@ -1765,57 +1621,12 @@ mod tests {
         .await;
     }
 
-    #[tokio::test]
-    async fn test_partition_create_or_get_idempotent() {
-        let sqlite = setup_db().await;
-        let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-        let mut repos = sqlite.repositories().await;
-
-        let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
-        let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id;
-
-        let key = PartitionKey::from("bananas");
-
-        let hash_id = PartitionHashId::new(table_id, &key);
-
-        let a = repos
-            .partitions()
-            .create_or_get(key.clone(), table_id)
-            .await
-            .expect("should create OK");
-
-        assert_eq!(a.hash_id().unwrap(), &hash_id);
-
-        // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent.
-        let b = repos
-            .partitions()
-            .create_or_get(key.clone(), table_id)
-            .await
-            .expect("idempotent write should succeed");
-
-        assert_eq!(a, b);
-
-        // Check that the hash_id is saved in the database and is returned when queried.
-        let table_partitions = sqlite
-            .repositories()
-            .await
-            .partitions()
-            .list_by_table_id(table_id)
-            .await
-            .unwrap();
-        assert_eq!(table_partitions.len(), 1);
-        assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id);
-
-        // Test: sort_key_ids from partition_create_or_get_idempotent
-        assert!(table_partitions[0].sort_key_ids().is_empty());
-    }
-
     #[tokio::test]
     async fn existing_partitions_without_hash_id() {
         let sqlite: SqliteCatalog = setup_db().await;
         let pool = sqlite.pool.clone();
         let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-        let mut repos = sqlite.repositories().await;
+        let mut repos = sqlite.repositories();
 
         let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
         let table = arbitrary_table(&mut *repos, "table", &namespace).await;
@@ -1827,17 +1638,16 @@ mod tests {
         sqlx::query(
             r#"
 INSERT INTO partition
-    (partition_key, shard_id, table_id, sort_key, sort_key_ids)
+    (partition_key, table_id, sort_key_ids)
 VALUES
-    ($1, $2, $3, '[]', '[]')
+    ($1, $2, '[]')
 ON CONFLICT (table_id, partition_key)
 DO UPDATE SET partition_key = partition.partition_key
-RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at;
+RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at;
         "#,
         )
         .bind(&key) // $1
-        .bind(TRANSITION_SHARD_ID) // $2
-        .bind(table_id) // $3
+        .bind(table_id) // $2
         .fetch_one(&pool)
         .await
         .unwrap();
@@ -1856,7 +1666,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
             .expect("idempotent write should succeed");
 
         // Test: sort_key_ids from freshly insert with empty value
-        assert!(inserted_again.sort_key_ids().is_empty());
+        assert!(inserted_again.sort_key_ids().is_none());
 
         assert_eq!(partition, &inserted_again);
 
@@ -1868,10 +1678,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
             .create(parquet_file_params)
             .await
             .unwrap();
-        assert_matches!(
-            parquet_file.partition_id,
-            TransitionPartitionId::Deprecated(_)
-        );
+        assert_eq!(parquet_file.partition_hash_id, None);
 
         // Add a partition record WITH a hash ID
         repos
@@ -1886,168 +1693,12 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         assert_eq!(old_style_partitions[0].id, partition.id);
     }
 
-    macro_rules! test_column_create_or_get_many_unchecked {
-        (
-            $name:ident,
-            calls = {$([$($col_name:literal => $col_type:expr),+ $(,)?]),+},
-            want = $($want:tt)+
-        ) => {
-            paste::paste! {
-                #[tokio::test]
-                async fn [<test_column_create_or_get_many_unchecked_ $name>]() {
-                    let sqlite = setup_db().await;
-                    let metrics = Arc::clone(&sqlite.metrics);
-                    let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-                    let mut repos = sqlite.repositories().await;
-
-                    let namespace = arbitrary_namespace(&mut *repos, "ns4")
-                        .await;
-                    let table_id = arbitrary_table(&mut *repos, "table", &namespace)
-                        .await
-                        .id;
-
-                    $(
-                        let mut insert = HashMap::new();
-                        $(
-                            insert.insert($col_name, $col_type);
-                        )+
-
-                        let got = repos
-                            .columns()
-                            .create_or_get_many_unchecked(table_id, insert.clone())
-                            .await;
-
-                        // The returned columns MUST always match the requested
-                        // column values if successful.
-                        if let Ok(got) = &got {
-                            assert_eq!(insert.len(), got.len());
-
-                            for got in got {
-                                assert_eq!(table_id, got.table_id);
-                                let requested_column_type = insert
-                                    .get(got.name.as_str())
-                                    .expect("Should have gotten back a column that was inserted");
-                                assert_eq!(
-                                    *requested_column_type,
-                                    ColumnType::try_from(got.column_type)
-                                        .expect("invalid column type")
-                                );
-                            }
-
-                            assert_metric_hit(&metrics, "column_create_or_get_many_unchecked");
-                        }
-                    )+
-
-                    assert_matches!(got, $($want)+);
-                }
-            }
-        }
-    }
-
-    // Issue a few calls to create_or_get_many that contain distinct columns and
-    // covers the full set of column types.
-    test_column_create_or_get_many_unchecked!(
-        insert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-            ],
-            [
-                "test8" => ColumnType::String,
-                "test9" => ColumnType::Bool,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with overlapping columns - request should succeed (upsert
-    // semantics).
-    test_column_create_or_get_many_unchecked!(
-        partial_upsert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ],
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-                "test8" => ColumnType::String,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with the same columns and types.
-    test_column_create_or_get_many_unchecked!(
-        full_upsert,
-        calls = {
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ],
-            [
-                "test1" => ColumnType::I64,
-                "test2" => ColumnType::U64,
-                "test3" => ColumnType::F64,
-                "test4" => ColumnType::Bool,
-            ]
-        },
-        want = Ok(_)
-    );
-
-    // Issue two calls with overlapping columns with conflicting types and
-    // observe a correctly populated ColumnTypeMismatch error.
-    test_column_create_or_get_many_unchecked!(
-        partial_type_conflict,
-        calls = {
-            [
-                "test1" => ColumnType::String,
-                "test2" => ColumnType::String,
-                "test3" => ColumnType::String,
-                "test4" => ColumnType::String,
-            ],
-            [
-                "test1" => ColumnType::String,
-                "test2" => ColumnType::Bool, // This one differs
-                "test3" => ColumnType::String,
-                // 4 is missing.
-                "test5" => ColumnType::String,
-                "test6" => ColumnType::Time,
-                "test7" => ColumnType::Tag,
-                "test8" => ColumnType::String,
-            ]
-        },
-        want = Err(e) => {
-            assert_matches!(e, Error::ColumnTypeMismatch { name, existing, new } => {
-                assert_eq!(name, "test2");
-                assert_eq!(existing, ColumnType::String);
-                assert_eq!(new, ColumnType::Bool);
-            })
-        }
-    );
-
     #[tokio::test]
     async fn test_billing_summary_on_parqet_file_creation() {
         let sqlite = setup_db().await;
         let pool = sqlite.pool.clone();
         let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-        let mut repos = sqlite.repositories().await;
+        let mut repos = sqlite.repositories();
         let namespace = arbitrary_namespace(&mut *repos, "ns4").await;
         let table = arbitrary_table(&mut *repos, "table", &namespace).await;
         let key = "bananas";
@@ -2067,7 +1718,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
             .expect("create parquet file should succeed");
         // insert the same again with a different size; we should then have 3x1337 as total file
         // size
-        p1.object_store_id = Uuid::new_v4();
+        p1.object_store_id = ObjectStoreId::new();
         p1.file_size_bytes *= 2;
         let _f2 = repos
             .parquet_files()
@@ -2086,7 +1737,13 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         // flag f1 for deletion and assert that the total file size is reduced accordingly.
         repos
             .parquet_files()
-            .create_upgrade_delete(&[f1.id], &[], &[], CompactionLevel::Initial)
+            .create_upgrade_delete(
+                partition.id,
+                &[f1.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
             .await
             .expect("flag parquet file for deletion should succeed");
         let total_file_size_bytes: i64 =
@@ -2117,7 +1774,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let sqlite = setup_db().await;
         let pool = sqlite.pool.clone();
         let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-        let mut repos = sqlite.repositories().await;
+        let mut repos = sqlite.repositories();
 
         let namespace_name = "apples";
 
@@ -2126,17 +1783,15 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file
         let insert_null_partition_template_namespace = sqlx::query(
             r#"
 INSERT INTO namespace (
-    name, topic_id, query_pool_id, retention_period_ns, partition_template
+    name, retention_period_ns, partition_template
 )
-VALUES ( $1, $2, $3, $4, NULL )
+VALUES ( $1, $2, NULL )
 RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at,
           partition_template;
             "#,
         )
         .bind(namespace_name) // $1
-        .bind(SHARED_TOPIC_ID) // $2
-        .bind(SHARED_QUERY_POOL_ID) // $3
-        .bind(None::<Option<i64>>); // $4
+        .bind(None::<Option<i64>>); // $2
 
         insert_null_partition_template_namespace
             .fetch_one(&pool)
@@ -2233,7 +1888,7 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele
         let sqlite = setup_db().await;
         let pool = sqlite.pool.clone();
         let sqlite: Arc<dyn Catalog> = Arc::new(sqlite);
-        let mut repos = sqlite.repositories().await;
+        let mut repos = sqlite.repositories();
 
         let namespace_default_template_name = "oranges";
         let namespace_default_template = repos
diff --git a/iox_catalog/src/test_helpers.rs b/iox_catalog/src/test_helpers.rs
new file mode 100644
index 00000000000..0861d79fb8a
--- /dev/null
+++ b/iox_catalog/src/test_helpers.rs
@@ -0,0 +1,92 @@
+//! Catalog helper functions for creation of catalog objects
+use data_types::{
+    partition_template::TablePartitionTemplateOverride, ColumnId, ColumnSet, CompactionLevel,
+    Namespace, NamespaceName, ObjectStoreId, ParquetFileParams, Partition, Table, TableSchema,
+    Timestamp,
+};
+
+use crate::interface::RepoCollection;
+
+/// When the details of the namespace don't matter; the test just needs *a* catalog namespace
+/// with a particular name.
+///
+/// Use [`NamespaceRepo::create`] directly if:
+///
+/// - The values of the parameters to `create` need to be different than what's here
+/// - The values of the parameters to `create` are relevant to the behavior under test
+/// - You expect namespace creation to fail in the test
+///
+/// [`NamespaceRepo::create`]: crate::interface::NamespaceRepo::create
+pub async fn arbitrary_namespace<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+) -> Namespace {
+    let namespace_name = NamespaceName::new(name).unwrap();
+    repos
+        .namespaces()
+        .create(&namespace_name, None, None, None)
+        .await
+        .unwrap()
+}
+
+/// When the details of the table don't matter; the test just needs *a* catalog table
+/// with a particular name in a particular namespace.
+///
+/// Use [`TableRepo::create`] directly if:
+///
+/// - The values of the parameters to `create_or_get` need to be different than what's here
+/// - The values of the parameters to `create_or_get` are relevant to the behavior under test
+/// - You expect table creation to fail in the test
+///
+/// [`TableRepo::create`]: crate::interface::TableRepo::create
+pub async fn arbitrary_table<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+    namespace: &Namespace,
+) -> Table {
+    repos
+        .tables()
+        .create(
+            name,
+            TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(),
+            namespace.id,
+        )
+        .await
+        .unwrap()
+}
+
+/// Load or create an arbitrary table schema in the same way that a write implicitly creates a
+/// table, that is, with a time column.
+pub async fn arbitrary_table_schema_load_or_create<R: RepoCollection + ?Sized>(
+    repos: &mut R,
+    name: &str,
+    namespace: &Namespace,
+) -> TableSchema {
+    crate::util::table_load_or_create(repos, namespace.id, &namespace.partition_template, name)
+        .await
+        .unwrap()
+}
+
+/// When the details of a Parquet file record don't matter, the test just needs *a* Parquet
+/// file record in a particular namespace+table+partition.
+pub fn arbitrary_parquet_file_params(
+    namespace: &Namespace,
+    table: &Table,
+    partition: &Partition,
+) -> ParquetFileParams {
+    ParquetFileParams {
+        namespace_id: namespace.id,
+        table_id: table.id,
+        partition_id: partition.id,
+        partition_hash_id: partition.hash_id().cloned(),
+        object_store_id: ObjectStoreId::new(),
+        min_time: Timestamp::new(1),
+        max_time: Timestamp::new(10),
+        file_size_bytes: 1337,
+        row_count: 0,
+        compaction_level: CompactionLevel::Initial,
+        created_at: Timestamp::new(1),
+        column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
+        max_l0_created_at: Timestamp::new(1),
+    }
+}
diff --git a/iox_catalog/src/util.rs b/iox_catalog/src/util.rs
new file mode 100644
index 00000000000..d6d184fbf09
--- /dev/null
+++ b/iox_catalog/src/util.rs
@@ -0,0 +1,897 @@
+//! Helper methods to simplify catalog work.
+//!
+//! They all use the public [`Catalog`] interface and have no special access to internals, so in theory they can be
+//! implement downstream as well.
+
+use std::{
+    borrow::Cow,
+    collections::{BTreeMap, HashMap, HashSet},
+    sync::Arc,
+};
+
+use data_types::{
+    partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride},
+    ColumnType, ColumnsByName, Namespace, NamespaceId, NamespaceSchema, PartitionId, SortKeyIds,
+    TableId, TableSchema,
+};
+use mutable_batch::MutableBatch;
+use thiserror::Error;
+
+use crate::{
+    constants::TIME_COLUMN,
+    interface::{CasFailure, Catalog, Error, RepoCollection, SoftDeletedRows},
+};
+
+/// Gets the namespace schema including all tables and columns.
+pub async fn get_schema_by_id<R>(
+    id: NamespaceId,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_id(id, deleted).await? else {
+        return Ok(None);
+    };
+
+    Ok(Some(get_schema_internal(namespace, repos).await?))
+}
+
+/// Gets the namespace schema including all tables and columns.
+pub async fn get_schema_by_name<R>(
+    name: &str,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else {
+        return Ok(None);
+    };
+
+    Ok(Some(get_schema_internal(namespace, repos).await?))
+}
+
+async fn get_schema_internal<R>(
+    namespace: Namespace,
+    repos: &mut R,
+) -> Result<NamespaceSchema, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    // get the columns first just in case someone else is creating schema while we're doing this.
+    let columns = repos.columns().list_by_namespace_id(namespace.id).await?;
+    let tables = repos.tables().list_by_namespace_id(namespace.id).await?;
+
+    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
+
+    let mut table_id_to_schema = BTreeMap::new();
+    for t in tables {
+        let table_schema = TableSchema::new_empty_from(&t);
+        table_id_to_schema.insert(t.id, (t.name, table_schema));
+    }
+
+    for c in columns {
+        let (_, t) = table_id_to_schema.get_mut(&c.table_id).unwrap();
+        t.add_column(c);
+    }
+
+    for (_, (table_name, schema)) in table_id_to_schema {
+        namespace.tables.insert(table_name, schema);
+    }
+
+    Ok(namespace)
+}
+
+/// Gets the schema for one particular table in a namespace.
+pub async fn get_schema_by_namespace_and_table<R>(
+    name: &str,
+    table_name: &str,
+    repos: &mut R,
+    deleted: SoftDeletedRows,
+) -> Result<Option<NamespaceSchema>, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else {
+        return Ok(None);
+    };
+
+    let Some(table) = repos
+        .tables()
+        .get_by_namespace_and_name(namespace.id, table_name)
+        .await?
+    else {
+        return Ok(None);
+    };
+
+    let mut table_schema = TableSchema::new_empty_from(&table);
+
+    let columns = repos.columns().list_by_table_id(table.id).await?;
+    for c in columns {
+        table_schema.add_column(c);
+    }
+
+    let mut namespace = NamespaceSchema::new_empty_from(&namespace);
+    namespace
+        .tables
+        .insert(table_name.to_string(), table_schema);
+
+    Ok(Some(namespace))
+}
+
+/// Gets all the table's columns.
+pub async fn get_table_columns_by_id<R>(
+    id: TableId,
+    repos: &mut R,
+) -> Result<ColumnsByName, crate::interface::Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let columns = repos.columns().list_by_table_id(id).await?;
+
+    Ok(ColumnsByName::new(columns))
+}
+
+/// Fetch all [`NamespaceSchema`] in the catalog.
+///
+/// This method performs the minimal number of queries needed to build the
+/// result set. No table lock is obtained, nor are queries executed within a
+/// transaction, but this method does return a point-in-time snapshot of the
+/// catalog state.
+///
+/// # Soft Deletion
+///
+/// No schemas for soft-deleted namespaces are returned.
+pub async fn list_schemas(
+    catalog: &dyn Catalog,
+) -> Result<impl Iterator<Item = (Namespace, NamespaceSchema)>, crate::interface::Error> {
+    let mut repos = catalog.repositories();
+
+    // In order to obtain a point-in-time snapshot, first fetch the columns,
+    // then the tables, and then resolve the namespace IDs to Namespace in order
+    // to construct the schemas.
+    //
+    // The set of columns returned forms the state snapshot, with the subsequent
+    // queries resolving only what is needed to construct schemas for the
+    // retrieved columns (ignoring any newly added tables/namespaces since the
+    // column snapshot was taken).
+    //
+    // This approach also tolerates concurrently deleted namespaces, which are
+    // simply ignored at the end when joining to the namespace query result.
+
+    // First fetch all the columns - this is the state snapshot of the catalog
+    // schemas.
+    let columns = repos.columns().list().await?;
+
+    // Construct the set of table IDs these columns belong to.
+    let retain_table_ids = columns.iter().map(|c| c.table_id).collect::<HashSet<_>>();
+
+    // Fetch all tables, and filter for those that are needed to construct
+    // schemas for "columns" only.
+    //
+    // Discard any tables that have no columns or have been created since
+    // the "columns" snapshot was retrieved, and construct a map of ID->Table.
+    let tables = repos
+        .tables()
+        .list()
+        .await?
+        .into_iter()
+        .filter_map(|t| {
+            if !retain_table_ids.contains(&t.id) {
+                return None;
+            }
+
+            Some((t.id, t))
+        })
+        .collect::<HashMap<_, _>>();
+
+    // Drop the table ID set as it will not be referenced again.
+    drop(retain_table_ids);
+
+    // Do all the I/O to fetch the namespaces in the background, while this
+    // thread constructs the NamespaceId->TableSchema map below.
+    let namespaces = tokio::spawn(async move {
+        repos
+            .namespaces()
+            .list(SoftDeletedRows::ExcludeDeleted)
+            .await
+    });
+
+    // A set of tables within a single namespace.
+    type NamespaceTables = BTreeMap<String, TableSchema>;
+
+    let mut joined = HashMap::<NamespaceId, NamespaceTables>::default();
+    for column in columns {
+        // Resolve the table this column references
+        let table = tables.get(&column.table_id).expect("no table for column");
+
+        let table_schema = joined
+            // Find or create a record in the joined <NamespaceId, Tables> map
+            // for this namespace ID.
+            .entry(table.namespace_id)
+            .or_default()
+            // Fetch the schema record for this table, or create an empty one.
+            .entry(table.name.clone())
+            .or_insert_with(|| TableSchema::new_empty_from(table));
+
+        table_schema.add_column(column);
+    }
+
+    // The table map is no longer needed - immediately reclaim the memory.
+    drop(tables);
+
+    // Convert the Namespace instances into NamespaceSchema instances.
+    let iter = namespaces
+        .await
+        .expect("namespace list task panicked")?
+        .into_iter()
+        // Ignore any namespaces that did not exist when the "columns" snapshot
+        // was created, or have no tables/columns (and therefore have no entry
+        // in "joined").
+        .filter_map(move |v| {
+            // The catalog call explicitly asked for no soft deleted records.
+            assert!(v.deleted_at.is_none());
+
+            let mut ns = NamespaceSchema::new_empty_from(&v);
+
+            ns.tables = joined.remove(&v.id)?;
+            Some((v, ns))
+        });
+
+    Ok(iter)
+}
+
+/// In a backoff loop, retry calling the compare-and-swap sort key catalog function if the catalog
+/// returns a query error unrelated to the CAS operation.
+///
+/// Returns with a value of `Ok` containing the new sort key if:
+///
+/// - No concurrent updates were detected
+/// - A concurrent update was detected, but the other update resulted in the same value this update
+///   was attempting to set
+///
+/// Returns with a value of `Err(newly_observed_value)` if a concurrent, conflicting update was
+/// detected. It is expected that callers of this function will take the returned value into
+/// account (in whatever manner is appropriate) before calling this function again.
+///
+/// NOTE: it is expected that ONLY processes that ingest data (currently only the ingesters or the
+/// bulk ingest API) update sort keys for existing partitions. Consider how calling this function
+/// from new processes will interact with the existing calls.
+pub async fn retry_cas_sort_key(
+    old_sort_key_ids: Option<&SortKeyIds>,
+    new_sort_key_ids: &SortKeyIds,
+    partition_id: PartitionId,
+    catalog: Arc<dyn Catalog>,
+) -> Result<SortKeyIds, SortKeyIds> {
+    use backoff::Backoff;
+    use observability_deps::tracing::{info, warn};
+    use std::ops::ControlFlow;
+
+    Backoff::new(&Default::default())
+        .retry_with_backoff("cas_sort_key", || {
+            let new_sort_key_ids = new_sort_key_ids.clone();
+            let catalog = Arc::clone(&catalog);
+            async move {
+                let mut repos = catalog.repositories();
+                match repos
+                    .partitions()
+                    .cas_sort_key(partition_id, old_sort_key_ids, &new_sort_key_ids)
+                    .await
+                {
+                    Ok(_) => ControlFlow::Break(Ok(new_sort_key_ids)),
+                    Err(CasFailure::QueryError(e)) => ControlFlow::Continue(e),
+                    Err(CasFailure::ValueMismatch(observed_sort_key_ids))
+                        if observed_sort_key_ids == new_sort_key_ids =>
+                    {
+                        // A CAS failure occurred because of a concurrent
+                        // sort key update, however the new catalog sort key
+                        // exactly matches the sort key this node wants to
+                        // commit.
+                        //
+                        // This is the sad-happy path, and this task can
+                        // continue.
+                        info!(
+                            %partition_id,
+                            ?old_sort_key_ids,
+                            ?observed_sort_key_ids,
+                            update_sort_key_ids=?new_sort_key_ids,
+                            "detected matching concurrent sort key update"
+                        );
+                        ControlFlow::Break(Ok(new_sort_key_ids))
+                    }
+                    Err(CasFailure::ValueMismatch(observed_sort_key_ids)) => {
+                        // Another ingester concurrently updated the sort
+                        // key.
+                        //
+                        // This breaks a sort-key update invariant - sort
+                        // key updates MUST be serialised. This operation must
+                        // be retried.
+                        //
+                        // See:
+                        //   https://github.com/influxdata/influxdb_iox/issues/6439
+                        //
+                        warn!(
+                            %partition_id,
+                            ?old_sort_key_ids,
+                            ?observed_sort_key_ids,
+                            update_sort_key_ids=?new_sort_key_ids,
+                            "detected concurrent sort key update"
+                        );
+                        // Stop the retry loop with an error containing the
+                        // newly observed sort key.
+                        ControlFlow::Break(Err(observed_sort_key_ids))
+                    }
+                }
+            }
+        })
+        .await
+        .expect("retry forever")
+}
+
+/// An [`crate::interface::Error`] scoped to a single table for schema validation errors.
+#[derive(Debug, Error)]
+#[error("table {}, {}", .0, .1)]
+pub struct TableScopedError(String, Error);
+
+impl TableScopedError {
+    /// Return the table name for this error.
+    pub fn table(&self) -> &str {
+        &self.0
+    }
+
+    /// Return a reference to the error.
+    pub fn err(&self) -> &Error {
+        &self.1
+    }
+
+    /// Return ownership of the error, discarding the table name.
+    pub fn into_err(self) -> Error {
+        self.1
+    }
+}
+
+/// Given an iterator of `(table_name, batch)` to validate, this function
+/// ensures all the columns within `batch` match the existing schema for
+/// `table_name` in `schema`. If the column does not already exist in `schema`,
+/// it is created and an updated [`NamespaceSchema`] is returned.
+///
+/// This function pushes schema additions through to the backend catalog, and
+/// relies on the catalog to serialize concurrent additions of a given column,
+/// ensuring only one type is ever accepted per column.
+pub async fn validate_or_insert_schema<'a, T, U, R>(
+    tables: T,
+    schema: &NamespaceSchema,
+    repos: &mut R,
+) -> Result<Option<NamespaceSchema>, TableScopedError>
+where
+    T: IntoIterator<IntoIter = U, Item = (&'a str, &'a MutableBatch)> + Send + Sync,
+    U: Iterator<Item = T::Item> + Send,
+    R: RepoCollection + ?Sized,
+{
+    let tables = tables.into_iter();
+
+    // The (potentially updated) NamespaceSchema to return to the caller.
+    let mut schema = Cow::Borrowed(schema);
+
+    for (table_name, batch) in tables {
+        validate_mutable_batch(batch, table_name, &mut schema, repos).await?;
+    }
+
+    match schema {
+        Cow::Owned(v) => Ok(Some(v)),
+        Cow::Borrowed(_) => Ok(None),
+    }
+}
+
+// &mut Cow is used to avoid a copy, so allow it
+#[allow(clippy::ptr_arg)]
+async fn validate_mutable_batch<R>(
+    mb: &MutableBatch,
+    table_name: &str,
+    schema: &mut Cow<'_, NamespaceSchema>,
+    repos: &mut R,
+) -> Result<(), TableScopedError>
+where
+    R: RepoCollection + ?Sized,
+{
+    // Check if the table exists in the schema.
+    //
+    // Because the entry API requires &mut it is not used to avoid a premature
+    // clone of the Cow.
+    let mut table = match schema.tables.get(table_name) {
+        Some(t) => Cow::Borrowed(t),
+        None => {
+            // The table does not exist in the cached schema.
+            //
+            // Attempt to load an existing table from the catalog or create a new table in the
+            // catalog to populate the cache.
+            let table =
+                table_load_or_create(repos, schema.id, &schema.partition_template, table_name)
+                    .await
+                    .map_err(|e| TableScopedError(table_name.to_string(), e))?;
+
+            assert!(schema
+                .to_mut()
+                .tables
+                .insert(table_name.to_string(), table)
+                .is_none());
+
+            Cow::Borrowed(schema.tables.get(table_name).unwrap())
+        }
+    };
+
+    // The table is now in the schema (either by virtue of it already existing,
+    // or through adding it above).
+    //
+    // If the table itself needs to be updated during column validation it
+    // becomes a Cow::owned() copy and the modified copy should be inserted into
+    // the schema before returning.
+    validate_and_insert_columns(
+        mb.columns()
+            .map(|(name, col)| (name, col.influx_type().into())),
+        table_name,
+        &mut table,
+        repos,
+    )
+    .await?;
+
+    if let Cow::Owned(table) = table {
+        // The table schema was mutated and needs inserting into the namespace
+        // schema to make the changes visible to the caller.
+        assert!(schema
+            .to_mut()
+            .tables
+            .insert(table_name.to_string(), table)
+            .is_some());
+    }
+
+    Ok(())
+}
+
+/// Given an iterator of `(column_name, column_type)` to validate, this function ensures all the
+/// columns match the existing `TableSchema` in `table`. If the column does not already exist in
+/// `table`, it is created and the `table` is changed to the `Cow::Owned` variant.
+///
+/// This function pushes schema additions through to the backend catalog, and relies on the catalog
+/// to serialize concurrent additions of a given column, ensuring only one type is ever accepted
+/// per column.
+// &mut Cow is used to avoid a copy, so allow it
+#[allow(clippy::ptr_arg)]
+pub async fn validate_and_insert_columns<R>(
+    columns: impl Iterator<Item = (&String, ColumnType)> + Send,
+    table_name: &str,
+    table: &mut Cow<'_, TableSchema>,
+    repos: &mut R,
+) -> Result<(), TableScopedError>
+where
+    R: RepoCollection + ?Sized,
+{
+    let mut column_batch: HashMap<&str, ColumnType> = HashMap::new();
+
+    for (name, column_type) in columns {
+        // Check if the column exists in the cached schema.
+        //
+        // If it does, validate it. If it does not exist, create it and insert
+        // it into the cached schema.
+
+        match table.columns.get(name.as_str()) {
+            Some(existing) if existing.column_type == column_type => {
+                // No action is needed as the column matches the existing column
+                // schema.
+            }
+            Some(existing) => {
+                // The column schema and the column in the schema change are of
+                // different types.
+                return Err(TableScopedError(
+                    table_name.to_string(),
+                    Error::AlreadyExists {
+                        descr: format!(
+                            "column {} is type {} but schema update has type {}",
+                            name, existing.column_type, column_type
+                        ),
+                    },
+                ));
+            }
+            None => {
+                // The column does not exist in the cache, add it to the column
+                // batch to be bulk inserted later.
+                let old = column_batch.insert(name.as_str(), column_type);
+                assert!(
+                    old.is_none(),
+                    "duplicate column name `{name}` in new column schema shouldn't be possible"
+                );
+            }
+        }
+    }
+
+    if !column_batch.is_empty() {
+        repos
+            .columns()
+            .create_or_get_many_unchecked(table.id, column_batch)
+            .await
+            .map_err(|e| TableScopedError(table_name.to_string(), e))?
+            .into_iter()
+            .for_each(|c| table.to_mut().add_column(c));
+    }
+
+    Ok(())
+}
+
+/// Load or create table.
+pub async fn table_load_or_create<R>(
+    repos: &mut R,
+    namespace_id: NamespaceId,
+    namespace_partition_template: &NamespacePartitionTemplateOverride,
+    table_name: &str,
+) -> Result<TableSchema, Error>
+where
+    R: RepoCollection + ?Sized,
+{
+    let table = match repos
+        .tables()
+        .get_by_namespace_and_name(namespace_id, table_name)
+        .await?
+    {
+        Some(table) => table,
+        None => {
+            // There is a possibility of a race condition here, if another request has also
+            // created this table after the `get_by_namespace_and_name` call but before
+            // this `create` call. In that (hopefully) rare case, do an additional fetch
+            // from the catalog for the record that should now exist.
+            let create_result = repos
+                .tables()
+                .create(
+                    table_name,
+                    // This table is being created implicitly by this write, so there's no
+                    // possibility of a user-supplied partition template here, which is why there's
+                    // a hardcoded `None`. If there is a namespace template, it must be valid because
+                    // validity was checked during its creation, so that's why there's an `expect`.
+                    TablePartitionTemplateOverride::try_new(None, namespace_partition_template)
+                        .expect("no table partition template; namespace partition template has been validated"),
+                    namespace_id,
+                )
+                .await;
+            if let Err(Error::AlreadyExists { .. }) = create_result {
+                repos
+                    .tables()
+                    .get_by_namespace_and_name(namespace_id, table_name)
+                    // Propagate any `Err` returned by the catalog
+                    .await?
+                    // Getting `Ok(None)` should be impossible if we're in this code path because
+                    // the `create` request just said the table exists
+                    .expect(
+                        "Table creation failed because the table exists, so looking up the table \
+                        should return `Some(table)`, but it returned `None`",
+                    )
+            } else {
+                create_result?
+            }
+        }
+    };
+
+    let mut table = TableSchema::new_empty_from(&table);
+
+    // Always add a time column to all new tables.
+    let time_col = repos
+        .columns()
+        .create_or_get(TIME_COLUMN, table.id, ColumnType::Time)
+        .await?;
+
+    table.add_column(time_col);
+
+    Ok(table)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeMap, sync::Arc};
+
+    use super::*;
+    use crate::{interface::SoftDeletedRows, mem::MemCatalog, util::get_schema_by_name};
+
+    // Generate a test that simulates multiple, sequential writes in `lp` and
+    // asserts the resulting schema.
+    //
+    // This test asserts the cached schema and the database entry are always in
+    // sync.
+    macro_rules! test_validate_schema {
+        (
+            $name:ident,
+            lp = [$($lp:literal,)+],                                // An array of multi-line LP writes
+            want_observe_conflict = $want_observe_conflict:literal, // true if a schema validation error should be observed at some point
+            want_schema = {$($want_schema:tt) +}                    // The expected resulting schema after all writes complete.
+        ) => {
+            paste::paste! {
+                #[allow(clippy::bool_assert_comparison)]
+                #[tokio::test]
+                async fn [<test_validate_schema_ $name>]() {
+                    use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
+                    use std::ops::DerefMut;
+                    use pretty_assertions::assert_eq;
+                    const NAMESPACE_NAME: &str = "bananas";
+
+                    let metrics = Arc::new(metric::Registry::default());
+                    let time_provider = Arc::new(iox_time::SystemProvider::new());
+                    let repo = MemCatalog::new(metrics, time_provider);
+                    let mut txn = repo.repositories();
+
+                    let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME)
+                        .await;
+                    let schema = NamespaceSchema::new_empty_from(&namespace);
+
+                    // Apply all the lp literals as individual writes, feeding
+                    // the result of one validation into the next to drive
+                    // incremental construction of the schemas.
+                    let mut observed_conflict = false;
+                    $(
+                        let schema = {
+                            let lp: String = $lp.to_string();
+
+                            let writes = mutable_batch_lp::lines_to_batches(lp.as_str(), 42)
+                                .expect("failed to build test writes from LP");
+
+                            let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut())
+                                .await;
+
+                            match got {
+                                Err(TableScopedError(_, Error::AlreadyExists{ .. })) => {
+                                    observed_conflict = true;
+                                    schema
+                                },
+                                Err(e) => panic!("unexpected error: {}", e),
+                                Ok(Some(new_schema)) => new_schema,
+                                Ok(None) => schema,
+                            }
+                        };
+                    )+
+
+                    assert_eq!($want_observe_conflict, observed_conflict, "should error mismatch");
+
+                    // Invariant: in absence of concurrency, the schema within
+                    // the database must always match the incrementally built
+                    // cached schema.
+                    let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut(), SoftDeletedRows::ExcludeDeleted)
+                        .await
+                        .expect("database failed to query for namespace schema")
+                        .expect("namespace exists");
+                    assert_eq!(schema, db_schema, "schema in DB and cached schema differ");
+
+                    // Generate the map of tables => desired column types
+                    let want_tables: BTreeMap<String, BTreeMap<Arc<str>, ColumnType>> = test_validate_schema!(@table, $($want_schema)+);
+
+                    // Generate a similarly structured map from the actual
+                    // schema
+                    let actual_tables: BTreeMap<String, BTreeMap<Arc<str>, ColumnType>> = schema
+                        .tables
+                        .iter()
+                        .map(|(table, table_schema)| {
+                            let desired_cols = table_schema
+                                .columns
+                                .iter()
+                                .map(|(column, column_schema)| (Arc::clone(&column), column_schema.column_type))
+                                .collect::<BTreeMap<_, _>>();
+
+                            (table.clone(), desired_cols)
+                        })
+                        .collect();
+
+                    // Assert the actual namespace contents matches the desired
+                    // table schemas in the test args.
+                    assert_eq!(want_tables, actual_tables, "cached schema and desired schema differ");
+                }
+            }
+        };
+        // Generate a map of table names => column map (below)
+        //
+        // out: BTreeMap<String, BTreeMap<ColName, ColumnType>>
+        (@table, $($table_name:literal: [$($columns:tt) +],)*) => {{
+            let mut tables = BTreeMap::new();
+            $(
+                let want_cols = test_validate_schema!(@column, $($columns)+);
+                assert!(tables.insert($table_name.to_string(), want_cols).is_none());
+            )*
+            tables
+        }};
+        // Generate a map of column names => ColumnType
+        //
+        // out: BTreeMap<ColName, ColumnType>
+        (@column, $($col_name:literal => $col_type:expr,)+) => {{
+            let mut cols = BTreeMap::new();
+            $(
+                assert!(cols.insert(Arc::from($col_name), $col_type).is_none());
+            )*
+            cols
+        }};
+    }
+
+    test_validate_schema!(
+        one_write_multiple_tables,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    // test that a new table will be created
+    test_validate_schema!(
+        two_writes_incremental_new_table,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+            "
+                m1,t1=c f1=1i 2\n\
+                new_measurement,t9=a f10=true 1\n\
+            ",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+                ],
+            "new_measurement": [
+                "t9" => ColumnType::Tag,
+                "f10" => ColumnType::Bool,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    // test that a new column for an existing table will be created
+    test_validate_schema!(
+        two_writes_incremental_new_column,
+        lp = [
+            "
+                m1,t1=a,t2=b f1=2i,f2=2.0 1\n\
+                m1,t1=a f1=3i 2\n\
+                m2,t3=b f1=true 1\n\
+            ",
+            "m1,new_tag=c new_field=1i 2",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "t2" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "f2" => ColumnType::F64,
+                "time" => ColumnType::Time,
+                // These are the incremental additions:
+                "new_tag" => ColumnType::Tag,
+                "new_field" => ColumnType::I64,
+            ],
+            "m2": [
+                "f1" => ColumnType::Bool,
+                "t3" => ColumnType::Tag,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        table_always_has_time_column,
+        lp = [
+            "m1,t1=a f1=2i",
+        ],
+        want_observe_conflict = false,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        two_writes_conflicting_column_types,
+        lp = [
+            "m1,t1=a f1=2i",
+            // Second write has conflicting type for f1.
+            "m1,t1=a f1=2.0",
+        ],
+        want_observe_conflict = true,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    test_validate_schema!(
+        two_writes_tag_field_transposition,
+        lp = [
+            // x is a tag
+            "m1,t1=a,x=t f1=2i",
+            // x is a field
+            "m1,t1=a x=t,f1=2i",
+        ],
+        want_observe_conflict = true,
+        want_schema = {
+            "m1": [
+                "t1" => ColumnType::Tag,
+                "x" => ColumnType::Tag,
+                "f1" => ColumnType::I64,
+                "time" => ColumnType::Time,
+            ],
+        }
+    );
+
+    #[tokio::test]
+    async fn validate_table_create_race_doesnt_get_all_columns() {
+        use crate::{interface::Catalog, test_helpers::arbitrary_namespace};
+        use std::{collections::BTreeSet, ops::DerefMut};
+        const NAMESPACE_NAME: &str = "bananas";
+
+        let repo = MemCatalog::new(
+            Default::default(),
+            Arc::new(iox_time::SystemProvider::new()),
+        );
+        let mut txn = repo.repositories();
+        let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME).await;
+
+        // One cached schema has no tables.
+        let empty_schema = NamespaceSchema::new_empty_from(&namespace);
+
+        // Another cached schema gets a write that creates a table with some columns.
+        let schema_with_table = empty_schema.clone();
+        let writes = mutable_batch_lp::lines_to_batches("m1,t1=a f1=2i", 42).unwrap();
+        validate_or_insert_schema(
+            writes.iter().map(|(k, v)| (k.as_str(), v)),
+            &schema_with_table,
+            txn.deref_mut(),
+        )
+        .await
+        .unwrap();
+
+        // then the empty schema adds the same table with some different columns
+        let other_writes = mutable_batch_lp::lines_to_batches("m1,t2=a f2=2i", 43).unwrap();
+        let formerly_empty_schema = validate_or_insert_schema(
+            other_writes.iter().map(|(k, v)| (k.as_str(), v)),
+            &empty_schema,
+            txn.deref_mut(),
+        )
+        .await
+        .unwrap()
+        .unwrap();
+
+        // the formerly-empty schema should NOT have all the columns; schema convergence is handled
+        // at a higher level by the namespace cache/gossip system
+        let table = formerly_empty_schema.tables.get("m1").unwrap();
+        assert_eq!(table.columns.names(), BTreeSet::from(["t2", "f2", "time"]));
+    }
+}
diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml
index 4d96757474f..72898969d09 100644
--- a/iox_data_generator/Cargo.toml
+++ b/iox_data_generator/Cargo.toml
@@ -6,27 +6,30 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 bytes = "1.5"
 chrono = { version = "0.4", default-features = false }
 clap = { version = "4", features = ["derive", "env", "cargo"] }
 datafusion_util = { path = "../datafusion_util" }
 futures = "0.3"
-handlebars = "4.4.0"
+handlebars = "5.1.0"
 humantime = "2.1.0"
 influxdb2_client = { path = "../influxdb2_client" }
-itertools = "0.11.0"
+itertools = "0.12.0"
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch = { path = "../mutable_batch" }
 parquet_file = { path = "../parquet_file" }
 rand = { version = "0.8.3", features = ["small_rng"] }
-regex = "1.9"
+regex = "1.10"
 schema = { path = "../schema" }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.107"
-snafu = "0.7"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
-toml = "0.8.0"
+serde_json = "1.0.111"
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+toml = "0.8.8"
 tracing = "0.1"
 tracing-subscriber = "0.3"
 uuid = { version = "1", default_features = false }
diff --git a/iox_data_generator/src/substitution.rs b/iox_data_generator/src/substitution.rs
index 22ff6d5f6fa..b5e558a34fa 100644
--- a/iox_data_generator/src/substitution.rs
+++ b/iox_data_generator/src/substitution.rs
@@ -4,13 +4,13 @@
 use crate::specification;
 use chrono::prelude::*;
 use handlebars::{
-    Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderError,
+    Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderErrorReason,
 };
 use rand::rngs::SmallRng;
 use rand::{distributions::Alphanumeric, seq::SliceRandom, Rng, RngCore};
 use serde_json::Value;
 use snafu::{ResultExt, Snafu};
-use std::{collections::BTreeMap, convert::TryInto};
+use std::collections::BTreeMap;
 
 /// Substitution-specific Results
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -76,7 +76,7 @@ pub(crate) struct RandomHelper;
 impl HelperDef for RandomHelper {
     fn call<'reg: 'rc, 'rc>(
         &self,
-        h: &Helper<'_, '_>,
+        h: &Helper<'_>,
         _: &Handlebars<'_>,
         _: &Context,
         _: &mut RenderContext<'_, '_>,
@@ -84,12 +84,20 @@ impl HelperDef for RandomHelper {
     ) -> HelperResult {
         let param = h
             .param(0)
-            .ok_or_else(|| RenderError::new("`random` requires a parameter"))?
+            .ok_or(RenderErrorReason::ParamNotFoundForIndex("random", 0))?
             .value()
             .as_u64()
-            .ok_or_else(|| RenderError::new("`random`'s parameter must be an unsigned integer"))?
+            .ok_or_else(|| {
+                RenderErrorReason::ParamTypeMismatchForName(
+                    "random",
+                    "0".to_string(),
+                    "unsigned integer".to_string(),
+                )
+            })?
             .try_into()
-            .map_err(|_| RenderError::new("`random`'s parameter must fit in a usize"))?;
+            .map_err(|_| {
+                RenderErrorReason::Other("`random`'s parameter must fit in a usize".to_string())
+            })?;
 
         let mut rng = rand::thread_rng();
 
@@ -111,7 +119,7 @@ pub(crate) struct FormatNowHelper;
 impl HelperDef for FormatNowHelper {
     fn call<'reg: 'rc, 'rc>(
         &self,
-        h: &Helper<'_, '_>,
+        h: &Helper<'_>,
         _: &Handlebars<'_>,
         c: &Context,
         _: &mut RenderContext<'_, '_>,
@@ -119,7 +127,7 @@ impl HelperDef for FormatNowHelper {
     ) -> HelperResult {
         let format = h
             .param(0)
-            .ok_or_else(|| RenderError::new("`format-time` requires a parameter"))?
+            .ok_or(RenderErrorReason::ParamNotFoundForIndex("format-time", 0))?
             .render();
 
         let timestamp = c
@@ -142,7 +150,7 @@ pub(crate) struct GuidHelper;
 impl HelperDef for GuidHelper {
     fn call<'reg: 'rc, 'rc>(
         &self,
-        _h: &Helper<'_, '_>,
+        _h: &Helper<'_>,
         _: &Handlebars<'_>,
         _: &Context,
         _: &mut RenderContext<'_, '_>,
diff --git a/iox_data_generator/src/tag_pair.rs b/iox_data_generator/src/tag_pair.rs
index 3adbff8d202..302fc4d8739 100644
--- a/iox_data_generator/src/tag_pair.rs
+++ b/iox_data_generator/src/tag_pair.rs
@@ -9,7 +9,7 @@ use std::fmt::Formatter;
 use std::sync::{Arc, Mutex};
 
 /// Results specific to the tag_pair module
-pub type Result<T, E = Error> = std::result::Result<T, E>;
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
 
 /// Errors that may happen while creating or regenerating tag pairs
 #[derive(Snafu, Debug)]
diff --git a/iox_query/Cargo.toml b/iox_query/Cargo.toml
index 94b85ecd782..e4535319c03 100644
--- a/iox_query/Cargo.toml
+++ b/iox_query/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 # This crate is designed to be independent of the rest of the IOx
 # server and specific storage systems such as Mutable Buffer and Read Buffer.
 #
@@ -15,7 +18,7 @@ license.workspace = true
 # 2. Allow for query logic testing without bringing in all the storage systems.
 
 [dependencies] # In alphabetical order
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 arrow_util = { path = "../arrow_util" }
 async-trait = "0.1"
 chrono = { version = "0.4", default-features = false }
@@ -25,8 +28,9 @@ datafusion_util = { path = "../datafusion_util" }
 executor = { path = "../executor"}
 futures = "0.3"
 hashbrown = { workspace = true }
-indexmap = { version = "2.0", features = ["std"] }
-itertools = "0.11.0"
+indexmap = { version = "2.1", features = ["std"] }
+itertools = "0.12.0"
+iox_time = { path = "../iox_time" }
 metric = { path = "../metric" }
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
@@ -35,11 +39,13 @@ parking_lot = "0.12"
 parquet_file = { path = "../parquet_file" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
-snafu = "0.7"
-tokio = { version = "1.32", features = ["macros", "parking_lot"] }
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
 tokio-stream = "0.1"
 trace = { path = "../trace" }
+tracker = { path = "../tracker" }
 predicate = { path = "../predicate" }
+uuid = { version = "1", features = ["v4"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
diff --git a/iox_query/src/chunk_statistics.rs b/iox_query/src/chunk_statistics.rs
index ca2ba742068..043034737bc 100644
--- a/iox_query/src/chunk_statistics.rs
+++ b/iox_query/src/chunk_statistics.rs
@@ -3,10 +3,12 @@
 use std::{collections::HashMap, sync::Arc};
 
 use data_types::TimestampMinMax;
+use datafusion::common::stats::Precision;
 use datafusion::{
     physical_plan::{ColumnStatistics, Statistics},
     scalar::ScalarValue,
 };
+use datafusion_util::{option_to_precision, timestamptz_nano};
 use schema::{InfluxColumnType, Schema};
 
 /// Represent known min/max values for a specific column.
@@ -23,12 +25,25 @@ pub struct ColumnRange {
 /// These ranges apply to ALL rows (esp. in ALL files and ingester chunks) within in given partition.
 pub type ColumnRanges = Arc<HashMap<Arc<str>, ColumnRange>>;
 
+/// Returns the min/max values for the range, if present
+fn range_to_min_max_stats(
+    range: Option<&ColumnRange>,
+) -> (Precision<ScalarValue>, Precision<ScalarValue>) {
+    let Some(range) = range else {
+        return (Precision::Absent, Precision::Absent);
+    };
+    (
+        Precision::Exact(range.min_value.as_ref().clone()),
+        Precision::Exact(range.max_value.as_ref().clone()),
+    )
+}
+
 /// Create chunk [statistics](Statistics).
 pub fn create_chunk_statistics(
-    row_count: u64,
+    row_count: Option<usize>,
     schema: &Schema,
     ts_min_max: Option<TimestampMinMax>,
-    ranges: &ColumnRanges,
+    ranges: Option<&ColumnRanges>,
 ) -> Statistics {
     let mut columns = Vec::with_capacity(schema.len());
 
@@ -38,43 +53,46 @@ pub fn create_chunk_statistics(
                 // prefer explicitely given time range but fall back to column ranges
                 let (min_value, max_value) = match ts_min_max {
                     Some(ts_min_max) => (
-                        Some(ScalarValue::TimestampNanosecond(Some(ts_min_max.min), None)),
-                        Some(ScalarValue::TimestampNanosecond(Some(ts_min_max.max), None)),
+                        Precision::Exact(timestamptz_nano(ts_min_max.min)),
+                        Precision::Exact(timestamptz_nano(ts_min_max.max)),
                     ),
                     None => {
-                        let range = ranges.get::<str>(field.name().as_ref());
-                        (
-                            range.map(|r| r.min_value.as_ref().clone()),
-                            range.map(|r| r.max_value.as_ref().clone()),
-                        )
+                        let range =
+                            ranges.and_then(|ranges| ranges.get::<str>(field.name().as_ref()));
+
+                        range_to_min_max_stats(range)
                     }
                 };
 
                 ColumnStatistics {
-                    null_count: Some(0),
+                    null_count: Precision::Exact(0),
+                    min_value,
                     max_value,
+                    distinct_count: Precision::Absent,
+                }
+            }
+            _ => {
+                let range = ranges.and_then(|ranges| ranges.get::<str>(field.name().as_ref()));
+
+                let (min_value, max_value) = range_to_min_max_stats(range);
+
+                ColumnStatistics {
+                    null_count: Precision::Absent,
                     min_value,
-                    distinct_count: None,
+                    max_value,
+                    distinct_count: Precision::Absent,
                 }
             }
-            _ => ranges
-                .get::<str>(field.name().as_ref())
-                .map(|range| ColumnStatistics {
-                    null_count: None,
-                    max_value: Some(range.max_value.as_ref().clone()),
-                    min_value: Some(range.min_value.as_ref().clone()),
-                    distinct_count: None,
-                })
-                .unwrap_or_default(),
         };
         columns.push(stats)
     }
 
+    let num_rows = option_to_precision(row_count);
+
     Statistics {
-        num_rows: Some(row_count as usize),
-        total_byte_size: None,
-        column_statistics: Some(columns),
-        is_exact: true,
+        num_rows,
+        total_byte_size: Precision::Absent,
+        column_statistics: columns,
     }
 }
 
@@ -89,12 +107,24 @@ mod tests {
         let schema = SchemaBuilder::new().build().unwrap();
         let row_count = 0;
 
-        let actual = create_chunk_statistics(row_count, &schema, None, &Default::default());
+        let actual = create_chunk_statistics(Some(row_count), &schema, None, None);
+        let expected = Statistics {
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![],
+        };
+        assert_eq!(actual, expected);
+    }
+
+    #[test]
+    fn test_create_chunk_statistics_no_columns_null_rows() {
+        let schema = SchemaBuilder::new().build().unwrap();
+
+        let actual = create_chunk_statistics(None, &schema, None, None);
         let expected = Statistics {
-            num_rows: Some(row_count as usize),
-            total_byte_size: None,
-            column_statistics: Some(vec![]),
-            is_exact: true,
+            num_rows: Precision::Absent,
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![],
         };
         assert_eq!(actual, expected);
     }
@@ -127,37 +157,45 @@ mod tests {
             ),
         ]));
 
-        for row_count in [0u64, 1337u64] {
-            let actual = create_chunk_statistics(row_count, &schema, Some(ts_min_max), &ranges);
+        for row_count in [0usize, 1337usize] {
+            let actual =
+                create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges));
             let expected = Statistics {
-                num_rows: Some(row_count as usize),
-                total_byte_size: None,
-                column_statistics: Some(vec![
+                num_rows: Precision::Exact(row_count),
+                total_byte_size: Precision::Absent,
+                column_statistics: vec![
+                    // tag1
                     ColumnStatistics {
-                        null_count: None,
-                        min_value: Some(ScalarValue::from("aaa")),
-                        max_value: Some(ScalarValue::from("bbb")),
-                        distinct_count: None,
+                        null_count: Precision::Absent,
+                        min_value: Precision::Exact(ScalarValue::from("aaa")),
+                        max_value: Precision::Exact(ScalarValue::from("bbb")),
+                        distinct_count: Precision::Absent,
                     },
+                    // tag2
                     ColumnStatistics::default(),
+                    // field_bool
                     ColumnStatistics::default(),
+                    // field_float
                     ColumnStatistics::default(),
+                    // field_integer
                     ColumnStatistics {
-                        null_count: None,
-                        min_value: Some(ScalarValue::from(10i64)),
-                        max_value: Some(ScalarValue::from(20i64)),
-                        distinct_count: None,
+                        null_count: Precision::Absent,
+                        min_value: Precision::Exact(ScalarValue::from(10i64)),
+                        max_value: Precision::Exact(ScalarValue::from(20i64)),
+                        distinct_count: Precision::Absent,
                     },
+                    // field_string
                     ColumnStatistics::default(),
+                    // field_uinteger
                     ColumnStatistics::default(),
+                    // time
                     ColumnStatistics {
-                        null_count: Some(0),
-                        min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)),
-                        max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)),
-                        distinct_count: None,
+                        null_count: Precision::Exact(0),
+                        min_value: Precision::Exact(timestamptz_nano(10)),
+                        max_value: Precision::Exact(timestamptz_nano(20)),
+                        distinct_count: Precision::Absent,
                     },
-                ]),
-                is_exact: true,
+                ],
             };
             assert_eq!(actual, expected);
         }
@@ -166,21 +204,22 @@ mod tests {
     #[test]
     fn test_create_chunk_statistics_ts_min_max_overrides_column_range() {
         let schema = full_schema();
-        let row_count = 42u64;
+        let row_count = 42usize;
         let ts_min_max = TimestampMinMax { min: 10, max: 20 };
         let ranges = Arc::new(HashMap::from([(
             Arc::from(TIME_COLUMN_NAME),
             ColumnRange {
-                min_value: Arc::new(ScalarValue::TimestampNanosecond(Some(12), None)),
-                max_value: Arc::new(ScalarValue::TimestampNanosecond(Some(22), None)),
+                min_value: Arc::new(timestamptz_nano(12)),
+                max_value: Arc::new(timestamptz_nano(22)),
             },
         )]));
 
-        let actual = create_chunk_statistics(row_count, &schema, Some(ts_min_max), &ranges);
+        let actual =
+            create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges));
         let expected = Statistics {
-            num_rows: Some(row_count as usize),
-            total_byte_size: None,
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
@@ -189,13 +228,12 @@ mod tests {
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
                 ColumnStatistics {
-                    null_count: Some(0),
-                    min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)),
-                    max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)),
-                    distinct_count: None,
+                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(timestamptz_nano(10)),
+                    max_value: Precision::Exact(timestamptz_nano(20)),
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
@@ -203,20 +241,20 @@ mod tests {
     #[test]
     fn test_create_chunk_statistics_ts_min_max_none_so_fallback_to_column_range() {
         let schema = full_schema();
-        let row_count = 42u64;
+        let row_count = 42usize;
         let ranges = Arc::new(HashMap::from([(
             Arc::from(TIME_COLUMN_NAME),
             ColumnRange {
-                min_value: Arc::new(ScalarValue::TimestampNanosecond(Some(12), None)),
-                max_value: Arc::new(ScalarValue::TimestampNanosecond(Some(22), None)),
+                min_value: Arc::new(timestamptz_nano(12)),
+                max_value: Arc::new(timestamptz_nano(22)),
             },
         )]));
 
-        let actual = create_chunk_statistics(row_count, &schema, None, &ranges);
+        let actual = create_chunk_statistics(Some(row_count), &schema, None, Some(&ranges));
         let expected = Statistics {
-            num_rows: Some(row_count as usize),
-            total_byte_size: None,
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(row_count),
+            total_byte_size: Precision::Absent,
+            column_statistics: vec![
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
@@ -225,13 +263,12 @@ mod tests {
                 ColumnStatistics::default(),
                 ColumnStatistics::default(),
                 ColumnStatistics {
-                    null_count: Some(0),
-                    min_value: Some(ScalarValue::TimestampNanosecond(Some(12), None)),
-                    max_value: Some(ScalarValue::TimestampNanosecond(Some(22), None)),
-                    distinct_count: None,
+                    null_count: Precision::Exact(0),
+                    min_value: Precision::Exact(timestamptz_nano(12)),
+                    max_value: Precision::Exact(timestamptz_nano(22)),
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
diff --git a/iox_query/src/exec.rs b/iox_query/src/exec.rs
index dfaaf360450..abb8ba59d3a 100644
--- a/iox_query/src/exec.rs
+++ b/iox_query/src/exec.rs
@@ -10,6 +10,7 @@ mod non_null_checker;
 pub mod query_tracing;
 mod schema_pivot;
 pub mod seriesset;
+pub mod sleep;
 pub(crate) mod split;
 pub mod stringset;
 use datafusion_util::config::register_iox_object_store;
@@ -60,6 +61,18 @@ pub struct ExecutorConfig {
     pub mem_pool_size: usize,
 }
 
+impl ExecutorConfig {
+    pub fn testing() -> Self {
+        Self {
+            num_threads: NonZeroUsize::new(1).unwrap(),
+            target_query_partitions: NonZeroUsize::new(1).unwrap(),
+            object_stores: HashMap::default(),
+            metric_registry: Arc::new(Registry::default()),
+            mem_pool_size: TESTING_MEM_POOL_SIZE,
+        }
+    }
+}
+
 impl Display for ExecutorConfig {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
@@ -172,13 +185,7 @@ impl Executor {
     /// Get testing executor that runs a on single thread and a low memory bound
     /// to preserve resources.
     pub fn new_testing() -> Self {
-        let config = ExecutorConfig {
-            num_threads: NonZeroUsize::new(1).unwrap(),
-            target_query_partitions: NonZeroUsize::new(1).unwrap(),
-            object_stores: HashMap::default(),
-            metric_registry: Arc::new(Registry::default()),
-            mem_pool_size: TESTING_MEM_POOL_SIZE,
-        };
+        let config = ExecutorConfig::testing();
         let executors = Arc::new(DedicatedExecutors::new_testing());
         Self::new_with_config_and_executors(config, executors)
     }
@@ -274,6 +281,11 @@ impl Executor {
     pub fn pool(&self) -> Arc<dyn MemoryPool> {
         Arc::clone(&self.runtime.memory_pool)
     }
+
+    /// Returns underlying config.
+    pub fn config(&self) -> &ExecutorConfig {
+        &self.config
+    }
 }
 
 // No need to implement `Drop` because this is done by DedicatedExecutor already
@@ -742,8 +754,10 @@ mod tests {
             Ok(Box::pin(stream))
         }
 
-        fn statistics(&self) -> datafusion::physical_plan::Statistics {
-            Default::default()
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
         }
     }
 
diff --git a/iox_query/src/exec/context.rs b/iox_query/src/exec/context.rs
index 4060eb48c83..ad60c7a3cc9 100644
--- a/iox_query/src/exec/context.rs
+++ b/iox_query/src/exec/context.rs
@@ -6,6 +6,7 @@ use super::{
     gapfill::{plan_gap_fill, GapFill},
     non_null_checker::NonNullCheckerNode,
     seriesset::{series::Either, SeriesSet},
+    sleep::SleepNode,
     split::StreamSplitNode,
 };
 use crate::{
@@ -34,6 +35,7 @@ use arrow::record_batch::RecordBatch;
 use async_trait::async_trait;
 use datafusion::{
     catalog::CatalogProvider,
+    common::ParamValues,
     execution::{
         context::{QueryPlanner, SessionState, TaskContext},
         memory_pool::MemoryPool,
@@ -55,7 +57,7 @@ use query_functions::{register_scalar_functions, selectors::register_selector_ag
 use std::{fmt, num::NonZeroUsize, sync::Arc};
 use trace::{
     ctx::SpanContext,
-    span::{MetaValue, Span, SpanExt, SpanRecorder},
+    span::{MetaValue, Span, SpanEvent, SpanExt, SpanRecorder},
 };
 
 // Reuse DataFusion error and Result types for this module
@@ -150,6 +152,9 @@ impl ExtensionPlanner for IOxExtensionPlanner {
                 physical_inputs,
             )?;
             Some(Arc::new(gap_fill_exec) as Arc<dyn ExecutionPlan>)
+        } else if let Some(sleep) = any.downcast_ref::<SleepNode>() {
+            let sleep = sleep.plan(planner, logical_inputs, physical_inputs, session_state)?;
+            Some(Arc::new(sleep) as _)
         } else {
             None
         };
@@ -252,12 +257,12 @@ impl IOxSessionConfig {
             .session_config
             .with_extension(Arc::new(recorder.span().cloned()));
 
-        let state = SessionState::with_config_rt(session_config, self.runtime)
+        let state = SessionState::new_with_config_rt(session_config, self.runtime)
             .with_query_planner(Arc::new(IOxQueryPlanner {}));
         let state = register_iox_physical_optimizers(state);
         let state = register_iox_logical_optimizers(state);
 
-        let inner = SessionContext::with_state(state);
+        let inner = SessionContext::new_with_state(state);
         register_selector_aggregates(&inner);
         register_scalar_functions(&inner);
         if let Some(default_catalog) = self.default_catalog {
@@ -340,9 +345,27 @@ impl IOxSessionContext {
     /// in the SQL have been registered with this context. Use
     /// `create_physical_plan` to actually execute the query.
     pub async fn sql_to_logical_plan(&self, sql: &str) -> Result<LogicalPlan> {
+        Self::sql_to_logical_plan_with_params(self, sql, ParamValues::List(vec![])).await
+    }
+
+    /// Plan a SQL statement, providing a list of parameter values
+    /// to supply to `$placeholder` variables. This assumes that
+    /// any tables referenced in the SQL have been registered with
+    /// this context. Use `create_physical_plan` to actually execute
+    /// the query.
+    pub async fn sql_to_logical_plan_with_params(
+        &self,
+        sql: &str,
+        params: impl Into<ParamValues> + Send,
+    ) -> Result<LogicalPlan> {
         let ctx = self.child_ctx("sql_to_logical_plan");
         debug!(text=%sql, "planning SQL query");
-        let plan = ctx.inner.state().create_logical_plan(sql).await?;
+        let plan = ctx
+            .inner
+            .state()
+            .create_logical_plan(sql)
+            .await?
+            .with_param_values(params.into())?;
         // ensure the plan does not contain unwanted statements
         let verifier = SQLOptions::new()
             .with_allow_ddl(false) // no CREATE ...
@@ -363,9 +386,20 @@ impl IOxSessionContext {
     /// Plan a SQL statement and convert it to an execution plan. This assumes that any
     /// tables referenced in the SQL have been registered with this context
     pub async fn sql_to_physical_plan(&self, sql: &str) -> Result<Arc<dyn ExecutionPlan>> {
-        let logical_plan = self.sql_to_logical_plan(sql).await?;
+        Self::sql_to_physical_plan_with_params(self, sql, ParamValues::List(vec![])).await
+    }
 
+    /// Plan a SQL statement and convert it to an execution plan, providing a list of
+    /// parameter values to supply to `$placeholder` variables. This assumes that any
+    /// tables referenced in the SQL have been registered with this context
+    pub async fn sql_to_physical_plan_with_params(
+        &self,
+        sql: &str,
+        params: impl Into<ParamValues> + Send,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
         let ctx = self.child_ctx("sql_to_physical_plan");
+
+        let logical_plan = ctx.sql_to_logical_plan_with_params(sql, params).await?;
         ctx.create_physical_plan(&logical_plan).await
     }
 
@@ -378,7 +412,7 @@ impl IOxSessionContext {
         debug!(text=%logical_plan.display_indent_schema(), "create_physical_plan: initial plan");
         let physical_plan = ctx.inner.state().create_physical_plan(logical_plan).await?;
 
-        ctx.recorder.event("physical plan");
+        ctx.recorder.event(SpanEvent::new("physical plan"));
         debug!(text=%displayable(physical_plan.as_ref()).indent(false), "create_physical_plan: plan to run");
         Ok(physical_plan)
     }
@@ -671,7 +705,7 @@ impl IOxSessionContext {
 
     /// Record an event on the span recorder
     pub fn record_event(&mut self, name: &'static str) {
-        self.recorder.event(name);
+        self.recorder.event(SpanEvent::new(name));
     }
 
     /// Record an event on the span recorder
diff --git a/iox_query/src/exec/field.rs b/iox_query/src/exec/field.rs
index a0614b01691..58388900c86 100644
--- a/iox_query/src/exec/field.rs
+++ b/iox_query/src/exec/field.rs
@@ -55,7 +55,7 @@ impl From<&[&str]> for FieldColumns {
 }
 
 /// Column indexes for a field: a value and corresponding timestamp
-#[derive(Debug, PartialEq, Eq, Clone)]
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub struct FieldIndex {
     pub value_index: usize,
     pub timestamp_index: usize,
diff --git a/iox_query/src/exec/fieldlist.rs b/iox_query/src/exec/fieldlist.rs
index c2bbf4d2dc7..e74954362a3 100644
--- a/iox_query/src/exec/fieldlist.rs
+++ b/iox_query/src/exec/fieldlist.rs
@@ -190,7 +190,7 @@ mod tests {
         array::{Int64Array, StringArray},
         datatypes::{DataType as ArrowDataType, Field as ArrowField, Schema},
     };
-    use schema::TIME_DATA_TYPE;
+    use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
 
     #[test]
     fn test_convert_single_batch() {
@@ -200,9 +200,10 @@ mod tests {
         ]));
 
         let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
-        let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![
-            1000, 2000, 3000, 4000,
-        ]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let actual = do_conversion(
             Arc::clone(&schema),
@@ -226,9 +227,10 @@ mod tests {
         // expect same even if the timestamp order is different
 
         let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
-        let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![
-            1000, 4000, 2000, 3000,
-        ]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 4000, 2000, 3000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]])
             .expect("convert correctly");
@@ -247,12 +249,16 @@ mod tests {
         ]));
 
         let string_array1: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"]));
-        let timestamp_array1: ArrayRef =
-            Arc::new(TimestampNanosecondArray::from_iter_values(vec![1000, 3000]));
+        let timestamp_array1: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 3000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let string_array2: ArrayRef = Arc::new(StringArray::from(vec!["foo", "foo"]));
-        let timestamp_array2: ArrayRef =
-            Arc::new(TimestampNanosecondArray::from_iter_values(vec![1000, 4000]));
+        let timestamp_array2: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let actual = do_conversion(
             schema,
@@ -287,9 +293,10 @@ mod tests {
         // string array has no actual values, so should not be returned as a field
         let string_array: ArrayRef =
             Arc::new(StringArray::from(vec![None::<&str>, None, None, None]));
-        let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![
-            1000, 2000, 3000, 4000,
-        ]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]])
             .expect("convert correctly");
@@ -314,9 +321,10 @@ mod tests {
         let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"]));
         let int_array: ArrayRef =
             Arc::new(Int64Array::from(vec![Some(10), Some(20), Some(30), None]));
-        let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![
-            1000, 2000, 3000, 4000,
-        ]));
+        let timestamp_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let expected = FieldList {
             fields: vec![
diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs
index 05fcaa8f2c3..0733038f8f6 100644
--- a/iox_query/src/exec/gapfill/algo.rs
+++ b/iox_query/src/exec/gapfill/algo.rs
@@ -274,7 +274,7 @@ impl GapFiller {
         output_arrays.sort_by(|(a, _), (b, _)| a.cmp(b));
         let output_arrays: Vec<_> = output_arrays.into_iter().map(|(_, arr)| arr).collect();
         let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays)
-            .map_err(DataFusionError::ArrowError)?;
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
 
         self.cursor = final_cursor;
         Ok(batch)
@@ -596,7 +596,8 @@ impl Cursor {
         self.build_vec(params, input_time_array, series_ends, &mut aggr_builder)?;
 
         let take_arr = UInt64Array::from(aggr_builder.take_idxs);
-        take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError)
+        take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
     }
 
     /// Builds an array using the [`take`](take::take) kernel
@@ -668,7 +669,8 @@ impl Cursor {
         });
 
         let take_arr = UInt64Array::from(take_idxs);
-        take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError)
+        take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
     }
 
     /// Builds an array using the [`interleave`](arrow::compute::interleave) kernel
@@ -969,15 +971,15 @@ impl StashedAggrBuilder<'_> {
     /// kernel.
     fn create_stash(input_aggr_array: &ArrayRef, offset: u64) -> Result<ArrayRef> {
         let take_arr: UInt64Array = vec![None, Some(offset)].into();
-        let stash =
-            take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError)?;
+        let stash = take::take(input_aggr_array, &take_arr, None)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
         Ok(stash)
     }
 
     /// Build the output column.
     fn build(&self) -> Result<ArrayRef> {
         arrow::compute::interleave(&[&self.stash, self.input_aggr_array], &self.interleave_idxs)
-            .map_err(DataFusionError::ArrowError)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
     }
 
     fn buffered_input(offset: usize) -> (usize, usize) {
@@ -1043,7 +1045,7 @@ mod tests {
     use arrow_util::test_util::batches_to_lines;
     use datafusion::error::Result;
     use hashbrown::HashMap;
-    use schema::InfluxColumnType;
+    use schema::{InfluxColumnType, TIME_DATA_TIMEZONE};
 
     use crate::exec::gapfill::{
         algo::{AggrColState, Cursor},
@@ -1188,12 +1190,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &[series], &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_null(&params, &[series], &input_times, &input_aggr_array)
             .unwrap();
@@ -1234,12 +1238,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &[series], &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr =
             cursor.build_aggr_fill_null(&params, &[series], &input_times, &input_aggr_array)?;
         insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###"
@@ -1287,12 +1293,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &[series], &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_prev(&params, &[series], &input_times, &input_aggr_array)
             .unwrap();
@@ -1343,12 +1351,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &[series], &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &[series], &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_prev(&params, &[series], &input_times, &input_aggr_array)
             .unwrap();
@@ -1384,7 +1394,8 @@ mod tests {
             // 1000
             Some(1050),
             // 1100
-        ]);
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 11.0]));
         let series_ends = vec![1, 2];
 
@@ -1399,12 +1410,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_null(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
@@ -1439,7 +1452,8 @@ mod tests {
             Some(1000),
             Some(1050),
             Some(1100),
-        ]);
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![
             // 950
             // 1000
@@ -1463,12 +1477,14 @@ mod tests {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_prev(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
@@ -1511,7 +1527,8 @@ mod tests {
             Some(1050),
             Some(1100),
             Some(1100),
-        ]);
+        ])
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![
             // Some(9.0) //  950
             // ^^^^^^^^^ this element has been sliced off
@@ -1552,12 +1569,14 @@ mod tests {
             .collect(),
         };
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_prev_stashed(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
diff --git a/iox_query/src/exec/gapfill/algo/interpolate.rs b/iox_query/src/exec/gapfill/algo/interpolate.rs
index 0e3c68e7949..277e01b6fec 100644
--- a/iox_query/src/exec/gapfill/algo/interpolate.rs
+++ b/iox_query/src/exec/gapfill/algo/interpolate.rs
@@ -353,6 +353,7 @@ mod test {
 
     use arrow::array::{ArrayRef, Float64Array, Int64Array, TimestampNanosecondArray, UInt64Array};
     use hashbrown::HashMap;
+    use schema::TIME_DATA_TIMEZONE;
 
     use crate::exec::gapfill::{
         algo::tests::{array_to_lines, assert_cursor_end_state, new_cursor_with_batch_size},
@@ -404,12 +405,14 @@ mod test {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
@@ -476,12 +479,14 @@ mod test {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
@@ -548,12 +553,14 @@ mod test {
         let output_batch_size = 10000;
         let mut cursor = new_cursor_with_batch_size(&params, output_batch_size);
 
-        let time_arr: TimestampNanosecondArray = cursor
-            .clone_for_aggr_col(None)
-            .unwrap()
-            .build_time_vec(&params, &series_ends, &input_times)
-            .unwrap()
-            .into();
+        let time_arr = TimestampNanosecondArray::from(
+            cursor
+                .clone_for_aggr_col(None)
+                .unwrap()
+                .build_time_vec(&params, &series_ends, &input_times)
+                .unwrap(),
+        )
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
         let arr = cursor
             .build_aggr_fill_interpolate(&params, &series_ends, &input_times, &input_aggr_array)
             .unwrap();
diff --git a/iox_query/src/exec/gapfill/buffered_input.rs b/iox_query/src/exec/gapfill/buffered_input.rs
index a7ea743c00e..59ae3111d40 100644
--- a/iox_query/src/exec/gapfill/buffered_input.rs
+++ b/iox_query/src/exec/gapfill/buffered_input.rs
@@ -189,8 +189,8 @@ impl BufferedInput {
                 .iter()
                 .map(|c| SortField::new(batch.column(*c).data_type().clone()))
                 .collect();
-            let row_converter =
-                RowConverter::new(sort_fields).map_err(DataFusionError::ArrowError)?;
+            let row_converter = RowConverter::new(sort_fields)
+                .map_err(|err| DataFusionError::ArrowError(err, None))?;
             self.row_converter = Some(row_converter);
         }
         Ok(self.row_converter.as_mut().expect("cannot be none"))
@@ -206,7 +206,7 @@ impl BufferedInput {
             .collect();
         self.get_row_converter()?
             .convert_columns(&columns)
-            .map_err(DataFusionError::ArrowError)
+            .map_err(|err| DataFusionError::ArrowError(err, None))
     }
 
     /// Returns the row-oriented representation of the last buffered row that may appear in the next
diff --git a/iox_query/src/exec/gapfill/exec_tests.rs b/iox_query/src/exec/gapfill/exec_tests.rs
index 78eee423644..cc0a19086e2 100644
--- a/iox_query/src/exec/gapfill/exec_tests.rs
+++ b/iox_query/src/exec/gapfill/exec_tests.rs
@@ -1456,8 +1456,8 @@ impl TryFrom<TestRecords> for Vec<RecordBatch> {
             )));
         }
 
-        let one_batch =
-            RecordBatch::try_new(value.schema(), arrs).map_err(DataFusionError::ArrowError)?;
+        let one_batch = RecordBatch::try_new(value.schema(), arrs)
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
         let mut batches = vec![];
         let mut offset = 0;
         while offset < one_batch.num_rows() {
@@ -1479,7 +1479,7 @@ struct TestCase {
 impl TestCase {
     fn run(self) -> Result<Vec<RecordBatch>> {
         block_on(async {
-            let session_ctx = SessionContext::with_config(
+            let session_ctx = SessionContext::new_with_config(
                 SessionConfig::default().with_batch_size(self.output_batch_size),
             )
             .into();
@@ -1489,7 +1489,7 @@ impl TestCase {
 
     fn run_with_memory_limit(self, limit: usize) -> Result<Vec<RecordBatch>> {
         block_on(async {
-            let session_ctx = SessionContext::with_config_rt(
+            let session_ctx = SessionContext::new_with_config_rt(
                 SessionConfig::default().with_batch_size(self.output_batch_size),
                 RuntimeEnv::new(RuntimeConfig::default().with_memory_limit(limit, 1.0))?.into(),
             )
@@ -1560,10 +1560,7 @@ fn phys_fill_strategies(
     let end = start + records.agg_cols.len() + records.struct_cols.len();
     let mut v = Vec::with_capacity(records.agg_cols.len());
     for f in &records.schema().fields()[start..end] {
-        v.push((
-            phys_col(f.name(), &records.schema())?,
-            fill_strategy.clone(),
-        ));
+        v.push((phys_col(f.name(), &records.schema())?, fill_strategy));
     }
     Ok(v)
 }
diff --git a/iox_query/src/exec/gapfill/mod.rs b/iox_query/src/exec/gapfill/mod.rs
index 90b20254be8..30ef8a52275 100644
--- a/iox_query/src/exec/gapfill/mod.rs
+++ b/iox_query/src/exec/gapfill/mod.rs
@@ -70,7 +70,7 @@ pub struct GapFillParams {
 }
 
 /// Describes how to fill gaps in an aggregate column.
-#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+#[derive(Clone, Debug, Hash, PartialEq, Eq, Copy)]
 pub enum FillStrategy {
     /// Fill with null values.
     /// This is the InfluxQL behavior for `FILL(NULL)` or `FILL(NONE)`.
@@ -318,7 +318,7 @@ pub(crate) fn plan_gap_fill(
         .map(|(e, fs)| {
             Ok((
                 create_physical_expr(e, input_dfschema, input_schema, execution_props)?,
-                fs.clone(),
+                *fs,
             ))
         })
         .collect::<Result<Vec<(Arc<dyn PhysicalExpr>, FillStrategy)>>>()?;
@@ -534,8 +534,8 @@ impl ExecutionPlan for GapFillExec {
         )?))
     }
 
-    fn statistics(&self) -> Statistics {
-        Statistics::default()
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
     }
 }
 
@@ -589,9 +589,10 @@ mod test {
         datasource::empty::EmptyTable,
         error::Result,
         logical_expr::{logical_plan, Extension, UserDefinedLogicalNode},
-        prelude::{col, lit, lit_timestamp_nano},
+        prelude::{col, lit},
         scalar::ScalarValue,
     };
+    use datafusion_util::lit_timestamptz_nano;
 
     use test_helpers::assert_error;
 
@@ -628,7 +629,7 @@ mod test {
                 time_column: col("time"),
                 origin: None,
                 time_range: Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
                     end: Bound::Unbounded,
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
@@ -669,7 +670,7 @@ mod test {
                 origin: None,
                 time_range: Range {
                     start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
             },
@@ -679,8 +680,8 @@ mod test {
                 time_column: col("time"),
                 origin: None,
                 time_range: Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
             },
@@ -688,10 +689,10 @@ mod test {
             GapFillParams {
                 stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
                 time_column: col("time"),
-                origin: Some(lit_timestamp_nano(1_000_000_000)),
+                origin: Some(lit_timestamptz_nano(1_000_000_000)),
                 time_range: Range {
                     start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
             },
@@ -699,10 +700,10 @@ mod test {
             GapFillParams {
                 stride: lit(ScalarValue::IntervalDayTime(Some(60_000))),
                 time_column: col("time"),
-                origin: Some(lit_timestamp_nano(1_000_000_000)),
+                origin: Some(lit_timestamptz_nano(1_000_000_000)),
                 time_range: Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
             },
@@ -734,8 +735,8 @@ mod test {
                 time_column: col("time"),
                 origin: None,
                 time_range: Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
                 fill_strategy: fill_strategy_null(vec![col("temp")]),
             },
@@ -784,7 +785,7 @@ mod test {
         - "     SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]"
         - "       AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
         - "         AggregateExec: mode=Partial, gby=[date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]"
-        - "           EmptyExec: produce_one_row=false"
+        - "           EmptyExec"
         "###
         );
         Ok(())
@@ -814,7 +815,7 @@ mod test {
         - "     SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]"
         - "       AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
         - "         AggregateExec: mode=Partial, gby=[loc@1 as loc, date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]"
-        - "           EmptyExec: produce_one_row=false"
+        - "           EmptyExec"
         "###
         );
         Ok(())
diff --git a/iox_query/src/exec/gapfill/params.rs b/iox_query/src/exec/gapfill/params.rs
index b3ead749be1..5e9d0c42bc7 100644
--- a/iox_query/src/exec/gapfill/params.rs
+++ b/iox_query/src/exec/gapfill/params.rs
@@ -98,7 +98,7 @@ impl GapFillParams {
                         "fill strategy aggr expr was not a column: {e:?}",
                     )))?
                     .index();
-                Ok((idx, fs.clone()))
+                Ok((idx, *fs))
             })
             .collect::<Result<HashMap<usize, FillStrategy>>>()?;
 
diff --git a/iox_query/src/exec/gapfill/stream.rs b/iox_query/src/exec/gapfill/stream.rs
index 823c3c173a0..499de06b077 100644
--- a/iox_query/src/exec/gapfill/stream.rs
+++ b/iox_query/src/exec/gapfill/stream.rs
@@ -182,14 +182,15 @@ impl GapFillStream {
         let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum();
 
         let mut batch = arrow::compute::concat_batches(&self.schema, &batches)
-            .map_err(DataFusionError::ArrowError)?;
+            .map_err(|err| DataFusionError::ArrowError(err, None))?;
         self.reservation.try_grow(batch.get_array_memory_size())?;
 
         if batches.len() > 1 {
             // Optimize the dictionaries. The output of this operator uses the take kernel to produce
             // its output. Since the input batches will usually be smaller than the output, it should
             // be less work to optimize here vs optimizing the output.
-            batch = optimize_dictionaries(&batch).map_err(DataFusionError::ArrowError)?;
+            batch = optimize_dictionaries(&batch)
+                .map_err(|err| DataFusionError::ArrowError(err, None))?;
         }
 
         self.reservation.shrink(old_size);
@@ -205,7 +206,7 @@ impl GapFillStream {
         let input_time_array = self
             .time_expr
             .evaluate(&input_batch)?
-            .into_array(input_batch.num_rows());
+            .into_array(input_batch.num_rows())?;
         let input_time_array: &TimestampNanosecondArray = input_time_array
             .as_any()
             .downcast_ref()
@@ -247,7 +248,8 @@ impl GapFillStream {
             .map(|e| {
                 Ok((
                     expr_to_index(e),
-                    e.evaluate(input_batch)?.into_array(input_batch.num_rows()),
+                    e.evaluate(input_batch)?
+                        .into_array(input_batch.num_rows())?,
                 ))
             })
             .collect::<Result<Vec<_>>>()
@@ -261,7 +263,8 @@ impl GapFillStream {
             .map(|e| {
                 Ok((
                     expr_to_index(e),
-                    e.evaluate(input_batch)?.into_array(input_batch.num_rows()),
+                    e.evaluate(input_batch)?
+                        .into_array(input_batch.num_rows())?,
                 ))
             })
             .collect::<Result<Vec<_>>>()
diff --git a/iox_query/src/exec/non_null_checker.rs b/iox_query/src/exec/non_null_checker.rs
index 1de84cc656e..8a60bd73f57 100644
--- a/iox_query/src/exec/non_null_checker.rs
+++ b/iox_query/src/exec/non_null_checker.rs
@@ -46,6 +46,7 @@ use arrow::{
     datatypes::{DataType, Field, Schema, SchemaRef},
     record_batch::RecordBatch,
 };
+use datafusion::logical_expr::expr_vec_fmt;
 use datafusion::{
     common::{DFSchemaRef, ToDFSchema},
     error::{DataFusionError, Result},
@@ -79,6 +80,10 @@ pub struct NonNullCheckerNode {
 }
 
 impl NonNullCheckerNode {
+    /// Creates a new NonNullChecker node
+    ///
+    /// # Panics
+    /// If the input schema is empty
     pub fn new(value: &str, input: LogicalPlan) -> Self {
         let schema = make_non_null_checker_output_schema();
 
@@ -91,6 +96,8 @@ impl NonNullCheckerNode {
             .map(|field| Expr::Column(field.qualified_column()))
             .collect::<Vec<_>>();
 
+        assert!(!exprs.is_empty(), "NonNullChecker: input schema was empty");
+
         Self {
             input,
             schema,
@@ -130,17 +137,23 @@ impl UserDefinedLogicalNodeCore for NonNullCheckerNode {
         self.exprs.clone()
     }
 
-    /// For example: `NonNullChecker('the_value')`
+    /// For example: `NonNullChecker('the_value'), exprs=[foo]`
     fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{}('{}')", self.name(), self.value)
+        write!(
+            f,
+            "{}('{}') exprs={}",
+            self.name(),
+            self.value,
+            expr_vec_fmt!(self.exprs)
+        )
     }
 
     fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
-        assert_eq!(inputs.len(), 1, "NonNullChecker: input sizes inconistent");
+        assert_eq!(inputs.len(), 1, "NonNullChecker: input sizes inconsistent");
         assert_eq!(
             exprs.len(),
             self.exprs.len(),
-            "NonNullChecker: expression sizes inconistent"
+            "NonNullChecker: expression sizes inconsistent"
         );
         Self::new(self.value.as_ref(), inputs[0].clone())
     }
@@ -276,9 +289,8 @@ impl ExecutionPlan for NonNullCheckerExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Statistics {
-        // don't know anything about the statistics
-        Statistics::default()
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
     }
 }
 
diff --git a/iox_query/src/exec/query_tracing.rs b/iox_query/src/exec/query_tracing.rs
index a4b81bd2c39..de639c33b7a 100644
--- a/iox_query/src/exec/query_tracing.rs
+++ b/iox_query/src/exec/query_tracing.rs
@@ -672,8 +672,10 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> datafusion::physical_plan::Statistics {
-            unimplemented!()
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
         }
 
         fn metrics(&self) -> Option<MetricsSet> {
diff --git a/iox_query/src/exec/schema_pivot.rs b/iox_query/src/exec/schema_pivot.rs
index b6192f61f1a..a3e3d3adb2f 100644
--- a/iox_query/src/exec/schema_pivot.rs
+++ b/iox_query/src/exec/schema_pivot.rs
@@ -251,9 +251,8 @@ impl ExecutionPlan for SchemaPivotExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Statistics {
-        // don't know anything about the statistics
-        Statistics::default()
+    fn statistics(&self) -> Result<Statistics> {
+        Ok(Statistics::new_unknown(&self.schema()))
     }
 }
 
diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs
index 2ad6a63fd6c..81e83844927 100644
--- a/iox_query/src/exec/seriesset/converter.rs
+++ b/iox_query/src/exec/seriesset/converter.rs
@@ -48,7 +48,7 @@ pub enum Error {
 pub type Result<T, E = Error> = std::result::Result<T, E>;
 
 // Handles converting record batches into SeriesSets
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct SeriesSetConverter {}
 
 impl SeriesSetConverter {
@@ -386,7 +386,7 @@ impl Stream for SeriesSetConverterStream {
                         Err(e) => {
                             // internal state is broken, end this stream
                             this.we_finished = true;
-                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e))));
+                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None))));
                         }
                     };
 
@@ -435,7 +435,7 @@ impl Stream for SeriesSetConverterStream {
                         Err(e) => {
                             // internal state is broken, end this stream
                             this.we_finished = true;
-                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e))));
+                            return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None))));
                         }
                     };
 
@@ -625,9 +625,8 @@ impl PartialEq for SortableSeries {
 impl Eq for SortableSeries {}
 
 impl PartialOrd for SortableSeries {
-    #[allow(clippy::non_canonical_partial_ord_impl)]
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
-        self.tag_vals.partial_cmp(&other.tag_vals)
+        Some(self.cmp(other))
     }
 }
 
diff --git a/iox_query/src/exec/sleep.rs b/iox_query/src/exec/sleep.rs
new file mode 100644
index 00000000000..b7fa5050fd3
--- /dev/null
+++ b/iox_query/src/exec/sleep.rs
@@ -0,0 +1,265 @@
+/// Implementation of a "sleep" operation in DataFusion.
+///
+/// The sleep operation passes through its input data and sleeps asynchronously for a duration determined by an
+/// expression. The async sleep is implemented as a special [execution plan](SleepExpr) so we can perform this as part
+/// of the async data stream. In contrast to a UDF, this will NOT block any threads.
+use std::{sync::Arc, time::Duration};
+
+use arrow::{
+    array::{Array, Float32Array, Float64Array, Int64Array},
+    datatypes::{DataType, SchemaRef, TimeUnit},
+};
+use datafusion::{
+    common::DFSchemaRef,
+    error::DataFusionError,
+    execution::{context::SessionState, TaskContext},
+    logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore},
+    physical_plan::{
+        stream::RecordBatchStreamAdapter, DisplayAs, DisplayFormatType, ExecutionPlan,
+        PhysicalExpr, SendableRecordBatchStream, Statistics,
+    },
+    physical_planner::PhysicalPlanner,
+    prelude::Expr,
+};
+use futures::TryStreamExt;
+
+/// Logical plan note that represents a "sleep" operation.
+///
+/// This will be lowered to [`SleepExpr`].
+///
+/// See [module](super) docs for more details.
+#[derive(Clone, Debug, Hash, PartialEq, Eq)]
+pub struct SleepNode {
+    input: LogicalPlan,
+    duration: Vec<Expr>,
+}
+
+impl SleepNode {
+    pub fn new(input: LogicalPlan, duration: Vec<Expr>) -> Self {
+        Self { input, duration }
+    }
+
+    pub fn plan(
+        &self,
+        planner: &dyn PhysicalPlanner,
+        logical_inputs: &[&LogicalPlan],
+        physical_inputs: &[Arc<dyn ExecutionPlan>],
+        session_state: &SessionState,
+    ) -> Result<SleepExpr, DataFusionError> {
+        let duration = self
+            .duration
+            .iter()
+            .map(|e| {
+                planner.create_physical_expr(
+                    e,
+                    logical_inputs[0].schema(),
+                    &physical_inputs[0].schema(),
+                    session_state,
+                )
+            })
+            .collect::<Result<Vec<_>, _>>()?;
+        Ok(SleepExpr::new(Arc::clone(&physical_inputs[0]), duration))
+    }
+}
+
+impl UserDefinedLogicalNodeCore for SleepNode {
+    fn name(&self) -> &str {
+        "Sleep"
+    }
+
+    fn inputs(&self) -> Vec<&LogicalPlan> {
+        vec![&self.input]
+    }
+
+    fn schema(&self) -> &DFSchemaRef {
+        self.input.schema()
+    }
+
+    fn expressions(&self) -> Vec<Expr> {
+        self.duration.clone()
+    }
+
+    fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let duration = self
+            .duration
+            .iter()
+            .map(|e| e.to_string())
+            .collect::<Vec<String>>()
+            .join(", ");
+
+        write!(f, "{}: duration=[{}]", self.name(), duration)
+    }
+
+    fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self {
+        Self::new(inputs[0].clone(), exprs.to_vec())
+    }
+}
+
+/// Physical node that implements a "sleep" operation.
+///
+/// This was lowered from [`SleepNode`].
+///
+/// See [module](super) docs for more details.
+#[derive(Debug)]
+pub struct SleepExpr {
+    /// Input data.
+    input: Arc<dyn ExecutionPlan>,
+
+    /// Expression that determines the sum of the sleep duration.
+    duration: Vec<Arc<dyn PhysicalExpr>>,
+}
+
+impl SleepExpr {
+    pub fn new(input: Arc<dyn ExecutionPlan>, duration: Vec<Arc<dyn PhysicalExpr>>) -> Self {
+        Self { input, duration }
+    }
+}
+
+impl DisplayAs for SleepExpr {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                let duration = self
+                    .duration
+                    .iter()
+                    .map(|e| e.to_string())
+                    .collect::<Vec<String>>()
+                    .join(", ");
+
+                write!(f, "Sleep: duration=[{}]", duration)
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for SleepExpr {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+        self.input.output_partitioning()
+    }
+
+    fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+        assert_eq!(children.len(), 1);
+
+        Ok(Arc::new(Self::new(
+            Arc::clone(&children[0]),
+            self.duration.clone(),
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> datafusion::error::Result<SendableRecordBatchStream> {
+        let stream = self.input.execute(partition, context)?;
+
+        let duration = self.duration.clone();
+        let stream = RecordBatchStreamAdapter::new(
+            stream.schema(),
+            stream.and_then(move |batch| {
+                let duration = duration.clone();
+
+                async move {
+                    let mut sum = Duration::ZERO;
+                    for expr in duration {
+                        let array = expr.evaluate(&batch)?.into_array(batch.num_rows())?;
+                        let d = array_to_duration(&array)?;
+                        if let Some(d) = d {
+                            sum += d;
+                        }
+                    }
+                    if !sum.is_zero() {
+                        tokio::time::sleep(sum).await;
+                    }
+                    Ok(batch)
+                }
+            }),
+        );
+        Ok(Box::pin(stream))
+    }
+
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        Ok(Statistics::new_unknown(&self.schema()))
+    }
+}
+
+fn array_to_duration(array: &dyn Array) -> Result<Option<Duration>, DataFusionError> {
+    match array.data_type() {
+        DataType::Null => Ok(None),
+        DataType::Duration(tunit) => {
+            let array = arrow::compute::cast(array, &DataType::Int64)?;
+            let array = array
+                .as_any()
+                .downcast_ref::<Int64Array>()
+                .expect("just casted");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0 {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}{tunit:?}"
+                )));
+            }
+            let sum = sum as u64;
+            let duration = match tunit {
+                TimeUnit::Second => Duration::from_secs(sum),
+                TimeUnit::Millisecond => Duration::from_millis(sum),
+                TimeUnit::Microsecond => Duration::from_micros(sum),
+                TimeUnit::Nanosecond => Duration::from_nanos(sum),
+            };
+            Ok(Some(duration))
+        }
+        DataType::Float32 => {
+            let array = array
+                .as_any()
+                .downcast_ref::<Float32Array>()
+                .expect("just checked");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0.0 || !sum.is_finite() {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}s"
+                )));
+            }
+            Ok(Some(Duration::from_secs_f32(sum)))
+        }
+        DataType::Float64 => {
+            let array = array
+                .as_any()
+                .downcast_ref::<Float64Array>()
+                .expect("just checked");
+            let Some(sum) = arrow::compute::sum(array) else {
+                return Ok(None);
+            };
+            if sum < 0.0 || !sum.is_finite() {
+                return Err(DataFusionError::Execution(format!(
+                    "duration must be non-negative but is {sum}s"
+                )));
+            }
+            Ok(Some(Duration::from_secs_f64(sum)))
+        }
+        other => Err(DataFusionError::Internal(format!(
+            "Expected duration pattern to sleep(...), got: {other:?}"
+        ))),
+    }
+}
diff --git a/iox_query/src/exec/split.rs b/iox_query/src/exec/split.rs
index 736ab02131a..30108844d74 100644
--- a/iox_query/src/exec/split.rs
+++ b/iox_query/src/exec/split.rs
@@ -271,10 +271,10 @@ impl ExecutionPlan for StreamSplitExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Statistics {
+    fn statistics(&self) -> Result<Statistics> {
         // For now, don't return any statistics (in the future we
         // could potentially estimate the output cardinalities)
-        Statistics::default()
+        Ok(Statistics::new_unknown(&self.schema()))
     }
 }
 
@@ -567,7 +567,7 @@ mod tests {
 
         let input = make_input(vec![vec![batch0, batch1]]);
         // int_col < 3
-        let split_expr = df_physical_expr(input.as_ref(), col("int_col").lt(lit(3))).unwrap();
+        let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr]));
 
@@ -625,12 +625,12 @@ mod tests {
         let input = make_input(vec![vec![batch0, batch1]]);
         // int_col < 2
         let split_expr1 =
-            df_physical_expr(input.as_ref(), col("int_col").lt(lit::<i16>(2))).unwrap();
+            df_physical_expr(input.schema(), col("int_col").lt(lit::<i16>(2))).unwrap();
         // 2 <= int_col < 3
         let expr = col("int_col")
             .gt_eq(lit::<i16>(2))
             .and(col("int_col").lt(lit::<i16>(3)));
-        let split_expr2 = df_physical_expr(input.as_ref(), expr).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), expr).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
 
@@ -679,7 +679,7 @@ mod tests {
 
         let input = make_input(vec![vec![batch0]]);
         // use `false` to send all outputs to second stream
-        let split_expr = df_physical_expr(input.as_ref(), lit(false)).unwrap();
+        let split_expr = df_physical_expr(input.schema(), lit(false)).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr]));
 
@@ -713,8 +713,8 @@ mod tests {
         // Test 1: 3 streams but all data is sent to the second one
         let input = make_input(vec![vec![batch0.clone()]]);
         // use `false` & `true` to send all outputs to second stream
-        let split_expr1 = df_physical_expr(input.as_ref(), lit(false)).unwrap();
-        let split_expr2 = df_physical_expr(input.as_ref(), lit(true)).unwrap();
+        let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(true)).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
 
@@ -743,8 +743,8 @@ mod tests {
         let input = make_input(vec![vec![batch0.clone()]]);
 
         // use `false` & `false` to send all outputs to third stream
-        let split_expr1 = df_physical_expr(input.as_ref(), lit(false)).unwrap();
-        let split_expr2 = df_physical_expr(input.as_ref(), lit(false)).unwrap();
+        let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
 
@@ -773,8 +773,8 @@ mod tests {
         let input = make_input(vec![vec![batch0]]);
 
         // use `true` & `false` to send all outputs to first stream
-        let split_expr1 = df_physical_expr(input.as_ref(), lit(true)).unwrap();
-        let split_expr2 = df_physical_expr(input.as_ref(), lit(false)).unwrap();
+        let split_expr1 = df_physical_expr(input.schema(), lit(true)).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
 
@@ -812,7 +812,7 @@ mod tests {
 
         let input = make_input(vec![vec![batch0]]);
         // int_col < 3
-        let split_expr = df_physical_expr(input.as_ref(), col("int_col").lt(lit(3))).unwrap();
+        let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr]));
 
@@ -853,12 +853,12 @@ mod tests {
         let input = make_input(vec![vec![batch0]]);
         // int_col < 2
         let split_expr1 =
-            df_physical_expr(input.as_ref(), col("int_col").lt(lit::<i16>(2))).unwrap();
+            df_physical_expr(input.schema(), col("int_col").lt(lit::<i16>(2))).unwrap();
         // 2 <= int_col < 3
         let expr = col("int_col")
             .gt_eq(lit::<i16>(2))
             .and(col("int_col").lt(lit::<i16>(3)));
-        let split_expr2 = df_physical_expr(input.as_ref(), expr).unwrap();
+        let split_expr2 = df_physical_expr(input.schema(), expr).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2]));
 
@@ -908,7 +908,7 @@ mod tests {
 
         let input = make_input(vec![vec![batch0]]);
         // int_col (not a boolean)
-        let split_expr = df_physical_expr(input.as_ref(), col("int_col")).unwrap();
+        let split_expr = df_physical_expr(input.schema(), col("int_col")).unwrap();
         let split_exec: Arc<dyn ExecutionPlan> =
             Arc::new(StreamSplitExec::new(input, vec![split_expr]));
 
diff --git a/iox_query/src/frontend/reorg.rs b/iox_query/src/frontend/reorg.rs
index 1149d6cb443..9bf8259a76a 100644
--- a/iox_query/src/frontend/reorg.rs
+++ b/iox_query/src/frontend/reorg.rs
@@ -2,10 +2,8 @@
 
 use std::sync::Arc;
 
-use datafusion::{
-    logical_expr::LogicalPlan,
-    prelude::{col, lit_timestamp_nano},
-};
+use datafusion::{logical_expr::LogicalPlan, prelude::col};
+use datafusion_util::lit_timestamptz_nano;
 use observability_deps::tracing::debug;
 use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME};
 
@@ -44,7 +42,7 @@ impl From<datafusion::error::DataFusionError> for Error {
 
 /// Planner for physically rearranging chunk data. This planner
 /// creates COMPACT and SPLIT plans for use in the database lifecycle manager
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct ReorgPlanner {}
 
 impl ReorgPlanner {
@@ -203,7 +201,7 @@ impl ReorgPlanner {
 
         let mut split_exprs = Vec::with_capacity(split_times.len());
         // time <= split_times[0]
-        split_exprs.push(col(TIME_COLUMN_NAME).lt_eq(lit_timestamp_nano(split_times[0])));
+        split_exprs.push(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[0])));
         // split_times[i-1] , time <= split_time[i]
         for i in 1..split_times.len() {
             if split_times[i - 1] >= split_times[i] {
@@ -217,8 +215,8 @@ impl ReorgPlanner {
             }
             split_exprs.push(
                 col(TIME_COLUMN_NAME)
-                    .gt(lit_timestamp_nano(split_times[i - 1]))
-                    .and(col(TIME_COLUMN_NAME).lt_eq(lit_timestamp_nano(split_times[i]))),
+                    .gt(lit_timestamptz_nano(split_times[i - 1]))
+                    .and(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[i]))),
             );
         }
         let plan = make_stream_split(plan, split_exprs);
@@ -389,12 +387,79 @@ mod test {
     }
 
     #[tokio::test]
-    async fn test_compact_plan() {
+    async fn test_compact_plan_default_sort() {
+        test_helpers::maybe_start_logging();
+
+        let (schema, chunks) = get_test_chunks().await;
+
+        let sort_key = SortKeyBuilder::with_capacity(2)
+            .with_col("tag1")
+            .with_col(TIME_COLUMN_NAME)
+            .build();
+
+        let compact_plan = ReorgPlanner::new()
+            .compact_plan(Arc::from("t"), &schema, chunks, sort_key)
+            .expect("created compact plan");
+
+        let executor = Executor::new_testing();
+        let physical_plan = executor
+            .new_context(ExecutorType::Reorg)
+            .create_physical_plan(&compact_plan)
+            .await
+            .unwrap();
+
+        // It is critical that the plan only sorts the inputs and is not resorted after the UnionExec.
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&physical_plan),
+            @r###"
+        ---
+        - " SortPreservingMergeExec: [tag1@2 ASC,time@3 ASC]"
+        - "   UnionExec"
+        - "     SortExec: expr=[tag1@2 ASC,time@3 ASC]"
+        - "       RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
+        - "     ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
+        - "       DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
+        - "         SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
+        - "           RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
+        "###
+        );
+
+        assert_eq!(
+            physical_plan.output_partitioning().partition_count(),
+            1,
+            "{:?}",
+            physical_plan.output_partitioning()
+        );
+
+        let batches = test_collect(physical_plan).await;
+
+        // sorted on state ASC and time ASC (defaults)
+        let expected = vec![
+            "+-----------+------------+------+--------------------------------+",
+            "| field_int | field_int2 | tag1 | time                           |",
+            "+-----------+------------+------+--------------------------------+",
+            "| 100       |            | AL   | 1970-01-01T00:00:00.000000050Z |",
+            "| 70        |            | CT   | 1970-01-01T00:00:00.000000100Z |",
+            "| 1000      |            | MT   | 1970-01-01T00:00:00.000001Z    |",
+            "| 5         |            | MT   | 1970-01-01T00:00:00.000005Z    |",
+            "| 10        |            | MT   | 1970-01-01T00:00:00.000007Z    |",
+            "| 70        | 70         | UT   | 1970-01-01T00:00:00.000220Z    |",
+            "| 50        | 50         | VT   | 1970-01-01T00:00:00.000210Z    |",
+            "| 1000      | 1000       | WA   | 1970-01-01T00:00:00.000028Z    |",
+            "+-----------+------------+------+--------------------------------+",
+        ];
+
+        assert_batches_eq!(&expected, &batches);
+    }
+
+    #[tokio::test]
+    async fn test_compact_plan_alternate_sort() {
         test_helpers::maybe_start_logging();
 
         let (schema, chunks) = get_test_chunks().await;
 
         let sort_key = SortKeyBuilder::with_capacity(2)
+            // use something other than the default sort
             .with_col_opts("tag1", true, true)
             .with_col_opts(TIME_COLUMN_NAME, false, false)
             .build();
@@ -417,12 +482,12 @@ mod test {
         - " SortPreservingMergeExec: [tag1@2 DESC,time@3 ASC NULLS LAST]"
         - "   UnionExec"
         - "     SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]"
-        - "       RecordBatchesExec: chunks=1"
+        - "       RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
         - "     SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]"
         - "       ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
         - "         DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
         - "           SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
-        - "             RecordBatchesExec: chunks=1"
+        - "             RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
         "###
         );
 
@@ -435,7 +500,7 @@ mod test {
 
         let batches = test_collect(physical_plan).await;
 
-        // sorted on state ASC and time
+        // sorted on state DESC and time ASC
         let expected = vec![
             "+-----------+------------+------+--------------------------------+",
             "| field_int | field_int2 | tag1 | time                           |",
@@ -486,12 +551,12 @@ mod test {
         - "   SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]"
         - "     UnionExec"
         - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
         - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
         - "         ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
         - "           DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
         - "             SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
-        - "               RecordBatchesExec: chunks=1"
+        - "               RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
         "###
         );
 
@@ -567,12 +632,12 @@ mod test {
         - "   SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]"
         - "     UnionExec"
         - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]"
         - "       SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]"
         - "         ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]"
         - "           DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
         - "             SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
-        - "               RecordBatchesExec: chunks=1"
+        - "               RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]"
         "###
         );
 
diff --git a/iox_query/src/frontend/sql.rs b/iox_query/src/frontend/sql.rs
index 28536e41cf2..4008e3c8f27 100644
--- a/iox_query/src/frontend/sql.rs
+++ b/iox_query/src/frontend/sql.rs
@@ -1,10 +1,10 @@
 use std::sync::Arc;
 
 use crate::exec::context::IOxSessionContext;
-use datafusion::{error::Result, physical_plan::ExecutionPlan};
+use datafusion::{common::ParamValues, error::Result, physical_plan::ExecutionPlan};
 
 /// This struct can create plans for running SQL queries against databases
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct SqlQueryPlanner {}
 
 impl SqlQueryPlanner {
@@ -17,8 +17,10 @@ impl SqlQueryPlanner {
     pub async fn query(
         &self,
         query: &str,
+        params: impl Into<ParamValues> + Send,
         ctx: &IOxSessionContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
-        ctx.sql_to_physical_plan(query).await
+        let ctx = ctx.child_ctx("SqlQueryPlanner::query");
+        ctx.sql_to_physical_plan_with_params(query, params).await
     }
 }
diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs
index a4c7d3f95c7..e5afb924238 100644
--- a/iox_query/src/lib.rs
+++ b/iox_query/src/lib.rs
@@ -11,11 +11,14 @@
     clippy::dbg_macro,
     unused_crate_dependencies
 )]
+#![allow(unreachable_pub)]
 
 use datafusion_util::MemoryStream;
 use futures::TryStreamExt;
-use trace::ctx::SpanContext;
+use query_log::{QueryCompletedToken, QueryText, StateReceived};
+use trace::{ctx::SpanContext, span::Span};
 
+use tracker::InstrumentedAsyncOwnedSemaphorePermit;
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
@@ -45,6 +48,7 @@ pub mod physical_optimizer;
 pub mod plan;
 pub mod provider;
 pub mod pruning;
+pub mod query_log;
 pub mod statistics;
 pub mod util;
 
@@ -98,54 +102,6 @@ pub trait QueryChunk: Debug + Send + Sync + 'static {
     fn as_any(&self) -> &dyn Any;
 }
 
-/// A `QueryCompletedToken` is returned by `record_query` implementations of
-/// a `QueryNamespace`. It is used to trigger side-effects (such as query timing)
-/// on query completion.
-///
-pub struct QueryCompletedToken {
-    /// If this query completed successfully
-    success: bool,
-
-    /// Function invoked when the token is dropped. It is passed the
-    /// vaue of `self.success`
-    f: Option<Box<dyn FnOnce(bool) + Send>>,
-}
-
-impl Debug for QueryCompletedToken {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("QueryCompletedToken")
-            .field("success", &self.success)
-            .finish()
-    }
-}
-
-impl QueryCompletedToken {
-    pub fn new(f: impl FnOnce(bool) + Send + 'static) -> Self {
-        Self {
-            success: false,
-            f: Some(Box::new(f)),
-        }
-    }
-
-    /// Record that this query completed successfully
-    pub fn set_success(&mut self) {
-        self.success = true;
-    }
-}
-
-impl Drop for QueryCompletedToken {
-    fn drop(&mut self) {
-        if let Some(f) = self.f.take() {
-            (f)(self.success)
-        }
-    }
-}
-
-/// Boxed description of a query that knows how to render to a string
-///
-/// This avoids storing potentially large strings
-pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
-
 /// `QueryNamespace` is the main trait implemented by the IOx subsystems that store actual data.
 ///
 /// Namespaces store data organized by partitions and each partition stores data in Chunks.
@@ -186,12 +142,33 @@ pub trait QueryNamespace: Debug + Send + Sync {
         span_ctx: Option<&SpanContext>,
         query_type: &'static str,
         query_text: QueryText,
-    ) -> QueryCompletedToken;
+    ) -> QueryCompletedToken<StateReceived>;
 
     /// Returns a new execution context suitable for running queries
     fn new_query_context(&self, span_ctx: Option<SpanContext>) -> IOxSessionContext;
 }
 
+/// Trait that allows the query engine (which includes flight and storage/InfluxRPC) to access a
+/// virtual set of namespaces.
+///
+/// This is the only entry point for the query engine. This trait and the traits reachable by it (e.g.
+/// [`QueryNamespace`]) are the only wait to access the catalog and payload data.
+#[async_trait]
+pub trait QueryNamespaceProvider: std::fmt::Debug + Send + Sync + 'static {
+    /// Get namespace if it exists.
+    ///
+    /// System tables may contain debug information depending on `include_debug_info_tables`.
+    async fn db(
+        &self,
+        name: &str,
+        span: Option<Span>,
+        include_debug_info_tables: bool,
+    ) -> Option<Arc<dyn QueryNamespace>>;
+
+    /// Acquire concurrency-limiting sempahore
+    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
+}
+
 /// Raw data of a [`QueryChunk`].
 pub enum QueryChunkData {
     /// Record batches.
@@ -236,110 +213,6 @@ impl std::fmt::Debug for QueryChunkData {
     }
 }
 
-impl<P> QueryChunk for Arc<P>
-where
-    P: QueryChunk,
-{
-    fn stats(&self) -> Arc<Statistics> {
-        self.as_ref().stats()
-    }
-
-    fn schema(&self) -> &Schema {
-        self.as_ref().schema()
-    }
-
-    fn partition_id(&self) -> &TransitionPartitionId {
-        self.as_ref().partition_id()
-    }
-
-    fn sort_key(&self) -> Option<&SortKey> {
-        self.as_ref().sort_key()
-    }
-
-    fn id(&self) -> ChunkId {
-        self.as_ref().id()
-    }
-
-    fn may_contain_pk_duplicates(&self) -> bool {
-        self.as_ref().may_contain_pk_duplicates()
-    }
-
-    fn data(&self) -> QueryChunkData {
-        self.as_ref().data()
-    }
-
-    fn chunk_type(&self) -> &str {
-        self.as_ref().chunk_type()
-    }
-
-    fn order(&self) -> ChunkOrder {
-        self.as_ref().order()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        // present the underlying implementation, not the wrapper
-        self.as_ref().as_any()
-    }
-}
-
-impl QueryChunk for Arc<dyn QueryChunk> {
-    fn stats(&self) -> Arc<Statistics> {
-        self.as_ref().stats()
-    }
-
-    fn schema(&self) -> &Schema {
-        self.as_ref().schema()
-    }
-
-    fn partition_id(&self) -> &TransitionPartitionId {
-        self.as_ref().partition_id()
-    }
-
-    fn sort_key(&self) -> Option<&SortKey> {
-        self.as_ref().sort_key()
-    }
-
-    fn id(&self) -> ChunkId {
-        self.as_ref().id()
-    }
-
-    fn may_contain_pk_duplicates(&self) -> bool {
-        self.as_ref().may_contain_pk_duplicates()
-    }
-
-    fn data(&self) -> QueryChunkData {
-        self.as_ref().data()
-    }
-
-    fn chunk_type(&self) -> &str {
-        self.as_ref().chunk_type()
-    }
-
-    fn order(&self) -> ChunkOrder {
-        self.as_ref().order()
-    }
-
-    fn as_any(&self) -> &dyn Any {
-        // present the underlying implementation, not the wrapper
-        self.as_ref().as_any()
-    }
-}
-
-/// return true if all the chunks include distinct counts for all columns.
-pub fn chunks_have_distinct_counts<'a>(
-    chunks: impl IntoIterator<Item = &'a Arc<dyn QueryChunk>>,
-) -> bool {
-    // If at least one of the provided chunk cannot provide stats,
-    // do not need to compute potential duplicates. We will treat
-    // as all of them have duplicates
-    chunks.into_iter().all(|chunk| {
-        let Some(col_stats) = &chunk.stats().column_statistics else {
-            return false;
-        };
-        col_stats.iter().all(|col| col.distinct_count.is_some())
-    })
-}
-
 // Note: I would like to compile this module only in the 'test' cfg,
 // but when I do so then other modules can not find them. For example:
 //
diff --git a/iox_query/src/logical_optimizer/extract_sleep.rs b/iox_query/src/logical_optimizer/extract_sleep.rs
new file mode 100644
index 00000000000..2f11446ec29
--- /dev/null
+++ b/iox_query/src/logical_optimizer/extract_sleep.rs
@@ -0,0 +1,100 @@
+use std::sync::Arc;
+
+use datafusion::logical_expr::expr::ScalarFunction;
+use datafusion::{
+    common::{tree_node::TreeNodeRewriter, DFSchema},
+    error::DataFusionError,
+    logical_expr::{expr_rewriter::rewrite_preserving_name, Extension, LogicalPlan},
+    optimizer::{OptimizerConfig, OptimizerRule},
+    prelude::{lit, Expr},
+    scalar::ScalarValue,
+};
+use query_functions::SLEEP_UDF_NAME;
+
+use crate::exec::sleep::SleepNode;
+
+/// Rewrites the ["sleep" UDF](SLEEP_UDF_NAME) to a NULL expression and a [`SleepNode`].
+///
+/// See [`crate::exec::sleep`] for more details.
+#[derive(Debug, Clone)]
+pub struct ExtractSleep {}
+
+impl ExtractSleep {
+    /// Create new optimizer rule.
+    pub fn new() -> Self {
+        Self {}
+    }
+}
+
+impl OptimizerRule for ExtractSleep {
+    fn name(&self) -> &str {
+        "extract_sleep"
+    }
+
+    fn try_optimize(
+        &self,
+        plan: &LogicalPlan,
+        _config: &dyn OptimizerConfig,
+    ) -> datafusion::error::Result<Option<LogicalPlan>> {
+        optimize(plan).map(Some)
+    }
+}
+
+fn optimize(plan: &LogicalPlan) -> Result<LogicalPlan, DataFusionError> {
+    let new_inputs = plan
+        .inputs()
+        .iter()
+        .map(|input| optimize(input))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+    let mut schema =
+        new_inputs
+            .iter()
+            .map(|input| input.schema())
+            .fold(DFSchema::empty(), |mut lhs, rhs| {
+                lhs.merge(rhs);
+                lhs
+            });
+
+    schema.merge(plan.schema());
+
+    let mut expr_rewriter = Rewriter::default();
+
+    let new_exprs = plan
+        .expressions()
+        .into_iter()
+        .map(|expr| rewrite_preserving_name(expr, &mut expr_rewriter))
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+    let mut plan = plan.with_new_exprs(new_exprs, &new_inputs)?;
+
+    if !expr_rewriter.found_exprs.is_empty() {
+        plan = LogicalPlan::Extension(Extension {
+            node: Arc::new(SleepNode::new(plan, expr_rewriter.found_exprs)),
+        });
+    }
+
+    Ok(plan)
+}
+
+#[derive(Default)]
+struct Rewriter {
+    found_exprs: Vec<Expr>,
+}
+
+impl TreeNodeRewriter for Rewriter {
+    type N = Expr;
+
+    fn mutate(&mut self, expr: Expr) -> Result<Expr, DataFusionError> {
+        match expr {
+            Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => {
+                if func_def.name() == SLEEP_UDF_NAME {
+                    self.found_exprs.append(&mut args);
+                    return Ok(lit(ScalarValue::Null));
+                }
+
+                Ok(Expr::ScalarFunction(ScalarFunction { func_def, args }))
+            }
+            _ => Ok(expr),
+        }
+    }
+}
diff --git a/iox_query/src/logical_optimizer/handle_gapfill.rs b/iox_query/src/logical_optimizer/handle_gapfill.rs
index 291b88e986b..bd046b14df1 100644
--- a/iox_query/src/logical_optimizer/handle_gapfill.rs
+++ b/iox_query/src/logical_optimizer/handle_gapfill.rs
@@ -4,16 +4,17 @@
 pub mod range_predicate;
 
 use crate::exec::gapfill::{FillStrategy, GapFill, GapFillParams};
+use datafusion::logical_expr::ScalarFunctionDefinition;
 use datafusion::{
     common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter, VisitRecursion},
     error::{DataFusionError, Result},
     logical_expr::{
-        expr::{Alias, ScalarFunction, ScalarUDF},
+        expr::{Alias, ScalarFunction},
         utils::expr_to_columns,
         Aggregate, BuiltinScalarFunction, Extension, LogicalPlan, Projection,
     },
     optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule},
-    prelude::{col, Expr},
+    prelude::{col, Column, Expr},
 };
 use hashbrown::{hash_map, HashMap};
 use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME};
@@ -100,8 +101,12 @@ impl OptimizerRule for HandleGapFill {
 
 fn handle_gap_fill(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
     let res = match plan {
-        LogicalPlan::Aggregate(aggr) => handle_aggregate(aggr)?,
-        LogicalPlan::Projection(proj) => handle_projection(proj)?,
+        LogicalPlan::Aggregate(aggr) => {
+            handle_aggregate(aggr).map_err(|e| e.context("handle_aggregate"))?
+        }
+        LogicalPlan::Projection(proj) => {
+            handle_projection(proj).map_err(|e| e.context("handle_projection"))?
+        }
         _ => None,
     };
 
@@ -129,7 +134,9 @@ fn handle_aggregate(aggr: &Aggregate) -> Result<Option<LogicalPlan>> {
         new_group_expr,
         date_bin_gapfill_index,
         date_bin_gapfill_args,
-    } = if let Some(v) = replace_date_bin_gapfill(group_expr)? {
+    } = if let Some(v) =
+        replace_date_bin_gapfill(group_expr).map_err(|e| e.context("replace_date_bin_gapfill"))?
+    {
         v
     } else {
         return Ok(None);
@@ -145,14 +152,16 @@ fn handle_aggregate(aggr: &Aggregate) -> Result<Option<LogicalPlan>> {
             new_group_expr,
             aggr_expr.clone(),
             Arc::clone(schema),
-        )?;
+        )
+        .map_err(|e| e.context("Aggregate::try_new_with_schema"))?;
         let new_aggr_plan = LogicalPlan::Aggregate(new_aggr_plan);
-        check_node(&new_aggr_plan)?;
+        check_node(&new_aggr_plan).map_err(|e| e.context("check_node"))?;
         new_aggr_plan
     };
 
     let new_gap_fill_plan =
-        build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args)?;
+        build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args)
+            .map_err(|e| e.context("build_gapfill_node"))?;
     Ok(Some(new_gap_fill_plan))
 }
 
@@ -174,23 +183,33 @@ fn build_gapfill_node(
 
     // Ensure that stride argument is a scalar
     let stride = args_iter.next().unwrap();
-    validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride)?;
+    validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride)
+        .map_err(|e| e.context("validate_scalar_expr"))?;
+
+    fn get_column(expr: Expr) -> Result<Column> {
+        match expr {
+            Expr::Column(c) => Ok(c),
+            Expr::Cast(c) => get_column(*c.expr),
+            _ => Err(DataFusionError::Plan(
+                "DATE_BIN_GAPFILL requires a column as the source argument".to_string(),
+            )),
+        }
+    }
 
     // Ensure that the source argument is a column
-    let time_col = args_iter.next().unwrap().try_into_col().map_err(|_| {
-        DataFusionError::Plan(
-            "DATE_BIN_GAPFILL requires a column as the source argument".to_string(),
-        )
-    })?;
+    let time_col =
+        get_column(args_iter.next().unwrap()).map_err(|e| e.context("get time column"))?;
 
     // Ensure that a time range was specified and is valid for gap filling
-    let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col)?;
-    validate_time_range(&time_range)?;
+    let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col)
+        .map_err(|e| e.context("find time range"))?;
+    validate_time_range(&time_range).map_err(|e| e.context("validate time range"))?;
 
     // Ensure that origin argument is a scalar
     let origin = args_iter.next();
     if let Some(ref origin) = origin {
-        validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", origin)?;
+        validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", origin)
+            .map_err(|e| e.context("validate origin"))?;
     }
 
     // Make sure the time output to the gapfill node matches what the
@@ -219,18 +238,21 @@ fn build_gapfill_node(
         .collect();
 
     Ok(LogicalPlan::Extension(Extension {
-        node: Arc::new(GapFill::try_new(
-            Arc::new(new_aggr_plan),
-            new_group_expr,
-            aggr_expr,
-            GapFillParams {
-                stride,
-                time_column,
-                origin,
-                time_range,
-                fill_strategy: fill_behavior,
-            },
-        )?),
+        node: Arc::new(
+            GapFill::try_new(
+                Arc::new(new_aggr_plan),
+                new_group_expr,
+                aggr_expr,
+                GapFillParams {
+                    stride,
+                    time_column,
+                    origin,
+                    time_range,
+                    fill_strategy: fill_behavior,
+                },
+            )
+            .map_err(|e| e.context("GapFill::try_new"))?,
+        ),
     }))
 }
 
@@ -358,7 +380,7 @@ impl TreeNodeRewriter for DateBinGapfillRewriter {
     type N = Expr;
     fn pre_visit(&mut self, expr: &Expr) -> Result<RewriteRecursion> {
         match expr {
-            Expr::ScalarUDF(ScalarUDF { fun, .. }) if fun.name == DATE_BIN_GAPFILL_UDF_NAME => {
+            Expr::ScalarFunction(fun) if fun.func_def.name() == DATE_BIN_GAPFILL_UDF_NAME => {
                 Ok(RewriteRecursion::Mutate)
             }
             _ => Ok(RewriteRecursion::Continue),
@@ -370,10 +392,12 @@ impl TreeNodeRewriter for DateBinGapfillRewriter {
         // so that everything stays wired up.
         let orig_name = expr.display_name()?;
         match expr {
-            Expr::ScalarUDF(ScalarUDF { fun, args }) if fun.name == DATE_BIN_GAPFILL_UDF_NAME => {
+            Expr::ScalarFunction(ScalarFunction { func_def, args })
+                if func_def.name() == DATE_BIN_GAPFILL_UDF_NAME =>
+            {
                 self.args = Some(args.clone());
                 Ok(Expr::ScalarFunction(ScalarFunction {
-                    fun: BuiltinScalarFunction::DateBin,
+                    func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin),
                     args,
                 })
                 .alias(orig_name))
@@ -422,7 +446,11 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
     };
     let new_proj_exprs = proj_exprs
         .iter()
-        .map(|e| e.clone().rewrite(&mut fill_fn_rewriter))
+        .map(|expr| {
+            expr.clone()
+                .rewrite(&mut fill_fn_rewriter)
+                .map_err(|e| e.context(format!("rewrite: {expr}")))
+        })
         .collect::<Result<Vec<Expr>>>()?;
 
     let FillFnRewriter { aggr_col_fill_map } = fill_fn_rewriter;
@@ -434,7 +462,7 @@ fn handle_projection(proj: &Projection) -> Result<Option<LogicalPlan>> {
     // to reflect the new fill strategy.
     let mut new_gapfill = child_gapfill.clone();
     for (e, fs) in aggr_col_fill_map {
-        let udf = fill_strategy_to_udf(&fs)?;
+        let udf = fill_strategy_to_udf(&fs).map_err(|e| e.context("fill_strategy_to_udf"))?;
         if new_gapfill.replace_fill_strategy(&e, fs).is_none() {
             // There was a gap filling function called on a non-aggregate column.
             return Err(DataFusionError::Plan(format!(
@@ -470,7 +498,7 @@ impl TreeNodeRewriter for FillFnRewriter {
     type N = Expr;
     fn pre_visit(&mut self, expr: &Expr) -> Result<RewriteRecursion> {
         match expr {
-            Expr::ScalarUDF(ScalarUDF { fun, .. }) if udf_to_fill_strategy(&fun.name).is_some() => {
+            Expr::ScalarFunction(fun) if udf_to_fill_strategy(fun.func_def.name()).is_some() => {
                 Ok(RewriteRecursion::Mutate)
             }
             _ => Ok(RewriteRecursion::Continue),
@@ -480,14 +508,14 @@ impl TreeNodeRewriter for FillFnRewriter {
     fn mutate(&mut self, expr: Expr) -> Result<Expr> {
         let orig_name = expr.display_name()?;
         match expr {
-            Expr::ScalarUDF(ScalarUDF { ref fun, .. })
-                if udf_to_fill_strategy(&fun.name).is_none() =>
+            Expr::ScalarFunction(ref fun)
+                if udf_to_fill_strategy(fun.func_def.name()).is_none() =>
             {
                 Ok(expr)
             }
-            Expr::ScalarUDF(ScalarUDF { fun, mut args }) => {
-                let fs = udf_to_fill_strategy(&fun.name).expect("must be a fill fn");
-                let arg = args.remove(0);
+            Expr::ScalarFunction(mut fun) => {
+                let fs = udf_to_fill_strategy(fun.func_def.name()).expect("must be a fill fn");
+                let arg = fun.args.remove(0);
                 self.add_fill_strategy(arg.clone(), fs)?;
                 Ok(arg.alias(orig_name))
             }
@@ -524,7 +552,7 @@ fn count_udf(e: &Expr, name: &str) -> Result<usize> {
 fn matches_udf(e: &Expr, name: &str) -> bool {
     matches!(
         e,
-        Expr::ScalarUDF(ScalarUDF { fun, .. }) if fun.name == name
+        Expr::ScalarFunction(fun) if fun.func_def.name() == name
     )
 }
 
@@ -556,18 +584,19 @@ mod test {
 
     use arrow::datatypes::{DataType, Field, Schema, TimeUnit};
     use datafusion::error::Result;
-    use datafusion::logical_expr::expr::ScalarUDF;
+    use datafusion::logical_expr::builder::table_scan_with_filters;
     use datafusion::logical_expr::{logical_plan, LogicalPlan, LogicalPlanBuilder};
     use datafusion::optimizer::optimizer::Optimizer;
     use datafusion::optimizer::OptimizerContext;
-    use datafusion::prelude::{avg, case, col, lit, lit_timestamp_nano, min, Expr};
+    use datafusion::prelude::{avg, case, col, lit, min, Expr};
     use datafusion::scalar::ScalarValue;
+    use datafusion_util::lit_timestamptz_nano;
     use query_functions::gapfill::{
         DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME,
     };
 
-    fn table_scan() -> Result<LogicalPlan> {
-        let schema = Schema::new(vec![
+    fn schema() -> Schema {
+        Schema::new(vec![
             Field::new(
                 "time",
                 DataType::Timestamp(TimeUnit::Nanosecond, None),
@@ -580,8 +609,11 @@ mod test {
             ),
             Field::new("loc", DataType::Utf8, false),
             Field::new("temp", DataType::Float64, false),
-        ]);
-        logical_plan::table_scan(Some("temps"), &schema, None)?.build()
+        ])
+    }
+
+    fn table_scan() -> Result<LogicalPlan> {
+        logical_plan::table_scan(Some("temps"), &schema(), None)?.build()
     }
 
     fn date_bin_gapfill(interval: Expr, time: Expr) -> Result<Expr> {
@@ -597,33 +629,27 @@ mod test {
         if let Some(origin) = origin {
             args.push(origin)
         }
-        Ok(Expr::ScalarUDF(ScalarUDF {
-            fun: query_functions::registry().udf(DATE_BIN_GAPFILL_UDF_NAME)?,
-            args,
-        }))
+
+        Ok(query_functions::registry()
+            .udf(DATE_BIN_GAPFILL_UDF_NAME)?
+            .call(args))
     }
 
     fn locf(arg: Expr) -> Result<Expr> {
-        Ok(Expr::ScalarUDF(ScalarUDF {
-            fun: query_functions::registry().udf(LOCF_UDF_NAME)?,
-            args: vec![arg],
-        }))
+        Ok(query_functions::registry()
+            .udf(LOCF_UDF_NAME)?
+            .call(vec![arg]))
     }
 
     fn interpolate(arg: Expr) -> Result<Expr> {
-        Ok(Expr::ScalarUDF(ScalarUDF {
-            fun: query_functions::registry().udf(INTERPOLATE_UDF_NAME)?,
-            args: vec![arg],
-        }))
+        Ok(query_functions::registry()
+            .udf(INTERPOLATE_UDF_NAME)?
+            .call(vec![arg]))
     }
 
     fn optimize(plan: &LogicalPlan) -> Result<Option<LogicalPlan>> {
         let optimizer = Optimizer::with_rules(vec![Arc::new(HandleGapFill)]);
-        optimizer.optimize_recursively(
-            optimizer.rules.first().unwrap(),
-            plan,
-            &OptimizerContext::new(),
-        )
+        optimizer.optimize_recursively(&optimizer.rules[0], plan, &OptimizerContext::new())
     }
 
     fn assert_optimizer_err(plan: &LogicalPlan, expected: &str) {
@@ -713,8 +739,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![
@@ -742,8 +768,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![
@@ -771,8 +797,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill_with_origin(
@@ -803,8 +829,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(stride, col("time"))?],
@@ -826,20 +852,20 @@ mod test {
                 "Error during planning: gap-filling query is missing both upper and lower time bounds",
             ),
             (
-                col("time").gt_eq(lit_timestamp_nano(1000)),
+                col("time").gt_eq(lit_timestamptz_nano(1000)),
                 "Error during planning: gap-filling query is missing upper time bound",
             ),
             (
-                col("time").lt(lit_timestamp_nano(2000)),
+                col("time").lt(lit_timestamptz_nano(2000)),
                 "Error during planning: gap-filling query is missing lower time bound",
             ),
             (
                 col("time").gt_eq(col("time2")).and(
-                    col("time").lt(lit_timestamp_nano(2000))),
+                    col("time").lt(lit_timestamptz_nano(2000))),
                 "Error during planning: lower time bound for gap fill query must evaluate to a scalar",
             ),
             (
-                col("time").gt_eq(lit_timestamp_nano(2000)).and(
+                col("time").gt_eq(lit_timestamptz_nano(2000)).and(
                     col("time").lt(col("time2"))),
                 "Error during planning: upper time bound for gap fill query must evaluate to a scalar",
             )
@@ -874,8 +900,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(
@@ -903,14 +929,14 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill_with_origin(
                     lit(ScalarValue::IntervalDayTime(Some(60_000))),
                     col("time"),
-                    Some(lit_timestamp_nano(7)),
+                    Some(lit_timestamptz_nano(7)),
                 )?],
                 vec![avg(col("temp"))],
             )?
@@ -933,8 +959,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![
@@ -980,8 +1006,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(
@@ -1014,8 +1040,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(
@@ -1049,8 +1075,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(
@@ -1083,8 +1109,8 @@ mod test {
         let plan = LogicalPlanBuilder::from(table_scan()?)
             .filter(
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
             )?
             .aggregate(
                 vec![date_bin_gapfill(
@@ -1112,4 +1138,39 @@ mod test {
         "###);
         Ok(())
     }
+
+    #[test]
+    fn scan_filter_not_part_of_projection() {
+        let schema = schema();
+        let plan = table_scan_with_filters(
+            Some("temps"),
+            &schema,
+            Some(vec![schema.index_of("time").unwrap()]),
+            vec![
+                col("temps.time").gt_eq(lit_timestamptz_nano(1000)),
+                col("temps.time").lt(lit_timestamptz_nano(2000)),
+                col("temps.loc").eq(lit("foo")),
+            ],
+        )
+        .unwrap()
+        .aggregate(
+            vec![
+                date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"))
+                    .unwrap(),
+            ],
+            std::iter::empty::<Expr>(),
+        )
+        .unwrap()
+        .build()
+        .unwrap();
+
+        insta::assert_yaml_snapshot!(
+            format_optimized_plan(&plan).unwrap(),
+            @r###"
+        ---
+        - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))"
+        - "  Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[]]"
+        - "    TableScan: temps projection=[time], full_filters=[temps.time >= TimestampNanosecond(1000, None), temps.time < TimestampNanosecond(2000, None), temps.loc = Utf8(\"foo\")]"
+        "###);
+    }
 }
diff --git a/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs
index 97a31e232bd..26b9682b454 100644
--- a/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs
+++ b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs
@@ -1,5 +1,8 @@
 //! Find the time range from the filters in a logical plan.
-use std::ops::{Bound, Range};
+use std::{
+    ops::{Bound, Range},
+    sync::Arc,
+};
 
 use datafusion::{
     common::{
@@ -7,8 +10,9 @@ use datafusion::{
         DFSchema,
     },
     error::Result,
-    logical_expr::{Between, BinaryExpr, LogicalPlan, Operator},
-    optimizer::utils::split_conjunction,
+    logical_expr::{
+        utils::split_conjunction, Between, BinaryExpr, LogicalPlan, LogicalPlanBuilder, Operator,
+    },
     prelude::{Column, Expr},
 };
 
@@ -57,12 +61,23 @@ impl TreeNodeVisitor for TimeRangeVisitor {
             }
             LogicalPlan::TableScan(t) => {
                 let range = self.range.clone();
+
+                // filters may use columns that are NOT part of a projection, so we need the underlying schema. Because
+                // that's a bit of a mess in DF, we reconstruct the schema using the plan builder.
+                let unprojected_scan = LogicalPlanBuilder::scan_with_filters(
+                    t.table_name.to_owned(),
+                    Arc::clone(&t.source),
+                    None,
+                    t.filters.clone(),
+                )
+                .map_err(|e| e.context("reconstruct unprojected scheam"))?;
+                let unprojected_schema = unprojected_scan.schema();
                 let range = t
                     .filters
                     .iter()
                     .flat_map(split_conjunction)
                     .try_fold(range, |range, expr| {
-                        range.with_expr(&t.projected_schema, &self.col, expr)
+                        range.with_expr(unprojected_schema, &self.col, expr)
                     })?;
                 self.range = range;
                 Ok(VisitRecursion::Continue)
@@ -166,9 +181,10 @@ mod tests {
             logical_plan::{self, builder::LogicalTableSource},
             Between, LogicalPlan, LogicalPlanBuilder,
         },
-        prelude::{col, lit, lit_timestamp_nano, Column, Expr, Partitioning},
+        prelude::{col, lit, Column, Expr, Partitioning},
         sql::TableReference,
     };
+    use datafusion_util::lit_timestamptz_nano;
 
     use super::find_time_range;
 
@@ -225,88 +241,88 @@ mod tests {
             ),
             (
                 "time_gt_val",
-                col("time").gt(lit_timestamp_nano(1000)),
+                col("time").gt(lit_timestamptz_nano(1000)),
                 Range {
-                    start: Bound::Excluded(lit_timestamp_nano(1000)),
+                    start: Bound::Excluded(lit_timestamptz_nano(1000)),
                     end: Bound::Unbounded,
                 },
             ),
             (
                 "time_gt_eq_val",
-                col("time").gt_eq(lit_timestamp_nano(1000)),
+                col("time").gt_eq(lit_timestamptz_nano(1000)),
                 Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
                     end: Bound::Unbounded,
                 },
             ),
             (
                 "time_lt_val",
-                col("time").lt(lit_timestamp_nano(1000)),
+                col("time").lt(lit_timestamptz_nano(1000)),
                 Range {
                     start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamp_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(1000)),
                 },
             ),
             (
                 "time_lt_eq_val",
-                col("time").lt_eq(lit_timestamp_nano(1000)),
+                col("time").lt_eq(lit_timestamptz_nano(1000)),
                 Range {
                     start: Bound::Unbounded,
-                    end: Bound::Included(lit_timestamp_nano(1000)),
+                    end: Bound::Included(lit_timestamptz_nano(1000)),
                 },
             ),
             (
                 "val_gt_time",
-                lit_timestamp_nano(1000).gt(col("time")),
+                lit_timestamptz_nano(1000).gt(col("time")),
                 Range {
                     start: Bound::Unbounded,
-                    end: Bound::Excluded(lit_timestamp_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(1000)),
                 },
             ),
             (
                 "val_gt_eq_time",
-                lit_timestamp_nano(1000).gt_eq(col("time")),
+                lit_timestamptz_nano(1000).gt_eq(col("time")),
                 Range {
                     start: Bound::Unbounded,
-                    end: Bound::Included(lit_timestamp_nano(1000)),
+                    end: Bound::Included(lit_timestamptz_nano(1000)),
                 },
             ),
             (
                 "val_lt_time",
-                lit_timestamp_nano(1000).lt(col("time")),
+                lit_timestamptz_nano(1000).lt(col("time")),
                 Range {
-                    start: Bound::Excluded(lit_timestamp_nano(1000)),
+                    start: Bound::Excluded(lit_timestamptz_nano(1000)),
                     end: Bound::Unbounded,
                 },
             ),
             (
                 "val_lt_eq_time",
-                lit_timestamp_nano(1000).lt_eq(col("time")),
+                lit_timestamptz_nano(1000).lt_eq(col("time")),
                 Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
                     end: Bound::Unbounded,
                 },
             ),
             (
                 "and",
                 col("time")
-                    .gt_eq(lit_timestamp_nano(1000))
-                    .and(col("time").lt(lit_timestamp_nano(2000))),
+                    .gt_eq(lit_timestamptz_nano(1000))
+                    .and(col("time").lt(lit_timestamptz_nano(2000))),
                 Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
-                    end: Bound::Excluded(lit_timestamp_nano(2000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Excluded(lit_timestamptz_nano(2000)),
                 },
             ),
             (
                 "between",
                 between(
                     col("time"),
-                    lit_timestamp_nano(1000),
-                    lit_timestamp_nano(2000),
+                    lit_timestamptz_nano(1000),
+                    lit_timestamptz_nano(2000),
                 ),
                 Range {
-                    start: Bound::Included(lit_timestamp_nano(1000)),
-                    end: Bound::Included(lit_timestamp_nano(2000)),
+                    start: Bound::Included(lit_timestamptz_nano(1000)),
+                    end: Bound::Included(lit_timestamptz_nano(2000)),
                 },
             ),
         ];
@@ -330,11 +346,11 @@ mod tests {
         // - even when predicates are in different filter nodes
         // - through projections that alias columns
         let plan = LogicalPlanBuilder::from(table_scan()?)
-            .filter(col("time").gt_eq(lit_timestamp_nano(1000)))?
+            .filter(col("time").gt_eq(lit_timestamptz_nano(1000)))?
             .sort(vec![col("time")])?
             .limit(0, Some(10))?
             .project(vec![col("time").alias("other_time")])?
-            .filter(col("other_time").lt(lit_timestamp_nano(2000)))?
+            .filter(col("other_time").lt(lit_timestamptz_nano(2000)))?
             .distinct()?
             .repartition(Partitioning::RoundRobinBatch(1))?
             .project(vec![col("other_time").alias("my_time")])?
@@ -342,8 +358,8 @@ mod tests {
         let time_col = Column::from_name("my_time");
         let actual = find_time_range(&plan, &time_col)?;
         let expected = Range {
-            start: Bound::Included(lit_timestamp_nano(1000)),
-            end: Bound::Excluded(lit_timestamp_nano(2000)),
+            start: Bound::Included(lit_timestamptz_nano(1000)),
+            end: Bound::Excluded(lit_timestamptz_nano(2000)),
         };
         assert_eq!(expected, actual);
         Ok(())
diff --git a/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs
index 216b663011e..3660cdbbd2a 100644
--- a/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs
+++ b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs
@@ -1,9 +1,8 @@
+use datafusion::logical_expr::expr::ScalarFunction;
 use datafusion::{
     common::{tree_node::TreeNodeRewriter, DFSchema},
     error::DataFusionError,
-    logical_expr::{
-        expr::ScalarUDF, expr_rewriter::rewrite_preserving_name, LogicalPlan, Operator,
-    },
+    logical_expr::{expr_rewriter::rewrite_preserving_name, LogicalPlan, Operator},
     optimizer::{OptimizerConfig, OptimizerRule},
     prelude::{binary_expr, lit, Expr},
     scalar::ScalarValue,
@@ -73,14 +72,14 @@ impl TreeNodeRewriter for InfluxRegexToDataFusionRegex {
 
     fn mutate(&mut self, expr: Expr) -> Result<Expr, DataFusionError> {
         match expr {
-            Expr::ScalarUDF(ScalarUDF { fun, mut args }) => {
+            Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => {
+                let name = func_def.name();
                 if (args.len() == 2)
-                    && ((fun.name == REGEX_MATCH_UDF_NAME)
-                        || (fun.name == REGEX_NOT_MATCH_UDF_NAME))
+                    && ((name == REGEX_MATCH_UDF_NAME) || (name == REGEX_NOT_MATCH_UDF_NAME))
                 {
                     if let Expr::Literal(ScalarValue::Utf8(Some(s))) = &args[1] {
                         let s = clean_non_meta_escapes(s);
-                        let op = match fun.name.as_str() {
+                        let op = match name {
                             REGEX_MATCH_UDF_NAME => Operator::RegexMatch,
                             REGEX_NOT_MATCH_UDF_NAME => Operator::RegexNotMatch,
                             _ => unreachable!(),
@@ -89,7 +88,7 @@ impl TreeNodeRewriter for InfluxRegexToDataFusionRegex {
                     }
                 }
 
-                Ok(Expr::ScalarUDF(ScalarUDF { fun, args }))
+                Ok(Expr::ScalarFunction(ScalarFunction { func_def, args }))
             }
             _ => Ok(expr),
         }
diff --git a/iox_query/src/logical_optimizer/mod.rs b/iox_query/src/logical_optimizer/mod.rs
index 6e88a65bd23..42b72e18be3 100644
--- a/iox_query/src/logical_optimizer/mod.rs
+++ b/iox_query/src/logical_optimizer/mod.rs
@@ -3,9 +3,11 @@ use std::sync::Arc;
 use datafusion::execution::context::SessionState;
 
 use self::{
-    handle_gapfill::HandleGapFill, influx_regex_to_datafusion_regex::InfluxRegexToDataFusionRegex,
+    extract_sleep::ExtractSleep, handle_gapfill::HandleGapFill,
+    influx_regex_to_datafusion_regex::InfluxRegexToDataFusionRegex,
 };
 
+mod extract_sleep;
 mod handle_gapfill;
 mod influx_regex_to_datafusion_regex;
 pub use handle_gapfill::range_predicate;
@@ -16,5 +18,6 @@ pub use handle_gapfill::range_predicate;
 pub fn register_iox_logical_optimizers(state: SessionState) -> SessionState {
     state
         .add_optimizer_rule(Arc::new(InfluxRegexToDataFusionRegex::new()))
+        .add_optimizer_rule(Arc::new(ExtractSleep::new()))
         .add_optimizer_rule(Arc::new(HandleGapFill::new()))
 }
diff --git a/iox_query/src/physical_optimizer/chunk_extraction.rs b/iox_query/src/physical_optimizer/chunk_extraction.rs
index a462b2973ea..488b5df7854 100644
--- a/iox_query/src/physical_optimizer/chunk_extraction.rs
+++ b/iox_query/src/physical_optimizer/chunk_extraction.rs
@@ -5,8 +5,8 @@ use datafusion::{
     datasource::physical_plan::ParquetExec,
     error::DataFusionError,
     physical_plan::{
-        empty::EmptyExec, union::UnionExec, visit_execution_plan, ExecutionPlan,
-        ExecutionPlanVisitor,
+        empty::EmptyExec, placeholder_row::PlaceholderRowExec, union::UnionExec,
+        visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor,
     },
 };
 use observability_deps::tracing::debug;
@@ -141,14 +141,12 @@ impl ExecutionPlanVisitor for ExtractChunksVisitor {
                     self.add_chunk(Arc::clone(&ext.chunk));
                 }
             }
-        } else if let Some(empty_exec) = plan_any.downcast_ref::<EmptyExec>() {
+        } else if plan_any.downcast_ref::<PlaceholderRowExec>().is_some() {
             // should not produce dummy data
-            if empty_exec.produce_one_row() {
-                return Err(DataFusionError::External(
-                    String::from("EmptyExec produces row").into(),
-                ));
-            }
-
+            return Err(DataFusionError::External(
+                String::from("EmptyExec produces row").into(),
+            ));
+        } else if let Some(empty_exec) = plan_any.downcast_ref::<EmptyExec>() {
             self.add_schema_from_exec(empty_exec).map_err(|e| {
                 DataFusionError::Context("add schema from EmptyExec".to_owned(), Box::new(e))
             })?;
@@ -228,8 +226,8 @@ mod tests {
         let schema1 = iox_schema.as_arrow();
         let schema2 = iox_schema.select_by_indices(&[]).as_arrow();
         let plan = UnionExec::new(vec![
-            Arc::new(EmptyExec::new(false, schema1)),
-            Arc::new(EmptyExec::new(false, schema2)),
+            Arc::new(EmptyExec::new(schema1)),
+            Arc::new(EmptyExec::new(schema2)),
         ]);
         assert!(extract_chunks(&plan).is_none());
     }
@@ -237,7 +235,7 @@ mod tests {
     #[test]
     fn test_empty_exec_with_rows() {
         let schema = chunk(1).schema().as_arrow();
-        let plan = EmptyExec::new(true, schema);
+        let plan = PlaceholderRowExec::new(schema);
         assert!(extract_chunks(&plan).is_none());
     }
 
@@ -248,7 +246,7 @@ mod tests {
             DataType::Float64,
             true,
         )]));
-        let plan = EmptyExec::new(false, Arc::clone(&schema));
+        let plan = EmptyExec::new(Arc::clone(&schema));
         let (schema2, chunks, sort_key) = extract_chunks(&plan).unwrap();
         assert_eq!(schema, schema2);
         assert!(chunks.is_empty());
@@ -274,7 +272,7 @@ mod tests {
         let schema = chunk1.schema().as_arrow();
         let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2);
         let plan = FilterExec::try_new(
-            df_physical_expr(plan.as_ref(), col("tag1").eq(lit("foo"))).unwrap(),
+            df_physical_expr(plan.schema(), col("tag1").eq(lit("foo"))).unwrap(),
             plan,
         )
         .unwrap();
diff --git a/iox_query/src/physical_optimizer/combine_chunks.rs b/iox_query/src/physical_optimizer/combine_chunks.rs
index a0138cd4761..d09681ea47d 100644
--- a/iox_query/src/physical_optimizer/combine_chunks.rs
+++ b/iox_query/src/physical_optimizer/combine_chunks.rs
@@ -1,20 +1,50 @@
 use std::sync::Arc;
 
+use arrow::compute::SortOptions;
 use datafusion::{
-    common::tree_node::{Transformed, TreeNode},
+    common::{
+        plan_err,
+        tree_node::{Transformed, TreeNode},
+    },
     config::ConfigOptions,
     error::{DataFusionError, Result},
     physical_optimizer::PhysicalOptimizerRule,
     physical_plan::{union::UnionExec, ExecutionPlan},
 };
+use observability_deps::tracing::trace;
+use schema::TIME_COLUMN_NAME;
 
 use crate::{
-    physical_optimizer::chunk_extraction::extract_chunks, provider::chunks_to_physical_nodes,
+    physical_optimizer::{
+        chunk_extraction::extract_chunks,
+        sort::util::{collect_statistics_min_max, sort_by_value_ranges},
+    },
+    provider::chunks_to_physical_nodes,
 };
 
 /// Collects [`QueryChunk`]s and re-creates a appropriate physical nodes.
 ///
-/// This only works if there no filters, projections, sorts, or de-duplicate operations in the affected subtree.
+/// Invariants of inputs of the union:
+///   1. They do not overlap on time ranges (done in previous step: TimeSplit)
+///   2. Each input of the union is either with_chunks or other_plans.
+///      - An input with_chunks is a plan that contains only (union of) ParquetExecs or RecordBatchesExec
+///      - An input of other_plans is a plan that contains at least one node that is not a ParquetExec or
+///        RecordBatchesExec or Union of them. Examples of those other nodes are FilterExec, DeduplicateExec,
+///        ProjectionExec, etc.
+//
+/// Goals of this optimzation step:
+///   i. Combine **possible** plans with_chunks into a single union
+///   ii. - Keep the the combined plan non-overlapped on time ranges. This will likely help later optimization steps.
+///       - If time ranges cannot be computed, combine all plans with_chunks into a single union.
+///
+/// Example: w = with_chunks, o = other_plans
+///   Input:  |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --|
+///   Output when time ranges can be computed: Only two sets of plans that are combined: [P1, P2], [P4, P5]
+///           |------ P1 & P2 w ----| |-- P3 o --| |------ P4 & P5 w ------| |-- P6 o --| |--P7 w --|
+///   Output when time ranges cannot be computed: all plans with_chunks are combined into a single union
+///           |-------------------------- P1, P2, P4, P5, P7 w -------------------------------------|
+///                                   |-- P3 o --|                           |-- P6 o --|
+///
 ///
 /// This is mostly useful after multiple re-arrangements (e.g. [`PartitionSplit`]-[`TimeSplit`]-[`RemoveDedup`]) created
 /// a bunch of freestanding chunks that can be re-arranged into more packed, more efficient physical nodes.
@@ -35,32 +65,31 @@ impl PhysicalOptimizerRule for CombineChunks {
     ) -> Result<Arc<dyn ExecutionPlan>> {
         plan.transform_up(&|plan| {
             if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
-                let (inputs_with_chunks, inputs_other): (Vec<_>, Vec<_>) = union_exec
-                    .inputs()
-                    .iter()
-                    .cloned()
-                    .partition(|plan| {
-                        extract_chunks(plan.as_ref()).is_some()
-                    });
-
-                if inputs_with_chunks.is_empty() {
-                    return Ok(Transformed::No(plan));
-                }
-                let union_of_chunks = UnionExec::new(inputs_with_chunks);
-
-                if let Some((schema, chunks, output_sort_key)) = extract_chunks(&union_of_chunks) {
-                    let union_of_chunks = chunks_to_physical_nodes(
-                        &schema,
-                        output_sort_key.as_ref(),
-                        chunks,
-                        config.execution.target_partitions,
-                    );
-                    let Some(union_of_chunks) = union_of_chunks.as_any().downcast_ref::<UnionExec>() else {
-                        return Err(DataFusionError::External(format!("Expected chunks_to_physical_nodes to produce UnionExec but got {union_of_chunks:?}").into()));
-                    };
-                    let final_union = UnionExec::new(union_of_chunks.inputs().iter().cloned().chain(inputs_other).collect());
-                    return Ok(Transformed::Yes(Arc::new(final_union)));
-                }
+                // sort and group the inputs by time range
+                let inputs = union_exec.inputs();
+                // We only need to ensure the input are sorted by time range,
+                // any order is fine and hence we choose to go with ASC here
+                let groups = sort_and_group_plans(
+                    inputs.clone(),
+                    TIME_COLUMN_NAME,
+                    SortOptions {
+                        descending: false,
+                        nulls_first: false,
+                    },
+                )?;
+
+                // combine plans from each group
+                let plans = groups
+                    .into_iter()
+                    .map(|group| combine_plans(group, config))
+                    .collect::<Result<Vec<_>>>()?
+                    .into_iter()
+                    .flatten()
+                    .collect::<Vec<_>>();
+
+                let final_union = UnionExec::new(plans);
+                trace!(?final_union, "-------- final union");
+                return Ok(Transformed::Yes(Arc::new(final_union)));
             }
 
             Ok(Transformed::No(plan))
@@ -76,6 +105,117 @@ impl PhysicalOptimizerRule for CombineChunks {
     }
 }
 
+/// Sort the given plans on the given column name and a given sort order.
+///
+/// Then group them into non-overlapped groups based on the ranges of the given column, and return the groups.
+///
+/// # Input Invariants
+/// - Plans do not overlap on the given column
+///
+/// # Output Invariants
+/// - Plans in the same group do not overlap on the given column
+/// -The groups do not overlap on the given column
+///
+/// # Example
+/// Input:
+///
+/// ```text
+/// 7 plans with value ranges : |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --|
+/// ```
+///
+/// Output:
+///
+/// ```text
+/// 5 groups: [P1, P2], [P3], [P4, P5], [P6], [P7]
+/// ```
+fn sort_and_group_plans(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    col_name: &str,
+    sort_options: SortOptions,
+) -> Result<Vec<Vec<Arc<dyn ExecutionPlan>>>> {
+    if plans.len() <= 1 {
+        return Ok(vec![plans]);
+    }
+
+    let Some(value_ranges) = collect_statistics_min_max(&plans, col_name)? else {
+        // No statistics to sort and group the plans.
+        // Return all plans in the same group
+        trace!("-------- combine chunks - cannot collect statistics min max for column {col_name}");
+        return Ok(vec![plans]);
+    };
+
+    // Sort the plans by their value ranges
+    trace!("-------- value_ranges: {:?}", value_ranges);
+    let Some(plans_value_ranges) = sort_by_value_ranges(plans.clone(), value_ranges, sort_options)?
+    else {
+        // The inputs are not being sorted by value ranges, cannot group them
+        // Return all plans in the same group
+        trace!("-------- inputs are not sorted by value ranges. No optimization");
+        return Ok(vec![plans]);
+    };
+
+    // Group plans that can be combined
+    let plans = plans_value_ranges.plans;
+    let mut final_groups = Vec::with_capacity(plans.len());
+    let mut combinable_plans = Vec::new();
+    for plan in plans {
+        if extract_chunks(plan.as_ref()).is_some() {
+            combinable_plans.push(plan);
+        } else {
+            if !combinable_plans.is_empty() {
+                final_groups.push(combinable_plans);
+                combinable_plans = Vec::new();
+            }
+            final_groups.push(vec![plan]);
+        }
+    }
+
+    if !combinable_plans.is_empty() {
+        final_groups.push(combinable_plans);
+    }
+
+    Ok(final_groups)
+}
+
+/// Combine the given plans with chunks  into a single union. The other plans stay as is.
+fn combine_plans(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    config: &ConfigOptions,
+) -> Result<Vec<Arc<dyn ExecutionPlan>>> {
+    let (inputs_with_chunks, inputs_other): (Vec<_>, Vec<_>) = plans
+        .iter()
+        .cloned()
+        .partition(|plan| extract_chunks(plan.as_ref()).is_some());
+
+    if inputs_with_chunks.is_empty() {
+        return Ok(plans);
+    }
+    let union_of_chunks = UnionExec::new(inputs_with_chunks);
+
+    if let Some((schema, chunks, output_sort_key)) = extract_chunks(&union_of_chunks) {
+        let union_of_chunks = chunks_to_physical_nodes(
+            &schema,
+            output_sort_key.as_ref(),
+            chunks,
+            config.execution.target_partitions,
+        );
+        let Some(union_of_chunks) = union_of_chunks.as_any().downcast_ref::<UnionExec>() else {
+            return plan_err!("Expected chunks_to_physical_nodes to produce UnionExec but got {union_of_chunks:?}");
+        };
+
+        // return other_plans and the union_of_chunks
+        let plans = union_of_chunks
+            .inputs()
+            .iter()
+            .cloned()
+            .chain(inputs_other)
+            .collect();
+        return Ok(plans);
+    }
+
+    Ok(plans)
+}
+
 #[cfg(test)]
 mod tests {
     use datafusion::{
@@ -89,11 +229,24 @@ mod tests {
 
     #[test]
     fn test_combine_single_union_tree() {
-        let chunk1 = TestChunk::new("table").with_id(1);
-        let chunk2 = TestChunk::new("table").with_id(2).with_dummy_parquet_file();
-        let chunk3 = TestChunk::new("table").with_id(3);
-        let chunk4 = TestChunk::new("table").with_id(4).with_dummy_parquet_file();
-        let chunk5 = TestChunk::new("table").with_id(5).with_dummy_parquet_file();
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_time_column_with_stats(Some(1), Some(2));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(3), Some(4));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_time_column_with_stats(Some(5), Some(6));
+        let chunk4 = TestChunk::new("table")
+            .with_id(4)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(7), Some(8));
+        let chunk5 = TestChunk::new("table")
+            .with_id(5)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(9), Some(10));
         let schema = chunk1.schema().as_arrow();
         let plan = Arc::new(UnionExec::new(vec![
             chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2),
@@ -114,16 +267,75 @@ mod tests {
         input:
           - " UnionExec"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=1"
-          - "     ParquetExec: file_groups={1 group: [[2.parquet]]}"
+          - "     RecordBatchesExec: chunks=1, projection=[time]"
+          - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=1"
-          - "     ParquetExec: file_groups={2 groups: [[4.parquet], [5.parquet]]}"
+          - "     RecordBatchesExec: chunks=1, projection=[time]"
+          - "     ParquetExec: file_groups={2 groups: [[4.parquet], [5.parquet]]}, projection=[time]"
         output:
           Ok:
             - " UnionExec"
-            - "   RecordBatchesExec: chunks=2"
-            - "   ParquetExec: file_groups={2 groups: [[2.parquet, 5.parquet], [4.parquet]]}"
+            - "   RecordBatchesExec: chunks=2, projection=[time]"
+            - "   ParquetExec: file_groups={2 groups: [[2.parquet, 5.parquet], [4.parquet]]}, projection=[time]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_only_combine_contiguous_arms() {
+        let chunk1 = TestChunk::new("table")
+            .with_id(1)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(1), Some(2));
+        let chunk2 = TestChunk::new("table")
+            .with_id(2)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(3), Some(4));
+        let chunk3 = TestChunk::new("table")
+            .with_id(3)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(5), Some(6));
+        let chunk4 = TestChunk::new("table")
+            .with_id(4)
+            .with_dummy_parquet_file()
+            .with_time_column_with_stats(Some(7), Some(8));
+        let schema = chunk1.schema().as_arrow();
+        let plan = Arc::new(UnionExec::new(vec![
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2),
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk2)], 2),
+            Arc::new(
+                FilterExec::try_new(
+                    Arc::new(Literal::new(ScalarValue::from(false))),
+                    chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk3)], 2),
+                )
+                .unwrap(),
+            ),
+            chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk4)], 2),
+        ]));
+        let opt = CombineChunks;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[time]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]"
+          - "   FilterExec: false"
+          - "     UnionExec"
+          - "       ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]"
+          - "   UnionExec"
+          - "     ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[time]"
+            - "   FilterExec: false"
+            - "     UnionExec"
+            - "       ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]"
+            - "   ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]"
         "###
         );
     }
@@ -183,10 +395,10 @@ mod tests {
             @r###"
         ---
         input:
-          - " EmptyExec: produce_one_row=false"
+          - " EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs
index 9f6539ea4c6..341ae4774cc 100644
--- a/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs
+++ b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs
@@ -119,11 +119,11 @@ mod tests {
         ---
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " DeduplicateExec: []"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs
index 08e94e87dc6..c4b39248d2d 100644
--- a/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs
+++ b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs
@@ -191,11 +191,11 @@ mod tests {
         ---
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/dedup/partition_split.rs b/iox_query/src/physical_optimizer/dedup/partition_split.rs
index 07154149854..386cd9cd94e 100644
--- a/iox_query/src/physical_optimizer/dedup/partition_split.rs
+++ b/iox_query/src/physical_optimizer/dedup/partition_split.rs
@@ -126,11 +126,11 @@ mod tests {
         ---
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -150,13 +150,13 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
           - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=2"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
             - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
         "###
         );
@@ -183,18 +183,18 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
           - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " UnionExec"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
         "###
         );
@@ -238,18 +238,18 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
           - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " UnionExec"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
         "###
         );
@@ -275,12 +275,12 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=3"
+          - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=3"
+            - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/dedup/remove_dedup.rs b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs
index 4bfab071505..9558c5a205f 100644
--- a/iox_query/src/physical_optimizer/dedup/remove_dedup.rs
+++ b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs
@@ -80,10 +80,10 @@ mod tests {
         ---
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
@@ -101,11 +101,11 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=1"
+          - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " UnionExec"
-            - "   RecordBatchesExec: chunks=1"
+            - "   RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
         "###
         );
     }
@@ -123,12 +123,12 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=1"
+          - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=1"
+            - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
         "###
         );
     }
@@ -147,12 +147,12 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=2"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/dedup/time_split.rs b/iox_query/src/physical_optimizer/dedup/time_split.rs
index 57f18baf5fa..29acccb9f7a 100644
--- a/iox_query/src/physical_optimizer/dedup/time_split.rs
+++ b/iox_query/src/physical_optimizer/dedup/time_split.rs
@@ -119,11 +119,11 @@ mod tests {
         ---
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -145,13 +145,13 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
           - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=2"
+            - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
             - "     ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]"
         "###
         );
@@ -186,18 +186,18 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=2"
+          - "     RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]"
           - "     ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " UnionExec"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={2 groups: [[6.parquet, 5.parquet], [3.parquet]]}, projection=[field, tag1, tag2, time]"
             - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "     UnionExec"
-            - "       RecordBatchesExec: chunks=1"
+            - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]"
             - "       ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]"
         "###
         );
@@ -223,12 +223,12 @@ mod tests {
         input:
           - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=3"
+          - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
         output:
           Ok:
             - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=3"
+            - "     RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/mod.rs b/iox_query/src/physical_optimizer/mod.rs
index c12331e427a..a0bf7a4cb0c 100644
--- a/iox_query/src/physical_optimizer/mod.rs
+++ b/iox_query/src/physical_optimizer/mod.rs
@@ -10,7 +10,7 @@ use self::{
     },
     predicate_pushdown::PredicatePushdown,
     projection_pushdown::ProjectionPushdown,
-    sort::parquet_sortness::ParquetSortness,
+    sort::{order_union_sorted_inputs::OrderUnionSortedInputs, parquet_sortness::ParquetSortness},
     union::{nested_union::NestedUnion, one_union::OneUnion},
 };
 
@@ -25,6 +25,9 @@ mod union;
 #[cfg(test)]
 mod test_util;
 
+#[cfg(test)]
+mod tests;
+
 /// Register IOx-specific [`PhysicalOptimizerRule`]s with the SessionContext
 pub fn register_iox_physical_optimizers(state: SessionState) -> SessionState {
     // prepend IOx-specific rules to DataFusion builtins
@@ -42,7 +45,12 @@ pub fn register_iox_physical_optimizers(state: SessionState) -> SessionState {
         Arc::new(NestedUnion),
         Arc::new(OneUnion),
     ];
+
+    // Append DataFUsion physical rules to the IOx-specific rules
     optimizers.append(&mut state.physical_optimizers().to_vec());
 
+    // Add a rule to optimize plan with limit
+    optimizers.push(Arc::new(OrderUnionSortedInputs));
+
     state.with_physical_optimizer_rules(optimizers)
 }
diff --git a/iox_query/src/physical_optimizer/predicate_pushdown.rs b/iox_query/src/physical_optimizer/predicate_pushdown.rs
index 3e3b8b92f50..ab8ccd4bcfc 100644
--- a/iox_query/src/physical_optimizer/predicate_pushdown.rs
+++ b/iox_query/src/physical_optimizer/predicate_pushdown.rs
@@ -38,10 +38,8 @@ impl PhysicalOptimizerRule for PredicatePushdown {
                 let child = children.remove(0);
 
                 let child_any = child.as_any();
-                if let Some(child_empty) = child_any.downcast_ref::<EmptyExec>() {
-                    if !child_empty.produce_one_row() {
-                        return Ok(Transformed::Yes(child));
-                    }
+                if child_any.downcast_ref::<EmptyExec>().is_some() {
+                    return Ok(Transformed::Yes(child));
                 } else if let Some(child_union) = child_any.downcast_ref::<UnionExec>() {
                     let new_inputs = child_union
                         .inputs()
@@ -170,6 +168,7 @@ mod tests {
         physical_expr::PhysicalSortExpr,
         physical_plan::{
             expressions::{BinaryExpr, Column, Literal},
+            placeholder_row::PlaceholderRowExec,
             PhysicalExpr, Statistics,
         },
         scalar::ScalarValue,
@@ -184,11 +183,7 @@ mod tests {
     fn test_empty_no_rows() {
         let schema = schema();
         let plan = Arc::new(
-            FilterExec::try_new(
-                predicate_tag(&schema),
-                Arc::new(EmptyExec::new(false, schema)),
-            )
-            .unwrap(),
+            FilterExec::try_new(predicate_tag(&schema), Arc::new(EmptyExec::new(schema))).unwrap(),
         );
         let opt = PredicatePushdown;
         insta::assert_yaml_snapshot!(
@@ -197,10 +192,10 @@ mod tests {
         ---
         input:
           - " FilterExec: tag1@0 = foo"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
@@ -211,7 +206,7 @@ mod tests {
         let plan = Arc::new(
             FilterExec::try_new(
                 predicate_tag(&schema),
-                Arc::new(EmptyExec::new(true, schema)),
+                Arc::new(PlaceholderRowExec::new(schema)),
             )
             .unwrap(),
         );
@@ -222,11 +217,11 @@ mod tests {
         ---
         input:
           - " FilterExec: tag1@0 = foo"
-          - "   EmptyExec: produce_one_row=true"
+          - "   PlaceholderRowExec"
         output:
           Ok:
             - " FilterExec: tag1@0 = foo"
-            - "   EmptyExec: produce_one_row=true"
+            - "   PlaceholderRowExec"
         "###
         );
     }
@@ -239,7 +234,7 @@ mod tests {
                 predicate_tag(&schema),
                 Arc::new(UnionExec::new(
                     (0..2)
-                        .map(|_| Arc::new(EmptyExec::new(true, Arc::clone(&schema))) as _)
+                        .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _)
                         .collect(),
                 )),
             )
@@ -253,15 +248,15 @@ mod tests {
         input:
           - " FilterExec: tag1@0 = foo"
           - "   UnionExec"
-          - "     EmptyExec: produce_one_row=true"
-          - "     EmptyExec: produce_one_row=true"
+          - "     PlaceholderRowExec"
+          - "     PlaceholderRowExec"
         output:
           Ok:
             - " UnionExec"
             - "   FilterExec: tag1@0 = foo"
-            - "     EmptyExec: produce_one_row=true"
+            - "     PlaceholderRowExec"
             - "   FilterExec: tag1@0 = foo"
-            - "     EmptyExec: produce_one_row=true"
+            - "     PlaceholderRowExec"
         "###
         );
     }
@@ -274,7 +269,7 @@ mod tests {
                 predicate_tag(&schema),
                 Arc::new(UnionExec::new(vec![Arc::new(UnionExec::new(
                     (0..2)
-                        .map(|_| Arc::new(EmptyExec::new(true, Arc::clone(&schema))) as _)
+                        .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _)
                         .collect(),
                 ))])),
             )
@@ -289,16 +284,16 @@ mod tests {
           - " FilterExec: tag1@0 = foo"
           - "   UnionExec"
           - "     UnionExec"
-          - "       EmptyExec: produce_one_row=true"
-          - "       EmptyExec: produce_one_row=true"
+          - "       PlaceholderRowExec"
+          - "       PlaceholderRowExec"
         output:
           Ok:
             - " UnionExec"
             - "   UnionExec"
             - "     FilterExec: tag1@0 = foo"
-            - "       EmptyExec: produce_one_row=true"
+            - "       PlaceholderRowExec"
             - "     FilterExec: tag1@0 = foo"
-            - "       EmptyExec: produce_one_row=true"
+            - "       PlaceholderRowExec"
         "###
         );
     }
@@ -310,12 +305,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![],
-            infinite_source: false,
         };
         let plan = Arc::new(
             FilterExec::try_new(
@@ -351,7 +345,7 @@ mod tests {
             FilterExec::try_new(
                 predicate_field(&schema),
                 Arc::new(DeduplicateExec::new(
-                    Arc::new(EmptyExec::new(true, Arc::clone(&schema))),
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
                     sort_expr(&schema),
                     false,
                 )),
@@ -366,12 +360,12 @@ mod tests {
         input:
           - " FilterExec: field@2 = val"
           - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
-          - "     EmptyExec: produce_one_row=true"
+          - "     PlaceholderRowExec"
         output:
           Ok:
             - " FilterExec: field@2 = val"
             - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
-            - "     EmptyExec: produce_one_row=true"
+            - "     PlaceholderRowExec"
         "###
         );
     }
@@ -383,7 +377,7 @@ mod tests {
             FilterExec::try_new(
                 predicate_tag(&schema),
                 Arc::new(DeduplicateExec::new(
-                    Arc::new(EmptyExec::new(true, Arc::clone(&schema))),
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
                     sort_expr(&schema),
                     false,
                 )),
@@ -398,12 +392,12 @@ mod tests {
         input:
           - " FilterExec: tag1@0 = foo"
           - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
-          - "     EmptyExec: produce_one_row=true"
+          - "     PlaceholderRowExec"
         output:
           Ok:
             - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
             - "   FilterExec: tag1@0 = foo"
-            - "     EmptyExec: produce_one_row=true"
+            - "     PlaceholderRowExec"
         "###
         );
     }
@@ -422,7 +416,7 @@ mod tests {
                 ])
                 .expect("not empty"),
                 Arc::new(DeduplicateExec::new(
-                    Arc::new(EmptyExec::new(true, Arc::clone(&schema))),
+                    Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))),
                     sort_expr(&schema),
                     false,
                 )),
@@ -437,13 +431,13 @@ mod tests {
         input:
           - " FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND field@2 = val AND tag1@0 = field@2 AND true"
           - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
-          - "     EmptyExec: produce_one_row=true"
+          - "     PlaceholderRowExec"
         output:
           Ok:
             - " FilterExec: field@2 = val AND tag1@0 = field@2"
             - "   DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]"
             - "     FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND true"
-            - "       EmptyExec: produce_one_row=true"
+            - "       PlaceholderRowExec"
         "###
         );
     }
diff --git a/iox_query/src/physical_optimizer/projection_pushdown.rs b/iox_query/src/physical_optimizer/projection_pushdown.rs
index df26c84ecf8..0efe5977b95 100644
--- a/iox_query/src/physical_optimizer/projection_pushdown.rs
+++ b/iox_query/src/physical_optimizer/projection_pushdown.rs
@@ -18,6 +18,7 @@ use datafusion::{
         empty::EmptyExec,
         expressions::Column,
         filter::FilterExec,
+        placeholder_row::PlaceholderRowExec,
         projection::ProjectionExec,
         sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
         union::UnionExec,
@@ -63,10 +64,15 @@ impl PhysicalOptimizerRule for ProjectionPushdown {
 
                 let child_any = child.as_any();
                 if let Some(child_empty) = child_any.downcast_ref::<EmptyExec>() {
-                    let new_child = EmptyExec::new(
-                        child_empty.produce_one_row(),
-                        Arc::new(child_empty.schema().project(&column_indices)?),
-                    );
+                    let new_child =
+                        EmptyExec::new(Arc::new(child_empty.schema().project(&column_indices)?));
+                    return Ok(Transformed::Yes(Arc::new(new_child)));
+                } else if let Some(child_placeholder) =
+                    child_any.downcast_ref::<PlaceholderRowExec>()
+                {
+                    let new_child = PlaceholderRowExec::new(Arc::new(
+                        child_placeholder.schema().project(&column_indices)?,
+                    ));
                     return Ok(Transformed::Yes(Arc::new(new_child)));
                 } else if let Some(child_union) = child_any.downcast_ref::<UnionExec>() {
                     let new_inputs = child_union
@@ -453,7 +459,7 @@ mod tests {
         let plan = Arc::new(
             ProjectionExec::try_new(
                 vec![(expr_col("tag1", &schema), String::from("tag1"))],
-                Arc::new(EmptyExec::new(false, schema)),
+                Arc::new(EmptyExec::new(schema)),
             )
             .unwrap(),
         );
@@ -465,10 +471,10 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
 
@@ -492,7 +498,7 @@ mod tests {
                     (expr_col("tag1", &schema), String::from("tag1")),
                     (expr_col("field", &schema), String::from("field")),
                 ],
-                Arc::new(EmptyExec::new(false, schema)),
+                Arc::new(EmptyExec::new(schema)),
             )
             .unwrap(),
         );
@@ -504,10 +510,10 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag2@1 as tag2, tag1@0 as tag1, field@2 as field]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
 
@@ -531,7 +537,7 @@ mod tests {
         let plan = Arc::new(
             ProjectionExec::try_new(
                 vec![(expr_col("tag2", &schema), String::from("tag1"))],
-                Arc::new(EmptyExec::new(false, schema)),
+                Arc::new(EmptyExec::new(schema)),
             )
             .unwrap(),
         );
@@ -542,11 +548,11 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag2@1 as tag1]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " ProjectionExec: expr=[tag2@1 as tag1]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -560,7 +566,7 @@ mod tests {
                     (expr_col("tag1", &schema), String::from("tag1")),
                     (expr_col("tag2", &schema), String::from("tag3")),
                 ],
-                Arc::new(EmptyExec::new(false, schema)),
+                Arc::new(EmptyExec::new(schema)),
             )
             .unwrap(),
         );
@@ -571,11 +577,11 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -589,7 +595,7 @@ mod tests {
                     Arc::new(Literal::new(ScalarValue::from("foo"))),
                     String::from("tag1"),
                 )],
-                Arc::new(EmptyExec::new(false, schema)),
+                Arc::new(EmptyExec::new(schema)),
             )
             .unwrap(),
         );
@@ -600,11 +606,11 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[foo as tag1]"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " ProjectionExec: expr=[foo as tag1]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -725,7 +731,7 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: Some(projection),
             limit: None,
             table_partition_cols: vec![],
@@ -743,7 +749,6 @@ mod tests {
                     options: Default::default(),
                 },
             ]],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, Some(expr_string_cmp("tag1", &schema)), None);
         let plan = Arc::new(
@@ -987,12 +992,12 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
-          - "   SortExec: fetch=42, expr=[tag2@1 DESC]"
+          - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
           - "     Test"
         output:
           Ok:
             - " ProjectionExec: expr=[tag1@0 as tag1]"
-            - "   SortExec: fetch=42, expr=[tag2@1 DESC]"
+            - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
             - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
             - "       Test"
         "###
@@ -1033,12 +1038,12 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
-          - "   SortExec: fetch=42, expr=[tag2@1 DESC]"
+          - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
           - "     Test"
         output:
           Ok:
             - " ProjectionExec: expr=[tag1@0 as tag1]"
-            - "   SortExec: fetch=42, expr=[tag2@1 DESC]"
+            - "   SortExec: TopK(fetch=42), expr=[tag2@1 DESC]"
             - "     ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
             - "       Test"
         "###
@@ -1089,7 +1094,7 @@ mod tests {
     #[test]
     fn test_nested_proj_inner_is_impure() {
         let schema = schema();
-        let plan = Arc::new(EmptyExec::new(false, schema));
+        let plan = Arc::new(EmptyExec::new(schema));
         let plan = Arc::new(
             ProjectionExec::try_new(
                 vec![
@@ -1121,11 +1126,11 @@ mod tests {
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
           - "   ProjectionExec: expr=[foo as tag1, bar as tag2]"
-          - "     EmptyExec: produce_one_row=false"
+          - "     EmptyExec"
         output:
           Ok:
             - " ProjectionExec: expr=[foo as tag1]"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -1133,7 +1138,7 @@ mod tests {
     #[test]
     fn test_nested_proj_inner_is_pure() {
         let schema = schema();
-        let plan = Arc::new(EmptyExec::new(false, schema));
+        let plan = Arc::new(EmptyExec::new(schema));
         let plan = Arc::new(
             ProjectionExec::try_new(
                 vec![
@@ -1160,10 +1165,10 @@ mod tests {
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
           - "   ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]"
-          - "     EmptyExec: produce_one_row=false"
+          - "     EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
         let empty_exec = test
@@ -1297,10 +1302,10 @@ mod tests {
         ---
         input:
           - " ProjectionExec: expr=[tag1@0 as tag1]"
-          - "   RecordBatchesExec: chunks=1"
+          - "   RecordBatchesExec: chunks=1, projection=[tag1, tag2, field]"
         output:
           Ok:
-            - " RecordBatchesExec: chunks=1"
+            - " RecordBatchesExec: chunks=1, projection=[tag1]"
         "###
         );
 
@@ -1326,12 +1331,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![],
-            infinite_source: false,
         };
         let plan = Arc::new(ParquetExec::new(base_config, None, None));
         let plan = Arc::new(UnionExec::new(vec![plan]));
@@ -1695,8 +1699,10 @@ mod tests {
             unimplemented!()
         }
 
-        fn statistics(&self) -> datafusion::physical_plan::Statistics {
-            Statistics::default()
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            Ok(datafusion::physical_plan::Statistics::new_unknown(
+                &self.schema(),
+            ))
         }
     }
 
diff --git a/iox_query/src/physical_optimizer/sort/mod.rs b/iox_query/src/physical_optimizer/sort/mod.rs
index d0cdabb621a..9a9be8b7a80 100644
--- a/iox_query/src/physical_optimizer/sort/mod.rs
+++ b/iox_query/src/physical_optimizer/sort/mod.rs
@@ -2,5 +2,7 @@
 //!
 //! [`SortExec`]: datafusion::physical_plan::sorts::sort::SortExec
 
+pub mod order_union_sorted_inputs;
 pub mod parquet_sortness;
 pub mod push_sort_through_union;
+pub mod util;
diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
new file mode 100644
index 00000000000..026610870b6
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs
@@ -0,0 +1,1487 @@
+use std::sync::Arc;
+
+use datafusion::{
+    common::tree_node::{Transformed, TreeNode},
+    config::ConfigOptions,
+    error::Result,
+    physical_optimizer::PhysicalOptimizerRule,
+    physical_plan::{
+        displayable, expressions::Column, sorts::sort_preserving_merge::SortPreservingMergeExec,
+        union::UnionExec, ExecutionPlan,
+    },
+};
+use observability_deps::tracing::{trace, warn};
+
+use crate::{
+    physical_optimizer::sort::util::{collect_statistics_min_max, sort_by_value_ranges},
+    provider::progressive_eval::ProgressiveEvalExec,
+};
+
+/// IOx specific optimization that eliminates a `SortPreservingMerge`
+/// by reordering inputs in terms  of their value ranges. If all inputs are non overlapping and ordered
+/// by value range, they can be concatenated by `ProgressiveEval`  while
+/// maintaining the desired output order without actually merging.
+///
+/// Find this structure:
+///     SortPreservingMergeExec  - on one column (DESC or ASC)
+///         UnionExec
+/// and if
+///    - all inputs of UnionExec are already sorted (or has SortExec) with sortExpr also on time DESC or ASC accarsdingly and
+///    - the streams do not overlap in values of the sorted column
+/// do:
+///   - order them by the sorted column DESC or ASC accordingly and
+///   - replace SortPreservingMergeExec with ProgressiveEvalExec
+///
+/// Notes: The difference between SortPreservingMergeExec & ProgressiveEvalExec
+///    - SortPreservingMergeExec do the merge of sorted input streams. It needs each stream sorted but the streams themselves
+///      can be in any random order and they can also overlap in values of sorted columns.
+///    - ProgressiveEvalExec only outputs data in their input order of the streams and not do any merges. Thus in order to
+///      output data in the right sort order, these three conditions must be true:
+///        1. Each input stream must sorted on the same column DESC or ASC accordingly
+///        2. The streams must be sorted on the column DESC or ASC accordingly
+///        3. The streams must not overlap in the values of that column.
+///
+/// Example: for col_name ranges:
+///   |--- r1---|-- r2 ---|-- r3 ---|-- r4 --|
+///
+/// Here is what the input look like:
+///
+///   SortPreservingMergeExec: time@2 DESC, fetch=1
+///     UnionExec
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r3
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r1
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r4
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r2  -- assuming this SortExec has 2 output sorted streams
+///          ...
+///
+/// The streams do not overlap in time, and they are already sorted by time DESC.
+///
+/// The output will be the same except that all the input streams will be sorted by time DESC too and looks like
+///
+///   SortPreservingMergeExec: time@2 DESC, fetch=1
+///     UnionExec
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r1
+///         ...
+///       SortPreservingMergeExec:                                                  -- need this extra to merge the 2 streams into one
+///          SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r2
+///             ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r3
+///         ...
+///       SortExec: expr=col_name@2 DESC  <--- input stream with col_name range r4
+///          ...
+///
+
+pub(crate) struct OrderUnionSortedInputs;
+
+impl PhysicalOptimizerRule for OrderUnionSortedInputs {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        plan.transform_up(&|plan| {
+            // Find SortPreservingMergeExec
+            let Some(sort_preserving_merge_exec) =
+                plan.as_any().downcast_ref::<SortPreservingMergeExec>()
+            else {
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check if the sortExpr is only on one column
+            let sort_expr = sort_preserving_merge_exec.expr();
+            if sort_expr.len() != 1 {
+                trace!(
+                    ?sort_expr,
+                    "-------- sortExpr is not on one column. No optimization"
+                );
+                return Ok(Transformed::No(plan));
+            };
+            let Some(sorted_col) = sort_expr[0].expr.as_any().downcast_ref::<Column>() else {
+                trace!(
+                    ?sort_expr,
+                    "-------- sortExpr is not on pure column but expression. No optimization"
+                );
+                return Ok(Transformed::No(plan));
+            };
+            let sort_options = sort_expr[0].options;
+
+            // Find UnionExec
+            let Some(union_exec) = sort_preserving_merge_exec
+                .input()
+                .as_any()
+                .downcast_ref::<UnionExec>()
+            else {
+                trace!("-------- SortPreservingMergeExec input is not UnionExec. No optimization");
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check all inputs of UnionExec must be already sorted and on the same sort_expr of SortPreservingMergeExec
+            let Some(union_output_ordering) = union_exec.output_ordering() else {
+                warn!(plan=%displayable(plan.as_ref()).indent(false), "Union input to SortPreservingMerge is not sorted");
+                return Ok(Transformed::No(plan));
+            };
+
+            // Check if the first PhysicalSortExpr is the same as the sortExpr[0] in SortPreservingMergeExec
+            if sort_expr[0] != union_output_ordering[0] {
+                warn!(?sort_expr, ?union_output_ordering, plan=%displayable(plan.as_ref()).indent(false), "-------- Sort order of SortPreservingMerge and its children are different");
+                return Ok(Transformed::No(plan));
+            }
+
+            let Some(value_ranges) = collect_statistics_min_max(union_exec.inputs(), sorted_col.name())?
+            else {
+                return Ok(Transformed::No(plan));
+            };
+
+            // Sort the inputs by their value ranges
+            trace!("-------- value_ranges: {:?}", value_ranges);
+            let Some(plans_value_ranges) =
+                sort_by_value_ranges(union_exec.inputs().to_vec(), value_ranges, sort_options)?
+            else {
+                trace!("-------- inputs are not sorted by value ranges. No optimization");
+                return Ok(Transformed::No(plan));
+            };
+
+            // If each input of UnionExec outputs many sorted streams, data of different streams may overlap and
+            // even if they do not overlapped, their streams can be in any order. We need to (sort) merge them first
+            // to have a single output stream out to guarantee the output is sorted.
+            let new_inputs = plans_value_ranges.plans
+                .iter()
+                .map(|input| {
+                    if input.output_partitioning().partition_count() > 1 {
+                        // Add SortPreservingMergeExec on top of this input
+                        let sort_preserving_merge_exec = Arc::new(
+                            SortPreservingMergeExec::new(sort_expr.to_vec(), Arc::clone(input))
+                                .with_fetch(sort_preserving_merge_exec.fetch()),
+                        );
+                        Ok(sort_preserving_merge_exec as _)
+                    } else {
+                        Ok(Arc::clone(input))
+                    }
+                })
+                .collect::<Result<Vec<_>>>()?;
+
+            let new_union_exec = Arc::new(UnionExec::new(new_inputs));
+
+            // Replace SortPreservingMergeExec with ProgressiveEvalExec
+            let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new(
+                new_union_exec,
+                Some(plans_value_ranges.value_ranges),
+                sort_preserving_merge_exec.fetch(),
+            ));
+
+            Ok(Transformed::Yes(progresive_eval_exec))
+        })
+    }
+
+    fn name(&self) -> &str {
+        "order_union_sorted_inputs"
+    }
+
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use arrow::{compute::SortOptions, datatypes::SchemaRef};
+    use datafusion::{
+        logical_expr::Operator,
+        physical_expr::PhysicalSortExpr,
+        physical_plan::{
+            expressions::{BinaryExpr, Column},
+            limit::GlobalLimitExec,
+            projection::ProjectionExec,
+            repartition::RepartitionExec,
+            sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec},
+            union::UnionExec,
+            ExecutionPlan, Partitioning, PhysicalExpr,
+        },
+        scalar::ScalarValue,
+    };
+    use schema::{InfluxFieldType, SchemaBuilder as IOxSchemaBuilder};
+
+    use crate::{
+        physical_optimizer::{
+            sort::order_union_sorted_inputs::OrderUnionSortedInputs, test_util::OptimizationTest,
+        },
+        provider::{chunks_to_physical_nodes, DeduplicateExec, RecordBatchesExec},
+        statistics::{column_statistics_min_max, compute_stats_column_min_max},
+        test::{format_execution_plan, TestChunk},
+        QueryChunk, CHUNK_ORDER_COLUMN_NAME,
+    };
+
+    // ------------------------------------------------------------------
+    // Positive tests: the right structure found -> plan optimized
+    // ------------------------------------------------------------------
+
+    #[test]
+    fn test_limit_mix_record_batch_parquet_1_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //          RecordBatchesExec                 -- 3 chunks [2001, 3000]
+        //          RecordBatchesExec                 -- 2 chunks [2500, 3500]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_batches1 = record_batches_exec_with_value_range(3, 2001, 3000);
+        let plan_batches2 = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches1, plan_batches2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        // min max of plan_sorted1 is [1000, 2000]
+        // structure of plan_sorted1
+        let p_sort1 = Arc::clone(&plan_sort1) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_sort1),
+            @r###"
+        ---
+        - " SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+        let min_max_sort1 = compute_stats_column_min_max(&*plan_sort1, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_sort1).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(2000), None)
+            )
+        );
+        //
+        // min max of plan_sorted2 is [2001, 3500]
+        let p_sort2 = Arc::clone(&plan_sort2) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_sort2),
+            @r###"
+        ---
+        - " SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "   UnionExec"
+        - "     RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+        - "     RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+        let min_max_sort2 = compute_stats_column_min_max(&*plan_sort2, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_sort2).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(2001), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // min max of plan_spm is [1000, 3500]
+        let p_spm = Arc::clone(&plan_spm) as Arc<dyn ExecutionPlan>;
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&p_spm),
+            @r###"
+        ---
+        - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+        - "   UnionExec"
+        - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+        - "       UnionExec"
+        - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+        - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    #[test]
+    fn test_limit_mix_record_batch_parquet_2_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[time@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // test on non-time column & order desc
+    #[test]
+    fn test_limit_mix_record_batch_parquet_non_time_sort_desc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 DESC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[field1@2 DESC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[field1@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("field1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [field1@2 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // test on non-time column & order asc
+    #[test]
+    fn test_limit_mix_record_batch_parquet_non_time_sort_asc() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [field1@2 ASC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 ASC]   -- time range [1000, 2000]
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[field1@2 ASC]   -- time range [2001, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[field1@2 ASC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2001, 3000]
+        //
+        // Output plan: same as input plan
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("field1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [field1@2 ASC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]"
+            - "     UnionExec"
+            - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_time_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_non_time_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[field1@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [field1@2 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[field1@2 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right sort preserving merge struct --> optimize
+    #[test]
+    fn test_spm_non_time_asc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [field1@2 ASC]
+        //    UnionExec
+        //      SortExec: expr=[field1@2 ASC]
+        //        ParquetExec
+        //      SortExec: expr=[field1@2 ASC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: 2 SortExec ordered as above
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // output stays the same as input
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [field1@2 ASC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[field1@2 ASC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // Plan starts with SortPreservingMerge and includes deduplication & projections.
+    // All conditions meet --> optimize
+    #[test]
+    fn test_spm_time_desc_with_dedupe_and_proj() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[time]
+        //          ParquetExec                           -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        //
+        // Output: 2 SortExec are swapped
+
+        let schema = schema();
+
+        let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+
+        // Sort plan of the first parquet:
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[time]
+        //          ParquetExec
+        let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_projection_1 = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("time", &schema), String::from("time"))],
+                plan_parquet_1,
+            )
+            .unwrap(),
+        );
+        let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1));
+
+        // Sort plan of the second parquet and the record batch
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+        let dedupe_sort_order = ordering_with_options(
+            [
+                ("col1", SortOp::Asc),
+                ("col2", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+        let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches));
+        let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq]));
+        let plan_spm_1 = Arc::new(SortPreservingMergeExec::new(
+            dedupe_sort_order.clone(),
+            plan_union_1,
+        ));
+        let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false));
+        let plan_projection_2 = Arc::new(
+            ProjectionExec::try_new(
+                vec![(expr_col("time", &schema), String::from("time"))],
+                plan_dedupe,
+            )
+            .unwrap(),
+        );
+        let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2));
+
+        // Union them together
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // SortPreservingMerge them
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            final_sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // compute statistics
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm).unwrap();
+        assert_eq!(
+            min_max,
+            (
+                ScalarValue::TimestampNanosecond(Some(1000), None),
+                ScalarValue::TimestampNanosecond(Some(3500), None)
+            )
+        );
+
+        // Output plan: the 2 SortExecs will be swapped the order
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[time@3 as time]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[time@3 as time]"
+          - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             UnionExec"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[time@3 as time]"
+            - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             UnionExec"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[time@3 as time]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // ------------------------------------------------------------------
+    // Negative tests: the right structure not found -> nothing optimized
+    // ------------------------------------------------------------------
+
+    // Right stucture but sort on 2 columns --> plan stays the same
+    #[test]
+    fn test_negative_spm_2_column_sort_desc() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@3 DESC, field1@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@3 DESC, field1@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@3 DESC, field1@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+        //
+        // Output: same as input
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order =
+            ordering_with_options([("time", SortOp::Desc), ("field1", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit  & random plan --> plan stay the same
+    #[test]
+    fn test_negative_no_limit() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_batches = record_batches_exec_with_value_range(2, 1500, 2500);
+
+        let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet]));
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap());
+        let hash_exprs = order.iter().cloned().map(|e| e.expr).collect();
+        let plan =
+            Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap());
+        let plan = Arc::new(SortExec::new(order.clone(), plan));
+        let plan = Arc::new(DeduplicateExec::new(plan, order, true));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan, opt),
+            @r###"
+        ---
+        input:
+          - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "   SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+          - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+          - "         UnionExec"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "   SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3"
+            - "         UnionExec"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // has limit but no sort preserving merge --> plan stay the same
+    #[test]
+    fn test_negative_limit_no_preserving_merge() {
+        test_helpers::maybe_start_logging();
+
+        let schema = schema();
+
+        let plan_batches1 = record_batches_exec_with_value_range(1, 1000, 2000);
+        let plan_batches2 = record_batches_exec_with_value_range(3, 2001, 3000);
+        let plan_batches3 = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches2, plan_batches3]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_batches1));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_union_2, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       UnionExec"
+          - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+          - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       UnionExec"
+            - "         RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]"
+            - "         RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+        "###
+        );
+    }
+
+    // right structure and same sort order but inputs of uion overlap --> plan stay the same
+    #[test]
+    fn test_negative_overlap() {
+        test_helpers::maybe_start_logging();
+
+        // Input plan:
+        //
+        // GlobalLimitExec: skip=0, fetch=2
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]  that overlaps with the other SorExec
+        //        ParquetExec                         -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2000, 3500] from combine time range of two record batches
+        //        UnionExec
+        //           SortExec: expr=[time@2 DESC]
+        //              RecordBatchesExec             -- 2 chunks [2500, 3500]
+        //           ParquetExec                      -- [2000, 3000]
+
+        let schema = schema();
+        let order = ordering_with_options(
+            [
+                ("col2", SortOp::Asc),
+                ("col1", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2000, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3]));
+
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            sort_order.clone(),
+            plan_union_2,
+        ));
+
+        let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1)));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_limit, opt),
+            @r###"
+        ---
+        input:
+          - " GlobalLimitExec: skip=0, fetch=1"
+          - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "         UnionExec"
+          - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " GlobalLimitExec: skip=0, fetch=1"
+            - "   SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+            - "     UnionExec"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "       SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "         UnionExec"
+            - "           SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // No limit & but the input is in the right union struct --> plan stay the same
+    #[test]
+    fn test_negative_no_sortpreservingmerge_input_union() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]
+        //        ParquetExec
+        //      SortExec: expr=[time@2 DESC]
+        //        UnionExec
+        //          RecordBatchesExec
+        //          ParquetExec
+
+        let schema = schema();
+
+        let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000);
+        let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2]));
+
+        let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+        let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet));
+        let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1));
+
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // input and output are the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_union_2, opt),
+            @r###"
+        ---
+        input:
+          - " UnionExec"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "     ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "     UnionExec"
+          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " UnionExec"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "     ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "   SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "     UnionExec"
+            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "       ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // Projection expression (field + field) ==> not optimze. Plan stays the same
+    #[test]
+    fn test_negative_spm_time_desc_with_dedupe_and_proj_on_expr() {
+        test_helpers::maybe_start_logging();
+
+        // plan:
+        //  SortPreservingMerge: [time@2 DESC]
+        //    UnionExec
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[field1 + field1, time]                                <-- NOTE: has expresssion col1+col2
+        //          ParquetExec                           -- [1000, 2000]
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[field1 + field1, time]                                <-- NOTE: has expresssion col1+col2
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+
+        let schema = schema();
+
+        let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema);
+
+        // Sort plan of the first parquet:
+        //      SortExec: expr=[time@2 DESC]   -- time range [1000, 2000]
+        //        ProjectionExec: expr=[field1 + field1, time]
+        //          ParquetExec
+        let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000);
+
+        let field_expr = Arc::new(BinaryExpr::new(
+            Arc::new(Column::new_with_schema("field1", &schema).unwrap()),
+            Operator::Plus,
+            Arc::new(Column::new_with_schema("field1", &schema).unwrap()),
+        ));
+        let plan_projection_1 = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (Arc::<BinaryExpr>::clone(&field_expr), String::from("field")),
+                    (expr_col("time", &schema), String::from("time")),
+                ],
+                plan_parquet_1,
+            )
+            .unwrap(),
+        );
+        let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1));
+
+        // Sort plan of the second parquet and the record batch
+        //      SortExec: expr=[time@2 DESC]   -- time range [2001, 3500] from combine time range of record batches & parquet
+        //        ProjectionExec: expr=[field1 + field1, time]
+        //          DeduplicateExec: [col1, col2, time]
+        //              SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC]
+        //                  UnionExec
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          RecordBatchesExec           -- 2 chunks [2500, 3500]
+        //                      SortExec: expr=[col1 ASC, col2 ASC, time ASC]
+        //                          ParquetExec                     -- [2001, 3000]
+        let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000);
+        let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500);
+        let dedupe_sort_order = ordering_with_options(
+            [
+                ("col1", SortOp::Asc),
+                ("col2", SortOp::Asc),
+                ("time", SortOp::Asc),
+            ],
+            &schema,
+        );
+        let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches));
+        let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2));
+        let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq]));
+        let plan_spm_1 = Arc::new(SortPreservingMergeExec::new(
+            dedupe_sort_order.clone(),
+            plan_union_1,
+        ));
+        let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false));
+        let plan_projection_2 = Arc::new(
+            ProjectionExec::try_new(
+                vec![
+                    (field_expr, String::from("field")),
+                    (expr_col("time", &schema), String::from("time")),
+                ],
+                plan_dedupe,
+            )
+            .unwrap(),
+        );
+        let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2));
+
+        // Union them together
+        let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2]));
+
+        // SortPreservingMerge them
+        let plan_spm = Arc::new(SortPreservingMergeExec::new(
+            final_sort_order.clone(),
+            plan_union_2,
+        ));
+
+        // compute statistics: no stats becasue the ProjectionExec includes expression
+        let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap();
+        let min_max = column_statistics_min_max(&min_max_spm);
+        assert!(min_max.is_none());
+
+        // output plan stays the same
+        let opt = OrderUnionSortedInputs;
+        insta::assert_yaml_snapshot!(
+            OptimizationTest::new(plan_spm, opt),
+            @r###"
+        ---
+        input:
+          - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+          - "   UnionExec"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+          - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+          - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+          - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+          - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "             UnionExec"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+          - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        output:
+          Ok:
+            - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]"
+            - "   UnionExec"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+            - "         ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+            - "     SortExec: expr=[time@3 DESC NULLS LAST]"
+            - "       ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]"
+            - "         DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "           SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "             UnionExec"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "               SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]"
+            - "                 ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
+        "###
+        );
+    }
+
+    // ------------------------------------------------------------------
+    // Helper functions
+    // ------------------------------------------------------------------
+
+    fn schema() -> SchemaRef {
+        IOxSchemaBuilder::new()
+            .tag("col1")
+            .tag("col2")
+            .influx_field("field1", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into()
+    }
+
+    fn expr_col(name: &str, schema: &SchemaRef) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Column::new_with_schema(name, schema).unwrap())
+    }
+
+    // test chunk with time range and field1's value range
+    fn test_chunk(min: i64, max: i64, parquet_data: bool) -> Arc<dyn QueryChunk> {
+        let chunk = TestChunk::new("t")
+            .with_time_column_with_stats(Some(min), Some(max))
+            .with_tag_column_with_stats("col1", Some("AL"), Some("MT"))
+            .with_tag_column_with_stats("col2", Some("MA"), Some("VY"))
+            .with_i64_field_column_with_stats("field1", Some(min), Some(max));
+
+        let chunk = if parquet_data {
+            chunk.with_dummy_parquet_file()
+        } else {
+            chunk
+        };
+
+        Arc::new(chunk) as Arc<dyn QueryChunk>
+    }
+
+    fn record_batches_exec_with_value_range(
+        n_chunks: usize,
+        min: i64,
+        max: i64,
+    ) -> Arc<dyn ExecutionPlan> {
+        let chunks = std::iter::repeat(test_chunk(min, max, false))
+            .take(n_chunks)
+            .collect::<Vec<_>>();
+
+        Arc::new(RecordBatchesExec::new(chunks, schema(), None))
+    }
+
+    fn parquet_exec_with_value_range(
+        schema: &SchemaRef,
+        min: i64,
+        max: i64,
+    ) -> Arc<dyn ExecutionPlan> {
+        let chunk = test_chunk(min, max, true);
+        let plan = chunks_to_physical_nodes(schema, None, vec![chunk], 1);
+
+        if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
+            if union_exec.inputs().len() == 1 {
+                Arc::clone(&union_exec.inputs()[0])
+            } else {
+                plan
+            }
+        } else {
+            plan
+        }
+    }
+
+    fn ordering_with_options<const N: usize>(
+        cols: [(&str, SortOp); N],
+        schema: &SchemaRef,
+    ) -> Vec<PhysicalSortExpr> {
+        cols.into_iter()
+            .map(|col| PhysicalSortExpr {
+                expr: Arc::new(Column::new_with_schema(col.0, schema.as_ref()).unwrap()),
+                options: SortOptions {
+                    descending: col.1 == SortOp::Desc,
+                    nulls_first: false,
+                },
+            })
+            .collect()
+    }
+
+    #[derive(Debug, PartialEq)]
+    enum SortOp {
+        Asc,
+        Desc,
+    }
+}
diff --git a/iox_query/src/physical_optimizer/sort/parquet_sortness.rs b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs
index bf2c9440733..c0f4a132dab 100644
--- a/iox_query/src/physical_optimizer/sort/parquet_sortness.rs
+++ b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs
@@ -180,8 +180,8 @@ mod tests {
         datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl},
         physical_expr::PhysicalSortExpr,
         physical_plan::{
-            empty::EmptyExec, expressions::Column, sorts::sort::SortExec, union::UnionExec,
-            Statistics,
+            expressions::Column, placeholder_row::PlaceholderRowExec, sorts::sort::SortExec,
+            union::UnionExec, Statistics,
         },
     };
     use object_store::{path::Path, ObjectMeta};
@@ -202,12 +202,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1"], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -220,11 +219,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         "###
         );
@@ -237,12 +236,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(DeduplicateExec::new(
@@ -273,12 +271,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)], vec![file(3)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1"], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -296,11 +293,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={2 groups: [[1.parquet, 2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={3 groups: [[1.parquet], [2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         "###
         );
@@ -315,12 +312,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1)], vec![file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1"], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -333,11 +329,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         "###
         );
@@ -350,12 +346,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col1", "col2"], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -368,11 +363,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         "###
         );
@@ -385,12 +380,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -403,11 +397,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]"
         "###
         );
@@ -420,12 +414,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2), file(3)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1"], &schema)],
-            infinite_source: false,
         };
         let inner = ParquetExec::new(base_config, None, None);
         let plan = Arc::new(
@@ -443,11 +436,11 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "   ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]"
         "###
         );
@@ -456,7 +449,7 @@ mod tests {
     #[test]
     fn test_other_node() {
         let schema = schema();
-        let inner = EmptyExec::new(true, Arc::clone(&schema));
+        let inner = PlaceholderRowExec::new(Arc::clone(&schema));
         let plan = Arc::new(
             SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner))
                 .with_fetch(Some(42)),
@@ -467,12 +460,12 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
-          - "   EmptyExec: produce_one_row=true"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   PlaceholderRowExec"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
-            - "   EmptyExec: produce_one_row=true"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   PlaceholderRowExec"
         "###
         );
     }
@@ -484,12 +477,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1"], &schema)],
-            infinite_source: false,
         };
         let plan = Arc::new(ParquetExec::new(base_config, None, None));
         let opt = ParquetSortness;
@@ -513,12 +505,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col1", "col2"], &schema)],
-            infinite_source: false,
         };
         let plan = Arc::new(ParquetExec::new(base_config, None, None));
         let plan =
@@ -531,13 +522,13 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]"
-          - "   SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+          - "   SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
           - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]"
-            - "   SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
+            - "   SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
             - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         "###
         );
@@ -550,12 +541,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col1", "col2"], &schema)],
-            infinite_source: false,
         };
         let plan = Arc::new(ParquetExec::new(base_config, None, None));
         let plan =
@@ -568,13 +558,13 @@ mod tests {
             @r###"
         ---
         input:
-          - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
-          - "   SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]"
+          - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+          - "   SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
           - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         output:
           Ok:
-            - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]"
-            - "   SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]"
+            - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]"
+            - "   SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]"
             - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]"
         "###
         );
@@ -588,12 +578,11 @@ mod tests {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(&schema),
             file_groups: vec![vec![file(1), file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(&schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)],
-            infinite_source: false,
         };
         let plan_parquet = Arc::new(ParquetExec::new(base_config, None, None));
         let plan_batches = Arc::new(RecordBatchesExec::new(vec![], Arc::clone(&schema), None));
@@ -612,13 +601,13 @@ mod tests {
         input:
           - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
           - "   UnionExec"
-          - "     RecordBatchesExec: chunks=0"
+          - "     RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]"
           - "     ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]"
             - "   UnionExec"
-            - "     RecordBatchesExec: chunks=0"
+            - "     RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]"
             - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]"
         "###
         );
@@ -650,6 +639,7 @@ mod tests {
                 last_modified: Default::default(),
                 size: 0,
                 e_tag: None,
+                version: None,
             },
             partition_values: vec![],
             range: None,
diff --git a/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs
index 6563a86512e..f76772abe3f 100644
--- a/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs
+++ b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs
@@ -1,13 +1,13 @@
 use std::sync::Arc;
 
 use datafusion::{
-    common::tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter},
-    config::ConfigOptions,
-    error::Result,
-    physical_expr::{
-        utils::ordering_satisfy_requirement,
-        {PhysicalSortExpr, PhysicalSortRequirement},
+    common::{
+        internal_err,
+        tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter},
     },
+    config::ConfigOptions,
+    error::{DataFusionError, Result},
+    physical_expr::{PhysicalSortExpr, PhysicalSortRequirement},
     physical_optimizer::PhysicalOptimizerRule,
     physical_plan::{
         repartition::RepartitionExec, sorts::sort::SortExec, union::UnionExec, ExecutionPlan,
@@ -67,7 +67,7 @@ impl PhysicalOptimizerRule for PushSortThroughUnion {
                 return Ok(Transformed::No(plan));
             };
 
-            if !sort_should_be_pushed_down(sort_exec) {
+            if !sort_should_be_pushed_down(sort_exec)? {
                 return Ok(Transformed::No(plan));
             }
 
@@ -80,16 +80,16 @@ impl PhysicalOptimizerRule for PushSortThroughUnion {
 
             // As a sanity check, make sure plan has the same ordering as before.
             // If this fails, there is a bug in this optimization.
-            let required_order = sort_exec.output_ordering().map(sort_exprs_to_requirement);
-            if !ordering_satisfy_requirement(
-                plan.output_ordering(),
-                required_order.as_deref(),
-                || plan.equivalence_properties(),
-                || plan.ordering_equivalence_properties(),
-            ) {
-                return Err(datafusion::error::DataFusionError::Internal(
-                    "PushSortThroughUnion corrupted plan sort order".into(),
-                ));
+            let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement)
+            else {
+                return internal_err!("No sort order after a sort");
+            };
+
+            if !plan
+                .equivalence_properties()
+                .ordering_satisfy_requirement(&required_order)
+            {
+                return internal_err!("PushSortThroughUnion corrupted plan sort order");
             }
 
             Ok(Transformed::Yes(plan))
@@ -106,7 +106,7 @@ impl PhysicalOptimizerRule for PushSortThroughUnion {
 }
 
 /// Returns true if the [`SortExec`] can be pushed down beneath a [`UnionExec`].
-fn sort_should_be_pushed_down(sort_exec: &SortExec) -> bool {
+fn sort_should_be_pushed_down(sort_exec: &SortExec) -> Result<bool> {
     // Skip over any RepartitionExecs
     let mut input = sort_exec.input();
     while input.as_any().is::<RepartitionExec>() {
@@ -118,22 +118,21 @@ fn sort_should_be_pushed_down(sort_exec: &SortExec) -> bool {
     }
 
     let Some(union_exec) = input.as_any().downcast_ref::<UnionExec>() else {
-        return false;
+        return Ok(false);
     };
 
-    let required_ordering = sort_exec.output_ordering().map(sort_exprs_to_requirement);
+    let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement) else {
+        return internal_err!("No sort order after a sort");
+    };
 
     // Push down the sort if any of the children are already sorted.
     // This means we will need to sort fewer rows than if we didn't
     // push down the sort.
-    union_exec.children().iter().any(|child| {
-        ordering_satisfy_requirement(
-            child.output_ordering(),
-            required_ordering.as_deref(),
-            || child.equivalence_properties(),
-            || child.ordering_equivalence_properties(),
-        )
-    })
+    Ok(union_exec.children().iter().any(|child| {
+        child
+            .equivalence_properties()
+            .ordering_satisfy_requirement(&required_order)
+    }))
 }
 
 /// Rewrites a plan:
@@ -166,23 +165,21 @@ impl TreeNodeRewriter for SortRewriter {
                     Arc::clone(repartition_exec.input()),
                     repartition_exec.output_partitioning(),
                 )?
-                .with_preserve_order(true),
+                .with_preserve_order(),
             ))
         } else if let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() {
             // Any children of the UnionExec that are not already sorted,
             // need to be sorted.
-            let required_ordering = Some(sort_exprs_to_requirement(self.ordering.as_ref()));
+            let required_ordering = sort_exprs_to_requirement(self.ordering.as_ref());
 
             let new_children = union_exec
                 .children()
                 .into_iter()
                 .map(|child| {
-                    if !ordering_satisfy_requirement(
-                        child.output_ordering(),
-                        required_ordering.as_deref(),
-                        || child.equivalence_properties(),
-                        || child.ordering_equivalence_properties(),
-                    ) {
+                    if !child
+                        .equivalence_properties()
+                        .ordering_satisfy_requirement(&required_ordering)
+                    {
                         let sort_exec = SortExec::new(self.ordering.clone(), child)
                             .with_preserve_partitioning(true);
                         Arc::new(sort_exec)
@@ -266,16 +263,16 @@ mod test {
           - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "         UnionExec"
-          - "           RecordBatchesExec: chunks=2"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "   SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
-            - "     SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
             - "       UnionExec"
             - "         SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "           RecordBatchesExec: chunks=2"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -317,17 +314,17 @@ mod test {
           - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "           UnionExec"
-          - "             RecordBatchesExec: chunks=2"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
             - " SortExec: expr=[time@3 ASC]"
             - "   DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "     SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
-            - "       SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
             - "         UnionExec"
             - "           SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "             RecordBatchesExec: chunks=2"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -358,14 +355,14 @@ mod test {
           - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
           - "   SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
           - "     UnionExec"
-          - "       RecordBatchesExec: chunks=2"
+          - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "       ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
             - "   UnionExec"
             - "     SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "       RecordBatchesExec: chunks=2"
+            - "       RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -402,8 +399,8 @@ mod test {
           - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "         UnionExec"
-          - "           RecordBatchesExec: chunks=2"
-          - "           RecordBatchesExec: chunks=2"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
@@ -411,8 +408,8 @@ mod test {
             - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
             - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
             - "         UnionExec"
-            - "           RecordBatchesExec: chunks=2"
-            - "           RecordBatchesExec: chunks=2"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
         "###
         );
     }
@@ -454,8 +451,8 @@ mod test {
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "   SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
-            - "     SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
             - "       UnionExec"
             - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
             - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
@@ -537,16 +534,16 @@ mod test {
           - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "           UnionExec"
-          - "             RecordBatchesExec: chunks=2"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
             - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "   SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
-            - "     SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
+            - "   RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
+            - "     RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC"
             - "       UnionExec"
             - "         SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]"
-            - "           RecordBatchesExec: chunks=2"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "         ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -586,7 +583,7 @@ mod test {
           - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "           UnionExec"
-          - "             RecordBatchesExec: chunks=2"
+          - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
@@ -596,7 +593,7 @@ mod test {
             - "       RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
             - "         RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
             - "           UnionExec"
-            - "             RecordBatchesExec: chunks=2"
+            - "             RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "             ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -635,7 +632,7 @@ mod test {
           - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
           - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
           - "         UnionExec"
-          - "           RecordBatchesExec: chunks=2"
+          - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
           - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
         output:
           Ok:
@@ -644,7 +641,7 @@ mod test {
             - "     RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8"
             - "       RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4"
             - "         UnionExec"
-            - "           RecordBatchesExec: chunks=2"
+            - "           RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]"
             - "           ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]"
         "###
         );
@@ -662,12 +659,11 @@ mod test {
             object_store_url: ObjectStoreUrl::parse("test://").unwrap(),
             file_schema: Arc::clone(schema),
             file_groups: vec![vec![file(1)], vec![file(2)]],
-            statistics: Statistics::default(),
+            statistics: Statistics::new_unknown(schema),
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![order.to_vec()],
-            infinite_source: false,
         };
         Arc::new(ParquetExec::new(base_config, None, None))
     }
@@ -691,6 +687,7 @@ mod test {
                 last_modified: Default::default(),
                 size: 0,
                 e_tag: None,
+                version: None,
             },
             partition_values: vec![],
             range: None,
diff --git a/iox_query/src/physical_optimizer/sort/util.rs b/iox_query/src/physical_optimizer/sort/util.rs
new file mode 100644
index 00000000000..274b016c225
--- /dev/null
+++ b/iox_query/src/physical_optimizer/sort/util.rs
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+
+use crate::statistics::{column_statistics_min_max, compute_stats_column_min_max, overlap};
+use arrow::compute::{rank, SortOptions};
+use datafusion::{error::Result, physical_plan::ExecutionPlan, scalar::ScalarValue};
+use observability_deps::tracing::trace;
+
+/// Compute statistics for the given plans on a given column name
+/// Return none if the statistics are not available
+pub(crate) fn collect_statistics_min_max(
+    plans: &[Arc<dyn ExecutionPlan>],
+    col_name: &str,
+) -> Result<Option<Vec<(ScalarValue, ScalarValue)>>> {
+    // temp solution while waiting for DF's statistics to get mature
+    // Compute min max stats for all inputs of UnionExec on the sorted column
+    // https://github.com/apache/arrow-datafusion/issues/8078
+    let col_stats = plans
+        .iter()
+        .map(|plan| compute_stats_column_min_max(&**plan, col_name))
+        .collect::<Result<Vec<_>>>()?;
+
+    // If min and max not available, return none
+    let mut value_ranges = Vec::with_capacity(col_stats.len());
+    for stats in &col_stats {
+        let Some((min, max)) = column_statistics_min_max(stats) else {
+            trace!("-------- min_max not available");
+            return Ok(None);
+        };
+
+        value_ranges.push((min, max));
+    }
+
+    // todo: use this when DF satistics is ready
+    // // Get statistics for the inputs of UnionExec on the sorted column
+    // let Some(value_ranges) = statistics_min_max(plans, col_name)
+    // else {
+    //     return Ok(None);
+    // };
+
+    Ok(Some(value_ranges))
+}
+
+/// Plans and their corresponding value ranges
+pub(crate) struct PlansValueRanges {
+    pub plans: Vec<Arc<dyn ExecutionPlan>>,
+    // Min and max values of the plan on a specific column
+    pub value_ranges: Vec<(ScalarValue, ScalarValue)>,
+}
+
+/// Sort the given plans by value ranges
+/// Return none if
+///    . the number of plans is not the same as the number of value ranges
+///    . the value ranges overlap
+pub(crate) fn sort_by_value_ranges(
+    plans: Vec<Arc<dyn ExecutionPlan>>,
+    value_ranges: Vec<(ScalarValue, ScalarValue)>,
+    sort_options: SortOptions,
+) -> Result<Option<PlansValueRanges>> {
+    if plans.len() != value_ranges.len() {
+        trace!(
+            plans.len = plans.len(),
+            value_ranges.len = value_ranges.len(),
+            "--------- number of plans is not the same as the number of value ranges"
+        );
+        return Ok(None);
+    }
+
+    if overlap(&value_ranges)? {
+        trace!("--------- value ranges overlap");
+        return Ok(None);
+    }
+
+    // get the min value of each value range
+    let min_iter = value_ranges.iter().map(|(min, _)| min.clone());
+    let mins = ScalarValue::iter_to_array(min_iter)?;
+
+    // rank the min values
+    let ranks = rank(&*mins, Some(sort_options))?;
+
+    // sort the plans by the ranks of their min values
+    let mut plan_rank_zip: Vec<(Arc<dyn ExecutionPlan>, u32)> =
+        plans.into_iter().zip(ranks.clone()).collect::<Vec<_>>();
+    plan_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2));
+    let plans = plan_rank_zip
+        .into_iter()
+        .map(|(plan, _)| plan)
+        .collect::<Vec<_>>();
+
+    // Sort the value ranges by the ranks of their min values
+    let mut value_range_rank_zip: Vec<((ScalarValue, ScalarValue), u32)> =
+        value_ranges.into_iter().zip(ranks).collect::<Vec<_>>();
+    value_range_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2));
+    let value_ranges = value_range_rank_zip
+        .into_iter()
+        .map(|(value_range, _)| value_range)
+        .collect::<Vec<_>>();
+
+    Ok(Some(PlansValueRanges {
+        plans,
+        value_ranges,
+    }))
+}
diff --git a/iox_query/src/physical_optimizer/tests.rs b/iox_query/src/physical_optimizer/tests.rs
new file mode 100644
index 00000000000..4e582278627
--- /dev/null
+++ b/iox_query/src/physical_optimizer/tests.rs
@@ -0,0 +1,210 @@
+//! Optimizer edge cases.
+//!
+//! These are NOT part of the usual end2end query tests because they depend on very specific chunk arrangements that are
+//! hard to reproduce in an end2end setting.
+
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use datafusion::{
+    common::DFSchema,
+    datasource::provider_as_source,
+    logical_expr::{col, count, lit, Expr, ExprSchemable, LogicalPlanBuilder},
+    scalar::ScalarValue,
+};
+use schema::sort::SortKey;
+use test_helpers::maybe_start_logging;
+
+use crate::{
+    exec::{DedicatedExecutors, Executor, ExecutorConfig, ExecutorType},
+    provider::ProviderBuilder,
+    test::{format_execution_plan, TestChunk},
+    QueryChunk,
+};
+
+/// Test that reconstructs specific case where parquet files may unnecessarily be sorted.
+///
+/// See:
+/// - <https://github.com/influxdata/EAR/issues/4468>
+/// - <https://github.com/influxdata/influxdb_iox/issues/9451>
+#[tokio::test]
+async fn test_parquet_should_not_be_resorted() {
+    // DF session setup
+    let config = ExecutorConfig {
+        target_query_partitions: 16.try_into().unwrap(),
+        ..ExecutorConfig::testing()
+    };
+    let exec = Executor::new_with_config_and_executors(
+        config,
+        Arc::new(DedicatedExecutors::new_testing()),
+    );
+    let ctx = exec.new_context(ExecutorType::Query);
+    let state = ctx.inner().state();
+
+    // chunks
+    let c = TestChunk::new("t")
+        .with_tag_column("tag")
+        .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None);
+    let c_mem = c.clone().with_may_contain_pk_duplicates(true);
+    let c_file = c
+        .clone()
+        .with_dummy_parquet_file()
+        .with_may_contain_pk_duplicates(false)
+        .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")]));
+    let schema = c.schema().clone();
+    let provider = ProviderBuilder::new("t".into(), schema)
+        .add_chunk(Arc::new(c_mem.clone().with_id(1).with_order(i64::MAX)))
+        .add_chunk(Arc::new(c_file.clone().with_id(2).with_order(2)))
+        .add_chunk(Arc::new(c_file.clone().with_id(3).with_order(3)))
+        .build()
+        .unwrap();
+
+    // initial plan
+    // NOTE: we NEED two time predicates for the bug to trigger!
+    let expr = col("time")
+        .gt(lit(ScalarValue::TimestampNanosecond(Some(0), None)))
+        .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None))));
+
+    let plan =
+        LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None)
+            .unwrap()
+            .filter(expr)
+            .unwrap()
+            .aggregate(
+                std::iter::empty::<Expr>(),
+                [count(lit(true)).alias("count")],
+            )
+            .unwrap()
+            .project([col("count")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+    let plan = state.create_physical_plan(&plan).await.unwrap();
+
+    // The output of the parquet files should not be resorted
+    insta::assert_yaml_snapshot!(
+        format_execution_plan(&plan),
+        @r###"
+    ---
+    - " AggregateExec: mode=Final, gby=[], aggr=[count]"
+    - "   CoalescePartitionsExec"
+    - "     AggregateExec: mode=Partial, gby=[], aggr=[count]"
+    - "       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1"
+    - "         ProjectionExec: expr=[]"
+    - "           DeduplicateExec: [tag@1 ASC,time@2 ASC]"
+    - "             SortPreservingMergeExec: [tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "               UnionExec"
+    - "                 SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "                   CoalesceBatchesExec: target_batch_size=8192"
+    - "                     FilterExec: time@2 > 0 AND time@2 > 2"
+    - "                       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1"
+    - "                         RecordBatchesExec: chunks=1, projection=[__chunk_order, tag, time]"
+    - "                 SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "                   CoalesceBatchesExec: target_batch_size=8192"
+    - "                     FilterExec: time@2 > 0 AND time@2 > 2"
+    - "                       RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=2"
+    - "                         ParquetExec: file_groups={2 groups: [[2.parquet], [3.parquet]]}, projection=[__chunk_order, tag, time], output_ordering=[tag@1 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=time@1 > 0 AND time@1 > 2, pruning_predicate=time_max@0 > 0 AND time_max@0 > 2"
+    "###
+    );
+}
+
+/// Bug reproducer for:
+/// - <https://github.com/influxdata/EAR/issues/4728>
+/// - <https://github.com/influxdata/influxdb_iox/issues/9450>
+#[tokio::test]
+async fn test_parquet_must_resorted() {
+    maybe_start_logging();
+
+    // DF session setup
+    let config = ExecutorConfig {
+        target_query_partitions: 6.try_into().unwrap(),
+        ..ExecutorConfig::testing()
+    };
+    let exec = Executor::new_with_config_and_executors(
+        config,
+        Arc::new(DedicatedExecutors::new_testing()),
+    );
+    let ctx = exec.new_context(ExecutorType::Query);
+    let state = ctx.inner().state();
+
+    // chunks
+    let c = TestChunk::new("t")
+        .with_tag_column("tag")
+        .with_f64_field_column("field")
+        .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None)
+        .with_may_contain_pk_duplicates(false)
+        .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")]));
+    let schema = c.schema().clone();
+    let df_schema = DFSchema::try_from(schema.as_arrow().as_ref().clone()).unwrap();
+    let provider = ProviderBuilder::new("t".into(), schema)
+        // need a small file followed by a big one
+        .add_chunk(Arc::new(
+            c.clone()
+                .with_id(1)
+                .with_order(1)
+                .with_dummy_parquet_file_and_size(1),
+        ))
+        .add_chunk(Arc::new(
+            c.clone()
+                .with_id(2)
+                .with_order(2)
+                .with_dummy_parquet_file_and_size(100_000_000),
+        ))
+        .build()
+        .unwrap();
+
+    // initial plan
+    let expr = col("tag")
+        .gt(lit("foo"))
+        .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None))))
+        .and(
+            col("field")
+                .cast_to(&DataType::Utf8, &df_schema)
+                .unwrap()
+                .not_eq(lit("")),
+        );
+
+    let plan =
+        LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None)
+            .unwrap()
+            .filter(expr)
+            .unwrap()
+            .project([col("tag")])
+            .unwrap()
+            .build()
+            .unwrap();
+
+    let plan = state.create_physical_plan(&plan).await.unwrap();
+
+    // The output of the parquet files must be sorted prior to merging
+    // if the first file_group has more than one file
+    //
+    // Prior to https://github.com/influxdata/influxdb_iox/issues/9450, the plan
+    // called for the ParquetExec to read the files in parallel (using subranges) like:
+    // ```
+    // {6 groups: [[1.parquet:0..1, 2.parquet:0..16666666], [2.parquet:16666666..33333333],...
+    // ```
+    //
+    // Groups with more than one file produce an output partition that is the
+    // result of concatenating them together, so even if the output of each
+    // individual file is sorted, the output of the partition is not, due to the
+    // concatenation.
+    insta::assert_yaml_snapshot!(
+        format_execution_plan(&plan),
+        @r###"
+    ---
+    - " ProjectionExec: expr=[tag@1 as tag]"
+    - "   CoalesceBatchesExec: target_batch_size=8192"
+    - "     FilterExec: CAST(field@0 AS Utf8) != "
+    - "       RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=1"
+    - "         ProjectionExec: expr=[field@1 as field, tag@3 as tag]"
+    - "           DeduplicateExec: [tag@3 ASC,time@2 ASC]"
+    - "             SortPreservingMergeExec: [tag@3 ASC,time@2 ASC,__chunk_order@0 ASC]"
+    - "               CoalesceBatchesExec: target_batch_size=8192"
+    - "                 FilterExec: tag@3 > foo AND time@2 > 2"
+    - "                   RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=2, preserve_order=true, sort_exprs=tag@3 ASC,time@2 ASC,__chunk_order@0 ASC"
+    - "                     ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[__chunk_order, field, time, tag], output_ordering=[tag@3 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=tag@1 > foo AND time@2 > 2, pruning_predicate=tag_max@0 > foo AND time_max@1 > 2"
+    "###
+    );
+}
diff --git a/iox_query/src/physical_optimizer/union/nested_union.rs b/iox_query/src/physical_optimizer/union/nested_union.rs
index 6e94423dec9..7a051396d1a 100644
--- a/iox_query/src/physical_optimizer/union/nested_union.rs
+++ b/iox_query/src/physical_optimizer/union/nested_union.rs
@@ -95,11 +95,11 @@ mod tests {
         ---
         input:
           - " UnionExec"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
             - " UnionExec"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
         "###
         );
     }
@@ -118,15 +118,15 @@ mod tests {
         input:
           - " UnionExec"
           - "   UnionExec"
-          - "     EmptyExec: produce_one_row=false"
-          - "     EmptyExec: produce_one_row=false"
-          - "   EmptyExec: produce_one_row=false"
+          - "     EmptyExec"
+          - "     EmptyExec"
+          - "   EmptyExec"
         output:
           Ok:
             - " UnionExec"
-            - "   EmptyExec: produce_one_row=false"
-            - "   EmptyExec: produce_one_row=false"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
         "###
         );
     }
@@ -148,16 +148,16 @@ mod tests {
         input:
           - " UnionExec"
           - "   UnionExec"
-          - "     EmptyExec: produce_one_row=false"
+          - "     EmptyExec"
           - "     UnionExec"
-          - "       EmptyExec: produce_one_row=false"
-          - "   EmptyExec: produce_one_row=false"
+          - "       EmptyExec"
+          - "   EmptyExec"
         output:
           Ok:
             - " UnionExec"
-            - "   EmptyExec: produce_one_row=false"
-            - "   EmptyExec: produce_one_row=false"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
+            - "   EmptyExec"
+            - "   EmptyExec"
         "###
         );
     }
@@ -171,16 +171,16 @@ mod tests {
             @r###"
         ---
         input:
-          - " EmptyExec: produce_one_row=false"
+          - " EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
 
     fn other_node() -> Arc<dyn ExecutionPlan> {
-        Arc::new(EmptyExec::new(false, schema()))
+        Arc::new(EmptyExec::new(schema()))
     }
 
     fn schema() -> SchemaRef {
diff --git a/iox_query/src/physical_optimizer/union/one_union.rs b/iox_query/src/physical_optimizer/union/one_union.rs
index c43bedcadea..15f277a40af 100644
--- a/iox_query/src/physical_optimizer/union/one_union.rs
+++ b/iox_query/src/physical_optimizer/union/one_union.rs
@@ -77,10 +77,10 @@ mod tests {
         ---
         input:
           - " UnionExec"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
@@ -95,13 +95,13 @@ mod tests {
         ---
         input:
           - " UnionExec"
-          - "   EmptyExec: produce_one_row=false"
-          - "   EmptyExec: produce_one_row=false"
+          - "   EmptyExec"
+          - "   EmptyExec"
         output:
           Ok:
             - " UnionExec"
-            - "   EmptyExec: produce_one_row=false"
-            - "   EmptyExec: produce_one_row=false"
+            - "   EmptyExec"
+            - "   EmptyExec"
         "###
         );
     }
@@ -115,16 +115,16 @@ mod tests {
             @r###"
         ---
         input:
-          - " EmptyExec: produce_one_row=false"
+          - " EmptyExec"
         output:
           Ok:
-            - " EmptyExec: produce_one_row=false"
+            - " EmptyExec"
         "###
         );
     }
 
     fn other_node() -> Arc<dyn ExecutionPlan> {
-        Arc::new(EmptyExec::new(false, schema()))
+        Arc::new(EmptyExec::new(schema()))
     }
 
     fn schema() -> SchemaRef {
diff --git a/iox_query/src/provider.rs b/iox_query/src/provider.rs
index 30f50c20b69..3fab97e578c 100644
--- a/iox_query/src/provider.rs
+++ b/iox_query/src/provider.rs
@@ -11,8 +11,10 @@ use datafusion::{
     datasource::{provider_as_source, TableProvider},
     error::{DataFusionError, Result as DataFusionResult},
     execution::context::SessionState,
-    logical_expr::{LogicalPlanBuilder, TableProviderFilterPushDown, TableType},
-    optimizer::utils::{conjunction, split_conjunction},
+    logical_expr::{
+        utils::{conjunction, split_conjunction},
+        LogicalPlanBuilder, TableProviderFilterPushDown, TableType,
+    },
     physical_plan::{
         expressions::col as physical_col, filter::FilterExec, projection::ProjectionExec,
         ExecutionPlan,
@@ -35,6 +37,7 @@ mod adapter;
 mod deduplicate;
 pub mod overlap;
 mod physical;
+pub(crate) mod progressive_eval;
 mod record_batch_exec;
 pub use self::overlap::group_potential_duplicates;
 pub use deduplicate::{DeduplicateExec, RecordBatchDeduplicator};
@@ -82,7 +85,7 @@ impl From<Error> for ArrowError {
 impl From<Error> for DataFusionError {
     // Wrap an error into a datafusion error
     fn from(e: Error) -> Self {
-        Self::ArrowError(e.into())
+        Self::ArrowError(e.into(), None)
     }
 }
 
@@ -195,6 +198,14 @@ impl TableProvider for ChunkTableProvider {
         self.arrow_schema()
     }
 
+    /// Creates a plan like the following:
+    ///
+    /// ```text
+    /// Project (keep only columns needed in the rest of the plan)
+    ///   Filter (optional, apply any push down predicates)
+    ///     Deduplicate (optional, if chunks overlap)
+    ///       ... Scan of Chunks (RecordBatchExec / ParquetExec / UnionExec, etc) ...
+    /// ```
     async fn scan(
         &self,
         ctx: &SessionState,
@@ -256,7 +267,7 @@ impl TableProvider for ChunkTableProvider {
 
             if let Some(expr) = maybe_expr {
                 Arc::new(FilterExec::try_new(
-                    df_physical_expr(plan.as_ref(), expr)?,
+                    df_physical_expr(plan.schema(), expr)?,
                     plan,
                 )?)
             } else {
@@ -358,7 +369,7 @@ mod test {
         - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
         - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "     UnionExec"
-        - "       RecordBatchesExec: chunks=1"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -375,7 +386,7 @@ mod test {
         - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]"
         - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "     UnionExec"
-        - "       RecordBatchesExec: chunks=1"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -396,7 +407,7 @@ mod test {
         - "   FilterExec: false"
         - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "       UnionExec"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -410,7 +421,7 @@ mod test {
         - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
         - "   DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "     UnionExec"
-        - "       RecordBatchesExec: chunks=1"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -456,7 +467,7 @@ mod test {
         ---
         - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
         - "   UnionExec"
-        - "     RecordBatchesExec: chunks=1"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -472,7 +483,7 @@ mod test {
         ---
         - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]"
         - "   UnionExec"
-        - "     RecordBatchesExec: chunks=1"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -501,7 +512,7 @@ mod test {
         - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
         - "   FilterExec: false AND tag1@1 = CAST(foo AS Dictionary(Int32, Utf8))"
         - "     UnionExec"
-        - "       RecordBatchesExec: chunks=1"
+        - "       RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "       ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -514,7 +525,7 @@ mod test {
         ---
         - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]"
         - "   UnionExec"
-        - "     RecordBatchesExec: chunks=1"
+        - "     RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "     ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -565,7 +576,7 @@ mod test {
         - "   FilterExec: time@3 > 100"
         - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "       UnionExec"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -583,7 +594,7 @@ mod test {
         - "   FilterExec: time@3 > 100"
         - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "       UnionExec"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -607,7 +618,7 @@ mod test {
         - "   FilterExec: false AND time@3 > 100"
         - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "       UnionExec"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
@@ -622,7 +633,7 @@ mod test {
         - "   FilterExec: time@3 > 100"
         - "     DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]"
         - "       UnionExec"
-        - "         RecordBatchesExec: chunks=1"
+        - "         RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]"
         - "         ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]"
         "###
         );
diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs
index 5d928852d41..a0f1ad9b8b9 100644
--- a/iox_query/src/provider/adapter.rs
+++ b/iox_query/src/provider/adapter.rs
@@ -204,11 +204,13 @@ impl SchemaAdapterStream {
             .mappings
             .iter()
             .map(|mapping| match mapping {
-                ColumnMapping::FromInput(input_index) => Arc::clone(batch.column(*input_index)),
-                ColumnMapping::MakeNull(data_type) => new_null_array(data_type, batch.num_rows()),
+                ColumnMapping::FromInput(input_index) => Ok(Arc::clone(batch.column(*input_index))),
+                ColumnMapping::MakeNull(data_type) => {
+                    Ok(new_null_array(data_type, batch.num_rows()))
+                }
                 ColumnMapping::Virtual(value) => value.to_array_of_size(batch.num_rows()),
             })
-            .collect::<Vec<_>>();
+            .collect::<Result<Vec<_>, DataFusionError>>()?;
 
         Ok(RecordBatch::try_new(
             Arc::clone(&self.output_schema),
diff --git a/iox_query/src/provider/deduplicate.rs b/iox_query/src/provider/deduplicate.rs
index 5744f4ec5df..45c02503d60 100644
--- a/iox_query/src/provider/deduplicate.rs
+++ b/iox_query/src/provider/deduplicate.rs
@@ -1,6 +1,5 @@
 //! Implemention of DeduplicateExec operator (resolves primary key conflicts) plumbing and tests
 mod algo;
-mod key_ranges;
 
 use std::{collections::HashSet, fmt, sync::Arc};
 
@@ -11,6 +10,7 @@ use crate::CHUNK_ORDER_COLUMN_NAME;
 
 use self::algo::get_col_name;
 pub use self::algo::RecordBatchDeduplicator;
+use datafusion::physical_expr::EquivalenceProperties;
 use datafusion::{
     error::{DataFusionError, Result},
     execution::context::TaskContext,
@@ -188,6 +188,7 @@ impl ExecutionPlan for DeduplicateExec {
     }
 
     fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        trace!("Deduplicate output ordering: {:?}", self.sort_keys);
         Some(&self.sort_keys)
     }
 
@@ -209,6 +210,11 @@ impl ExecutionPlan for DeduplicateExec {
         vec![Arc::clone(&self.input)]
     }
 
+    fn equivalence_properties(&self) -> EquivalenceProperties {
+        // deduplicate does not change the equivalence properties
+        self.input.equivalence_properties()
+    }
+
     fn with_new_children(
         self: Arc<Self>,
         children: Vec<Arc<dyn ExecutionPlan>>,
@@ -271,12 +277,9 @@ impl ExecutionPlan for DeduplicateExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Statistics {
+    fn statistics(&self) -> Result<Statistics> {
         // use a guess from our input but they are NOT exact
-        Statistics {
-            is_exact: false,
-            ..self.input.statistics()
-        }
+        Ok(self.input.statistics()?.into_inexact())
     }
 }
 
@@ -369,6 +372,7 @@ mod test {
 
     use super::*;
     use arrow::array::{DictionaryArray, Int64Array};
+    use schema::TIME_DATA_TIMEZONE;
     use std::iter::FromIterator;
 
     #[tokio::test]
@@ -465,7 +469,8 @@ mod test {
         let f1 = Float64Array::from(vec![Some(1.0), None]);
         let f2 = Float64Array::from(vec![None, Some(3.0)]);
 
-        let time = TimestampNanosecondArray::from(vec![Some(100), Some(100)]);
+        let time = TimestampNanosecondArray::from(vec![Some(100), Some(100)])
+            .with_timezone_opt(TIME_DATA_TIMEZONE());
 
         let batch = RecordBatch::try_from_iter(vec![
             ("f1", Arc::new(f1) as ArrayRef),
@@ -1219,9 +1224,9 @@ mod test {
             Ok(AdapterStream::adapt_unbounded(self.schema(), rx, handle))
         }
 
-        fn statistics(&self) -> Statistics {
+        fn statistics(&self) -> Result<Statistics, DataFusionError> {
             // don't know anything about the statistics
-            Statistics::default()
+            Ok(Statistics::new_unknown(&self.schema()))
         }
     }
 
diff --git a/iox_query/src/provider/deduplicate/algo.rs b/iox_query/src/provider/deduplicate/algo.rs
index d0a13ce2bc9..a4c24e6e344 100644
--- a/iox_query/src/provider/deduplicate/algo.rs
+++ b/iox_query/src/provider/deduplicate/algo.rs
@@ -16,8 +16,6 @@ use datafusion::physical_plan::{
 };
 use observability_deps::tracing::{debug, trace};
 
-use crate::provider::deduplicate::key_ranges::key_ranges;
-
 // Handles the deduplication across potentially multiple
 // [`RecordBatch`]es which are already sorted on a primary key,
 // including primary keys which straddle RecordBatch boundaries
@@ -240,12 +238,7 @@ impl RecordBatchDeduplicator {
 
                 is_sort_key[index] = true;
 
-                let array = batch.column(index);
-
-                arrow::compute::SortColumn {
-                    values: Arc::clone(array),
-                    options: Some(skey.options),
-                }
+                Arc::clone(batch.column(index))
             })
             .collect();
         //
@@ -256,19 +249,18 @@ impl RecordBatchDeduplicator {
         // the column with the highest cardinality
         let len = columns.len();
         if len > 1 {
-            if let DataType::Timestamp(TimeUnit::Nanosecond, _) =
-                columns[len - 1].values.data_type()
-            {
+            if let DataType::Timestamp(TimeUnit::Nanosecond, _) = columns[len - 1].data_type() {
                 columns.swap(len - 2, len - 1);
             }
         }
         // Reverse the list
-        let columns: Vec<_> = columns.into_iter().rev().collect();
+        columns.reverse();
 
         // Compute partitions (aka breakpoints between the ranges)
         // Each range (or partition) includes a unique sort key value which is
         // a unique combination of PK columns. PK columns consist of all tags and the time col.
-        let ranges = key_ranges(&columns)?.collect();
+        let partitions = arrow::compute::partition(&columns)?;
+        let ranges = partitions.ranges();
 
         Ok(DuplicateRanges {
             is_sort_key,
@@ -411,8 +403,6 @@ mod test {
     use datafusion::physical_plan::expressions::col;
     use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder};
 
-    use crate::provider::deduplicate::key_ranges::range;
-
     use super::*;
 
     #[tokio::test]
@@ -844,4 +834,8 @@ mod test {
         let metrics = ExecutionPlanMetricsSet::new();
         MetricBuilder::new(&metrics).counter("num_dupes", 0)
     }
+
+    fn range(start: usize, end: usize) -> Range<usize> {
+        Range { start, end }
+    }
 }
diff --git a/iox_query/src/provider/overlap.rs b/iox_query/src/provider/overlap.rs
index 4ba1d84e6fd..4b90162bb98 100644
--- a/iox_query/src/provider/overlap.rs
+++ b/iox_query/src/provider/overlap.rs
@@ -23,7 +23,7 @@ pub fn group_potential_duplicates(
     // If at least one of the chunks has no time range,
     // all chunks are considered to overlap with each other.
     if ts.iter().any(|ts| ts.is_none()) {
-        debug!("At least one chunk has not timestamp mim max");
+        debug!("At least one chunk has not timestamp min max");
         return vec![chunks];
     }
 
@@ -92,21 +92,16 @@ pub fn group_potential_duplicates(
 }
 
 fn timestamp_min_max(chunk: &dyn QueryChunk) -> Option<TimestampMinMax> {
+    let stats = chunk.stats();
     chunk
-        .stats()
-        .column_statistics
-        .as_ref()
-        .and_then(|stats| {
-            chunk
-                .schema()
-                .find_index_of(TIME_COLUMN_NAME)
-                .map(|idx| &stats[idx])
-        })
+        .schema()
+        .find_index_of(TIME_COLUMN_NAME)
+        .map(|idx| &stats.column_statistics[idx])
         .and_then(|stats| {
             if let (
-                Some(ScalarValue::TimestampNanosecond(Some(min), None)),
-                Some(ScalarValue::TimestampNanosecond(Some(max), None)),
-            ) = (&stats.min_value, &stats.max_value)
+                Some(ScalarValue::TimestampNanosecond(Some(min), _)),
+                Some(ScalarValue::TimestampNanosecond(Some(max), _)),
+            ) = (stats.min_value.get_value(), stats.max_value.get_value())
             {
                 Some(TimestampMinMax::new(*min, *max))
             } else {
diff --git a/iox_query/src/provider/physical.rs b/iox_query/src/provider/physical.rs
index 9616653c9df..3114cf8b397 100644
--- a/iox_query/src/provider/physical.rs
+++ b/iox_query/src/provider/physical.rs
@@ -1,10 +1,11 @@
 //! Implementation of a DataFusion PhysicalPlan node across partition chunks
 
+use crate::statistics::build_statistics_for_chunks;
 use crate::{
     provider::record_batch_exec::RecordBatchesExec, util::arrow_sort_key_exprs, QueryChunk,
     QueryChunkData, CHUNK_ORDER_COLUMN_NAME,
 };
-use arrow::datatypes::{DataType, Fields, Schema as ArrowSchema, SchemaRef};
+use arrow::datatypes::{Fields, Schema as ArrowSchema, SchemaRef};
 use datafusion::{
     datasource::{
         listing::PartitionedFile,
@@ -12,10 +13,7 @@ use datafusion::{
         physical_plan::{FileScanConfig, ParquetExec},
     },
     physical_expr::PhysicalSortExpr,
-    physical_plan::{
-        empty::EmptyExec, expressions::Column, union::UnionExec, ColumnStatistics, ExecutionPlan,
-        Statistics,
-    },
+    physical_plan::{empty::EmptyExec, expressions::Column, union::UnionExec, ExecutionPlan},
     scalar::ScalarValue,
 };
 use object_store::ObjectMeta;
@@ -145,7 +143,7 @@ pub fn chunks_to_physical_nodes(
     target_partitions: usize,
 ) -> Arc<dyn ExecutionPlan> {
     if chunks.is_empty() {
-        return Arc::new(EmptyExec::new(false, Arc::clone(schema)));
+        return Arc::new(EmptyExec::new(Arc::clone(schema)));
     }
 
     let mut record_batch_chunks: Vec<Arc<dyn QueryChunk>> = vec![];
@@ -199,24 +197,12 @@ pub fn chunks_to_physical_nodes(
         // ensure that chunks are actually ordered by chunk order
         chunks.sort_by_key(|(_meta, c)| c.order());
 
-        #[allow(clippy::manual_try_fold)]
-        let num_rows = chunks.iter().map(|(_meta, c)| c.stats().num_rows).fold(
-            Some(0usize),
-            |accu, x| match (accu, x) {
-                (Some(accu), Some(x)) => Some(accu + x),
-                _ => None,
-            },
-        );
-        let chunk_order_min = chunks
+        // Compute statistics for the chunks
+        let query_chunks = chunks
             .iter()
-            .map(|(_meta, c)| c.order().get())
-            .min()
-            .expect("at least one chunk");
-        let chunk_order_max = chunks
-            .iter()
-            .map(|(_meta, c)| c.order().get())
-            .max()
-            .expect("at least one chunk");
+            .map(|(_meta, chunk)| Arc::clone(chunk))
+            .collect::<Vec<_>>();
+        let statistics = build_statistics_for_chunks(&query_chunks, Arc::clone(schema));
 
         let file_groups = distribute(
             chunks.into_iter().map(|(object_meta, chunk)| {
@@ -242,7 +228,10 @@ pub fn chunks_to_physical_nodes(
         let output_ordering = sort_key.map(|sort_key| arrow_sort_key_exprs(&sort_key, schema));
 
         let (table_partition_cols, file_schema, output_ordering) = if has_chunk_order_col {
-            let table_partition_cols = vec![(CHUNK_ORDER_COLUMN_NAME.to_owned(), DataType::Int64)];
+            let table_partition_cols = vec![schema
+                .field_with_name(CHUNK_ORDER_COLUMN_NAME)
+                .unwrap()
+                .clone()];
             let file_schema = Arc::new(ArrowSchema::new(
                 schema
                     .fields
@@ -269,40 +258,6 @@ pub fn chunks_to_physical_nodes(
             (vec![], Arc::clone(schema), output_ordering)
         };
 
-        let statistics = Statistics {
-            num_rows,
-            total_byte_size: None,
-            column_statistics: Some(
-                schema
-                    .fields
-                    .iter()
-                    .map(|f| {
-                        let null_count = if f.is_nullable() { None } else { Some(0) };
-
-                        let (min_value, max_value) = if f.name() == CHUNK_ORDER_COLUMN_NAME {
-                            (
-                                Some(ScalarValue::from(chunk_order_min)),
-                                Some(ScalarValue::from(chunk_order_max)),
-                            )
-                        } else {
-                            (None, None)
-                        };
-
-                        ColumnStatistics {
-                            null_count,
-                            min_value,
-                            max_value,
-                            distinct_count: None,
-                        }
-                    })
-                    .collect(),
-            ),
-
-            // this does NOT account for predicate pushdown
-            // Also see https://github.com/apache/arrow-datafusion/issues/5614
-            is_exact: false,
-        };
-
         // No sort order is represented by an empty Vec
         let output_ordering = vec![output_ordering.unwrap_or_default()];
 
@@ -315,7 +270,6 @@ pub fn chunks_to_physical_nodes(
             limit: None,
             table_partition_cols,
             output_ordering,
-            infinite_source: false,
         };
         let meta_size_hint = None;
 
@@ -350,10 +304,15 @@ where
 
 #[cfg(test)]
 mod tests {
-    use schema::{sort::SortKeyBuilder, SchemaBuilder, TIME_COLUMN_NAME};
+    use datafusion::{
+        common::stats::Precision,
+        physical_plan::{ColumnStatistics, Statistics},
+    };
+    use schema::{sort::SortKeyBuilder, InfluxFieldType, SchemaBuilder, TIME_COLUMN_NAME};
 
     use crate::{
         chunk_order_field,
+        statistics::build_statistics_for_chunks,
         test::{format_execution_plan, TestChunk},
     };
 
@@ -455,7 +414,7 @@ mod tests {
             format_execution_plan(&plan),
             @r###"
         ---
-        - " EmptyExec: produce_one_row=false"
+        - " EmptyExec"
         "###
         );
     }
@@ -575,9 +534,192 @@ mod tests {
             @r###"
         ---
         - " UnionExec"
-        - "   RecordBatchesExec: chunks=1"
+        - "   RecordBatchesExec: chunks=1, projection=[tag, __chunk_order]"
         - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, __chunk_order], output_ordering=[__chunk_order@1 ASC]"
         "###
         );
     }
+
+    // reproducer of https://github.com/influxdata/idpe/issues/18287
+    #[test]
+    fn reproduce_schema_bug_in_parquet_exec() {
+        // schema with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        // Build a RecordBatchsExec for record_batch_chunk
+        //
+        // Use chunks_to_physical_nodes to build a plan with UnionExec on top of RecordBatchesExec
+        // Note: I purposely use chunks_to_physical_node to create plan for both record_batch_chunk and parquet_chunk to
+        //       consistently create their plan. Also chunks_to_physical_node is used to do create plan in optimization
+        //       passes that I will need
+        let plan = chunks_to_physical_nodes(
+            &schema,
+            None,
+            vec![Arc::clone(&record_batch_chunk) as Arc<dyn QueryChunk>],
+            1,
+        );
+        // remove union
+        let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() else {
+            panic!("plan is not a UnionExec");
+        };
+        let plan_record_batches_exec = Arc::clone(&union_exec.inputs()[0]);
+        // verify this is a RecordBatchesExec
+        assert!(plan_record_batches_exec
+            .as_any()
+            .downcast_ref::<RecordBatchesExec>()
+            .is_some());
+
+        // Build a ParquetExec for parquet_chunk
+        //
+        // Use chunks_to_physical_nodes to build a plan with UnionExec on top of ParquetExec
+        let plan = chunks_to_physical_nodes(
+            &schema,
+            None,
+            vec![Arc::clone(&parquet_chunk) as Arc<dyn QueryChunk>],
+            1,
+        );
+        // remove union
+        let Some(union_exec) = plan.as_any().downcast_ref::<UnionExec>() else {
+            panic!("plan is not a UnionExec");
+        };
+        let plan_parquet_exec = Arc::clone(&union_exec.inputs()[0]);
+        // verify this is a ParquetExec
+        assert!(plan_parquet_exec
+            .as_any()
+            .downcast_ref::<ParquetExec>()
+            .is_some());
+
+        // Schema of 2 chunks are the same
+        assert_eq!(record_batch_chunk.schema(), parquet_chunk.schema());
+
+        // Schema of the corresponding plans are also the same
+        assert_eq!(
+            plan_record_batches_exec.schema(),
+            plan_parquet_exec.schema()
+        );
+
+        // Statistics of 2 chunks are the same
+        let record_batch_stats =
+            build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema));
+        let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema);
+        assert_eq!(record_batch_stats, parquet_stats);
+
+        // Statistics of the corresponding plans should also be the same except the CHUNK_ORDER_COLUMN_NAME
+        // Notes:
+        //  1. We do compute stats for CHUNK_ORDER_COLUMN_NAME and store it as in FileScanConfig.statistics
+        //     See: https://github.com/influxdata/influxdb_iox/blob/0e5b97d9e913111641f65b9af31e3b3f45f3b14b/iox_query/src/provider/physical.rs#L311C24-L311C24
+        //     So, if we get statistics there, we have everything
+        //  2. However, if we get statistics through the DF plan's statistics() method, we will not get stats for CHUNK_ORDER_COLUMN_NAME
+        //     The reason is we store CHUNK_ORDER_COLUMN_NAME as table_partition_cols in DF and DF has not computed stats for it yet.
+        //     See: https://github.com/apache/arrow-datafusion/blob/a9d66e2b492843c2fb335a7dfe27fed073629b09/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L139
+        // When we get the plan's statistics, we won't care about CHUNK_ORDER_COLUMN_NAME becasue it is not a real column.
+        // Thus, we are good for now. In the future, if we want a 100% consistent for CHUNK_ORDER_COLUMN_NAME, we need
+        // to modify DF to compute stats for table_partition_cols
+        //
+        // Here both parquet's plan stats and FileScanConfig stats
+        //
+        // Cast to ParquetExec to get statistics
+        let plan_parquet_exec = plan_parquet_exec
+            .as_any()
+            .downcast_ref::<ParquetExec>()
+            .unwrap();
+        // stats of the parquet plan generally computed from propagating stats from input plans/chunks/columns
+        let parquet_plan_stats = plan_parquet_exec.statistics().unwrap();
+        // stats stored in FileScanConfig
+        let parqet_file_stats = &plan_parquet_exec.base_config().statistics;
+
+        // stats of IOx specific recod batch plan
+        let record_batch_plan_stats = plan_record_batches_exec.statistics().unwrap();
+
+        // Record batch plan stats is the same as parquet file stats and includes everything
+        assert_eq!(record_batch_plan_stats, *parqet_file_stats);
+
+        // Verify content
+        //
+        // Actual columns have stats
+        let col_stats = vec![
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+        ];
+        //
+        // Add CHUNK_ORDER_COLUMN_NAME with stats
+        let mut parquet_file_col_stats = col_stats.clone();
+        parquet_file_col_stats.push(ColumnStatistics {
+            null_count: Precision::Absent,
+            max_value: Precision::Exact(ScalarValue::Int64(Some(6))),
+            min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+            distinct_count: Precision::Absent,
+        });
+        //
+        // Add CHUNK_ORDER_COLUMN_NAME without stats
+        let mut parquet_plan_stats_col_stats = col_stats;
+        parquet_plan_stats_col_stats.push(ColumnStatistics {
+            null_count: Precision::Absent,
+            max_value: Precision::Absent,
+            min_value: Precision::Absent,
+            distinct_count: Precision::Absent,
+        });
+        //
+        let expected_parquet_plan_stats = Statistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Absent,
+            column_statistics: parquet_plan_stats_col_stats,
+        };
+        //
+        let expected_parquet_file_stats = Statistics {
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Absent,
+            column_statistics: parquet_file_col_stats,
+        };
+
+        // Content of Record batch plan stats that include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(record_batch_plan_stats, expected_parquet_file_stats);
+        // Content of parquet file stats that also include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(*parqet_file_stats, expected_parquet_file_stats);
+        //
+        // Content of parquet plan stats that does not include stats of CHUNK_ORDER_COLUMN_NAME
+        assert_eq!(parquet_plan_stats, expected_parquet_plan_stats);
+    }
 }
diff --git a/iox_query/src/provider/progressive_eval.rs b/iox_query/src/provider/progressive_eval.rs
new file mode 100644
index 00000000000..80109e4baca
--- /dev/null
+++ b/iox_query/src/provider/progressive_eval.rs
@@ -0,0 +1,1206 @@
+// ProgressiveEvalExec (step 1 in https://docs.google.com/document/d/1x1yf9ggyxD4JPT8Gf9YlIKxUawqoKTJ1HFyTbGin9xY/edit)
+// This will be moved to DF once it is ready
+
+//! Defines the progressive eval plan
+
+use std::any::Any;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use arrow::record_batch::RecordBatch;
+use datafusion::common::{internal_err, DataFusionError, Result};
+use datafusion::execution::TaskContext;
+use datafusion::physical_expr::{EquivalenceProperties, PhysicalSortExpr, PhysicalSortRequirement};
+use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet};
+use datafusion::physical_plan::stream::RecordBatchReceiverStream;
+use datafusion::physical_plan::{
+    DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, RecordBatchStream,
+    SendableRecordBatchStream, Statistics,
+};
+use datafusion::scalar::ScalarValue;
+use futures::{ready, Stream, StreamExt};
+use std::pin::Pin;
+use std::task::{Context, Poll};
+
+use observability_deps::tracing::{debug, trace};
+
+/// ProgressiveEval return a stream of record batches in the order of its inputs.
+/// It will stop when the number of output rows reach the given limit.
+///
+/// This takes an input execution plan and a number n, and provided each partition of
+/// the input plan is in an expected order, this operator will return top record batches that covers the top n rows
+/// in the order of the input plan.
+///
+/// ```text
+/// ┌─────────────────────────┐
+/// │ ┌───┬───┬───┬───┐       │
+/// │ │ A │ B │ C │ D │       │──┐
+/// │ └───┴───┴───┴───┘       │  │
+/// └─────────────────────────┘  │  ┌───────────────────┐    ┌───────────────────────────────┐
+///   Stream 1                   │  │                   │    │ ┌───┬───╦═══╦───┬───╦═══╗     │
+///                              ├─▶│  ProgressiveEval  │───▶│ │ A │ B ║ C ║ D │ M ║ N ║ ... │
+///                              │  │                   │    │ └───┴─▲─╩═══╩───┴───╩═══╝     │
+/// ┌─────────────────────────┐  │  └───────────────────┘    └─┬─────┴───────────────────────┘
+/// │ ╔═══╦═══╗               │  │
+/// │ ║ M ║ N ║               │──┘                             │
+/// │ ╚═══╩═══╝               │                Output only include top record batches that cover top N rows
+/// └─────────────────────────┘                
+///   Stream 2
+///
+///
+///  Input Streams                                             Output stream
+///  (in some order)                                           (in same order)
+/// ```
+#[derive(Debug)]
+pub(crate) struct ProgressiveEvalExec {
+    /// Input plan
+    input: Arc<dyn ExecutionPlan>,
+
+    /// Corresponding value ranges of the input plan
+    /// None if the value ranges are not available
+    value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+
+    /// Execution metrics
+    metrics: ExecutionPlanMetricsSet,
+
+    /// Optional number of rows to fetch. Stops producing rows after this fetch
+    fetch: Option<usize>,
+}
+
+impl ProgressiveEvalExec {
+    /// Create a new progressive execution plan
+    pub fn new(
+        input: Arc<dyn ExecutionPlan>,
+        value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+        fetch: Option<usize>,
+    ) -> Self {
+        Self {
+            input,
+            value_ranges,
+            metrics: ExecutionPlanMetricsSet::new(),
+            fetch,
+        }
+    }
+
+    /// Input schema
+    pub fn input(&self) -> &Arc<dyn ExecutionPlan> {
+        &self.input
+    }
+}
+
+impl DisplayAs for ProgressiveEvalExec {
+    fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match t {
+            DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                write!(f, "ProgressiveEvalExec: ")?;
+                if let Some(fetch) = self.fetch {
+                    write!(f, "fetch={fetch}, ")?;
+                };
+                if let Some(value_ranges) = &self.value_ranges {
+                    write!(f, "input_ranges={value_ranges:?}")?;
+                };
+
+                Ok(())
+            }
+        }
+    }
+}
+
+impl ExecutionPlan for ProgressiveEvalExec {
+    /// Return a reference to Any that can be used for downcasting
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn schema(&self) -> SchemaRef {
+        self.input.schema()
+    }
+
+    /// Get the output partitioning of this plan
+    fn output_partitioning(&self) -> Partitioning {
+        // This node serializes all the data to a single partition
+        Partitioning::UnknownPartitioning(1)
+    }
+
+    /// Specifies whether this plan generates an infinite stream of records.
+    /// If the plan does not support pipelining, but its input(s) are
+    /// infinite, returns an error to indicate this.
+    fn unbounded_output(&self, children: &[bool]) -> Result<bool> {
+        Ok(children[0])
+    }
+
+    fn required_input_distribution(&self) -> Vec<Distribution> {
+        vec![Distribution::UnspecifiedDistribution]
+    }
+
+    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
+        vec![false]
+    }
+
+    fn required_input_ordering(&self) -> Vec<Option<Vec<PhysicalSortRequirement>>> {
+        self.input()
+            .output_ordering()
+            .map(|_| None)
+            .into_iter()
+            .collect()
+    }
+
+    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+        self.input.output_ordering()
+    }
+
+    /// ProgressiveEvalExec will only accept sorted input
+    /// and will maintain the input order
+    fn maintains_input_order(&self) -> Vec<bool> {
+        vec![true]
+    }
+
+    fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+        vec![Arc::<dyn ExecutionPlan>::clone(&self.input)]
+    }
+
+    fn with_new_children(
+        self: Arc<Self>,
+        children: Vec<Arc<dyn ExecutionPlan>>,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        Ok(Arc::new(Self::new(
+            Arc::<dyn ExecutionPlan>::clone(&children[0]),
+            self.value_ranges.clone(),
+            self.fetch,
+        )))
+    }
+
+    fn execute(
+        &self,
+        partition: usize,
+        context: Arc<TaskContext>,
+    ) -> Result<SendableRecordBatchStream> {
+        trace!(
+            "Start ProgressiveEvalExec::execute for partition: {}",
+            partition
+        );
+        if 0 != partition {
+            return internal_err!("ProgressiveEvalExec invalid partition {partition}");
+        }
+
+        let input_partitions = self.input.output_partitioning().partition_count();
+        trace!(
+            "Number of input partitions of  ProgressiveEvalExec::execute: {}",
+            input_partitions
+        );
+        let schema = self.schema();
+
+        // Have the input streams run in parallel
+        // todo: maybe in the future we do not need this parallelism if number of fecthed rows is in the fitst stream
+        let receivers = (0..input_partitions)
+            .map(|partition| {
+                let stream = self
+                    .input
+                    .execute(partition, Arc::<TaskContext>::clone(&context))?;
+
+                Ok(spawn_buffered(stream, 1))
+            })
+            .collect::<Result<_>>()?;
+
+        debug!("Done setting up sender-receiver for ProgressiveEvalExec::execute");
+
+        let result = ProgressiveEvalStream::new(
+            receivers,
+            schema,
+            BaselineMetrics::new(&self.metrics, partition),
+            self.fetch,
+        )?;
+
+        debug!("Got stream result from ProgressiveEvalStream::new_from_receivers");
+
+        Ok(Box::pin(result))
+    }
+
+    fn metrics(&self) -> Option<MetricsSet> {
+        Some(self.metrics.clone_inner())
+    }
+
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        self.input.statistics()
+    }
+
+    fn equivalence_properties(&self) -> EquivalenceProperties {
+        // progressive eval does not change the equivalence properties of its input
+        self.input.equivalence_properties()
+    }
+}
+
+/// Concat input streams until reaching the fetch limit
+struct ProgressiveEvalStream {
+    /// input streams
+    input_streams: Vec<SendableRecordBatchStream>,
+
+    /// The schema of the input and output.
+    schema: SchemaRef,
+
+    /// used to record execution metrics
+    metrics: BaselineMetrics,
+
+    /// Index of current stream
+    current_stream_idx: usize,
+
+    /// If the stream has encountered an error
+    aborted: bool,
+
+    /// Optional number of rows to fetch
+    fetch: Option<usize>,
+
+    /// number of rows produced
+    produced: usize,
+}
+
+impl ProgressiveEvalStream {
+    fn new(
+        input_streams: Vec<SendableRecordBatchStream>,
+        schema: SchemaRef,
+        metrics: BaselineMetrics,
+        fetch: Option<usize>,
+    ) -> Result<Self> {
+        Ok(Self {
+            input_streams,
+            schema,
+            metrics,
+            current_stream_idx: 0,
+            aborted: false,
+            fetch,
+            produced: 0,
+        })
+    }
+}
+
+impl Stream for ProgressiveEvalStream {
+    type Item = Result<RecordBatch>;
+
+    // Return the next record batch until reaching the fetch limit or the end of all input streams
+    // Return pending if the next record batch is not ready
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        // Error in previous poll
+        if self.aborted {
+            return Poll::Ready(None);
+        }
+
+        // Have reached the fetch limit
+        if self.produced >= self.fetch.unwrap_or(std::usize::MAX) {
+            return Poll::Ready(None);
+        }
+
+        // Have reached the end of all input streams
+        if self.current_stream_idx >= self.input_streams.len() {
+            return Poll::Ready(None);
+        }
+
+        // Get next record batch
+        let mut poll;
+        loop {
+            let idx = self.current_stream_idx;
+            poll = self.input_streams[idx].poll_next_unpin(cx);
+            match poll {
+                // This input stream no longer has data, move to next stream
+                Poll::Ready(None) => {
+                    self.current_stream_idx += 1;
+                    if self.current_stream_idx >= self.input_streams.len() {
+                        break;
+                    }
+                }
+                _ => break,
+            }
+        }
+
+        let poll = match ready!(poll) {
+            // This input stream has data, return its next record batch
+            Some(Ok(batch)) => {
+                self.produced += batch.num_rows();
+                Poll::Ready(Some(Ok(batch)))
+            }
+            // This input stream has an error, return the error and set aborted to true to stop polling next round
+            Some(Err(e)) => {
+                self.aborted = true;
+                Poll::Ready(Some(Err(e)))
+            }
+            // This input stream has no more data, return None (aka finished)
+            None => {
+                // Reaching here means data of all streams have read
+                assert!(
+                    self.current_stream_idx >= self.input_streams.len(),
+                    "ProgressiveEvalStream::poll_next should not return None before all input streams are read",);
+
+                Poll::Ready(None)
+            }
+        };
+
+        self.metrics.record_poll(poll)
+    }
+}
+
+impl RecordBatchStream for ProgressiveEvalStream {
+    fn schema(&self) -> SchemaRef {
+        Arc::clone(&self.schema)
+    }
+}
+
+// todo: this is a copy from DF code. When this ProgressiveEval operator is moved to DF, this can be removed
+/// If running in a tokio context spawns the execution of `stream` to a separate task
+/// allowing it to execute in parallel with an intermediate buffer of size `buffer`
+pub(crate) fn spawn_buffered(
+    mut input: SendableRecordBatchStream,
+    buffer: usize,
+) -> SendableRecordBatchStream {
+    // Use tokio only if running from a multi-thread tokio context
+    match tokio::runtime::Handle::try_current() {
+        Ok(handle) if handle.runtime_flavor() == tokio::runtime::RuntimeFlavor::MultiThread => {
+            let mut builder = RecordBatchReceiverStream::builder(input.schema(), buffer);
+
+            let sender = builder.tx();
+
+            builder.spawn(async move {
+                while let Some(item) = input.next().await {
+                    if sender.send(item).await.is_err() {
+                        // receiver dropped when query is shutdown early (e.g., limit) or error,
+                        // no need to return propagate the send error.
+                        return Ok(());
+                    }
+                }
+
+                Ok(())
+            });
+
+            builder.build()
+        }
+        _ => input,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::iter::FromIterator;
+    use std::sync::Weak;
+
+    use arrow::array::ArrayRef;
+    use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray};
+    use arrow::datatypes::Schema;
+    use arrow::datatypes::{DataType, Field};
+    use arrow::record_batch::RecordBatch;
+    use datafusion::assert_batches_eq;
+    use datafusion::physical_plan::collect;
+    use datafusion::physical_plan::memory::MemoryExec;
+    use datafusion::physical_plan::metrics::{MetricValue, Timestamp};
+    use futures::{Future, FutureExt};
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test_no_input_stream() {
+        let task_ctx = Arc::new(TaskContext::default());
+        _test_progressive_eval(
+            &[],
+            None,
+            None, // no fetch limit --> return all rows
+            &["++", "++"],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_one_input_stream() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("c"),
+            Some("e"),
+            Some("g"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // return all
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            None, // no fetch limit --> return all rows
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // fetch no rows
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(0),
+            &["++", "++"],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // still return all even select 3 rows becasue first record batch is returned
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(3),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // return all because fetch limit is larger
+        _test_progressive_eval(
+            &[vec![b1.clone()]],
+            None,
+            Some(7),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | j | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_return_all() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("c"),
+            Some("e"),
+            Some("g"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("b"),
+            Some("d"),
+            Some("f"),
+            Some("h"),
+            Some("j"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2]
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()]],
+            None,
+            None, // no fetch limit --> return all rows
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | j | 1970-01-01T00:00:00.000000008 |",
+                "| 10 | b | 1970-01-01T00:00:00.000000004 |",
+                "| 20 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 70 | f | 1970-01-01T00:00:00.000000002 |",
+                "| 90 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 30 | j | 1970-01-01T00:00:00.000000006 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b2, b1]
+        _test_progressive_eval(
+            &[vec![b2], vec![b1]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 10 | b | 1970-01-01T00:00:00.000000004 |",
+                "| 20 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 70 | f | 1970-01-01T00:00:00.000000002 |",
+                "| 90 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 30 | j | 1970-01-01T00:00:00.000000006 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | c | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | e | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | g | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | j | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_return_all_on_different_length_batches() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2]
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b2, b1]
+        _test_progressive_eval(
+            &[vec![b2], vec![b1]],
+            None,
+            None,
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_1() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 1 --> return all 3 rows of the first batch (b2) that covers that limit
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(1),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 1 --> return all 5 rows of the first batch (b1) that covers that limit
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(1),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | e | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_equal_first_batch_size() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 3 --> return all 3 rows of the first batch (b2) that covers that limit
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(3),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 5 --> return all 5 rows of first batch (b1) that covers that limit
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(5),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | e | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_limit_over_first_batch_size() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("c"),
+            Some("d"),
+            Some("e"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b2, b1]
+        // b2 has 3 rows. b1 has 5 rows
+        // Fetch limit is 4 --> return all rows of both batches in the order of b2, b1
+        _test_progressive_eval(
+            &[vec![b2.clone()], vec![b1.clone()]],
+            None,
+            Some(4),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2]
+        // b1 has 5 rows. b2 has 3 rows
+        // Fetch limit is 6 --> return all rows of both batches in the order of b1, b2
+        _test_progressive_eval(
+            &[vec![b1], vec![b2]],
+            None,
+            Some(6),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  | d | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | e | 1970-01-01T00:00:00.000000008 |",
+                "| 70 | c | 1970-01-01T00:00:00.000000004 |",
+                "| 90 | d | 1970-01-01T00:00:00.000000006 |",
+                "| 30 | e | 1970-01-01T00:00:00.000000002 |",
+                "+----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    #[tokio::test]
+    async fn test_three_partitions_with_nulls() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("a"),
+            Some("b"),
+            Some("c"),
+            None,
+            Some("f"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            Some("e"),
+            Some("g"),
+            Some("h"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![40, 60, 20]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![100, 200, 700, 900]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![
+            None,
+            Some("g"),
+            Some("h"),
+            Some("i"),
+        ]));
+        let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2]));
+        let b3 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap();
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 1 --> return all rows of the b1
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]],
+            None,
+            Some(1),
+            &[
+                "+---+---+-------------------------------+",
+                "| a | b | c                             |",
+                "+---+---+-------------------------------+",
+                "| 1 | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2 | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7 | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9 |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3 | f | 1970-01-01T00:00:00.000000008 |",
+                "+---+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 7 --> return all rows of the b1 & b2 in the order of b1, b2
+        _test_progressive_eval(
+            &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]],
+            None,
+            Some(7),
+            &[
+                "+----+---+-------------------------------+",
+                "| a  | b | c                             |",
+                "+----+---+-------------------------------+",
+                "| 1  | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2  | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7  | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9  |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3  | f | 1970-01-01T00:00:00.000000008 |",
+                "| 10 | e | 1970-01-01T00:00:00.000000040 |",
+                "| 20 | g | 1970-01-01T00:00:00.000000060 |",
+                "| 70 | h | 1970-01-01T00:00:00.000000020 |",
+                "+----+---+-------------------------------+",
+            ],
+            Arc::clone(&task_ctx),
+        )
+        .await;
+
+        // [b1, b2, b3]
+        // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows
+        // Fetch limit is 50 --> return all rows of all batches in the order of b1, b2, b3
+        _test_progressive_eval(
+            &[vec![b1], vec![b2], vec![b3]],
+            None,
+            Some(50),
+            &[
+                "+-----+---+-------------------------------+",
+                "| a   | b | c                             |",
+                "+-----+---+-------------------------------+",
+                "| 1   | a | 1970-01-01T00:00:00.000000008 |",
+                "| 2   | b | 1970-01-01T00:00:00.000000007 |",
+                "| 7   | c | 1970-01-01T00:00:00.000000006 |",
+                "| 9   |   | 1970-01-01T00:00:00.000000005 |",
+                "| 3   | f | 1970-01-01T00:00:00.000000008 |",
+                "| 10  | e | 1970-01-01T00:00:00.000000040 |",
+                "| 20  | g | 1970-01-01T00:00:00.000000060 |",
+                "| 70  | h | 1970-01-01T00:00:00.000000020 |",
+                "| 100 |   | 1970-01-01T00:00:00.000000004 |",
+                "| 200 | g | 1970-01-01T00:00:00.000000006 |",
+                "| 700 | h | 1970-01-01T00:00:00.000000002 |",
+                "| 900 | i | 1970-01-01T00:00:00.000000002 |",
+                "+-----+---+-------------------------------+",
+            ],
+            task_ctx,
+        )
+        .await;
+    }
+
+    async fn _test_progressive_eval(
+        partitions: &[Vec<RecordBatch>],
+        value_ranges: Option<Vec<(ScalarValue, ScalarValue)>>,
+        fetch: Option<usize>,
+        exp: &[&str],
+        context: Arc<TaskContext>,
+    ) {
+        let schema = if partitions.is_empty() {
+            // just whatwever schema
+            let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+            let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
+            batch.schema()
+        } else {
+            partitions[0][0].schema()
+        };
+
+        let exec = MemoryExec::try_new(partitions, schema, None).unwrap();
+        let progressive = Arc::new(ProgressiveEvalExec::new(
+            Arc::new(exec),
+            value_ranges,
+            fetch,
+        ));
+
+        let collected = collect(progressive, context).await.unwrap();
+        assert_batches_eq!(exp, collected.as_slice());
+    }
+
+    #[tokio::test]
+    async fn test_merge_metrics() {
+        let task_ctx = Arc::new(TaskContext::default());
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("a"), Some("c")]));
+        let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
+
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20]));
+        let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("b"), Some("d")]));
+        let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap();
+
+        let schema = b1.schema();
+        let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap();
+        let progressive = Arc::new(ProgressiveEvalExec::new(Arc::new(exec), None, None));
+
+        let collected = collect(Arc::<ProgressiveEvalExec>::clone(&progressive), task_ctx)
+            .await
+            .unwrap();
+        let expected = [
+            "+----+---+",
+            "| a  | b |",
+            "+----+---+",
+            "| 1  | a |",
+            "| 2  | c |",
+            "| 10 | b |",
+            "| 20 | d |",
+            "+----+---+",
+        ];
+        assert_batches_eq!(expected, collected.as_slice());
+
+        // Now, validate metrics
+        let metrics = progressive.metrics().unwrap();
+
+        assert_eq!(metrics.output_rows().unwrap(), 4);
+        assert!(metrics.elapsed_compute().unwrap() > 0);
+
+        let mut saw_start = false;
+        let mut saw_end = false;
+        metrics.iter().for_each(|m| match m.value() {
+            MetricValue::StartTimestamp(ts) => {
+                saw_start = true;
+                assert!(nanos_from_timestamp(ts) > 0);
+            }
+            MetricValue::EndTimestamp(ts) => {
+                saw_end = true;
+                assert!(nanos_from_timestamp(ts) > 0);
+            }
+            _ => {}
+        });
+
+        assert!(saw_start);
+        assert!(saw_end);
+    }
+
+    fn nanos_from_timestamp(ts: &Timestamp) -> i64 {
+        ts.value().unwrap().timestamp_nanos_opt().unwrap()
+    }
+
+    #[tokio::test]
+    async fn test_drop_cancel() -> Result<()> {
+        let task_ctx = Arc::new(TaskContext::default());
+        let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)]));
+
+        let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 2));
+        let refs = blocking_exec.refs();
+        let progressive_exec = Arc::new(ProgressiveEvalExec::new(blocking_exec, None, None));
+
+        let fut = collect(progressive_exec, task_ctx);
+        let mut fut = fut.boxed();
+
+        assert_is_pending(&mut fut);
+        drop(fut);
+        assert_strong_count_converges_to_zero(refs).await;
+
+        Ok(())
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Asserts that the strong count of the given [`Weak`] pointer converges to zero.
+    ///
+    /// This might take a while but has a timeout.
+    pub async fn assert_strong_count_converges_to_zero<T>(refs: Weak<T>) {
+        #![allow(clippy::future_not_send)]
+        tokio::time::timeout(std::time::Duration::from_secs(10), async {
+            loop {
+                if Weak::strong_count(&refs) == 0 {
+                    break;
+                }
+                tokio::time::sleep(std::time::Duration::from_millis(10)).await;
+            }
+        })
+        .await
+        .unwrap();
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Asserts that given future is pending.
+    pub fn assert_is_pending<'a, T>(fut: &mut Pin<Box<dyn Future<Output = T> + Send + 'a>>) {
+        let waker = futures::task::noop_waker();
+        let mut cx = futures::task::Context::from_waker(&waker);
+        let poll = fut.poll_unpin(&mut cx);
+
+        assert!(poll.is_pending());
+    }
+
+    // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed
+    /// Execution plan that emits streams that block forever.
+    ///
+    /// This is useful to test shutdown / cancelation behavior of certain execution plans.
+    #[derive(Debug)]
+    pub struct BlockingExec {
+        /// Schema that is mocked by this plan.
+        schema: SchemaRef,
+
+        /// Number of output partitions.
+        n_partitions: usize,
+
+        /// Ref-counting helper to check if the plan and the produced stream are still in memory.
+        refs: Arc<()>,
+    }
+
+    impl BlockingExec {
+        /// Create new [`BlockingExec`] with a give schema and number of partitions.
+        pub fn new(schema: SchemaRef, n_partitions: usize) -> Self {
+            Self {
+                schema,
+                n_partitions,
+                refs: Default::default(),
+            }
+        }
+
+        /// Weak pointer that can be used for ref-counting this execution plan and its streams.
+        ///
+        /// Use [`Weak::strong_count`] to determine if the plan itself and its streams are dropped (should be 0 in that
+        /// case). Note that tokio might take some time to cancel spawned tasks, so you need to wrap this check into a retry
+        /// loop. Use [`assert_strong_count_converges_to_zero`] to archive this.
+        pub fn refs(&self) -> Weak<()> {
+            Arc::downgrade(&self.refs)
+        }
+    }
+
+    impl DisplayAs for BlockingExec {
+        fn fmt_as(
+            &self,
+            t: DisplayFormatType,
+            f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            match t {
+                DisplayFormatType::Default | DisplayFormatType::Verbose => {
+                    write!(f, "BlockingExec",)
+                }
+            }
+        }
+    }
+
+    impl ExecutionPlan for BlockingExec {
+        fn as_any(&self) -> &dyn Any {
+            self
+        }
+
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            // this is a leaf node and has no children
+            vec![]
+        }
+
+        fn output_partitioning(&self) -> Partitioning {
+            Partitioning::UnknownPartitioning(self.n_partitions)
+        }
+
+        fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
+            None
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> Result<Arc<dyn ExecutionPlan>> {
+            internal_err!("Children cannot be replaced in {self:?}")
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<TaskContext>,
+        ) -> Result<SendableRecordBatchStream> {
+            Ok(Box::pin(BlockingStream {
+                schema: Arc::clone(&self.schema),
+                _refs: Arc::clone(&self.refs),
+            }))
+        }
+
+        fn statistics(&self) -> Result<Statistics> {
+            unimplemented!()
+        }
+    }
+
+    /// A [`RecordBatchStream`] that is pending forever.
+    #[derive(Debug)]
+    pub struct BlockingStream {
+        /// Schema mocked by this stream.
+        schema: SchemaRef,
+
+        /// Ref-counting helper to check if the stream are still in memory.
+        _refs: Arc<()>,
+    }
+
+    impl Stream for BlockingStream {
+        type Item = Result<RecordBatch>;
+
+        fn poll_next(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+            Poll::Pending
+        }
+    }
+
+    impl RecordBatchStream for BlockingStream {
+        fn schema(&self) -> SchemaRef {
+            Arc::clone(&self.schema)
+        }
+    }
+}
diff --git a/iox_query/src/provider/record_batch_exec.rs b/iox_query/src/provider/record_batch_exec.rs
index 9b3c591edae..612228681bc 100644
--- a/iox_query/src/provider/record_batch_exec.rs
+++ b/iox_query/src/provider/record_batch_exec.rs
@@ -1,17 +1,19 @@
 //! Implementation of a DataFusion PhysicalPlan node across partition chunks
 
-use crate::{statistics::DFStatsAggregator, QueryChunk, CHUNK_ORDER_COLUMN_NAME};
+use crate::statistics::build_statistics_for_chunks;
+use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME};
 
 use super::adapter::SchemaAdapterStream;
-use arrow::datatypes::{Schema, SchemaRef};
+use arrow::datatypes::SchemaRef;
+use datafusion::physical_plan::display::ProjectSchemaDisplay;
 use datafusion::{
     error::DataFusionError,
     execution::context::TaskContext,
     physical_plan::{
         expressions::{Column, PhysicalSortExpr},
         metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet},
-        ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning,
-        SendableRecordBatchStream, Statistics,
+        DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream,
+        Statistics,
     },
     scalar::ScalarValue,
 };
@@ -55,40 +57,10 @@ impl RecordBatchesExec {
         schema: SchemaRef,
         output_sort_key_memo: Option<SortKey>,
     ) -> Self {
-        let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok();
-        let chunk_order_only_schema =
-            chunk_order_field.map(|field| Schema::new(vec![field.clone()]));
-
         let chunks: Vec<_> = chunks.into_iter().collect();
+        let statistics = build_statistics_for_chunks(&chunks, Arc::clone(&schema));
 
-        let statistics = chunks
-            .iter()
-            .fold(DFStatsAggregator::new(&schema), |mut agg, chunk| {
-                agg.update(&chunk.stats(), chunk.schema().as_arrow().as_ref());
-
-                if let Some(schema) = chunk_order_only_schema.as_ref() {
-                    let order = chunk.order().get();
-                    let order = ScalarValue::from(order);
-                    agg.update(
-                        &Statistics {
-                            num_rows: Some(0),
-                            total_byte_size: Some(0),
-                            column_statistics: Some(vec![ColumnStatistics {
-                                null_count: Some(0),
-                                max_value: Some(order.clone()),
-                                min_value: Some(order),
-                                distinct_count: Some(1),
-                            }]),
-                            is_exact: true,
-                        },
-                        schema,
-                    );
-                }
-
-                agg
-            })
-            .build();
-
+        let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok();
         let output_ordering = if chunk_order_field.is_some() {
             Some(vec![
                 // every chunk gets its own partition, so we can claim that the output is ordered
@@ -199,8 +171,8 @@ impl ExecutionPlan for RecordBatchesExec {
         Some(self.metrics.clone_inner())
     }
 
-    fn statistics(&self) -> Statistics {
-        self.statistics.clone()
+    fn statistics(&self) -> Result<Statistics, DataFusionError> {
+        Ok(self.statistics.clone())
     }
 }
 
@@ -208,7 +180,11 @@ impl DisplayAs for RecordBatchesExec {
     fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match t {
             DisplayFormatType::Default | DisplayFormatType::Verbose => {
-                write!(f, "RecordBatchesExec: chunks={}", self.chunks.len(),)
+                write!(f, "RecordBatchesExec: chunks={}", self.chunks.len(),)?;
+                if !self.schema.fields().is_empty() {
+                    write!(f, ", projection={}", ProjectSchemaDisplay(&self.schema))?;
+                }
+                Ok(())
             }
         }
     }
diff --git a/iox_query/src/pruning.rs b/iox_query/src/pruning.rs
index 4885b2867e3..50f44f1baf3 100644
--- a/iox_query/src/pruning.rs
+++ b/iox_query/src/pruning.rs
@@ -2,20 +2,21 @@
 
 use crate::QueryChunk;
 use arrow::{
-    array::{ArrayRef, UInt64Array},
+    array::{ArrayRef, BooleanArray, UInt64Array},
     datatypes::{DataType, SchemaRef},
 };
 use datafusion::{
     physical_expr::execution_props::ExecutionProps,
     physical_optimizer::pruning::PruningStatistics,
     physical_plan::{ColumnStatistics, Statistics},
-    prelude::{col, lit_timestamp_nano, Column, Expr},
+    prelude::{col, Column, Expr},
     scalar::ScalarValue,
 };
-use datafusion_util::create_pruning_predicate;
+use datafusion_util::{create_pruning_predicate, lit_timestamptz_nano};
 use observability_deps::tracing::{debug, trace, warn};
 use query_functions::group_by::Aggregate;
 use schema::{Schema, TIME_COLUMN_NAME};
+use std::collections::HashSet;
 use std::sync::Arc;
 
 /// Reason why a chunk could not be pruned.
@@ -82,16 +83,7 @@ pub fn prune_chunks(
         .iter()
         .map(|c| (c.stats(), c.schema().as_arrow()))
         .collect();
-    prune_summaries(table_schema, &summaries, filters)
-}
 
-/// Given a `Vec` of pruning summaries, return a `Vec<bool>` where `false` indicates that the
-/// predicate can be proven to evaluate to `false` for every single row.
-pub fn prune_summaries(
-    table_schema: &Schema,
-    summaries: &[(Arc<Statistics>, SchemaRef)],
-    filters: &[Expr],
-) -> Result<Vec<bool>, NotPrunedReason> {
     let filter_expr = match filters.iter().cloned().reduce(|a, b| a.and(b)) {
         Some(expr) => expr,
         None => {
@@ -99,12 +91,23 @@ pub fn prune_summaries(
             return Err(NotPrunedReason::NoExpressionOnPredicate);
         }
     };
+
+    prune_summaries(table_schema, &summaries, &filter_expr)
+}
+
+/// Given a `Vec` of pruning summaries, return a `Vec<bool>` where `false` indicates that the
+/// predicate can be proven to evaluate to `false` for every single row.
+pub fn prune_summaries(
+    table_schema: &Schema,
+    summaries: &[(Arc<Statistics>, SchemaRef)],
+    filter_expr: &Expr,
+) -> Result<Vec<bool>, NotPrunedReason> {
     trace!(%filter_expr, "Filter_expr of pruning chunks");
 
     // no information about the queries here
     let props = ExecutionProps::new();
     let pruning_predicate =
-        match create_pruning_predicate(&props, &filter_expr, &table_schema.as_arrow()) {
+        match create_pruning_predicate(&props, filter_expr, &table_schema.as_arrow()) {
             Ok(p) => p,
             Err(e) => {
                 warn!(%e, ?filter_expr, "Can not create pruning predicate");
@@ -148,9 +151,8 @@ impl<'a> ChunkPruningStatistics<'a> {
         column: &'b Column,
     ) -> impl Iterator<Item = Option<&'a ColumnStatistics>> + 'a {
         self.summaries.iter().map(|(stats, schema)| {
-            let stats = stats.column_statistics.as_ref()?;
             let idx = schema.index_of(&column.name).ok()?;
-            Some(&stats[idx])
+            Some(&stats.column_statistics[idx])
         })
     }
 }
@@ -175,10 +177,19 @@ impl<'a> PruningStatistics for ChunkPruningStatistics<'a> {
     fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
         let null_counts = self
             .column_summaries(column)
-            .map(|x| x.and_then(|s| s.null_count.map(|x| x as u64)));
+            .map(|stats| stats.and_then(|stats| stats.null_count.get_value()))
+            .map(|x| x.map(|x| *x as u64));
 
         Some(Arc::new(UInt64Array::from_iter(null_counts)))
     }
+
+    fn contained(
+        &self,
+        _column: &datafusion::common::Column,
+        _values: &HashSet<ScalarValue>,
+    ) -> Option<BooleanArray> {
+        None
+    }
 }
 
 /// Collects an [`ArrayRef`] containing the aggregate statistic corresponding to
@@ -201,15 +212,15 @@ fn collect_pruning_stats<'a>(
 /// Returns the aggregate statistic corresponding to `aggregate` from `stats`
 fn get_aggregate(stats: &ColumnStatistics, aggregate: Aggregate) -> Option<&ScalarValue> {
     match aggregate {
-        Aggregate::Min => stats.min_value.as_ref(),
-        Aggregate::Max => stats.max_value.as_ref(),
+        Aggregate::Min => stats.min_value.get_value(),
+        Aggregate::Max => stats.max_value.get_value(),
         _ => None,
     }
 }
 
 /// Retention time expression, "time > retention_time".
 pub fn retention_expr(retention_time: i64) -> Expr {
-    col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(retention_time))
+    col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time))
 }
 
 #[cfg(test)]
diff --git a/iox_query/src/query_log.rs b/iox_query/src/query_log.rs
new file mode 100644
index 00000000000..e6ae929cb76
--- /dev/null
+++ b/iox_query/src/query_log.rs
@@ -0,0 +1,704 @@
+//! Ring buffer of queries that have been run with some brief information
+
+use data_types::NamespaceId;
+use datafusion::physical_plan::ExecutionPlan;
+use iox_time::{Time, TimeProvider};
+use observability_deps::tracing::{info, warn};
+use parking_lot::Mutex;
+use std::{
+    collections::VecDeque,
+    fmt::Debug,
+    sync::{
+        atomic::{self, AtomicBool, AtomicI64, AtomicUsize, Ordering},
+        Arc,
+    },
+    time::Duration,
+};
+use trace::ctx::TraceId;
+use uuid::Uuid;
+
+/// The query duration used for queries still running.
+const UNCOMPLETED_DURATION: i64 = -1;
+
+/// Information about a single query that was executed
+pub struct QueryLogEntry {
+    /// Unique ID.
+    pub id: Uuid,
+
+    /// Namespace ID.
+    pub namespace_id: NamespaceId,
+
+    /// Namespace name.
+    pub namespace_name: Arc<str>,
+
+    /// The type of query
+    pub query_type: &'static str,
+
+    /// The text of the query (SQL for sql queries, pbjson for storage rpc queries)
+    pub query_text: QueryText,
+
+    /// The trace ID if any
+    pub trace_id: Option<TraceId>,
+
+    /// Time at which the query was run
+    pub issue_time: Time,
+
+    /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time).
+    permit_duration: AtomicDuration,
+
+    /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration).
+    plan_duration: AtomicDuration,
+
+    /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) +
+    /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration).
+    execute_duration: AtomicDuration,
+
+    /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow.
+    end2end_duration: AtomicDuration,
+
+    /// CPU duration spend for computation.
+    compute_duration: AtomicDuration,
+
+    /// If the query completed successfully
+    success: AtomicBool,
+
+    /// If the query is currently running (in any state).
+    running: AtomicBool,
+}
+
+impl Debug for QueryLogEntry {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("QueryLogEntry")
+            .field("id", &self.id)
+            .field("namespace_id", &self.namespace_id)
+            .field("namespace_name", &self.namespace_name)
+            .field("query_type", &self.query_type)
+            .field("query_text", &self.query_text.to_string())
+            .field("trace_id", &self.trace_id)
+            .field("issue_time", &self.issue_time)
+            .field("permit_duration", &self.permit_duration())
+            .field("plan_duration", &self.plan_duration())
+            .field("execute_duration", &self.execute_duration())
+            .field("end2end_duration", &self.end2end_duration())
+            .field("compute_duration", &self.compute_duration())
+            .field("success", &self.success())
+            .field("running", &self.running())
+            .finish()
+    }
+}
+
+impl QueryLogEntry {
+    /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time).
+    pub fn permit_duration(&self) -> Option<Duration> {
+        self.permit_duration.get()
+    }
+
+    /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration).
+    pub fn plan_duration(&self) -> Option<Duration> {
+        self.plan_duration.get()
+    }
+
+    /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) +
+    /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration).
+    pub fn execute_duration(&self) -> Option<Duration> {
+        self.execute_duration.get()
+    }
+
+    /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow.
+    pub fn end2end_duration(&self) -> Option<Duration> {
+        self.end2end_duration.get()
+    }
+
+    /// CPU duration spend for computation.
+    pub fn compute_duration(&self) -> Option<Duration> {
+        self.compute_duration.get()
+    }
+
+    /// Returns true if `set_completed` was called with `success=true`
+    pub fn success(&self) -> bool {
+        self.success.load(Ordering::SeqCst)
+    }
+
+    /// If the query is currently running (in any state).
+    pub fn running(&self) -> bool {
+        self.running.load(Ordering::SeqCst)
+    }
+
+    /// Log entry.
+    pub fn log(&self, when: &'static str) {
+        info!(
+            when,
+            id=%self.id,
+            namespace_id=self.namespace_id.get(),
+            namespace_name=self.namespace_name.as_ref(),
+            query_type=self.query_type,
+            query_text=%self.query_text,
+            trace_id=self.trace_id.map(|id| format!("{:x}", id.get())),
+            issue_time=%self.issue_time,
+            plan_duration_secs=self.plan_duration().map(|d| d.as_secs_f64()),
+            permit_duration_secs=self.permit_duration().map(|d| d.as_secs_f64()),
+            execute_duration_secs=self.execute_duration().map(|d| d.as_secs_f64()),
+            end2end_duration_secs=self.end2end_duration().map(|d| d.as_secs_f64()),
+            compute_duration_secs=self.compute_duration().map(|d| d.as_secs_f64()),
+            success=self.success(),
+            running=self.running(),
+            "query",
+        )
+    }
+}
+
+/// Snapshot of the entries the [`QueryLog`].
+#[derive(Debug)]
+pub struct QueryLogEntries {
+    /// Entries.
+    pub entries: VecDeque<Arc<QueryLogEntry>>,
+
+    /// Maximum number of entries
+    pub max_size: usize,
+
+    /// Number of evicted entries due to the "max size" constraint.
+    pub evicted: usize,
+}
+
+/// Stores a fixed number `QueryExecutions` -- handles locking
+/// internally so can be shared across multiple
+pub struct QueryLog {
+    log: Mutex<VecDeque<Arc<QueryLogEntry>>>,
+    max_size: usize,
+    evicted: AtomicUsize,
+    time_provider: Arc<dyn TimeProvider>,
+    id_gen: IDGen,
+}
+
+impl QueryLog {
+    /// Create a new QueryLog that can hold at most `size` items.
+    /// When the `size+1` item is added, item `0` is evicted.
+    pub fn new(max_size: usize, time_provider: Arc<dyn TimeProvider>) -> Self {
+        Self::new_with_id_gen(max_size, time_provider, Box::new(Uuid::new_v4))
+    }
+
+    pub fn new_with_id_gen(
+        max_size: usize,
+        time_provider: Arc<dyn TimeProvider>,
+        id_gen: IDGen,
+    ) -> Self {
+        Self {
+            log: Mutex::new(VecDeque::with_capacity(max_size)),
+            max_size,
+            evicted: AtomicUsize::new(0),
+            time_provider,
+            id_gen,
+        }
+    }
+
+    pub fn push(
+        &self,
+        namespace_id: NamespaceId,
+        namespace_name: Arc<str>,
+        query_type: &'static str,
+        query_text: QueryText,
+        trace_id: Option<TraceId>,
+    ) -> QueryCompletedToken<StateReceived> {
+        let entry = Arc::new(QueryLogEntry {
+            id: (self.id_gen)(),
+            namespace_id,
+            namespace_name,
+            query_type,
+            query_text,
+            trace_id,
+            issue_time: self.time_provider.now(),
+            permit_duration: Default::default(),
+            plan_duration: Default::default(),
+            execute_duration: Default::default(),
+            end2end_duration: Default::default(),
+            compute_duration: Default::default(),
+            success: atomic::AtomicBool::new(false),
+            running: atomic::AtomicBool::new(true),
+        });
+        entry.log("start");
+        let token = QueryCompletedToken {
+            entry: Some(Arc::clone(&entry)),
+            time_provider: Arc::clone(&self.time_provider),
+            state: Default::default(),
+        };
+
+        if self.max_size == 0 {
+            return token;
+        }
+
+        let mut log = self.log.lock();
+
+        // enforce limit
+        while log.len() > self.max_size {
+            log.pop_front();
+            self.evicted.fetch_add(1, Ordering::SeqCst);
+        }
+
+        log.push_back(Arc::clone(&entry));
+        token
+    }
+
+    pub fn entries(&self) -> QueryLogEntries {
+        let log = self.log.lock();
+        QueryLogEntries {
+            entries: log.clone(),
+            max_size: self.max_size,
+            evicted: self.evicted.load(Ordering::SeqCst),
+        }
+    }
+}
+
+impl Debug for QueryLog {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("QueryLog")
+            .field("log", &self.log)
+            .field("max_size", &self.max_size)
+            .field("evicted", &self.evicted)
+            .field("time_provider", &self.time_provider)
+            .field("id_gen", &"<ID_GEN>")
+            .finish()
+    }
+}
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+///
+/// # To Do
+/// - The concurrency-limiting semaphore has NOT yet issued a permit.
+/// - The query is not planned.
+/// - The query has not been executed.
+#[derive(Debug, Clone, Copy, Default)]
+pub struct StateReceived;
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+/// - The concurrency-limiting semaphore has issued a permit.
+/// - The query was planned.
+///
+/// # To Do
+/// - The concurrency-limiting semaphore has NOT yet issued a permit.
+/// - The query has not been executed.
+#[derive(Debug)]
+pub struct StatePlanned {
+    /// Physical execution plan.
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+/// State of [`QueryCompletedToken`].
+///
+/// # Done
+/// - The query has been received (and potentially authenticated) by the server.
+/// - The concurrency-limiting semaphore has issued a permit.
+///
+/// # To Do
+/// - The query has not been executed.
+#[derive(Debug)]
+pub struct StatePermit {
+    /// Physical execution plan.
+    plan: Arc<dyn ExecutionPlan>,
+}
+
+/// A `QueryCompletedToken` is returned by `record_query` implementations of
+/// a `QueryNamespace`. It is used to trigger side-effects (such as query timing)
+/// on query completion.
+#[derive(Debug)]
+pub struct QueryCompletedToken<S> {
+    /// Entry.
+    ///
+    /// This is optional so we can implement type state and [`Drop`] at the same time.
+    entry: Option<Arc<QueryLogEntry>>,
+
+    /// Time provider
+    time_provider: Arc<dyn TimeProvider>,
+
+    /// Current state.
+    state: S,
+}
+
+impl<S> QueryCompletedToken<S> {
+    /// Underlying entry.
+    pub fn entry(&self) -> &Arc<QueryLogEntry> {
+        self.entry.as_ref().expect("valid state")
+    }
+}
+
+impl QueryCompletedToken<StateReceived> {
+    /// Record that this query got planned.
+    pub fn planned(mut self, plan: Arc<dyn ExecutionPlan>) -> QueryCompletedToken<StatePlanned> {
+        let entry = self.entry.take().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time;
+        entry.plan_duration.set_relative(origin, now);
+
+        QueryCompletedToken {
+            entry: Some(entry),
+            time_provider: Arc::clone(&self.time_provider),
+            state: StatePlanned { plan },
+        }
+    }
+}
+
+impl QueryCompletedToken<StatePlanned> {
+    /// Record that this query got a semaphore permit.
+    pub fn permit(mut self) -> QueryCompletedToken<StatePermit> {
+        let entry = self.entry.take().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time + entry.plan_duration().expect("valid state");
+        entry.permit_duration.set_relative(origin, now);
+
+        QueryCompletedToken {
+            entry: Some(entry),
+            time_provider: Arc::clone(&self.time_provider),
+            state: StatePermit {
+                plan: Arc::clone(&self.state.plan),
+            },
+        }
+    }
+}
+
+impl QueryCompletedToken<StatePermit> {
+    /// Record that this query completed successfully
+    pub fn success(self) {
+        let entry = self.entry.as_ref().expect("valid state");
+        entry.success.store(true, Ordering::SeqCst);
+
+        self.finish()
+    }
+
+    /// Record that the query finished execution with an error.
+    pub fn fail(self) {
+        self.finish()
+    }
+
+    fn finish(&self) {
+        let entry = self.entry.as_ref().expect("valid state");
+
+        let now = self.time_provider.now();
+        let origin = entry.issue_time
+            + entry.permit_duration().expect("valid state")
+            + entry.plan_duration().expect("valid state");
+        entry.execute_duration.set_relative(origin, now);
+
+        entry
+            .compute_duration
+            .set_absolute(collect_compute_duration(self.state.plan.as_ref()));
+    }
+}
+
+impl<S> Drop for QueryCompletedToken<S> {
+    fn drop(&mut self) {
+        if let Some(entry) = self.entry.take() {
+            let now = self.time_provider.now();
+            entry.end2end_duration.set_relative(entry.issue_time, now);
+            entry.running.store(false, Ordering::SeqCst);
+
+            entry.log("end");
+        }
+    }
+}
+
+/// Boxed description of a query that knows how to render to a string
+///
+/// This avoids storing potentially large strings
+pub type QueryText = Box<dyn std::fmt::Display + Send + Sync>;
+
+/// Method that generated [`Uuid`]s.
+pub type IDGen = Box<dyn Fn() -> Uuid + Send + Sync>;
+
+struct AtomicDuration(AtomicI64);
+
+impl AtomicDuration {
+    fn get(&self) -> Option<Duration> {
+        match self.0.load(Ordering::Relaxed) {
+            UNCOMPLETED_DURATION => None,
+            d => Some(Duration::from_nanos(d as u64)),
+        }
+    }
+
+    fn set_relative(&self, origin: Time, now: Time) {
+        match now.checked_duration_since(origin) {
+            Some(dur) => {
+                self.0.store(dur.as_nanos() as i64, Ordering::Relaxed);
+            }
+            None => {
+                warn!("Clock went backwards, not query duration")
+            }
+        }
+    }
+
+    fn set_absolute(&self, d: Duration) {
+        self.0.store(d.as_nanos() as i64, Ordering::Relaxed);
+    }
+}
+
+impl Default for AtomicDuration {
+    fn default() -> Self {
+        Self(AtomicI64::new(UNCOMPLETED_DURATION))
+    }
+}
+
+/// Collect compute duration from [`ExecutionPlan`].
+fn collect_compute_duration(plan: &dyn ExecutionPlan) -> Duration {
+    let mut total = Duration::ZERO;
+
+    if let Some(metrics) = plan.metrics() {
+        if let Some(nanos) = metrics.elapsed_compute() {
+            total += Duration::from_nanos(nanos as u64);
+        }
+    }
+
+    for child in plan.children() {
+        total += collect_compute_duration(child.as_ref());
+    }
+
+    total
+}
+
+#[cfg(test)]
+mod test_super {
+    use datafusion::error::DataFusionError;
+    use std::sync::atomic::AtomicU64;
+
+    use datafusion::physical_plan::{
+        metrics::{MetricValue, MetricsSet},
+        DisplayAs, Metric,
+    };
+    use iox_time::MockProvider;
+    use test_helpers::tracing::TracingCapture;
+
+    use super::*;
+
+    #[test]
+    fn test_token_end2end_success() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.plan_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(1));
+        let token = token.planned(plan());
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(10));
+        let token = token.permit();
+
+        assert!(!entry.success());
+        assert!(entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), None,);
+        assert_eq!(entry.compute_duration(), None,);
+
+        time_provider.inc(Duration::from_millis(100));
+        token.success();
+
+        assert!(entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),);
+        assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = true; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    #[test]
+    fn test_token_execution_fail() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        time_provider.inc(Duration::from_millis(1));
+        let token = token.planned(plan());
+        time_provider.inc(Duration::from_millis(10));
+        let token = token.permit();
+        time_provider.inc(Duration::from_millis(100));
+        token.fail();
+
+        assert!(!entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),);
+        assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),);
+        assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),);
+        assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = false; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    #[test]
+    fn test_token_drop_before_acquire() {
+        let capture = TracingCapture::new();
+
+        let Test {
+            time_provider,
+            token,
+            entry,
+        } = Test::default();
+
+        time_provider.inc(Duration::from_millis(100));
+        drop(token);
+
+        assert!(!entry.success());
+        assert!(!entry.running());
+        assert_eq!(entry.permit_duration(), None,);
+        assert_eq!(entry.plan_duration(), None,);
+        assert_eq!(entry.execute_duration(), None,);
+        assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(100)),);
+        assert_eq!(entry.compute_duration(), None,);
+
+        assert_eq!(
+            capture.to_string().trim(),
+            [
+                r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#,
+                r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; end2end_duration_secs = 0.1; success = false; running = false;"#,
+            ].join(" \n")
+        );
+    }
+
+    struct Test {
+        time_provider: Arc<MockProvider>,
+        token: QueryCompletedToken<StateReceived>,
+        entry: Arc<QueryLogEntry>,
+    }
+
+    impl Default for Test {
+        fn default() -> Self {
+            let time_provider =
+                Arc::new(MockProvider::new(Time::from_timestamp_millis(100).unwrap()));
+            let id_counter = AtomicU64::new(1);
+            let log = QueryLog::new_with_id_gen(
+                1_000,
+                Arc::clone(&time_provider) as _,
+                Box::new(move || Uuid::from_u128(id_counter.fetch_add(1, Ordering::SeqCst) as _)),
+            );
+
+            let token = log.push(
+                NamespaceId::new(1),
+                Arc::from("ns"),
+                "sql",
+                Box::new("SELECT 1"),
+                None,
+            );
+
+            let entry = Arc::clone(token.entry());
+
+            Self {
+                time_provider,
+                token,
+                entry,
+            }
+        }
+    }
+
+    fn plan() -> Arc<dyn ExecutionPlan> {
+        Arc::new(TestExec)
+    }
+
+    #[derive(Debug)]
+    struct TestExec;
+
+    impl DisplayAs for TestExec {
+        fn fmt_as(
+            &self,
+            _t: datafusion::physical_plan::DisplayFormatType,
+            _f: &mut std::fmt::Formatter<'_>,
+        ) -> std::fmt::Result {
+            unimplemented!()
+        }
+    }
+
+    impl ExecutionPlan for TestExec {
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn schema(&self) -> arrow::datatypes::SchemaRef {
+            unimplemented!()
+        }
+
+        fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning {
+            unimplemented!()
+        }
+
+        fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> {
+            unimplemented!()
+        }
+
+        fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> {
+            vec![]
+        }
+
+        fn with_new_children(
+            self: Arc<Self>,
+            _children: Vec<Arc<dyn ExecutionPlan>>,
+        ) -> datafusion::error::Result<Arc<dyn ExecutionPlan>> {
+            unimplemented!()
+        }
+
+        fn execute(
+            &self,
+            _partition: usize,
+            _context: Arc<datafusion::execution::TaskContext>,
+        ) -> datafusion::error::Result<datafusion::physical_plan::SendableRecordBatchStream>
+        {
+            unimplemented!()
+        }
+
+        fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+            unimplemented!()
+        }
+
+        fn metrics(&self) -> Option<MetricsSet> {
+            let mut metrics = MetricsSet::default();
+
+            let t = datafusion::physical_plan::metrics::Time::default();
+            t.add_duration(Duration::from_millis(1_337));
+            metrics.push(Arc::new(Metric::new(MetricValue::ElapsedCompute(t), None)));
+
+            Some(metrics)
+        }
+    }
+}
diff --git a/iox_query/src/statistics.rs b/iox_query/src/statistics.rs
index fd5f98cfaa9..3fc4d540543 100644
--- a/iox_query/src/statistics.rs
+++ b/iox_query/src/statistics.rs
@@ -1,20 +1,41 @@
 //! Code to translate IOx statistics to DataFusion statistics
 
-use std::{cmp::Ordering, collections::HashMap};
+use std::collections::{HashMap, VecDeque};
+use std::sync::Arc;
 
-use arrow::datatypes::Schema;
+use arrow::compute::rank;
+use arrow::datatypes::{Schema, SchemaRef};
+use datafusion::common::stats::Precision;
+use datafusion::datasource::physical_plan::ParquetExec;
+use datafusion::error::DataFusionError;
+use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec;
+use datafusion::physical_plan::empty::EmptyExec;
+use datafusion::physical_plan::expressions::Column;
+use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::placeholder_row::PlaceholderRowExec;
+use datafusion::physical_plan::projection::ProjectionExec;
+use datafusion::physical_plan::repartition::RepartitionExec;
+use datafusion::physical_plan::sorts::sort::SortExec;
+use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
+use datafusion::physical_plan::union::UnionExec;
+use datafusion::physical_plan::{visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor};
 use datafusion::{
     physical_plan::{ColumnStatistics, Statistics as DFStatistics},
     scalar::ScalarValue,
 };
+use observability_deps::tracing::trace;
+
+use crate::provider::{DeduplicateExec, RecordBatchesExec};
+use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME};
 
 /// Aggregates DataFusion [statistics](DFStatistics).
 #[derive(Debug)]
 pub struct DFStatsAggregator<'a> {
-    num_rows: Option<usize>,
-    total_byte_size: Option<usize>,
-    column_statistics: Option<Vec<DFStatsAggregatorCol>>,
-    is_exact: bool,
+    num_rows: Precision<usize>,
+    total_byte_size: Precision<usize>,
+    column_statistics: Vec<DFStatsAggregatorCol>,
+    // Maps column name to index in column_statistics for all columns we are
+    // aggregating
     col_idx_map: HashMap<&'a str, usize>,
 }
 
@@ -39,18 +60,16 @@ impl<'a> DFStatsAggregator<'a> {
             .collect::<HashMap<_, _>>();
 
         Self {
-            num_rows: Some(0),
-            total_byte_size: Some(0),
-            column_statistics: Some(
-                (0..col_idx_map.len())
-                    .map(|_| DFStatsAggregatorCol {
-                        null_count: Some(0),
-                        max_value: TriStateScalar::Uninit,
-                        min_value: TriStateScalar::Uninit,
-                    })
-                    .collect(),
-            ),
-            is_exact: true,
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: (0..col_idx_map.len())
+                .map(|_| DFStatsAggregatorCol {
+                    null_count: Precision::Exact(0),
+                    max_value: None,
+                    min_value: None,
+                })
+                .collect(),
+
             col_idx_map,
         }
     }
@@ -70,81 +89,70 @@ impl<'a> DFStatsAggregator<'a> {
             num_rows: update_num_rows,
             total_byte_size: update_total_byte_size,
             column_statistics: update_column_statistics,
-            is_exact: update_is_exact,
         } = update_stats;
 
-        self.num_rows = self
-            .num_rows
-            .zip(*update_num_rows)
-            .map(|(base, update)| base + update);
-        self.total_byte_size = self
-            .total_byte_size
-            .zip(*update_total_byte_size)
-            .map(|(base, update)| base + update);
-        self.column_statistics = self
-            .column_statistics
-            .take()
-            .zip(update_column_statistics.as_ref())
-            .map(|(mut base_cols, update_cols)| {
-                assert_eq!(base_cols.len(), self.col_idx_map.len());
-                assert!(
-                    update_cols.len() == update_schema.fields().len(),
-                    "stats ({}) and schema ({}) have different column count",
-                    update_cols.len(),
-                    update_schema.fields().len(),
-                );
+        self.num_rows = self.num_rows.add(update_num_rows);
+        self.total_byte_size = self.total_byte_size.add(update_total_byte_size);
 
-                let mut used_cols = vec![false; self.col_idx_map.len()];
-
-                for (update_field, update_col) in update_schema.fields().iter().zip(update_cols) {
-                    let Some(idx) = self.col_idx_map.get(update_field.name().as_str()) else {
-                        continue;
-                    };
-                    let base_col = &mut base_cols[*idx];
-                    used_cols[*idx] = true;
-
-                    // decompose structs so we don't forget new fields
-                    let DFStatsAggregatorCol {
-                        null_count: base_null_count,
-                        max_value: base_max_value,
-                        min_value: base_min_value,
-                    } = base_col;
-                    let ColumnStatistics {
-                        null_count: update_null_count,
-                        max_value: update_max_value,
-                        min_value: update_min_value,
-                        distinct_count: _update_distinct_count,
-                    } = update_col;
-
-                    *base_null_count = base_null_count
-                        .zip(*update_null_count)
-                        .map(|(base, update)| base + update);
-                    base_max_value.update(update_max_value, |base, update| {
-                        match base.partial_cmp(update) {
-                            None => None,
-                            Some(Ordering::Less) => Some(update.clone()),
-                            Some(Ordering::Equal | Ordering::Greater) => Some(base),
-                        }
-                    });
-                    base_min_value.update(update_min_value, |base, update| {
-                        match base.partial_cmp(update) {
-                            None => None,
-                            Some(Ordering::Less | Ordering::Equal) => Some(base),
-                            Some(Ordering::Greater) => Some(update.clone()),
-                        }
-                    });
-                }
+        assert_eq!(self.column_statistics.len(), self.col_idx_map.len());
+        assert_eq!(
+            update_column_statistics.len(),
+            update_schema.fields().len(),
+            "stats ({}) and schema ({}) have different column count",
+            update_column_statistics.len(),
+            update_schema.fields().len(),
+        );
 
-                // for unused cols, we need to assume all-NULL and hence invalidate the null counters
-                for (used, base_col) in used_cols.into_iter().zip(&mut base_cols) {
-                    if !used {
-                        base_col.null_count = None;
-                    }
-                }
+        let mut used_cols = vec![false; self.col_idx_map.len()];
 
-                base_cols
-            });
-        self.is_exact &= update_is_exact;
+        for (update_field, update_col) in update_schema
+            .fields()
+            .iter()
+            .zip(update_column_statistics.iter())
+        {
+            // Skip if not aggregating statitics for this field
+            let Some(idx) = self.col_idx_map.get(update_field.name().as_str()) else {
+                continue;
+            };
+            let base_col = &mut self.column_statistics[*idx];
+            used_cols[*idx] = true;
+
+            // decompose structs so we don't forget new fields
+            let DFStatsAggregatorCol {
+                null_count: base_null_count,
+                max_value: base_max_value,
+                min_value: base_min_value,
+            } = base_col;
+            let ColumnStatistics {
+                null_count: update_null_count,
+                max_value: update_max_value,
+                min_value: update_min_value,
+                distinct_count: _update_distinct_count,
+            } = update_col;
+
+            *base_null_count = base_null_count.add(update_null_count);
+
+            *base_max_value = Some(
+                base_max_value
+                    .take()
+                    .map(|base_max_value| base_max_value.max(update_max_value))
+                    .unwrap_or(update_max_value.clone()),
+            );
+
+            *base_min_value = Some(
+                base_min_value
+                    .take()
+                    .map(|base_min_value| base_min_value.min(update_min_value))
+                    .unwrap_or(update_min_value.clone()),
+            );
+        }
+
+        // for unused cols, we need to assume all-NULL and hence invalidate the null counters
+        for (used, base_col) in used_cols.into_iter().zip(&mut self.column_statistics) {
+            if !used {
+                base_col.null_count = Precision::Absent;
+            }
+        }
     }
 
     /// Build aggregated statistics.
@@ -152,87 +160,336 @@ impl<'a> DFStatsAggregator<'a> {
         DFStatistics {
             num_rows: self.num_rows,
             total_byte_size: self.total_byte_size,
-            column_statistics: self.column_statistics.map(|cols| {
-                cols.into_iter()
-                    .map(|col| ColumnStatistics {
-                        null_count: col.null_count,
-                        max_value: col.max_value.collapse(),
-                        min_value: col.min_value.collapse(),
-                        distinct_count: None,
-                    })
-                    .collect()
-            }),
-            is_exact: self.is_exact,
+            column_statistics: self
+                .column_statistics
+                .into_iter()
+                .map(|col| ColumnStatistics {
+                    null_count: col.null_count,
+                    max_value: col.max_value.unwrap_or(Precision::Absent),
+                    min_value: col.min_value.unwrap_or(Precision::Absent),
+                    distinct_count: Precision::Absent,
+                })
+                .collect(),
         }
     }
 }
 
-/// Similar to [`ColumnStatistics`] but has a tri-state for the min/max values so we can differentiate between
-/// ["uninitialized"](TriStateScalar::Uninit) and ["invalid"](TriStateScalar::Invalid).
+/// Similar to [`ColumnStatistics`] but uses `Option` to track min/max values so
+/// we can differentiate between
+///
+/// 1. "uninitialized" (`None`)
+/// 1. "initialized" (`Some(Precision::Exact(...))`)
+/// 2. "initialized but invalid" (`Some(Precision::Absent)`).
 ///
 /// It also does NOT contain a distinct count because we cannot aggregate these.
 #[derive(Debug)]
 struct DFStatsAggregatorCol {
-    null_count: Option<usize>,
-    max_value: TriStateScalar,
-    min_value: TriStateScalar,
+    null_count: Precision<usize>,
+    max_value: Option<Precision<ScalarValue>>,
+    min_value: Option<Precision<ScalarValue>>,
 }
 
-#[derive(Debug)]
-enum TriStateScalar {
-    /// Scalar has valid state.
-    Valid(ScalarValue),
+/// build DF statitics for given chunks and a schema
+pub fn build_statistics_for_chunks(
+    chunks: &[Arc<dyn QueryChunk>],
+    schema: SchemaRef,
+) -> DFStatistics {
+    let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok();
+    let chunk_order_only_schema = chunk_order_field.map(|field| Schema::new(vec![field.clone()]));
 
-    /// Scalar was not yet initialized.
-    Uninit,
+    let chunks: Vec<_> = chunks.iter().collect();
 
-    /// Scalar was poisoned and is invalid.
-    Invalid,
+    let statistics = chunks
+        .iter()
+        .fold(DFStatsAggregator::new(&schema), |mut agg, chunk| {
+            agg.update(&chunk.stats(), chunk.schema().as_arrow().as_ref());
+
+            if let Some(schema) = chunk_order_only_schema.as_ref() {
+                let order = chunk.order().get();
+                let order = ScalarValue::from(order);
+
+                agg.update(
+                    &DFStatistics {
+                        num_rows: Precision::Exact(0),
+                        total_byte_size: Precision::Exact(0),
+                        column_statistics: vec![ColumnStatistics {
+                            null_count: Precision::Exact(0),
+                            max_value: Precision::Exact(order.clone()),
+                            min_value: Precision::Exact(order),
+                            distinct_count: Precision::Exact(1),
+                        }],
+                    },
+                    schema,
+                );
+            }
+
+            agg
+        })
+        .build();
+
+    statistics
+}
+
+/// Traverse the execution plan and build statistics min max for the given column
+pub fn compute_stats_column_min_max(
+    plan: &dyn ExecutionPlan,
+    column_name: &str,
+) -> Result<ColumnStatistics, DataFusionError> {
+    let mut visitor = StatisticsVisitor::new(column_name);
+    visit_execution_plan(plan, &mut visitor)?;
+
+    // there must be only one statistics left in the stack
+    if visitor.statistics.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "There must be only one statistics left in the stack, but find {}",
+            visitor.statistics.len()
+        )));
+    }
+
+    Ok(visitor.statistics.pop_back().unwrap())
+}
+
+/// Traverse the physical plan and build statistics min max for the given column each node
+/// Note: This is a temproray solution until DF's statistics is more mature
+/// <https://github.com/apache/arrow-datafusion/issues/8078>
+struct StatisticsVisitor<'a> {
+    column_name: &'a str, //String,  // todo: not sure enough
+    statistics: VecDeque<ColumnStatistics>,
+}
+
+impl<'a> StatisticsVisitor<'a> {
+    fn new(column_name: &'a str) -> Self {
+        Self {
+            column_name,
+            statistics: VecDeque::new(),
+        }
+    }
 }
 
-impl TriStateScalar {
-    fn update<'a, F>(&mut self, update: &'a Option<ScalarValue>, f: F)
-    where
-        F: FnOnce(ScalarValue, &'a ScalarValue) -> Option<ScalarValue>,
-    {
-        match (self, update.as_ref()) {
-            // invalid acts as a poison value
-            (Self::Invalid, _) => {}
-            // update w/o invalid invalidates aggregate
-            (this, None) => {
-                *this = Self::Invalid;
+impl ExecutionPlanVisitor for StatisticsVisitor<'_> {
+    type Error = DataFusionError;
+
+    fn pre_visit(&mut self, _plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
+        Ok(false)
+    }
+
+    fn post_visit(&mut self, plan: &dyn ExecutionPlan) -> Result<bool, Self::Error> {
+        // If this is an EmptyExec / PlaceholderRowExec, we don't know about it
+        if plan.as_any().downcast_ref::<EmptyExec>().is_some()
+            || plan.as_any().downcast_ref::<PlaceholderRowExec>().is_some()
+        {
+            self.statistics.push_back(ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Absent,
+                min_value: Precision::Absent,
+                distinct_count: Precision::Absent,
+            });
+        }
+        // If this is leaf node (ParquetExec or RecordBatchExec), compute its statistics and push it to the stack
+        else if plan.as_any().downcast_ref::<ParquetExec>().is_some()
+            || plan.as_any().downcast_ref::<RecordBatchesExec>().is_some()
+        {
+            // get index of the given column in the schema
+            let statistics = match plan.schema().index_of(self.column_name) {
+                Ok(col_index) => plan.statistics()?.column_statistics[col_index].clone(),
+                // This is the case of alias, do not optimize by returning no statistics
+                Err(_) => {
+                    trace!(
+                        " ------------------- No statistics for column {} in PQ/RB",
+                        self.column_name
+                    );
+                    ColumnStatistics {
+                        null_count: Precision::Absent,
+                        max_value: Precision::Absent,
+                        min_value: Precision::Absent,
+                        distinct_count: Precision::Absent,
+                    }
+                }
+            };
+            self.statistics.push_back(statistics);
+        }
+        // Non leaf node
+        else {
+            // These are cases the stats will be unioned of their children's
+            //   Sort, Dediplicate, Filter, Repartition, Union, SortPreservingMerge, CoalesceBatches
+            let union_stats = if plan.as_any().downcast_ref::<SortExec>().is_some()
+                || plan.as_any().downcast_ref::<DeduplicateExec>().is_some()
+                || plan.as_any().downcast_ref::<FilterExec>().is_some()
+                || plan.as_any().downcast_ref::<RepartitionExec>().is_some()
+                || plan.as_any().downcast_ref::<UnionExec>().is_some()
+                || plan
+                    .as_any()
+                    .downcast_ref::<SortPreservingMergeExec>()
+                    .is_some()
+                || plan
+                    .as_any()
+                    .downcast_ref::<CoalesceBatchesExec>()
+                    .is_some()
+            {
+                true
+            } else if plan.as_any().downcast_ref::<ProjectionExec>().is_some() {
+                // ProjectionExec is a special case. Only union stats if it includes pure columns
+                projection_includes_pure_columns(
+                    plan.as_any().downcast_ref::<ProjectionExec>().unwrap(),
+                )
+            } else {
+                false
+            };
+
+            // pop statistics of all inputs from the stack
+            let num_inputs = plan.children().len();
+            // num_input must > 0. Pop the first one
+            let mut statistics = self
+                .statistics
+                .pop_back()
+                .expect("No statistics for input plan");
+            // pop the rest and update the min and max
+            for _ in 1..num_inputs {
+                let input_statistics = self
+                    .statistics
+                    .pop_back()
+                    .expect("No statistics for input plan");
+
+                if union_stats {
+                    // Convervatively union min max
+                    statistics.null_count = statistics.null_count.add(&input_statistics.null_count);
+                    statistics.max_value = statistics.max_value.max(&input_statistics.max_value);
+                    statistics.min_value = statistics.min_value.min(&input_statistics.min_value);
+                    statistics.distinct_count = Precision::Absent;
+                };
             }
-            // uninit w/ first value just clones the value
-            (this @ Self::Uninit, Some(update)) => {
-                *this = Self::Valid(update.clone());
+
+            if union_stats {
+                self.statistics.push_back(statistics);
+            } else {
+                trace!(
+                    " ------ No statistics for column {} in non-leaf node",
+                    self.column_name
+                );
+                // Make them absent for other cases
+                self.statistics.push_back(ColumnStatistics {
+                    null_count: Precision::Absent,
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
+                });
             }
-            // updating a valid value with something requires a folding function
-            (this @ Self::Valid(_), Some(update)) => {
-                let mut base = Self::Invalid;
-                std::mem::swap(this, &mut base);
-                let Self::Valid(base) = base else {
-                    unreachable!()
-                };
-                *this = match f(base, update) {
-                    Some(val) => Self::Valid(val),
-                    None => Self::Invalid,
+        }
+
+        Ok(true)
+    }
+}
+
+fn projection_includes_pure_columns(projection: &ProjectionExec) -> bool {
+    projection
+        .expr()
+        .iter()
+        .all(|(expr, _col_name)| expr.as_any().downcast_ref::<Column>().is_some())
+}
+
+/// Return min max of a ColumnStatistics with precise values
+pub fn column_statistics_min_max(
+    column_statistics: &ColumnStatistics,
+) -> Option<(ScalarValue, ScalarValue)> {
+    match (&column_statistics.min_value, &column_statistics.max_value) {
+        (Precision::Exact(min), Precision::Exact(max)) => Some((min.clone(), max.clone())),
+        // the statistics values are absent or imprecise
+        _ => None,
+    }
+}
+
+/// Get statsistics min max of given column name on given plans
+/// Return None if one of the inputs does not have statistics or  does not include the column
+pub fn statistics_min_max(
+    plans: &[Arc<dyn ExecutionPlan>],
+    column_name: &str,
+) -> Option<Vec<(ScalarValue, ScalarValue)>> {
+    // Get statistics for each plan
+    let plans_schema_and_stats = plans
+        .iter()
+        .map(|plan| Ok((Arc::clone(plan), plan.schema(), plan.statistics()?)))
+        .collect::<Result<Vec<_>, DataFusionError>>();
+
+    // If any without statistics, return none
+    let Ok(plans_schema_and_stats) = plans_schema_and_stats else {
+        return None;
+    };
+
+    // get value range of the sorted column for each input
+    let mut min_max_ranges = Vec::with_capacity(plans_schema_and_stats.len());
+    for (input, input_schema, input_stats) in plans_schema_and_stats {
+        // get index of the sorted column in the schema
+        let Ok(sorted_col_index) = input_schema.index_of(column_name) else {
+            // panic that the sorted column is not in the schema
+            panic!("sorted column {} is not in the schema", column_name);
+        };
+
+        let column_stats = input_stats.column_statistics;
+        let sorted_col_stats = column_stats[sorted_col_index].clone();
+        match (sorted_col_stats.min_value, sorted_col_stats.max_value) {
+            (Precision::Exact(min), Precision::Exact(max)) => {
+                min_max_ranges.push((min, max));
+            }
+            // WARNING: this may produce incorrect results until we use more precision
+            // as `Inexact` is not guaranteed to cover the actual min and max values
+            // https://github.com/apache/arrow-datafusion/issues/8078
+            (Precision::Inexact(min), Precision::Inexact(max)) => {
+                if let Some(_deduplicate_exec) = input.as_any().downcast_ref::<DeduplicateExec>() {
+                    min_max_ranges.push((min, max));
+                } else {
+                    return None;
                 };
             }
+            // the statistics  values are absent
+            _ => return None,
         }
     }
 
-    fn collapse(self) -> Option<ScalarValue> {
-        match self {
-            Self::Invalid | Self::Uninit => None,
-            Self::Valid(val) => Some(val),
+    Some(min_max_ranges)
+}
+
+/// Return true if at least 2 min_max ranges in the given array overlap
+pub fn overlap(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result<bool, DataFusionError> {
+    // interleave min and max into one iterator
+    let value_ranges_iter = value_ranges.iter().flat_map(|(min, max)| {
+        // panics if min > max
+        if min > max {
+            panic!("min ({:?}) > max ({:?})", min, max);
+        }
+        vec![min.clone(), max.clone()]
+    });
+
+    let value_ranges = ScalarValue::iter_to_array(value_ranges_iter)?;
+
+    // rank it
+    let ranks = rank(&*value_ranges, None)?;
+
+    // check overlap by checking if the max is rank right behind its corresponding min
+    //  . non-overlap example: values of min-max pairs [3, 5,   9, 12,   1, 1,   6, 8]
+    //     ranks:  [3, 4,   7, 8,  2, 2,  5, 6] : max (even index) = its correspnding min (odd index) for same min max OR min + 1
+    //  . overlap example:  [3, 5,   9, 12,   1, 1,   4, 6] : pair [3, 5] interleaves with pair [4, 6]
+    //     ranks:  [3, 5,   7, 8,  2, 2,  4, 6]
+    for i in (0..ranks.len()).step_by(2) {
+        if !((ranks[i] == ranks[i + 1]) || (ranks[i + 1] == ranks[i] + 1)) {
+            return Ok(true);
         }
     }
+
+    Ok(false)
 }
 
 #[cfg(test)]
 mod test {
+    use crate::{
+        provider::chunks_to_physical_nodes,
+        test::{format_execution_plan, TestChunk},
+    };
+
     use super::*;
     use arrow::datatypes::{DataType, Field};
+    use datafusion::{common::Statistics, error::DataFusionError};
+    use itertools::Itertools;
+    use schema::{InfluxFieldType, SchemaBuilder};
 
     #[test]
     fn test_df_stats_agg_no_cols_no_updates() {
@@ -241,10 +498,9 @@ mod test {
 
         let actual = agg.build();
         let expected = DFStatistics {
-            num_rows: Some(0),
-            total_byte_size: Some(0),
-            column_statistics: Some(vec![]),
-            is_exact: true,
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: Statistics::unknown_column(&schema),
         };
         assert_eq!(actual, expected);
     }
@@ -259,23 +515,22 @@ mod test {
 
         let actual = agg.build();
         let expected = DFStatistics {
-            num_rows: Some(0),
-            total_byte_size: Some(0),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(0),
+            total_byte_size: Precision::Exact(0),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(0),
-                    max_value: None,
-                    min_value: None,
-                    distinct_count: None,
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
                 },
                 ColumnStatistics {
-                    null_count: Some(0),
-                    max_value: None,
-                    min_value: None,
-                    distinct_count: None,
+                    null_count: Precision::Exact(0),
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
@@ -293,59 +548,56 @@ mod test {
             Field::new("col2", DataType::Utf8, false),
         ]);
         let update_stats = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
                 },
                 ColumnStatistics {
-                    null_count: Some(1_000),
-                    max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         agg.update(&update_stats, &update_schema);
 
         let update_schema = Schema::new(vec![Field::new("col2", DataType::Utf8, false)]);
         let update_stats = DFStatistics {
-            num_rows: Some(10_000),
-            total_byte_size: Some(100_000),
-            column_statistics: Some(vec![ColumnStatistics {
-                null_count: Some(1_000_000),
-                max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))),
-                min_value: Some(ScalarValue::Utf8(Some("c".to_owned()))),
-                distinct_count: Some(42),
-            }]),
-            is_exact: true,
+            num_rows: Precision::Exact(10_000),
+            total_byte_size: Precision::Exact(100_000),
+            column_statistics: vec![ColumnStatistics {
+                null_count: Precision::Exact(1_000_000),
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))),
+                distinct_count: Precision::Exact(42),
+            }],
         };
         agg.update(&update_stats, &update_schema);
 
         let actual = agg.build();
         let expected = DFStatistics {
-            num_rows: Some(10_001),
-            total_byte_size: Some(100_010),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(10_001),
+            total_byte_size: Precision::Exact(100_010),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: None,
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: None,
+                    null_count: Precision::Absent,
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
                 },
                 ColumnStatistics {
-                    null_count: Some(1_001_000),
-                    max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(1_001_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
@@ -363,23 +615,22 @@ mod test {
             Field::new("col2", DataType::Utf8, false),
         ]);
         let update_stats = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
                 },
                 ColumnStatistics {
-                    null_count: Some(1_000),
-                    max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         agg.update(&update_stats, &update_schema);
 
@@ -388,45 +639,43 @@ mod test {
             Field::new("col1", DataType::UInt64, true),
         ]);
         let update_stats = DFStatistics {
-            num_rows: Some(10_000),
-            total_byte_size: Some(100_000),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(10_000),
+            total_byte_size: Precision::Exact(100_000),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(1_000_000),
-                    max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("c".to_owned()))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(1_000_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))),
+                    distinct_count: Precision::Exact(42),
                 },
                 ColumnStatistics {
-                    null_count: Some(10_000_000),
-                    max_value: Some(ScalarValue::UInt64(Some(99))),
-                    min_value: Some(ScalarValue::UInt64(Some(40))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(10_000_000),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(99))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(40))),
+                    distinct_count: Precision::Exact(42),
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         agg.update(&update_stats, &update_schema);
 
         let actual = agg.build();
         let expected = DFStatistics {
-            num_rows: Some(10_001),
-            total_byte_size: Some(100_010),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(10_001),
+            total_byte_size: Precision::Exact(100_010),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(10_000_100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(40))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(10_000_100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(40))),
+                    distinct_count: Precision::Absent,
                 },
                 ColumnStatistics {
-                    null_count: Some(1_001_000),
-                    max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(1_001_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
@@ -444,45 +693,43 @@ mod test {
             Field::new("col3", DataType::Utf8, false),
         ]);
         let update_stats = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
                 },
                 ColumnStatistics {
-                    null_count: Some(1_000),
-                    max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         agg.update(&update_stats, &update_schema);
 
         let actual = agg.build();
         let expected = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
                 },
                 ColumnStatistics {
-                    null_count: None,
-                    max_value: None,
-                    min_value: None,
-                    distinct_count: None,
+                    null_count: Precision::Absent,
+                    max_value: Precision::Absent,
+                    min_value: Precision::Absent,
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         assert_eq!(actual, expected);
     }
@@ -495,42 +742,40 @@ mod test {
         ]);
 
         let update_stats = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(100),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(100),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Exact(42),
                 },
                 ColumnStatistics {
-                    null_count: Some(1_000),
-                    max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: Some(42),
+                    null_count: Precision::Exact(1_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Exact(42),
                 },
-            ]),
-            is_exact: true,
+            ],
         };
         let agg_stats = DFStatistics {
-            num_rows: Some(2),
-            total_byte_size: Some(20),
-            column_statistics: Some(vec![
+            num_rows: Precision::Exact(2),
+            total_byte_size: Precision::Exact(20),
+            column_statistics: vec![
                 ColumnStatistics {
-                    null_count: Some(200),
-                    max_value: Some(ScalarValue::UInt64(Some(100))),
-                    min_value: Some(ScalarValue::UInt64(Some(50))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(200),
+                    max_value: Precision::Exact(ScalarValue::UInt64(Some(100))),
+                    min_value: Precision::Exact(ScalarValue::UInt64(Some(50))),
+                    distinct_count: Precision::Absent,
                 },
                 ColumnStatistics {
-                    null_count: Some(2_000),
-                    max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))),
-                    min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))),
-                    distinct_count: None,
+                    null_count: Precision::Exact(2_000),
+                    max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))),
+                    min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))),
+                    distinct_count: Precision::Absent,
                 },
-            ]),
-            is_exact: true,
+            ],
         };
 
         #[derive(Debug, Clone, Copy)]
@@ -546,41 +791,36 @@ mod test {
             TotalByteSize,
             ColumnStatistics,
             Col(usize, ColMode),
-            IsExact,
         }
 
         impl Mode {
             fn mask(&self, mut stats: DFStatistics) -> DFStatistics {
                 match self {
                     Self::NumRows => {
-                        stats.num_rows = None;
+                        stats.num_rows = Precision::Absent;
                     }
                     Self::TotalByteSize => {
-                        stats.total_byte_size = None;
+                        stats.total_byte_size = Precision::Absent;
                     }
                     Self::ColumnStatistics => {
-                        stats.column_statistics = None;
+                        let num_cols = stats.column_statistics.len();
+                        stats.column_statistics = vec![ColumnStatistics::new_unknown(); num_cols]
                     }
                     Self::Col(idx, mode) => {
-                        if let Some(stats) = stats.column_statistics.as_mut() {
-                            let stats = &mut stats[*idx];
-
-                            match mode {
-                                ColMode::NullCount => {
-                                    stats.null_count = None;
-                                }
-                                ColMode::MaxValue => {
-                                    stats.max_value = None;
-                                }
-                                ColMode::MinValue => {
-                                    stats.min_value = None;
-                                }
+                        let stats = &mut stats.column_statistics[*idx];
+
+                        match mode {
+                            ColMode::NullCount => {
+                                stats.null_count = Precision::Absent;
+                            }
+                            ColMode::MaxValue => {
+                                stats.max_value = Precision::Absent;
+                            }
+                            ColMode::MinValue => {
+                                stats.min_value = Precision::Absent;
                             }
                         }
                     }
-                    Self::IsExact => {
-                        stats.is_exact = false;
-                    }
                 }
                 stats
             }
@@ -594,7 +834,6 @@ mod test {
             Mode::Col(0, ColMode::MaxValue),
             Mode::Col(0, ColMode::MinValue),
             Mode::Col(1, ColMode::NullCount),
-            Mode::IsExact,
         ] {
             println!("mode: {mode:?}");
 
@@ -626,11 +865,583 @@ mod test {
 
         let update_schema = Schema::new(vec![Field::new("col1", DataType::UInt64, true)]);
         let update_stats = DFStatistics {
-            num_rows: Some(1),
-            total_byte_size: Some(10),
-            column_statistics: Some(vec![]),
-            is_exact: true,
+            num_rows: Precision::Exact(1),
+            total_byte_size: Precision::Exact(10),
+            column_statistics: vec![],
         };
         agg.update(&update_stats, &update_schema);
     }
+
+    #[test]
+    fn test_stats_for_one_chunk() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        let expected_stats = [
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(6))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+        ];
+
+        let record_batch_stats =
+            build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema));
+        assert_eq!(record_batch_stats.column_statistics, expected_stats);
+
+        let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema);
+        assert_eq!(parquet_stats.column_statistics, expected_stats);
+    }
+
+    #[test]
+    fn test_stats_for_two_chunks() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("field", InfluxFieldType::Float)
+            .timestamp()
+            .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer)
+            .build()
+            .unwrap()
+            .into();
+
+        // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME
+        let record_batch_chunk_1 = Arc::new(
+            TestChunk::new("t1")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)),
+        );
+
+        let record_batch_chunk_2 = Arc::new(
+            TestChunk::new("t2")
+                .with_tag_column_with_stats("tag", Some("MI"), Some("WA"))
+                .with_time_column_with_stats(Some(50), Some(80))
+                .with_i64_field_column_with_stats("field", Some(0), Some(70))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15)),
+        );
+
+        // create them same test chunk but with a parquet file
+        let parquet_chunk_1 = Arc::new(
+            TestChunk::new("t1")
+                .with_tag_column_with_stats("tag", Some("AL"), Some("MT"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(100))
+                .with_time_column_with_stats(Some(10), Some(20))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6))
+                .with_dummy_parquet_file(),
+        );
+
+        let parquet_chunk_2 = Arc::new(
+            TestChunk::new("t2")
+                .with_tag_column_with_stats("tag", Some("MI"), Some("WA"))
+                .with_i64_field_column_with_stats("field", Some(0), Some(70))
+                .with_time_column_with_stats(Some(50), Some(80))
+                .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15))
+                .with_dummy_parquet_file(),
+        );
+
+        let expected_stats = [
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Utf8(Some("WA".to_string()))),
+                min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(100))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(80), None)),
+                min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)),
+                distinct_count: Precision::Absent,
+            },
+            ColumnStatistics {
+                null_count: Precision::Absent,
+                max_value: Precision::Exact(ScalarValue::Int64(Some(15))),
+                min_value: Precision::Exact(ScalarValue::Int64(Some(0))),
+                distinct_count: Precision::Absent,
+            },
+        ];
+
+        let record_batch_stats = build_statistics_for_chunks(
+            &[record_batch_chunk_1, record_batch_chunk_2],
+            Arc::clone(&schema),
+        );
+        assert_eq!(record_batch_stats.column_statistics, expected_stats);
+
+        let parquet_stats =
+            build_statistics_for_chunks(&[parquet_chunk_1, parquet_chunk_2], schema);
+        assert_eq!(parquet_stats.column_statistics, expected_stats);
+    }
+
+    #[test]
+    fn test_compute_statistics_min_max() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("float_field", InfluxFieldType::Float)
+            .influx_field("int_field", InfluxFieldType::Integer)
+            .influx_field("string_field", InfluxFieldType::String)
+            .tag("tag_no_val") // no chunks have values for this
+            .influx_field("field_no_val", InfluxFieldType::Integer)
+            .timestamp()
+            .build()
+            .unwrap()
+            .into();
+
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(10), Some(100))
+                .with_tag_column_with_stats("tag", Some("MA"), Some("VT"))
+                .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4))
+                .with_i64_field_column_with_stats("int_field", Some(30), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum"))
+                // only this chunk has value for this field
+                .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50))
+                .with_dummy_parquet_file(),
+        ) as Arc<dyn QueryChunk>;
+
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(20), Some(200))
+                .with_tag_column_with_stats("tag", Some("Boston"), Some("DC"))
+                .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0))
+                .with_i64_field_column_with_stats("int_field", Some(1), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")),
+        ) as Arc<dyn QueryChunk>;
+
+        let plan_pq = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan_pq),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]"
+        "###
+        );
+
+        let plan_rb = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1);
+        insta::assert_yaml_snapshot!(
+            format_execution_plan(&plan_rb),
+            @r###"
+        ---
+        - " UnionExec"
+        - "   RecordBatchesExec: chunks=1, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]"
+        "###
+        );
+
+        // Stats for time
+        // parquet
+        let time_stats = compute_stats_column_min_max(&*plan_pq, "time").unwrap();
+        let min_max = column_statistics_min_max(&time_stats).unwrap();
+        let expected_time_stats = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(100), None),
+        );
+        assert_eq!(min_max, expected_time_stats);
+        // record batch
+        let time_stats = compute_stats_column_min_max(&*plan_rb, "time").unwrap();
+        let min_max = column_statistics_min_max(&time_stats).unwrap();
+        let expected_time_stats = (
+            ScalarValue::TimestampNanosecond(Some(20), None),
+            ScalarValue::TimestampNanosecond(Some(200), None),
+        );
+        assert_eq!(min_max, expected_time_stats);
+
+        // Stats for tag
+        // parquet
+        let tag_stats = compute_stats_column_min_max(&*plan_pq, "tag").unwrap();
+        let min_max = column_statistics_min_max(&tag_stats).unwrap();
+        let expected_tag_stats = (
+            ScalarValue::Utf8(Some("MA".to_string())),
+            ScalarValue::Utf8(Some("VT".to_string())),
+        );
+        assert_eq!(min_max, expected_tag_stats);
+        // record batch
+        let tag_stats = compute_stats_column_min_max(&*plan_rb, "tag").unwrap();
+        let min_max = column_statistics_min_max(&tag_stats).unwrap();
+        let expected_tag_stats = (
+            ScalarValue::Utf8(Some("Boston".to_string())),
+            ScalarValue::Utf8(Some("DC".to_string())),
+        );
+        assert_eq!(min_max, expected_tag_stats);
+
+        // Stats for field
+        // parquet
+        let float_stats = compute_stats_column_min_max(&*plan_pq, "float_field").unwrap();
+        let min_max = column_statistics_min_max(&float_stats).unwrap();
+        let expected_float_stats = (
+            ScalarValue::Float64(Some(10.1)),
+            ScalarValue::Float64(Some(100.4)),
+        );
+        assert_eq!(min_max, expected_float_stats);
+        // record batch
+        let float_stats = compute_stats_column_min_max(&*plan_rb, "float_field").unwrap();
+        let min_max = column_statistics_min_max(&float_stats).unwrap();
+        let expected_float_stats = (
+            ScalarValue::Float64(Some(15.6)),
+            ScalarValue::Float64(Some(30.0)),
+        );
+        assert_eq!(min_max, expected_float_stats);
+
+        // Stats for int
+        // parquet
+        let int_stats = compute_stats_column_min_max(&*plan_pq, "int_field").unwrap();
+        let min_max = column_statistics_min_max(&int_stats).unwrap();
+        let expected_int_stats = (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50)));
+        assert_eq!(min_max, expected_int_stats);
+        // record batch
+        let int_stats = compute_stats_column_min_max(&*plan_rb, "int_field").unwrap();
+        let min_max = column_statistics_min_max(&int_stats).unwrap();
+        let expected_int_stats = (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50)));
+        assert_eq!(min_max, expected_int_stats);
+
+        // Stats for string
+        // parquet
+        let string_stats = compute_stats_column_min_max(&*plan_pq, "string_field").unwrap();
+        let min_max = column_statistics_min_max(&string_stats).unwrap();
+        let expected_string_stats = (
+            ScalarValue::Utf8(Some("orange".to_string())),
+            ScalarValue::Utf8(Some("plum".to_string())),
+        );
+        assert_eq!(min_max, expected_string_stats);
+        // record batch
+        let string_stats = compute_stats_column_min_max(&*plan_rb, "string_field").unwrap();
+        let min_max = column_statistics_min_max(&string_stats).unwrap();
+        let expected_string_stats = (
+            ScalarValue::Utf8(Some("banana".to_string())),
+            ScalarValue::Utf8(Some("plum".to_string())),
+        );
+        assert_eq!(min_max, expected_string_stats);
+
+        // no tats on parquet
+        let tag_no_stats = compute_stats_column_min_max(&*plan_pq, "tag_no_val").unwrap();
+        let min_max = column_statistics_min_max(&tag_no_stats);
+        assert!(min_max.is_none());
+
+        // no stats on record batch
+        let field_no_stats = compute_stats_column_min_max(&*plan_rb, "field_no_val").unwrap();
+        let min_max = column_statistics_min_max(&field_no_stats);
+        assert!(min_max.is_none());
+    }
+
+    #[test]
+    fn test_statistics_min_max() {
+        // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME
+        let schema: SchemaRef = SchemaBuilder::new()
+            .tag("tag")
+            .influx_field("float_field", InfluxFieldType::Float)
+            .influx_field("int_field", InfluxFieldType::Integer)
+            .influx_field("string_field", InfluxFieldType::String)
+            .tag("tag_no_val") // no chunks have values for this
+            .influx_field("field_no_val", InfluxFieldType::Integer)
+            .timestamp()
+            .build()
+            .unwrap()
+            .into();
+
+        let parquet_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(10), Some(100))
+                .with_tag_column_with_stats("tag", Some("MA"), Some("VT"))
+                .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4))
+                .with_i64_field_column_with_stats("int_field", Some(30), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum"))
+                // only this chunk has value for this field
+                .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50))
+                .with_dummy_parquet_file(),
+        ) as Arc<dyn QueryChunk>;
+
+        let record_batch_chunk = Arc::new(
+            TestChunk::new("t")
+                .with_time_column_with_stats(Some(20), Some(200))
+                .with_tag_column_with_stats("tag", Some("Boston"), Some("DC"))
+                .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0))
+                .with_i64_field_column_with_stats("int_field", Some(1), Some(50))
+                .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")),
+        ) as Arc<dyn QueryChunk>;
+
+        let plan1 = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1);
+        let plan2 = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1);
+
+        let time_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "time").unwrap();
+        let expected_time_stats = [
+            (
+                ScalarValue::TimestampNanosecond(Some(10), None),
+                ScalarValue::TimestampNanosecond(Some(100), None),
+            ),
+            (
+                ScalarValue::TimestampNanosecond(Some(20), None),
+                ScalarValue::TimestampNanosecond(Some(200), None),
+            ),
+        ];
+        assert_eq!(time_stats, expected_time_stats);
+
+        let tag_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag").unwrap();
+        let expected_tag_stats = [
+            (
+                ScalarValue::Utf8(Some("MA".to_string())),
+                ScalarValue::Utf8(Some("VT".to_string())),
+            ),
+            (
+                ScalarValue::Utf8(Some("Boston".to_string())),
+                ScalarValue::Utf8(Some("DC".to_string())),
+            ),
+        ];
+        assert_eq!(tag_stats, expected_tag_stats);
+
+        let float_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "float_field").unwrap();
+        let expected_float_stats = [
+            (
+                ScalarValue::Float64(Some(10.1)),
+                ScalarValue::Float64(Some(100.4)),
+            ),
+            (
+                ScalarValue::Float64(Some(15.6)),
+                ScalarValue::Float64(Some(30.0)),
+            ),
+        ];
+        assert_eq!(float_stats, expected_float_stats);
+
+        let int_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "int_field").unwrap();
+        let expected_int_stats = [
+            (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50))),
+            (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50))),
+        ];
+        assert_eq!(int_stats, expected_int_stats);
+
+        let string_stats =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "string_field").unwrap();
+        let expected_string_stats = [
+            (
+                ScalarValue::Utf8(Some("orange".to_string())),
+                ScalarValue::Utf8(Some("plum".to_string())),
+            ),
+            (
+                ScalarValue::Utf8(Some("banana".to_string())),
+                ScalarValue::Utf8(Some("plum".to_string())),
+            ),
+        ];
+        assert_eq!(string_stats, expected_string_stats);
+
+        let tag_no_stat =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag_no_val");
+        assert!(tag_no_stat.is_none());
+
+        let field_no_stat =
+            statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "field_no_val");
+        assert!(field_no_stat.is_none());
+    }
+
+    #[test]
+    fn test_non_overlap_time() {
+        let pair_1 = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(20), None),
+        );
+        let pair_2 = (
+            ScalarValue::TimestampNanosecond(Some(100), None),
+            ScalarValue::TimestampNanosecond(Some(150), None),
+        );
+        let pair_3 = (
+            ScalarValue::TimestampNanosecond(Some(60), None),
+            ScalarValue::TimestampNanosecond(Some(65), None),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_time() {
+        let pair_1 = (
+            ScalarValue::TimestampNanosecond(Some(10), None),
+            ScalarValue::TimestampNanosecond(Some(20), None),
+        );
+        let pair_2 = (
+            ScalarValue::TimestampNanosecond(Some(100), None),
+            ScalarValue::TimestampNanosecond(Some(150), None),
+        );
+        let pair_3 = (
+            ScalarValue::TimestampNanosecond(Some(8), None),
+            ScalarValue::TimestampNanosecond(Some(10), None),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_integer() {
+        // [3, 5,   9, 12,   1, 1,   6, 8]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12)));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1)));
+        let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_integer() {
+        // [3, 5,   9, 12,   1, 1,   4, 6]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12)));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1)));
+        let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_integer_ascending_null_first() {
+        // [3, 5,   null, null,   1, 1,   6, 8]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2)));
+        let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_integer_ascending_null_first() {
+        // [3, 5,   null, null,   1, 1,   4, 6]
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5)));
+        let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None));
+        let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2)));
+        let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6)));
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    fn test_non_overlap_string_ascending_null_first() {
+        // ['e', 'h',   null, null,   'a', 'a',   'k', 'q']
+        let pair_1 = (
+            ScalarValue::Utf8(Some('e'.to_string())),
+            ScalarValue::Utf8(Some('h'.to_string())),
+        );
+        let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None));
+        let pair_3 = (
+            ScalarValue::Utf8(Some('a'.to_string())),
+            ScalarValue::Utf8(Some('a'.to_string())),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(!overlap);
+    }
+
+    #[test]
+    fn test_overlap_string_ascending_null_first() {
+        // ['e', 'h',   null, null,   'a', 'f',   'k', 'q']
+        let pair_1 = (
+            ScalarValue::Utf8(Some('e'.to_string())),
+            ScalarValue::Utf8(Some('h'.to_string())),
+        );
+        let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None));
+        let pair_3 = (
+            ScalarValue::Utf8(Some('a'.to_string())),
+            ScalarValue::Utf8(Some('f'.to_string())),
+        );
+
+        let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap();
+        assert!(overlap);
+    }
+
+    #[test]
+    #[should_panic(expected = "Internal(\"Empty iterator passed to ScalarValue::iter_to_array\")")]
+    fn test_overlap_empty() {
+        let _overlap = overlap_all(&[]);
+    }
+
+    #[should_panic(expected = "min (Int16(3)) > max (Int16(2))")]
+    #[test]
+    fn test_overlap_panic() {
+        // max < min
+        let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(2)));
+        let _overlap = overlap_all(&[pair_1]);
+    }
+
+    /// Runs `overlap` on all permutations of the given `value_range`es and asserts that the result is
+    /// the same. Returns that result
+    fn overlap_all(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result<bool, DataFusionError> {
+        let n = value_ranges.len();
+
+        let mut overlaps_all_permutations = value_ranges
+            .iter()
+            .cloned()
+            .permutations(n)
+            .map(|v| overlap(&v));
+
+        let Some(first) = overlaps_all_permutations.next() else {
+            return overlap(value_ranges);
+        };
+
+        let first = first.unwrap();
+
+        for result in overlaps_all_permutations {
+            assert_eq!(&result.unwrap(), &first);
+        }
+
+        Ok(first)
+    }
 }
diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs
index 8b11291a601..e9697760f61 100644
--- a/iox_query/src/test.rs
+++ b/iox_query/src/test.rs
@@ -8,7 +8,9 @@ use crate::{
         Executor, ExecutorType, IOxSessionContext,
     },
     pruning::prune_chunks,
-    QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryText,
+    query_log::{QueryLog, StateReceived},
+    QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryNamespaceProvider,
+    QueryText,
 };
 use arrow::array::{BooleanArray, Float64Array};
 use arrow::datatypes::SchemaRef;
@@ -20,7 +22,8 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use async_trait::async_trait;
-use data_types::{ChunkId, ChunkOrder, PartitionKey, TableId, TransitionPartitionId};
+use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionKey, TableId, TransitionPartitionId};
+use datafusion::common::stats::Precision;
 use datafusion::error::DataFusionError;
 use datafusion::execution::context::SessionState;
 use datafusion::logical_expr::Expr;
@@ -32,7 +35,8 @@ use datafusion::{
     physical_plan::{ColumnStatistics, Statistics as DataFusionStatistics},
     scalar::ScalarValue,
 };
-use datafusion_util::config::DEFAULT_SCHEMA;
+use datafusion_util::{config::DEFAULT_SCHEMA, option_to_precision, timestamptz_nano};
+use iox_time::SystemProvider;
 use itertools::Itertools;
 use object_store::{path::Path, ObjectMeta};
 use parking_lot::Mutex;
@@ -47,7 +51,76 @@ use std::{
     num::NonZeroU64,
     sync::Arc,
 };
-use trace::ctx::SpanContext;
+use trace::{ctx::SpanContext, span::Span};
+use tracker::{AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit};
+
+#[derive(Debug)]
+pub struct TestDatabaseStore {
+    databases: Mutex<BTreeMap<String, Arc<TestDatabase>>>,
+    executor: Arc<Executor>,
+    pub metric_registry: Arc<metric::Registry>,
+    pub query_semaphore: Arc<tracker::InstrumentedAsyncSemaphore>,
+}
+
+impl TestDatabaseStore {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    pub fn new_with_semaphore_size(semaphore_size: usize) -> Self {
+        let metric_registry = Arc::new(metric::Registry::default());
+        let semaphore_metrics = Arc::new(AsyncSemaphoreMetrics::new(
+            &metric_registry,
+            &[("semaphore", "query_execution")],
+        ));
+        Self {
+            databases: Mutex::new(BTreeMap::new()),
+            executor: Arc::new(Executor::new_testing()),
+            metric_registry,
+            query_semaphore: Arc::new(semaphore_metrics.new_semaphore(semaphore_size)),
+        }
+    }
+
+    pub async fn db_or_create(&self, name: &str) -> Arc<TestDatabase> {
+        let mut databases = self.databases.lock();
+
+        if let Some(db) = databases.get(name) {
+            Arc::clone(db)
+        } else {
+            let new_db = Arc::new(TestDatabase::new(Arc::clone(&self.executor)));
+            databases.insert(name.to_string(), Arc::clone(&new_db));
+            new_db
+        }
+    }
+}
+
+impl Default for TestDatabaseStore {
+    fn default() -> Self {
+        Self::new_with_semaphore_size(u16::MAX as usize)
+    }
+}
+
+#[async_trait]
+impl QueryNamespaceProvider for TestDatabaseStore {
+    /// Retrieve the database specified name
+    async fn db(
+        &self,
+        name: &str,
+        _span: Option<Span>,
+        _include_debug_info_tables: bool,
+    ) -> Option<Arc<dyn QueryNamespace>> {
+        let databases = self.databases.lock();
+
+        databases.get(name).cloned().map(|ns| ns as _)
+    }
+
+    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit {
+        Arc::clone(&self.query_semaphore)
+            .acquire_owned(span)
+            .await
+            .unwrap()
+    }
+}
 
 #[derive(Debug)]
 pub struct TestDatabase {
@@ -160,11 +233,17 @@ impl QueryNamespace for TestDatabase {
 
     fn record_query(
         &self,
-        _span_ctx: Option<&SpanContext>,
-        _query_type: &'static str,
-        _query_text: QueryText,
-    ) -> QueryCompletedToken {
-        QueryCompletedToken::new(|_| {})
+        span_ctx: Option<&SpanContext>,
+        query_type: &'static str,
+        query_text: QueryText,
+    ) -> QueryCompletedToken<StateReceived> {
+        QueryLog::new(0, Arc::new(SystemProvider::new())).push(
+            NamespaceId::new(1),
+            Arc::from("ns"),
+            query_type,
+            query_text,
+            span_ctx.map(|s| s.trace_id),
+        )
     }
 
     fn new_query_context(&self, span_ctx: Option<SpanContext>) -> IOxSessionContext {
@@ -280,13 +359,13 @@ impl TableProvider for TestDatabaseTableProvider {
     }
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 enum TestChunkData {
     RecordBatches(Vec<RecordBatch>),
     Parquet(ParquetExecInput),
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct TestChunk {
     /// Table name
     table_name: String,
@@ -355,10 +434,10 @@ macro_rules! impl_with_column_with_stats {
                 .unwrap();
 
             let stats = ColumnStatistics {
-                null_count: None,
-                max_value: max.map(|s| ScalarValue::from(s)),
-                min_value: min.map(|s| ScalarValue::from(s)),
-                distinct_count: None,
+                null_count: Precision::Absent,
+                max_value: option_to_precision(max.map(|s| ScalarValue::from(s))),
+                min_value: option_to_precision(min.map(|s| ScalarValue::from(s))),
+                distinct_count: Precision::Absent,
             };
 
             self.add_schema_to_table(new_column_schema, Some(stats))
@@ -405,7 +484,15 @@ impl TestChunk {
         self.with_dummy_parquet_file_and_store("iox://store")
     }
 
+    pub fn with_dummy_parquet_file_and_size(self, size: usize) -> Self {
+        self.with_dummy_parquet_file_and_store_and_size("iox://store", size)
+    }
+
     pub fn with_dummy_parquet_file_and_store(self, store: &str) -> Self {
+        self.with_dummy_parquet_file_and_store_and_size(store, 1)
+    }
+
+    pub fn with_dummy_parquet_file_and_store_and_size(self, store: &str, size: usize) -> Self {
         match self.table_data {
             TestChunkData::RecordBatches(batches) => {
                 assert!(batches.is_empty(), "chunk already has record batches");
@@ -419,8 +506,9 @@ impl TestChunk {
                 object_meta: ObjectMeta {
                     location: Self::parquet_location(self.id),
                     last_modified: Default::default(),
-                    size: 1,
+                    size,
                     e_tag: None,
+                    version: None,
                 },
             }),
             ..self
@@ -546,10 +634,10 @@ impl TestChunk {
 
         // Construct stats
         let stats = ColumnStatistics {
-            null_count: Some(null_count as usize),
-            max_value: max.map(ScalarValue::from),
-            min_value: min.map(ScalarValue::from),
-            distinct_count: distinct_count.map(|c| c.get() as usize),
+            null_count: Precision::Exact(null_count as usize),
+            max_value: option_to_precision(max.map(ScalarValue::from)),
+            min_value: option_to_precision(min.map(ScalarValue::from)),
+            distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)),
         };
 
         self.update_count(count as usize);
@@ -585,10 +673,10 @@ impl TestChunk {
 
         // Construct stats
         let stats = ColumnStatistics {
-            null_count: Some(null_count as usize),
-            max_value: max.map(|v| ScalarValue::TimestampNanosecond(Some(v), None)),
-            min_value: min.map(|v| ScalarValue::TimestampNanosecond(Some(v), None)),
-            distinct_count: distinct_count.map(|c| c.get() as usize),
+            null_count: Precision::Exact(null_count as usize),
+            max_value: option_to_precision(max.map(timestamptz_nano)),
+            min_value: option_to_precision(min.map(timestamptz_nano)),
+            distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)),
         };
 
         self.update_count(count as usize);
@@ -601,8 +689,8 @@ impl TestChunk {
             .get_mut(TIME_COLUMN_NAME)
             .expect("stats in sync w/ columns");
 
-        stats.min_value = Some(ScalarValue::TimestampNanosecond(Some(min), None));
-        stats.max_value = Some(ScalarValue::TimestampNanosecond(Some(max), None));
+        stats.min_value = Precision::Exact(timestamptz_nano(min));
+        stats.max_value = Precision::Exact(timestamptz_nano(max));
 
         self
     }
@@ -638,10 +726,10 @@ impl TestChunk {
 
         // Construct stats
         let stats = ColumnStatistics {
-            null_count: None,
-            max_value: max.map(ScalarValue::from),
-            min_value: min.map(ScalarValue::from),
-            distinct_count: None,
+            null_count: Precision::Absent,
+            max_value: option_to_precision(max.map(ScalarValue::from)),
+            min_value: option_to_precision(min.map(ScalarValue::from)),
+            distinct_count: Precision::Absent,
         };
 
         self.add_schema_to_table(new_column_schema, Some(stats))
@@ -682,9 +770,9 @@ impl TestChunk {
                 DataType::Int64 => Arc::new(Int64Array::from(vec![1000])) as ArrayRef,
                 DataType::UInt64 => Arc::new(UInt64Array::from(vec![1000])) as ArrayRef,
                 DataType::Utf8 => Arc::new(StringArray::from(vec!["MA"])) as ArrayRef,
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    Arc::new(TimestampNanosecondArray::from(vec![1000])) as ArrayRef
-                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![1000]).with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
                 DataType::Dictionary(key, value)
                     if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
                 {
@@ -723,9 +811,9 @@ impl TestChunk {
             .iter()
             .map(|(_influxdb_column_type, field)| match field.data_type() {
                 DataType::Int64 => Arc::new(Int64Array::from(vec![field_val])) as ArrayRef,
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    Arc::new(TimestampNanosecondArray::from(vec![ts_val])) as ArrayRef
-                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![ts_val]).with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
                 DataType::Dictionary(key, value)
                     if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
                 {
@@ -773,9 +861,10 @@ impl TestChunk {
                     "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI"])) as ArrayRef,
                     _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR"])) as ArrayRef,
                 },
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    Arc::new(TimestampNanosecondArray::from(vec![8000, 10000, 20000])) as ArrayRef
-                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![8000, 10000, 20000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
                 DataType::Dictionary(key, value)
                     if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
                 {
@@ -834,11 +923,10 @@ impl TestChunk {
                     "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI", "NC"])) as ArrayRef,
                     _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR", "AL"])) as ArrayRef,
                 },
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    Arc::new(TimestampNanosecondArray::from(vec![
-                        28000, 210000, 220000, 210000,
-                    ])) as ArrayRef
-                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![28000, 210000, 220000, 210000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
                 DataType::Dictionary(key, value)
                     if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
                 {
@@ -888,54 +976,54 @@ impl TestChunk {
     /// Stats(min, max) : tag1(AL, MT), tag2(AL, MA), time(5, 7000)
     pub fn with_five_rows_of_data(mut self) -> Self {
         // create arrays
-        let columns =
-            self.schema
-                .iter()
-                .map(|(_influxdb_column_type, field)| match field.data_type() {
-                    DataType::Int64 => {
-                        Arc::new(Int64Array::from(vec![1000, 10, 70, 100, 5])) as ArrayRef
-                    }
-                    DataType::Utf8 => match field.name().as_str() {
+        let columns = self
+            .schema
+            .iter()
+            .map(|(_influxdb_column_type, field)| match field.data_type() {
+                DataType::Int64 => {
+                    Arc::new(Int64Array::from(vec![1000, 10, 70, 100, 5])) as ArrayRef
+                }
+                DataType::Utf8 => {
+                    match field.name().as_str() {
                         "tag1" => Arc::new(StringArray::from(vec!["MT", "MT", "CT", "AL", "MT"]))
                             as ArrayRef,
                         "tag2" => Arc::new(StringArray::from(vec!["CT", "AL", "CT", "MA", "AL"]))
                             as ArrayRef,
                         _ => Arc::new(StringArray::from(vec!["CT", "MT", "AL", "AL", "MT"]))
                             as ArrayRef,
-                    },
-                    DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                        Arc::new(TimestampNanosecondArray::from(vec![
-                            1000, 7000, 100, 50, 5000,
-                        ])) as ArrayRef
                     }
-                    DataType::Dictionary(key, value)
-                        if key.as_ref() == &DataType::Int32
-                            && value.as_ref() == &DataType::Utf8 =>
-                    {
-                        match field.name().as_str() {
-                            "tag1" => Arc::new(
-                                vec!["MT", "MT", "CT", "AL", "MT"]
-                                    .into_iter()
-                                    .collect::<DictionaryArray<Int32Type>>(),
-                            ) as ArrayRef,
-                            "tag2" => Arc::new(
-                                vec!["CT", "AL", "CT", "MA", "AL"]
-                                    .into_iter()
-                                    .collect::<DictionaryArray<Int32Type>>(),
-                            ) as ArrayRef,
-                            _ => Arc::new(
-                                vec!["CT", "MT", "AL", "AL", "MT"]
-                                    .into_iter()
-                                    .collect::<DictionaryArray<Int32Type>>(),
-                            ) as ArrayRef,
-                        }
+                }
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![1000, 7000, 100, 50, 5000])
+                        .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
+                DataType::Dictionary(key, value)
+                    if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+                {
+                    match field.name().as_str() {
+                        "tag1" => Arc::new(
+                            vec!["MT", "MT", "CT", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        "tag2" => Arc::new(
+                            vec!["CT", "AL", "CT", "MA", "AL"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
+                        _ => Arc::new(
+                            vec!["CT", "MT", "AL", "AL", "MT"]
+                                .into_iter()
+                                .collect::<DictionaryArray<Int32Type>>(),
+                        ) as ArrayRef,
                     }
-                    _ => unimplemented!(
-                        "Unimplemented data type for test database: {:?}",
-                        field.data_type()
-                    ),
-                })
-                .collect::<Vec<_>>();
+                }
+                _ => unimplemented!(
+                    "Unimplemented data type for test database: {:?}",
+                    field.data_type()
+                ),
+            })
+            .collect::<Vec<_>>();
 
         let batch =
             RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch");
@@ -981,11 +1069,12 @@ impl TestChunk {
                         "CT", "MT", "AL", "AL", "MT", "CT", "MT", "AL", "AL", "MT",
                     ])) as ArrayRef,
                 },
-                DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-                    Arc::new(TimestampNanosecondArray::from(vec![
+                DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new(
+                    TimestampNanosecondArray::from(vec![
                         1000, 7000, 100, 50, 5, 2000, 7000, 500, 50, 5,
-                    ])) as ArrayRef
-                }
+                    ])
+                    .with_timezone_opt(tz.clone()),
+                ) as ArrayRef,
                 DataType::Dictionary(key, value)
                     if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
                 {
@@ -1045,17 +1134,15 @@ impl QueryChunk for TestChunk {
         self.check_error().unwrap();
 
         Arc::new(DataFusionStatistics {
-            num_rows: self.num_rows,
-            total_byte_size: None,
-            column_statistics: Some(
-                self.schema
-                    .inner()
-                    .fields()
-                    .iter()
-                    .map(|f| self.column_stats.get(f.name()).cloned().unwrap_or_default())
-                    .collect(),
-            ),
-            is_exact: true,
+            num_rows: option_to_precision(self.num_rows),
+            total_byte_size: Precision::Absent,
+            column_statistics: self
+                .schema
+                .inner()
+                .fields()
+                .iter()
+                .map(|f| self.column_stats.get(f.name()).cloned().unwrap_or_default())
+                .collect(),
         })
     }
 
diff --git a/iox_query/src/util.rs b/iox_query/src/util.rs
index 28371db745c..7cd92a46c9d 100644
--- a/iox_query/src/util.rs
+++ b/iox_query/src/util.rs
@@ -6,31 +6,34 @@ use std::{
 };
 
 use arrow::{
-    array::TimestampNanosecondArray, compute::SortOptions, datatypes::Schema as ArrowSchema,
+    array::TimestampNanosecondArray,
+    compute::SortOptions,
+    datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef},
     record_batch::RecordBatch,
 };
 
 use data_types::TimestampMinMax;
+use datafusion::common::stats::Precision;
+use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries};
 use datafusion::{
     self,
     common::ToDFSchema,
     datasource::{provider_as_source, MemTable},
     error::DataFusionError,
     execution::context::ExecutionProps,
-    logical_expr::{LogicalPlan, LogicalPlanBuilder},
+    logical_expr::{interval_arithmetic::Interval, LogicalPlan, LogicalPlanBuilder},
     optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext},
     physical_expr::create_physical_expr,
     physical_plan::{
         expressions::{col as physical_col, PhysicalSortExpr},
-        ColumnStatistics, ExecutionPlan, PhysicalExpr, Statistics,
+        PhysicalExpr,
     },
     prelude::{Column, Expr},
-    scalar::ScalarValue,
 };
 
 use itertools::Itertools;
 use observability_deps::tracing::trace;
-use schema::{sort::SortKey, InfluxColumnType, Schema, TIME_COLUMN_NAME};
+use schema::{sort::SortKey, TIME_COLUMN_NAME};
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 
 #[derive(Debug, Snafu)]
@@ -68,26 +71,6 @@ pub fn make_scan_plan(batch: RecordBatch) -> std::result::Result<LogicalPlan, Da
     LogicalPlanBuilder::scan("memtable", source, projection)?.build()
 }
 
-/// Returns the pk in arrow's expression used for data sorting
-pub fn arrow_pk_sort_exprs(
-    key_columns: Vec<&str>,
-    input_schema: &ArrowSchema,
-) -> Vec<PhysicalSortExpr> {
-    let mut sort_exprs = vec![];
-    for key in key_columns {
-        let expr = physical_col(key, input_schema).expect("pk in schema");
-        sort_exprs.push(PhysicalSortExpr {
-            expr,
-            options: SortOptions {
-                descending: false,
-                nulls_first: false,
-            },
-        });
-    }
-
-    sort_exprs
-}
-
 pub fn logical_sort_key_exprs(sort_key: &SortKey) -> Vec<Expr> {
     sort_key
         .iter()
@@ -120,11 +103,9 @@ pub fn arrow_sort_key_exprs(
 
 /// Build a datafusion physical expression from a logical one
 pub fn df_physical_expr(
-    input: &dyn ExecutionPlan,
+    schema: ArrowSchemaRef,
     expr: Expr,
 ) -> std::result::Result<Arc<dyn PhysicalExpr>, DataFusionError> {
-    let schema = input.schema();
-
     let df_schema = Arc::clone(&schema).to_dfschema_ref()?;
 
     let props = ExecutionProps::new();
@@ -139,7 +120,8 @@ pub fn df_physical_expr(
     create_physical_expr(&expr, df_schema.as_ref(), schema.as_ref(), &props)
 }
 
-/// Return min and max for column `time` of the given set of record batches
+/// Return min and max for column `time` of the given set of record batches by
+/// performing an `O(n)` scan of all provided batches.
 pub fn compute_timenanosecond_min_max<'a, I>(batches: I) -> Result<TimestampMinMax>
 where
     I: IntoIterator<Item = &'a RecordBatch>,
@@ -157,7 +139,8 @@ where
     })
 }
 
-/// Return min and max for column `time` in the given record batch
+/// Return min and max for column `time` in the given record batch by performing
+/// an `O(n)` scan of `batch`.
 pub fn compute_timenanosecond_min_max_for_one_record_batch(
     batch: &RecordBatch,
 ) -> Result<(i64, i64)> {
@@ -188,136 +171,155 @@ pub fn compute_timenanosecond_min_max_for_one_record_batch(
     Ok((min, max))
 }
 
-/// Create basic table summary.
-///
-/// This contains:
-/// - correct column types
-/// - [total count](Statistics::num_rows)
-/// - [min](ColumnStatistics::min_value)/[max](ColumnStatistics::max_value) for the timestamp column
-pub fn create_basic_summary(
-    row_count: u64,
-    schema: &Schema,
-    ts_min_max: Option<TimestampMinMax>,
-) -> Statistics {
-    let mut columns = Vec::with_capacity(schema.len());
-
-    for (t, _field) in schema.iter() {
-        let stats = match t {
-            InfluxColumnType::Timestamp => ColumnStatistics {
-                null_count: Some(0),
-                max_value: Some(ScalarValue::TimestampNanosecond(
-                    ts_min_max.map(|v| v.max),
-                    None,
-                )),
-                min_value: Some(ScalarValue::TimestampNanosecond(
-                    ts_min_max.map(|v| v.min),
-                    None,
-                )),
-                distinct_count: None,
-            },
-            _ => ColumnStatistics::default(),
-        };
-        columns.push(stats)
-    }
+/// Determine the possible maximum range for each of the fields in a
+/// ['ArrowSchema'] once the ['Expr'] has been applied. The returned
+/// Vec includes an Interval for every field in the schema in the same
+/// order. Any fileds that are not constrained by the expression will
+/// have an unbounded interval.
+pub fn calculate_field_intervals(
+    schema: ArrowSchemaRef,
+    expr: Expr,
+) -> Result<Vec<Interval>, DataFusionError> {
+    // make unknown boundaries for each column
+    // TODO use upstream code when https://github.com/apache/arrow-datafusion/pull/8377 is merged
+    let fields = schema.fields();
+    let boundaries = fields
+        .iter()
+        .enumerate()
+        .map(|(i, field)| {
+            let column = datafusion::physical_expr::expressions::Column::new(field.name(), i);
+            let interval = Interval::make_unbounded(field.data_type())?;
+            Ok(ExprBoundaries {
+                column,
+                interval,
+                distinct_count: Precision::Absent,
+            })
+        })
+        .collect::<Result<Vec<_>, DataFusionError>>()?;
+
+    let context = AnalysisContext::new(boundaries);
+    let analysis_result = analyze(
+        &df_physical_expr(Arc::clone(&schema), expr)?,
+        context,
+        &schema,
+    )?;
+
+    let intervals = analysis_result
+        .boundaries
+        .into_iter()
+        .map(|b| b.interval)
+        .collect::<Vec<_>>();
+
+    Ok(intervals)
+}
 
-    Statistics {
-        num_rows: Some(row_count as usize),
-        total_byte_size: None,
-        column_statistics: Some(columns),
-        is_exact: true,
-    }
+/// Determine the possible maximum range for the named field in the
+/// ['ArrowSchema'] once the ['Expr'] has been applied.
+pub fn calculate_field_interval(
+    schema: ArrowSchemaRef,
+    expr: Expr,
+    name: &str,
+) -> Result<Interval, DataFusionError> {
+    let idx = schema.index_of(name)?;
+    let mut intervals = calculate_field_intervals(Arc::clone(&schema), expr)?;
+    Ok(intervals.swap_remove(idx))
 }
 
 #[cfg(test)]
 mod tests {
-    use datafusion::scalar::ScalarValue;
-    use schema::{builder::SchemaBuilder, InfluxFieldType};
+    use datafusion::common::rounding::next_down;
+    use datafusion::common::ScalarValue;
+    use datafusion::logical_expr::{col, lit};
+    use schema::{builder::SchemaBuilder, InfluxFieldType, TIME_DATA_TIMEZONE};
 
     use super::*;
 
-    #[test]
-    fn test_create_basic_summary_no_columns_no_rows() {
-        let schema = SchemaBuilder::new().build().unwrap();
-        let row_count = 0;
-
-        let actual = create_basic_summary(row_count, &schema, None);
-        let expected = Statistics {
-            num_rows: Some(row_count as usize),
-            total_byte_size: None,
-            column_statistics: Some(vec![]),
-            is_exact: true,
-        };
-        assert_eq!(actual, expected);
+    fn time_interval(lower: Option<i64>, upper: Option<i64>) -> Interval {
+        let lower = ScalarValue::TimestampNanosecond(lower, TIME_DATA_TIMEZONE());
+        let upper = ScalarValue::TimestampNanosecond(upper, TIME_DATA_TIMEZONE());
+        Interval::try_new(lower, upper).unwrap()
+    }
+
+    fn f64_interval(lower: Option<f64>, upper: Option<f64>) -> Interval {
+        let lower = ScalarValue::Float64(lower);
+        let upper = ScalarValue::Float64(upper);
+        Interval::try_new(lower, upper).unwrap()
     }
 
     #[test]
-    fn test_create_basic_summary_no_rows() {
-        let schema = full_schema();
-        let row_count = 0;
-        let ts_min_max = TimestampMinMax { min: 10, max: 20 };
-
-        let actual = create_basic_summary(row_count, &schema, Some(ts_min_max));
-        let expected = Statistics {
-            num_rows: Some(0),
-            total_byte_size: None,
-            column_statistics: Some(vec![
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics {
-                    null_count: Some(0),
-                    min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)),
-                    max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)),
-                    distinct_count: None,
-                },
-            ]),
-            is_exact: true,
-        };
-        assert_eq!(actual, expected);
+    fn test_calculate_field_intervals() {
+        let schema = SchemaBuilder::new()
+            .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
+            .build()
+            .unwrap()
+            .as_arrow();
+        let expr = col("time")
+            .gt_eq(lit("2020-01-01T00:00:00Z"))
+            .and(col("time").lt(lit("2020-01-02T00:00:00Z")))
+            .and(col("a").gt_eq(lit(1000000.0)))
+            .and(col("a").lt(lit(2000000.0)));
+        let intervals = calculate_field_intervals(schema, expr).unwrap();
+        // 2020-01-01T00:00:00Z == 1577836800000000000
+        // 2020-01-02T00:00:00Z == 1577923200000000000
+        assert_eq!(
+            vec![
+                time_interval(Some(1577836800000000000), Some(1577923200000000000i64 - 1),),
+                f64_interval(Some(1000000.0), Some(next_down(2000000.0)))
+            ],
+            intervals
+        );
     }
 
     #[test]
-    fn test_create_basic_summary() {
-        let schema = full_schema();
-        let row_count = 3;
-        let ts_min_max = TimestampMinMax { min: 42, max: 42 };
-
-        let actual = create_basic_summary(row_count, &schema, Some(ts_min_max));
-        let expected = Statistics {
-            num_rows: Some(3),
-            total_byte_size: None,
-            column_statistics: Some(vec![
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics::default(),
-                ColumnStatistics {
-                    null_count: Some(0),
-                    min_value: Some(ScalarValue::TimestampNanosecond(Some(42), None)),
-                    max_value: Some(ScalarValue::TimestampNanosecond(Some(42), None)),
-                    distinct_count: None,
-                },
-            ]),
-            is_exact: true,
-        };
-        assert_eq!(actual, expected);
+    fn test_calculate_field_intervals_no_constraints() {
+        let schema = SchemaBuilder::new()
+            .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
+            .build()
+            .unwrap()
+            .as_arrow();
+        // must be a predicate (boolean expression)
+        let expr = lit("test").eq(lit("foo"));
+        let intervals = calculate_field_intervals(schema, expr).unwrap();
+        assert_eq!(
+            vec![time_interval(None, None), f64_interval(None, None)],
+            intervals
+        );
     }
 
-    fn full_schema() -> Schema {
-        SchemaBuilder::new()
-            .tag("tag")
-            .influx_field("field_bool", InfluxFieldType::Boolean)
-            .influx_field("field_float", InfluxFieldType::Float)
-            .influx_field("field_integer", InfluxFieldType::Integer)
-            .influx_field("field_string", InfluxFieldType::String)
-            .influx_field("field_uinteger", InfluxFieldType::UInteger)
+    #[test]
+    fn test_calculate_field_interval() {
+        let schema = SchemaBuilder::new()
             .timestamp()
+            .influx_field("a", InfluxFieldType::Float)
             .build()
             .unwrap()
+            .as_arrow();
+        let expr = col("time")
+            .gt_eq(lit("2020-01-01T00:00:00Z"))
+            .and(col("time").lt(lit("2020-01-02T00:00:00Z")))
+            .and(col("a").gt_eq(lit(1000000.0)))
+            .and(col("a").lt(lit(2000000.0)));
+
+        // Note
+        // 2020-01-01T00:00:00Z == 1577836800000000000
+        // 2020-01-02T00:00:00Z == 1577923200000000000
+        let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "time").unwrap();
+        assert_eq!(
+            time_interval(Some(1577836800000000000), Some(1577923200000000000 - 1),),
+            interval
+        );
+
+        let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "a").unwrap();
+        assert_eq!(
+            f64_interval(Some(1000000.0), Some(next_down(2000000.0))),
+            interval
+        );
+
+        assert_eq!(
+            "Arrow error: Schema error: Unable to get field named \"b\". Valid fields: [\"time\", \"a\"]",
+            calculate_field_interval(Arc::clone(&schema), expr.clone(), "b").unwrap_err().to_string(),
+        );
     }
 }
diff --git a/iox_query_influxql/Cargo.toml b/iox_query_influxql/Cargo.toml
index 50a96373dd4..0c116125510 100644
--- a/iox_query_influxql/Cargo.toml
+++ b/iox_query_influxql/Cargo.toml
@@ -5,22 +5,25 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 chrono-tz = { version = "0.8" }
 datafusion = { workspace = true }
 datafusion_util = { path = "../datafusion_util" }
 generated_types = { path = "../generated_types" }
 influxdb_influxql_parser = { path = "../influxdb_influxql_parser" }
 iox_query = { path = "../iox_query" }
-itertools = "0.11.0"
+itertools = "0.12.0"
 observability_deps = { path = "../observability_deps" }
 once_cell = "1"
 predicate = { path = "../predicate" }
 query_functions = { path = "../query_functions" }
 regex = "1"
 schema = { path = "../schema" }
-serde_json = "1.0.107"
+serde_json = "1.0.111"
 thiserror = "1.0"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/iox_query_influxql/src/aggregate/percentile.rs b/iox_query_influxql/src/aggregate/percentile.rs
index 4b88e9eb967..dda8659e45c 100644
--- a/iox_query_influxql/src/aggregate/percentile.rs
+++ b/iox_query_influxql/src/aggregate/percentile.rs
@@ -120,8 +120,9 @@ impl Accumulator for PercentileAccumulator {
     }
 
     fn state(&self) -> Result<Vec<ScalarValue>> {
+        let arr = ScalarValue::new_list(&self.data, &self.data_type);
         Ok(vec![
-            ScalarValue::new_list(Some(self.data.clone()), self.data_type.clone()),
+            ScalarValue::List(arr),
             ScalarValue::Float64(self.percentile),
         ])
     }
diff --git a/iox_query_influxql/src/frontend/planner.rs b/iox_query_influxql/src/frontend/planner.rs
index e8a311bdb6d..f8f6ff019fd 100644
--- a/iox_query_influxql/src/frontend/planner.rs
+++ b/iox_query_influxql/src/frontend/planner.rs
@@ -1,4 +1,5 @@
 use arrow::datatypes::SchemaRef;
+use datafusion::common::ParamValues;
 use datafusion::physical_expr::execution_props::ExecutionProps;
 use influxdb_influxql_parser::show_field_keys::ShowFieldKeysStatement;
 use influxdb_influxql_parser::show_measurements::ShowMeasurementsStatement;
@@ -12,7 +13,6 @@ use std::ops::Deref;
 use std::sync::Arc;
 
 use crate::plan::{parse_regex, InfluxQLToLogicalPlan, SchemaProvider};
-use datafusion::common::Statistics;
 use datafusion::datasource::provider_as_source;
 use datafusion::execution::context::{SessionState, TaskContext};
 use datafusion::logical_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource};
@@ -120,8 +120,10 @@ impl ExecutionPlan for SchemaExec {
         self.input.execute(partition, context)
     }
 
-    fn statistics(&self) -> Statistics {
-        self.input.statistics()
+    fn statistics(&self) -> Result<datafusion::physical_plan::Statistics, DataFusionError> {
+        Ok(datafusion::physical_plan::Statistics::new_unknown(
+            &self.schema(),
+        ))
     }
 }
 
@@ -136,7 +138,7 @@ impl DisplayAs for SchemaExec {
 }
 
 /// Create plans for running InfluxQL queries against databases
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Copy, Clone)]
 pub struct InfluxQLQueryPlanner {}
 
 impl InfluxQLQueryPlanner {
@@ -149,13 +151,20 @@ impl InfluxQLQueryPlanner {
     pub async fn query(
         &self,
         query: &str,
+        params: impl Into<ParamValues> + Send,
         ctx: &IOxSessionContext,
     ) -> Result<Arc<dyn ExecutionPlan>> {
+        let ctx = ctx.child_ctx("InfluxQLQueryPlanner::query");
         debug!(text=%query, "planning InfluxQL query");
 
         let statement = self.query_to_statement(query)?;
-        let logical_plan = self.statement_to_plan(statement, ctx).await?;
-
+        let logical_plan = self.statement_to_plan(statement, &ctx).await?;
+        // add params to plan only when they're non-empty
+        let logical_plan = match params.into() {
+            ParamValues::List(v) if !v.is_empty() => logical_plan.with_param_values(v)?,
+            ParamValues::Map(m) if !m.is_empty() => logical_plan.with_param_values(m)?,
+            _ => logical_plan,
+        };
         let input = ctx.create_physical_plan(&logical_plan).await?;
 
         // Merge schema-level metadata from the logical plan with the
@@ -179,6 +188,7 @@ impl InfluxQLQueryPlanner {
     ) -> Result<LogicalPlan> {
         use std::collections::hash_map::Entry;
 
+        let ctx = ctx.child_ctx("statement_to_plan");
         let session_cfg = ctx.inner().copied_config();
         let cfg = session_cfg.options();
         let schema = ctx
@@ -207,6 +217,9 @@ impl InfluxQLQueryPlanner {
 
         for table_name in &query_tables {
             if let Entry::Vacant(v) = sp.tables.entry(table_name.to_string()) {
+                let mut ctx = ctx.child_ctx("get table schema");
+                ctx.set_metadata("table", table_name.to_owned());
+
                 if let Some(table) = schema.table(table_name).await {
                     let schema = Schema::try_from(table.schema())
                         .map_err(|err| {
@@ -217,7 +230,7 @@ impl InfluxQLQueryPlanner {
             }
         }
 
-        let planner = InfluxQLToLogicalPlan::new(&sp, ctx);
+        let planner = InfluxQLToLogicalPlan::new(&sp, &ctx);
         let logical_plan = planner.statement_to_plan(statement)?;
         debug!(plan=%logical_plan.display_graphviz(), "logical plan");
         Ok(logical_plan)
diff --git a/iox_query_influxql/src/plan/ir.rs b/iox_query_influxql/src/plan/ir.rs
index 336bf4675fa..7ee811d154e 100644
--- a/iox_query_influxql/src/plan/ir.rs
+++ b/iox_query_influxql/src/plan/ir.rs
@@ -228,8 +228,8 @@ impl Display for Field {
 #[derive(Debug, Clone, Copy)]
 pub(super) struct Interval {
     /// The nanosecond duration of the interval
-    pub duration: i64,
+    pub(super) duration: i64,
 
     /// The nanosecond offset of the interval.
-    pub offset: Option<i64>,
+    pub(super) offset: Option<i64>,
 }
diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs
index cf8d72f1802..14ff70a3d4e 100644
--- a/iox_query_influxql/src/plan/planner.rs
+++ b/iox_query_influxql/src/plan/planner.rs
@@ -31,22 +31,22 @@ use datafusion::catalog::TableReference;
 use datafusion::common::tree_node::{Transformed, TreeNode, VisitRecursion};
 use datafusion::common::{DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, ToDFSchema};
 use datafusion::datasource::{provider_as_source, MemTable};
-use datafusion::logical_expr::expr::{Alias, ScalarFunction};
+use datafusion::logical_expr::expr::{AggregateFunctionDefinition, Alias, ScalarFunction};
 use datafusion::logical_expr::expr_rewriter::normalize_col;
 use datafusion::logical_expr::logical_plan::builder::project;
 use datafusion::logical_expr::logical_plan::Analyze;
 use datafusion::logical_expr::utils::{expr_as_column_expr, find_aggregate_exprs};
 use datafusion::logical_expr::{
-    binary_expr, col, date_bin, expr, expr::WindowFunction, lit, lit_timestamp_nano, now, union,
-    window_function, AggregateFunction, AggregateUDF, Between, BuiltInWindowFunction,
-    BuiltinScalarFunction, EmptyRelation, Explain, Expr, ExprSchemable, Extension, LogicalPlan,
-    LogicalPlanBuilder, Operator, PlanType, Projection, ScalarUDF, TableSource, ToStringifiedPlan,
-    WindowFrame, WindowFrameBound, WindowFrameUnits,
+    binary_expr, col, date_bin, expr, expr::WindowFunction, lit, now, union, utils::conjunction,
+    AggregateFunction, AggregateUDF, Between, BuiltInWindowFunction, BuiltinScalarFunction,
+    Distinct, EmptyRelation, Explain, Expr, ExprSchemable, Extension, LogicalPlan,
+    LogicalPlanBuilder, Operator, PlanType, Projection, ScalarFunctionDefinition, ScalarUDF,
+    TableSource, ToStringifiedPlan, WindowFrame, WindowFrameBound, WindowFrameUnits,
+    WindowFunctionDefinition,
 };
-use datafusion::optimizer::utils::conjunction;
 use datafusion::physical_expr::execution_props::ExecutionProps;
 use datafusion::prelude::{cast, sum, when, Column};
-use datafusion_util::{lit_dict, AsExpr};
+use datafusion_util::{lit_dict, lit_timestamptz_nano, AsExpr};
 use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata;
 use influxdb_influxql_parser::common::{LimitClause, OffsetClause, OrderByClause};
 use influxdb_influxql_parser::explain::{ExplainOption, ExplainStatement};
@@ -433,12 +433,15 @@ impl<'a> Context<'a> {
 /// InfluxQL query planner
 pub struct InfluxQLToLogicalPlan<'a> {
     s: &'a dyn SchemaProvider,
-    iox_ctx: &'a IOxSessionContext,
+    iox_ctx: IOxSessionContext,
 }
 
 impl<'a> InfluxQLToLogicalPlan<'a> {
     pub fn new(s: &'a dyn SchemaProvider, iox_ctx: &'a IOxSessionContext) -> Self {
-        Self { s, iox_ctx }
+        Self {
+            s,
+            iox_ctx: iox_ctx.child_ctx("InfluxQLToLogicalPlan"),
+        }
     }
 
     pub fn statement_to_plan(&self, statement: Statement) -> Result<LogicalPlan> {
@@ -447,9 +450,11 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             Statement::Delete(_) => error::not_implemented("DELETE"),
             Statement::DropMeasurement(_) => error::not_implemented("DROP MEASUREMENT"),
             Statement::Explain(explain) => self.explain_statement_to_plan(*explain),
-            Statement::Select(select) => {
-                self.select_query_to_plan(&self.rewrite_select_statement(*select)?)
-            }
+            Statement::Select(select) => self.select_query_to_plan(
+                &self
+                    .rewrite_select_statement(*select)
+                    .map_err(|e| e.context("rewriting statement"))?,
+            ),
             Statement::ShowDatabases(_) => error::not_implemented("SHOW DATABASES"),
             Statement::ShowMeasurements(show_measurements) => {
                 self.show_measurements_to_plan(*show_measurements)
@@ -468,7 +473,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
     }
 
     fn explain_statement_to_plan(&self, explain: ExplainStatement) -> Result<LogicalPlan> {
-        let plan = self.select_query_to_plan(&self.rewrite_select_statement(*explain.select)?)?;
+        let plan = self.statement_to_plan(*explain.statement)?;
         let plan = Arc::new(plan);
         let schema = LogicalPlan::explain_schema();
         let schema = schema.to_dfschema_ref()?;
@@ -783,13 +788,25 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
     ) -> Result<LogicalPlan> {
         match ctx.projection_type {
             ProjectionType::Raw => self.project_select_raw(input, fields),
-            ProjectionType::RawDistinct => self.project_select_raw_distinct(input, fields),
-            ProjectionType::Aggregate   => self.project_select_aggregate(ctx, input, fields, group_by_tag_set),
-            ProjectionType::Window => self.project_select_window(ctx, input, fields, group_by_tag_set),
-            ProjectionType::WindowAggregate => self.project_select_window_aggregate(ctx, input, fields, group_by_tag_set),
-            ProjectionType::WindowAggregateMixed => error::not_implemented("mixed window-aggregate and aggregate columns, such as DIFFERENCE(MEAN(col)), MEAN(col)"),
-            ProjectionType::Selector{..} => self.project_select_selector(ctx, input, fields, group_by_tag_set),
-            ProjectionType::TopBottomSelector => self.project_select_top_bottom_selector(ctx, input, fields, group_by_tag_set),
+            ProjectionType::RawDistinct => self.project_select_raw_distinct(ctx, input, fields),
+            ProjectionType::Aggregate => {
+                self.project_select_aggregate(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::Window => {
+                self.project_select_window(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::WindowAggregate => {
+                self.project_select_window_aggregate(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::WindowAggregateMixed => {
+                self.project_select_window_aggregate_mixed(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::Selector { .. } => {
+                self.project_select_selector(ctx, input, fields, group_by_tag_set)
+            }
+            ProjectionType::TopBottomSelector => {
+                self.project_select_top_bottom_selector(ctx, input, fields, group_by_tag_set)
+            }
         }
     }
 
@@ -809,6 +826,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
     /// and call only scalar functions, but output only distinct rows.
     fn project_select_raw_distinct(
         &self,
+        ctx: &Context<'_>,
         input: LogicalPlan,
         fields: &[Field],
     ) -> Result<LogicalPlan> {
@@ -834,10 +852,32 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             return error::internal("time column is not an alias");
         };
 
-        select_exprs[time_column_index] = lit_timestamp_nano(0).alias(alias);
+        select_exprs[time_column_index] = if let Some(i) = ctx.interval {
+            let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration));
+            let offset = i.offset.unwrap_or_default();
+
+            date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset)).alias(alias)
+        } else {
+            lit_timestamptz_nano(0).alias(alias)
+        };
 
         // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
-        let plan = project(input, select_exprs)?;
+        let mut plan = project(input, select_exprs)?;
+
+        // generate a predicate to filter out all rows where all field values are `NULL`,
+        // like:
+        //
+        //   NOT (field1 IS NULL AND field2 IS NULL AND ...)
+        plan = match conjunction(fields.iter().filter_map(|f| {
+            if matches!(f.data_type, Some(InfluxColumnType::Field(_))) {
+                Some(f.name.as_expr().is_null())
+            } else {
+                None
+            }
+        })) {
+            Some(expr) => LogicalPlanBuilder::from(plan).filter(expr.not())?.build()?,
+            None => plan,
+        };
 
         LogicalPlanBuilder::from(plan).distinct()?.build()
     }
@@ -852,7 +892,6 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         group_by_tag_set: &[&str],
     ) -> Result<LogicalPlan> {
         let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
-
         // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
         let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
 
@@ -923,8 +962,6 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
         let plan = project(plan, select_exprs)?;
 
-        // InfluxQL OG physical operators for
-
         // generate a predicate to filter rows where all field values of the row are `NULL`,
         // like:
         //
@@ -941,6 +978,34 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         }
     }
 
+    /// Plan "WindowAggregateMixed" SELECT queries. These are queries that use
+    /// a combination of window and nested aggregate functions, along with
+    /// additional aggregate functions.
+    ///
+    /// N.B. The plans produced here can output incorrect results when using the
+    /// `FILL(0)` directive. See [#9706](https://github.com/influxdata/influxdb_iox/issues/9706)
+    /// for details.
+    fn project_select_window_aggregate_mixed(
+        &self,
+        ctx: &Context<'_>,
+        input: LogicalPlan,
+        fields: &[Field],
+        group_by_tag_set: &[&str],
+    ) -> Result<LogicalPlan> {
+        let schema = IQLSchema::new_from_fields(input.schema(), fields)?;
+
+        // Transform InfluxQL AST field expressions to a list of DataFusion expressions.
+        let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?;
+
+        let (plan, select_exprs) =
+            self.select_aggregate(ctx, input, fields, select_exprs, group_by_tag_set)?;
+
+        let (plan, select_exprs) = self.select_window(ctx, plan, select_exprs, group_by_tag_set)?;
+
+        // Wrap the plan in a `LogicalPlan::Projection` from the select expressions
+        project(plan, select_exprs)
+    }
+
     /// Plan the execution of SELECT queries that have the Selector projection
     /// type. These a queries that include a single FIRST, LAST, MAX, MIN,
     /// PERCENTILE, or SAMPLE function call, possibly requesting additional
@@ -980,8 +1045,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 let perc_row_column_name = window_perc_row.display_name()?;
 
                 let window_row = Expr::WindowFunction(WindowFunction::new(
-                    window_function::WindowFunction::BuiltInWindowFunction(
-                        window_function::BuiltInWindowFunction::RowNumber,
+                    WindowFunctionDefinition::BuiltInWindowFunction(
+                        BuiltInWindowFunction::RowNumber,
                     ),
                     vec![],
                     window_partition_by(ctx, input.schema(), group_by_tag_set),
@@ -1148,8 +1213,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         if aggr_exprs.len() == 1 {
             let selector = aggr_exprs[0].clone();
 
-            if let Expr::AggregateUDF(mut udf) = selector.clone() {
-                if udf.fun.name.starts_with("selector_") {
+            if let Expr::AggregateFunction(mut agg) = selector.clone() {
+                if agg.func_def.name().starts_with("selector_") {
                     let selector_index = select_exprs
                         .iter()
                         .enumerate()
@@ -1168,6 +1233,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                         let (expr, out_name) = match expr.clone() {
                             Expr::Alias(Alias {
                                 expr,
+                                relation: None,
                                 name: out_name,
                             }) => (*expr, out_name),
                             _ => {
@@ -1185,8 +1251,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                         ));
                     }
 
-                    udf.args.append(&mut additional_args);
-                    let selector_new = Expr::AggregateUDF(udf);
+                    agg.args.append(&mut additional_args);
+                    let selector_new = Expr::AggregateFunction(agg);
                     select_exprs[selector_index] = select_exprs[selector_index]
                         .clone()
                         .transform_up(&|expr| {
@@ -1229,11 +1295,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration));
                 let offset = i.offset.unwrap_or_default();
 
-                date_bin(
-                    stride,
-                    "time".as_expr(),
-                    lit(ScalarValue::TimestampNanosecond(Some(offset), None)),
-                )
+                date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset))
             } else if let ProjectionType::Selector { has_fields: _ } = ctx.projection_type {
                 let selector = match aggr_exprs.len() {
                     1 => aggr_exprs[0].clone(),
@@ -1247,7 +1309,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
 
                 selector.field("time")
             } else {
-                lit_timestamp_nano(0)
+                lit_timestamptz_nano(0)
             }
             .alias(alias);
 
@@ -1308,7 +1370,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 FillClause::None => unreachable!(),
             };
 
-            build_gap_fill_node(plan, time_column, fill_strategy)?
+            build_gap_fill_node(plan, time_column, fill_strategy, &ctx.projection_type)?
         } else {
             plan
         };
@@ -1418,9 +1480,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         };
 
         let window_expr = Expr::WindowFunction(WindowFunction::new(
-            window_function::WindowFunction::BuiltInWindowFunction(
-                window_function::BuiltInWindowFunction::RowNumber,
-            ),
+            WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber),
             Vec::<Expr>::new(),
             window_partition_by(ctx, input.schema(), group_by_tags),
             order_by_exprs,
@@ -1451,9 +1511,12 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             // neither of which should be passed to udf_to_expr
             .map_err(|err| error::map::internal(format!("display_name: {err}")))?;
 
-        let Expr::ScalarUDF(expr::ScalarUDF { fun, args }) = e else {
+        let Expr::ScalarFunction(ScalarFunction { func_def, args }) = e else {
             return error::internal(format!("udf_to_expr: unexpected expression: {e}"));
         };
+        let ScalarFunctionDefinition::UDF(udf) = func_def else {
+            return error::internal(format!("udf_to_expr: unexpected function: {func_def:?}"));
+        };
 
         fn derivative_unit(ctx: &Context<'_>, args: &Vec<Expr>) -> Result<ScalarValue> {
             if args.len() > 1 {
@@ -1469,7 +1532,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             }
         }
 
-        match udf::WindowFunction::try_from_scalar_udf(Arc::clone(&fun)) {
+        match udf::WindowFunction::try_from_scalar_udf(Arc::clone(&udf)) {
             Some(udf::WindowFunction::MovingAverage) => Ok(Expr::WindowFunction(WindowFunction {
                 fun: MOVING_AVERAGE.clone(),
                 args,
@@ -1556,7 +1619,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
             .alias(alias)),
             None => error::internal(format!(
                 "unexpected user-defined window function: {}",
-                fun.name
+                udf.name()
             )),
         }
     }
@@ -1622,19 +1685,17 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 fields_to_exprs_no_nulls(input.schema(), group_by_tag_set).collect::<Vec<_>>()
             };
 
-            let window_func_exprs = vec![Expr::WindowFunction(WindowFunction {
-                fun: window_function::WindowFunction::BuiltInWindowFunction(
-                    BuiltInWindowFunction::RowNumber,
-                ),
-                args: vec![],
+            let window_func_exprs = vec![Expr::WindowFunction(WindowFunction::new(
+                WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber),
+                vec![],
                 partition_by,
                 order_by,
-                window_frame: WindowFrame {
+                WindowFrame {
                     units: WindowFrameUnits::Rows,
                     start_bound: WindowFrameBound::Preceding(ScalarValue::Null),
                     end_bound: WindowFrameBound::CurrentRow,
                 },
-            })
+            ))
             .alias(IOX_ROW_ALIAS)];
 
             // Prepare new projection.
@@ -1846,7 +1907,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 Literal::Timestamp(v) => v
                     .timestamp_nanos_opt()
                     .ok_or_else(|| error::map::query("timestamp out of range"))
-                    .map(|ts| lit(ScalarValue::TimestampNanosecond(Some(ts), None))),
+                    .map(lit_timestamptz_nano),
                 Literal::Duration(v) => {
                     Ok(lit(ScalarValue::IntervalMonthDayNano(Some((**v).into()))))
                 }
@@ -1989,12 +2050,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
 
                 check_arg_count(name, args, 2)?;
                 let nexpr = self.expr_to_df_expr(scope, &args[1], schema)?;
-                Ok(Expr::AggregateUDF(expr::AggregateUDF::new(
-                    PERCENTILE.clone(),
-                    vec![expr, nexpr],
-                    None,
-                    None,
-                )))
+                Ok(Expr::AggregateFunction(expr::AggregateFunction {
+                    func_def: AggregateFunctionDefinition::UDF(PERCENTILE.clone()),
+                    args: vec![expr, nexpr],
+                    distinct: false,
+                    filter: None,
+                    order_by: None,
+                }))
             }
             name @ ("first" | "last" | "min" | "max") => {
                 let expr = self.expr_to_df_expr(scope, &args[0], schema)?;
@@ -2118,7 +2180,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
         call: &Call,
         schema: &IQLSchema<'a>,
     ) -> Result<Expr> {
-        let args = call
+        let mut args = call
             .args
             .iter()
             .map(|e| self.expr_to_df_expr(scope, e, schema))
@@ -2129,13 +2191,16 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                 if args.len() != 2 {
                     error::query("invalid number of arguments for log, expected 2, got 1")
                 } else {
-                    Ok(Expr::ScalarFunction(ScalarFunction {
-                        fun: BuiltinScalarFunction::Log,
-                        args: args.into_iter().rev().collect(),
-                    }))
+                    let arg1 = args.pop().unwrap();
+                    let arg0 = args.pop().unwrap();
+                    // reverse args
+                    Ok(datafusion::prelude::log(arg1, arg0))
                 }
             }
-            fun => Ok(Expr::ScalarFunction(ScalarFunction { fun, args })),
+            fun => Ok(Expr::ScalarFunction(ScalarFunction {
+                func_def: ScalarFunctionDefinition::BuiltIn(fun),
+                args,
+            })),
         }
     }
 
@@ -2437,17 +2502,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> {
                         // - not null if it had any non-null values
                         //
                         // note that since we only have a single row, this is efficient
-                        .project([Expr::ScalarFunction(ScalarFunction {
-                            fun: BuiltinScalarFunction::MakeArray,
-                            args: tags
-                                .iter()
+                        .project([datafusion::prelude::array(
+                            tags.iter()
                                 .map(|tag| {
                                     let tag_col = Expr::Column(Column::from_name(*tag));
 
                                     when(tag_col.gt(lit(0)), lit(*tag)).end()
                                 })
                                 .collect::<Result<Vec<_>, _>>()?,
-                        })
+                        )
                         .alias(tag_key_col)])?
                         // roll our single array row into one row per tag key
                         .unnest_column(tag_key_df_col)?
@@ -2941,15 +3004,20 @@ fn build_gap_fill_node(
     input: LogicalPlan,
     time_column: &Expr,
     fill_strategy: FillStrategy,
+    projection_type: &ProjectionType,
 ) -> Result<LogicalPlan> {
     let (expr, alias) = match time_column {
-        Expr::Alias(Alias { expr, name: alias }) => (expr.as_ref(), alias),
+        Expr::Alias(Alias {
+            expr,
+            relation: None,
+            name: alias,
+        }) => (expr.as_ref(), alias),
         _ => return error::internal("expected time column to have an alias function"),
     };
 
     let date_bin_args = match expr {
         Expr::ScalarFunction(ScalarFunction {
-            fun: BuiltinScalarFunction::DateBin,
+            func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin),
             args,
         }) => args,
         _ => {
@@ -2996,6 +3064,16 @@ fn build_gap_fill_node(
                     }
                     _ => Ok(VisitRecursion::Continue),
                 });
+                time_range = if projection_type == &ProjectionType::WindowAggregateMixed {
+                    // For WindowAggregateMixed queries do not gap fill before the first
+                    // iterator value.
+                    time_range.map(|Range { start: _, end }| Range {
+                        start: Bound::Unbounded,
+                        end,
+                    })
+                } else {
+                    time_range
+                };
                 time_range
                     .ok_or_else(|| error::map::internal("expected to find a Filter or TableScan"))
             }?;
@@ -3029,7 +3107,7 @@ fn build_gap_fill_node(
     let fill_strategy = aggr_expr
         .iter()
         .cloned()
-        .map(|e| (e, fill_strategy.clone()))
+        .map(|e| (e, fill_strategy))
         .collect();
 
     let time_column = col(input
@@ -3148,8 +3226,13 @@ fn plan_with_metadata(plan: LogicalPlan, metadata: &InfluxQlMetadata) -> Result<
                 LogicalPlan::Analyze(v)
             }
             LogicalPlan::Distinct(src) => {
-                let mut v = src.clone();
-                v.input = Arc::new(set_schema(&src.input, metadata)?);
+                let v = match src.clone() {
+                    Distinct::All(input) => Distinct::All(Arc::new(set_schema(&input, metadata)?)),
+                    Distinct::On(mut on) => {
+                        on.input = Arc::new(set_schema(&on.input, metadata)?);
+                        Distinct::On(on)
+                    }
+                };
                 LogicalPlan::Distinct(v)
             }
             LogicalPlan::Unnest(src) => {
@@ -3309,7 +3392,7 @@ fn window_partition_by(
         parition_by.push(date_bin(
             stride,
             "time".as_expr(),
-            lit(ScalarValue::TimestampNanosecond(Some(offset), None)),
+            lit_timestamptz_nano(offset),
         ));
     }
     parition_by
@@ -3482,7 +3565,7 @@ mod test {
         }
 
         #[test]
-        fn test_snow_measurements() {
+        fn test_show_measurements() {
             assert_snapshot!(plan("SHOW MEASUREMENTS"), @"TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]");
             assert_snapshot!(plan("SHOW MEASUREMENTS LIMIT 1 OFFSET 2"), @r###"
             Sort: measurements.iox::measurement ASC NULLS LAST, measurements.name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
@@ -3494,46 +3577,55 @@ mod test {
             assert_snapshot!(plan("SHOW MEASUREMENTS WHERE foo = 'some_foo'"), @r###"
             Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
               Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                    Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                      TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                    Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                    Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                    Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                      TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                    Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                    Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                      TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                      Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                        TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                      Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                    Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                  TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                              Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                            Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                              TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                 Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
                   Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                     Filter: temp_03.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
@@ -3542,46 +3634,55 @@ mod test {
             assert_snapshot!(plan("SHOW MEASUREMENTS WHERE time > 1337"), @r###"
             Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
               Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                    Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                      TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                    Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                    Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                    Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                      TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                    Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                    Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                      TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
-                  Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                    Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                      TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                      Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                        TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                    Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                      Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                  Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                    Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                                Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                              Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                  TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                            Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                              Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                          Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                            Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                              TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                        Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                      Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                          TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+                    Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                      Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                        TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                 Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
                   Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                     Filter: temp_03.time >= TimestampNanosecond(1338, None) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
@@ -3606,76 +3707,85 @@ mod test {
             assert_snapshot!(plan("SHOW TAG KEYS WHERE foo = 'some_foo'"), @r###"
             Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
               Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
-                          Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                            TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
-                          Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
-                          Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
-                          Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
-                          Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                            TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
-                          Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                            TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
-                          Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                            TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
-                          Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
-                          Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
-                          Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
+                                            Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                              TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                            Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                              TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                    Unnest: tagKey [tagKey:Utf8;N]
+                                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
+                                          Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                  Unnest: tagKey [tagKey:Utf8;N]
+                                    Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                      Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                        Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                          TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                Unnest: tagKey [tagKey:Utf8;N]
+                                  Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                    Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
+                                      Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                        TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                              Unnest: tagKey [tagKey:Utf8;N]
+                                Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                  Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
+                                    Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                            Unnest: tagKey [tagKey:Utf8;N]
+                              Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
+                                  Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                          Unnest: tagKey [tagKey:Utf8;N]
+                            Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                              Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
+                                Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                        Unnest: tagKey [tagKey:Utf8;N]
+                          Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                            Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                              Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                      Unnest: tagKey [tagKey:Utf8;N]
+                        Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                            Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                              TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                 Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
                   Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
                     Unnest: tagKey [tagKey:Utf8;N]
@@ -3691,76 +3801,85 @@ mod test {
             assert_snapshot!(plan("SHOW TAG KEYS WHERE time > 1337"), @r###"
             Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
               Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
-                          Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                            TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
-                Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
-                          Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
-                          Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-                Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
-                          Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
-                          Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                            TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
-                Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
-                          Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                            TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
-                          Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                            TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
-                          Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
-                          Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
-                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
-                    Unnest: tagKey [tagKey:Utf8;N]
-                      Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
-                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
-                          Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
-                            TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                  Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N]
+                                            Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                              TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
+                                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                      Unnest: tagKey [tagKey:Utf8;N]
+                                        Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                            Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                              TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                                Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                  Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                    Unnest: tagKey [tagKey:Utf8;N]
+                                      Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                        Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N]
+                                          Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                                            TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                              Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                                Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                  Unnest: tagKey [tagKey:Utf8;N]
+                                    Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                      Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N]
+                                        Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                          TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                            Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                              Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                                Unnest: tagKey [tagKey:Utf8;N]
+                                  Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                    Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N]
+                                      Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                                        TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N]
+                          Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                            Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                              Unnest: tagKey [tagKey:Utf8;N]
+                                Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                  Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N]
+                                    Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                                      TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)]
+                        Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                          Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                            Unnest: tagKey [tagKey:Utf8;N]
+                              Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                                Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N]
+                                  Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                                    TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)]
+                      Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                        Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                          Unnest: tagKey [tagKey:Utf8;N]
+                            Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                              Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N]
+                                Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                  TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                    Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                      Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                        Unnest: tagKey [tagKey:Utf8;N]
+                          Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                            Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                              Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                                TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                  Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
+                    Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
+                      Unnest: tagKey [tagKey:Utf8;N]
+                        Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N]
+                          Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N]
+                            Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
+                              TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)]
                 Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N]
                   Filter: tagKey IS NOT NULL [tagKey:Utf8;N]
                     Unnest: tagKey [tagKey:Utf8;N]
@@ -3897,8 +4016,9 @@ mod test {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
                     Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
                       Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N]
-                        Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
-                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N]
+                          Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                 "###);
 
                 // Outer query projects subquery with binary expressions
@@ -3907,8 +4027,9 @@ mod test {
                   Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value * Float64(0.99) AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N]
                     Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N]
                       Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N]
-                        Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
-                          TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N]
+                          Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                 "###);
 
                 // Outer query groups by the `cpu` tag, which should be pushed all the way to inner-most subquery
@@ -3920,8 +4041,9 @@ mod test {
                         Aggregate: groupBy=[[cpu]], aggr=[[selector_max(value, time)]] [cpu:Dictionary(Int32, Utf8);N, selector_max(value,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N]
                           Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
                             Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
-                              Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
-                                TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                              Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                                Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N]
+                                  TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
                 "###);
             }
         }
@@ -4083,7 +4205,13 @@ mod test {
                 "###);
 
                 // Invariant: second argument is always a constant
-                assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), usage_system) FROM cpu GROUP BY TIME(10s)"), @"Error during planning: expected integer argument in moving_average()");
+                assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), usage_system) FROM cpu GROUP BY TIME(10s)"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: expected integer argument in moving_average()
+                "###);
             }
 
             #[test]
@@ -4178,8 +4306,16 @@ mod test {
             }
 
             #[test]
-            fn test_not_implemented() {
-                assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)), MEAN(usage_idle) FROM cpu GROUP BY TIME(10s)"), @"This feature is not implemented: mixed window-aggregate and aggregate columns, such as DIFFERENCE(MEAN(col)), MEAN(col)");
+            fn test_mixed_aggregate() {
+                assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)), MEAN(usage_idle) FROM cpu GROUP BY TIME(10s)"), @r###"
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
+                  Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(AVG(cpu.usage_idle)) AS difference, AVG(cpu.usage_idle) AS mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N]
+                    WindowAggr: windowExpr=[[difference(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, difference(AVG(cpu.usage_idle)):Float64;N]
+                      GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                        Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N]
+                          Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                            TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                "###);
             }
         }
 
@@ -4190,22 +4326,25 @@ mod test {
             Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
               Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
                 Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N]
-                  Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
-                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
             "###);
             assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu"), @r###"
             Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
               Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N]
                 Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N]
-                  Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
-                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
             "###);
             assert_snapshot!(plan("SELECT DISTINCT usage_idle FROM cpu GROUP BY cpu"), @r###"
             Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
               Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
                 Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
-                  Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
-                    TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                    Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                      TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
             "###);
             assert_snapshot!(plan("SELECT COUNT(DISTINCT usage_idle) FROM cpu"), @r###"
             Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
@@ -4213,10 +4352,40 @@ mod test {
                 Aggregate: groupBy=[[]], aggr=[[COUNT(DISTINCT cpu.usage_idle)]] [COUNT(DISTINCT cpu.usage_idle):Int64;N]
                   TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
             "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s)"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                    Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, distinct:Float64;N]
+                      Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s), cpu"), @r###"
+            Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+              Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                Distinct: [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                  Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                    Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N]
+                      Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+                        TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
+            "###);
 
             // fallible
-            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), DISTINCT(usage_system) FROM cpu"), @"Error during planning: aggregate function distinct() cannot be combined with other functions or fields");
-            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), usage_system FROM cpu"), @"Error during planning: aggregate function distinct() cannot be combined with other functions or fields");
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), DISTINCT(usage_system) FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: aggregate function distinct() cannot be combined with other functions or fields
+            "###);
+            assert_snapshot!(plan("SELECT DISTINCT(usage_idle), usage_system FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: aggregate function distinct() cannot be combined with other functions or fields
+            "###);
         }
 
         mod functions {
@@ -4358,7 +4527,13 @@ mod test {
             #[test]
             fn test_selectors_invalid_arguments_3() {
                 // Invalid number of arguments
-                assert_snapshot!(plan("SELECT MIN(usage_idle, usage_idle) FROM cpu"), @"Error during planning: invalid number of arguments for min, expected 1, got 2");
+                assert_snapshot!(plan("SELECT MIN(usage_idle, usage_idle) FROM cpu"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: invalid number of arguments for min, expected 1, got 2
+                "###);
             }
         }
 
@@ -4422,7 +4597,7 @@ mod test {
                 Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT top(usage_idle,10),cpu FROM cpu"), @r###"
             Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
@@ -4430,7 +4605,7 @@ mod test {
                 Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT top(usage_idle,10) FROM cpu GROUP BY cpu"), @r###"
             Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, top:Float64;N]
@@ -4438,7 +4613,7 @@ mod test {
                 Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT top(usage_idle,cpu,10) FROM cpu"), @r###"
             Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N]
@@ -4448,7 +4623,7 @@ mod test {
                     Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                       WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                         TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
         }
 
         #[test]
@@ -4459,7 +4634,7 @@ mod test {
                 Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT bottom(usage_idle,10),cpu FROM cpu"), @r###"
             Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
@@ -4467,7 +4642,7 @@ mod test {
                 Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT bottom(usage_idle,10) FROM cpu GROUP BY cpu"), @r###"
             Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, bottom:Float64;N]
@@ -4475,7 +4650,7 @@ mod test {
                 Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                   WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                     TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
 
             assert_snapshot!(plan("SELECT bottom(usage_idle,cpu,10) FROM cpu"), @r###"
             Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N]
@@ -4485,7 +4660,7 @@ mod test {
                     Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                       WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N]
                         TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N]
-                "###);
+            "###);
         }
 
         /// Test InfluxQL-specific behaviour of scalar functions that differ
@@ -4502,7 +4677,13 @@ mod test {
             // Fallible
 
             // LOG requires two arguments
-            assert_snapshot!(plan("SELECT LOG(usage_idle) FROM cpu"), @"Error during planning: invalid number of arguments for log, expected 2, got 1");
+            assert_snapshot!(plan("SELECT LOG(usage_idle) FROM cpu"), @r###"
+            rewriting statement
+            caused by
+            gather information about select statement
+            caused by
+            Error during planning: invalid number of arguments for log, expected 2, got 1
+            "###);
         }
 
         /// Validate the metadata is correctly encoded in the schema.
@@ -4631,7 +4812,13 @@ mod test {
             "###
             );
             assert_snapshot!(
-                plan("SELECT foo, f64_field FROM data where time > '2004-04-09T'"), @r###"Error during planning: invalid expression "'2004-04-09T'": '2004-04-09T' is not a valid timestamp"###
+                plan("SELECT foo, f64_field FROM data where time > '2004-04-09T'"), @r###"
+            rewriting statement
+            caused by
+            split condition
+            caused by
+            Error during planning: invalid expression "'2004-04-09T'": '2004-04-09T' is not a valid timestamp
+            "###
             );
 
             // time on the right-hand side
@@ -4647,7 +4834,13 @@ mod test {
             // fallible
 
             // Unsupported operator
-            assert_snapshot!(plan("SELECT foo, f64_field FROM data where time != 0"), @"Error during planning: invalid time comparison operator: !=")
+            assert_snapshot!(plan("SELECT foo, f64_field FROM data where time != 0"), @r###"
+            rewriting statement
+            caused by
+            split condition
+            caused by
+            Error during planning: invalid time comparison operator: !=
+            "###)
         }
 
         #[test]
@@ -4777,6 +4970,33 @@ mod test {
                 Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N]
                   TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
             "###);
+            assert_snapshot!(plan("EXPLAIN SHOW MEASUREMENTS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]
+            "###);
+            assert_snapshot!(plan("EXPLAIN SHOW TAG KEYS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: tag_keys [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW FIELD KEYS"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: field_keys [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW RETENTION POLICIES"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              TableScan: retention policies [iox::measurement:Dictionary(Int32, Utf8), name:Utf8, duration:Utf8, shardGroupDuration:Utf8, replicaN:Int64, default:Boolean]
+            "###);
+
+            assert_snapshot!(plan("EXPLAIN SHOW DATABASES"), @"This feature is not implemented: SHOW DATABASES");
+            assert_snapshot!(plan("EXPLAIN EXPLAIN SELECT f64_field::string FROM data"), @r###"
+            Explain [plan_type:Utf8, plan:Utf8]
+              Explain [plan_type:Utf8, plan:Utf8]
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N]
+                    TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
         }
 
         #[test]
@@ -4946,6 +5166,17 @@ mod test {
                   TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N]
             "###);
         }
+
+        /// See <https://github.com/influxdata/influxdb_iox/issues/9175>
+        #[test]
+        fn test_true_and_time_pred() {
+            assert_snapshot!(plan("SELECT f64_field FROM data WHERE true AND time < '2022-10-31T02:02:00Z'"), @r###"
+            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N]
+                Filter: data.time <= TimestampNanosecond(1667181719999999999, None) AND Boolean(true) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+            "###);
+        }
     }
 
     /// Tests to validate InfluxQL `SELECT` statements that project aggregate functions, such as `COUNT` or `SUM`.
@@ -4958,24 +5189,24 @@ mod test {
             #[test]
             fn no_group_by() {
                 assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data"), @r###"
-            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
-                Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
-                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-            "###);
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
 
                 assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY non_existent"), @r###"
-            Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
-                Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
-                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-            "###);
+                Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N]
+                    Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
                 assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo"), @r###"
-            Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
-                Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
-                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-            "###);
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
 
                 // The `COUNT(f64_field)` aggregate is only projected ones in the Aggregate and reused in the projection
                 assert_snapshot!(plan("SELECT COUNT(f64_field), COUNT(f64_field) + COUNT(f64_field), COUNT(f64_field) * 3 FROM data"), @r###"
@@ -4987,11 +5218,11 @@ mod test {
 
                 // non-existent tags are excluded from the Aggregate groupBy and Sort operators
                 assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo, non_existent"), @r###"
-            Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
-              Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
-                Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
-                  TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
-            "###);
+                Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
+                  Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N]
+                    Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N]
+                      TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N]
+                "###);
 
                 // Aggregate expression is projected once and reused in final projection
                 assert_snapshot!(plan("SELECT COUNT(f64_field),  COUNT(f64_field) * 2 FROM data"), @r###"
@@ -5027,8 +5258,20 @@ mod test {
                 // Fallible
 
                 // Cannot combine aggregate and non-aggregate columns in the projection
-                assert_snapshot!(plan("SELECT COUNT(f64_field), f64_field FROM data"), @"Error during planning: mixing aggregate and non-aggregate columns is not supported");
-                assert_snapshot!(plan("SELECT COUNT(f64_field) + f64_field FROM data"), @"Error during planning: mixing aggregate and non-aggregate columns is not supported");
+                assert_snapshot!(plan("SELECT COUNT(f64_field), f64_field FROM data"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: mixing aggregate and non-aggregate columns is not supported
+                "###);
+                assert_snapshot!(plan("SELECT COUNT(f64_field) + f64_field FROM data"), @r###"
+                rewriting statement
+                caused by
+                gather information about select statement
+                caused by
+                Error during planning: mixing aggregate and non-aggregate columns is not supported
+                "###);
             }
 
             #[test]
diff --git a/iox_query_influxql/src/plan/planner/select.rs b/iox_query_influxql/src/plan/planner/select.rs
index bfafb455424..97f1fdb08a3 100644
--- a/iox_query_influxql/src/plan/planner/select.rs
+++ b/iox_query_influxql/src/plan/planner/select.rs
@@ -288,7 +288,7 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::First {
-            field_key: Self::identifier(call.args.first().unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
         })
     }
 
@@ -300,7 +300,7 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::Last {
-            field_key: Self::identifier(call.args.first().unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
         })
     }
 
@@ -312,7 +312,7 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::Max {
-            field_key: Self::identifier(call.args.first().unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
         })
     }
 
@@ -324,7 +324,7 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::Min {
-            field_key: Self::identifier(call.args.first().unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
         })
     }
 
@@ -336,8 +336,8 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::Percentile {
-            field_key: Self::identifier(call.args.first().unwrap())?,
-            n: Self::literal_num(call.args.get(1).unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
+            n: Self::literal_num(&call.args[1])?,
         })
     }
 
@@ -349,8 +349,8 @@ impl<'a> Selector<'a> {
             ));
         }
         Ok(Self::Sample {
-            field_key: Self::identifier(call.args.first().unwrap())?,
-            n: Self::literal_int(call.args.get(1).unwrap())?,
+            field_key: Self::identifier(&call.args[0])?,
+            n: Self::literal_int(&call.args[1])?,
         })
     }
 
diff --git a/iox_query_influxql/src/plan/planner_rewrite_expression.rs b/iox_query_influxql/src/plan/planner_rewrite_expression.rs
index e9afe3f5a99..9fd50d5f995 100644
--- a/iox_query_influxql/src/plan/planner_rewrite_expression.rs
+++ b/iox_query_influxql/src/plan/planner_rewrite_expression.rs
@@ -127,7 +127,7 @@ use crate::plan::util::IQLSchema;
 use arrow::datatypes::DataType;
 use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
 use datafusion::common::{Result, ScalarValue};
-use datafusion::logical_expr::expr::{AggregateFunction, AggregateUDF, WindowFunction};
+use datafusion::logical_expr::expr::{AggregateFunction, WindowFunction};
 use datafusion::logical_expr::{
     binary_expr, cast, coalesce, lit, BinaryExpr, Expr, ExprSchemable, GetIndexedField, Operator,
 };
@@ -424,7 +424,6 @@ fn rewrite_expr(expr: Expr, schema: &IQLSchema<'_>) -> Result<Expr> {
             // Invoking an aggregate or window function on a tag column should return `NULL`
             // to be consistent with OG.
             Expr::AggregateFunction(AggregateFunction { ref args, .. } )
-            | Expr::AggregateUDF(AggregateUDF { ref args, .. } )
             | Expr::WindowFunction(WindowFunction { ref args, .. } ) => match &args[0] {
                Expr::Column(Column { ref name, ..  }) if schema.is_tag_field(name) => yes(lit(ScalarValue::Null)),
                _ => no(expr),
@@ -546,9 +545,8 @@ mod test {
     use crate::plan::ir::DataSourceSchema;
 
     use super::*;
-    use datafusion::logical_expr::lit_timestamp_nano;
     use datafusion::prelude::col;
-    use datafusion_util::AsExpr;
+    use datafusion_util::{lit_timestamptz_nano, AsExpr};
 
     use chrono::{DateTime, NaiveDate, Utc};
     use datafusion::common::{DFSchemaRef, ToDFSchema};
@@ -796,15 +794,15 @@ mod test {
         let schemas = new_schema();
         let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string();
 
-        let expr = "time".as_expr().gt_eq(lit_timestamp_nano(1000));
+        let expr = "time".as_expr().gt_eq(lit_timestamptz_nano(1000));
         assert_eq!(rewrite(expr), "time >= TimestampNanosecond(1000, None)");
 
-        let expr = lit_timestamp_nano(1000).lt_eq("time".as_expr());
+        let expr = lit_timestamptz_nano(1000).lt_eq("time".as_expr());
         assert_eq!(rewrite(expr), "TimestampNanosecond(1000, None) <= time");
 
         let expr = "time"
             .as_expr()
-            .gt_eq(lit_timestamp_nano(1000))
+            .gt_eq(lit_timestamptz_nano(1000))
             .and("tag0".as_expr().eq(lit("foo")));
         assert_eq!(
             rewrite(expr),
@@ -813,7 +811,7 @@ mod test {
 
         let expr = "time"
             .as_expr()
-            .gt_eq(lit_timestamp_nano(1000))
+            .gt_eq(lit_timestamptz_nano(1000))
             .and("float_field".as_expr().eq(lit(false)));
         assert_eq!(
             rewrite(expr),
diff --git a/iox_query_influxql/src/plan/rewriter.rs b/iox_query_influxql/src/plan/rewriter.rs
index be451229590..dc4fcc7b37b 100644
--- a/iox_query_influxql/src/plan/rewriter.rs
+++ b/iox_query_influxql/src/plan/rewriter.rs
@@ -100,8 +100,12 @@ impl RewriteSelect {
 
         let from = self.expand_from(s, stmt)?;
         let tag_set = from_tag_set(s, &from);
-        let (fields, group_by) = self.expand_projection(s, stmt, &from, &tag_set)?;
-        let condition = self.condition_resolve_types(s, stmt, &from)?;
+        let (fields, group_by) = self
+            .expand_projection(s, stmt, &from, &tag_set)
+            .map_err(|e| e.context("expand projection"))?;
+        let condition = self
+            .condition_resolve_types(s, stmt, &from)
+            .map_err(|e| e.context("resolve types in condition"))?;
 
         let now = Timestamp::from(s.execution_props().query_execution_start_time);
         let rc = ReduceContext {
@@ -109,10 +113,14 @@ impl RewriteSelect {
             tz: stmt.timezone.map(|tz| *tz),
         };
 
-        let interval = self.find_interval_offset(&rc, group_by.as_ref())?;
+        let interval = self
+            .find_interval_offset(&rc, group_by.as_ref())
+            .map_err(|e| e.context("find interval offset"))?;
 
         let (condition, time_range) = match condition {
-            Some(where_clause) => split_cond(&rc, &where_clause).map_err(error::map::expr_error)?,
+            Some(where_clause) => split_cond(&rc, &where_clause)
+                .map_err(error::map::expr_error)
+                .map_err(|e| e.context("split condition"))?,
             None => (None, TimeRange::default()),
         };
 
@@ -131,7 +139,8 @@ impl RewriteSelect {
         let SelectStatementInfo {
             projection_type,
             extra_intervals,
-        } = select_statement_info(&fields, &group_by, stmt.fill)?;
+        } = select_statement_info(&fields, &group_by, stmt.fill)
+            .map_err(|e| e.context("gather information about select statement"))?;
 
         // Following InfluxQL OG behaviour, if this is a subquery, and the fill strategy equates
         // to `FILL(null)`, switch to `FILL(none)`.
@@ -1042,6 +1051,8 @@ impl FieldChecker {
                 } else {
                     ProjectionType::WindowAggregateMixed
                 }
+            } else if self.has_distinct {
+                ProjectionType::RawDistinct
             } else {
                 ProjectionType::Aggregate
             }
@@ -1566,7 +1577,7 @@ pub(crate) enum ProjectionType {
     /// A query that projects no aggregate or selector functions.
     #[default]
     Raw,
-    /// A query that projects a single DISTINCT(field)
+    /// A query that projects a single DISTINCT(field).
     RawDistinct,
     /// A query that projects one or more aggregate functions or
     /// two or more selector functions.
@@ -2432,21 +2443,21 @@ mod test {
 
             let stmt = parse_select("SELECT *::field + *::tag FROM cpu");
             let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
-            assert_eq!(
+            assert_contains!(
                 err.to_string(),
                 "Error during planning: unsupported binary expression: contains a wildcard or regular expression"
             );
 
             let stmt = parse_select("SELECT COUNT(*) + SUM(usage_idle) FROM cpu");
             let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
-            assert_eq!(
+            assert_contains!(
                 err.to_string(),
                 "Error during planning: unsupported binary expression: contains a wildcard or regular expression"
             );
 
             let stmt = parse_select("SELECT COUNT(*::tag) FROM cpu");
             let err = rewrite_select_statement(&namespace, &stmt).unwrap_err();
-            assert_eq!(
+            assert_contains!(
                 err.to_string(),
                 "Error during planning: unable to use tag as wildcard in count()"
             );
diff --git a/iox_query_influxql/src/plan/udf.rs b/iox_query_influxql/src/plan/udf.rs
index 437bfda68e7..fdf8a2b5c1a 100644
--- a/iox_query_influxql/src/plan/udf.rs
+++ b/iox_query_influxql/src/plan/udf.rs
@@ -8,12 +8,16 @@
 use crate::plan::util::find_exprs_in_exprs;
 use crate::{error, NUMERICS};
 use arrow::datatypes::{DataType, TimeUnit};
-use datafusion::logical_expr::{
-    Expr, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, Signature, TypeSignature,
-    Volatility,
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::{
+        Expr, ScalarFunctionDefinition, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature,
+        Volatility,
+    },
+    physical_plan::ColumnarValue,
 };
 use once_cell::sync::Lazy;
-use std::sync::Arc;
+use std::{any::Any, sync::Arc};
 
 pub(super) enum WindowFunction {
     MovingAverage,
@@ -27,7 +31,7 @@ pub(super) enum WindowFunction {
 impl WindowFunction {
     /// Try to return the equivalent [`WindowFunction`] for `fun`.
     pub(super) fn try_from_scalar_udf(fun: Arc<ScalarUDF>) -> Option<Self> {
-        match fun.name.as_str() {
+        match fun.name() {
             MOVING_AVERAGE_UDF_NAME => Some(Self::MovingAverage),
             DIFFERENCE_UDF_NAME => Some(Self::Difference),
             NON_NEGATIVE_DIFFERENCE_UDF_NAME => Some(Self::NonNegativeDifference),
@@ -39,17 +43,51 @@ impl WindowFunction {
     }
 }
 
-/// Find all [`Expr::ScalarUDF`] expressions that match one of the supported
+/// Find all [`ScalarUDF`] expressions that match one of the supported
 /// window UDF functions.
 pub(super) fn find_window_udfs(exprs: &[Expr]) -> Vec<Expr> {
-    find_exprs_in_exprs(
-        exprs,
-        &|nested_expr| matches!(nested_expr, Expr::ScalarUDF(s) if WindowFunction::try_from_scalar_udf(Arc::clone(&s.fun)).is_some()),
-    )
+    find_exprs_in_exprs(exprs, &|nested_expr| {
+        let Expr::ScalarFunction(fun) = nested_expr else {
+            return false;
+        };
+        let ScalarFunctionDefinition::UDF(udf) = &fun.func_def else {
+            return false;
+        };
+        WindowFunction::try_from_scalar_udf(Arc::clone(udf)).is_some()
+    })
 }
 
 const MOVING_AVERAGE_UDF_NAME: &str = "moving_average";
 
+#[derive(Debug)]
+struct MovingAverageUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for MovingAverageUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        MOVING_AVERAGE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{MOVING_AVERAGE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
 /// Create an expression to represent the `MOVING_AVERAGE` function.
 pub(crate) fn moving_average(args: Vec<Expr>) -> Expr {
     MOVING_AVERAGE.call(args)
@@ -57,25 +95,53 @@ pub(crate) fn moving_average(args: Vec<Expr>) -> Expr {
 
 /// Definition of the `MOVING_AVERAGE` function.
 static MOVING_AVERAGE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    static RETURN_TYPE: Lazy<Arc<DataType>> = Lazy::new(|| Arc::new(DataType::Float64));
-
-    let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(RETURN_TYPE.clone()));
-    Arc::new(ScalarUDF::new(
-        MOVING_AVERAGE_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(MovingAverageUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64]))
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(MOVING_AVERAGE_UDF_NAME),
-    ))
+    }))
 });
 
 const DIFFERENCE_UDF_NAME: &str = "difference";
 
+#[derive(Debug)]
+struct DifferenceUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DifferenceUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DIFFERENCE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{DIFFERENCE_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{DIFFERENCE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
 /// Create an expression to represent the `DIFFERENCE` function.
 pub(crate) fn difference(args: Vec<Expr>) -> Expr {
     DIFFERENCE.call(args)
@@ -83,23 +149,53 @@ pub(crate) fn difference(args: Vec<Expr>) -> Expr {
 
 /// Definition of the `DIFFERENCE` function.
 static DIFFERENCE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone())));
-    Arc::new(ScalarUDF::new(
-        DIFFERENCE_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(DifferenceUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(DIFFERENCE_UDF_NAME),
-    ))
+    }))
 });
 
 const NON_NEGATIVE_DIFFERENCE_UDF_NAME: &str = "non_negative_difference";
 
+#[derive(Debug)]
+struct NonNegativeDifferenceUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for NonNegativeDifferenceUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        NON_NEGATIVE_DIFFERENCE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
 /// Create an expression to represent the `NON_NEGATIVE_DIFFERENCE` function.
 pub(crate) fn non_negative_difference(args: Vec<Expr>) -> Expr {
     NON_NEGATIVE_DIFFERENCE.call(args)
@@ -107,23 +203,48 @@ pub(crate) fn non_negative_difference(args: Vec<Expr>) -> Expr {
 
 /// Definition of the `NON_NEGATIVE_DIFFERENCE` function.
 static NON_NEGATIVE_DIFFERENCE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone())));
-    Arc::new(ScalarUDF::new(
-        NON_NEGATIVE_DIFFERENCE_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(NonNegativeDifferenceUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(NON_NEGATIVE_DIFFERENCE_UDF_NAME),
-    ))
+    }))
 });
 
 const DERIVATIVE_UDF_NAME: &str = "derivative";
 
+#[derive(Debug)]
+struct DerivativeUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DerivativeUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DERIVATIVE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{DERIVATIVE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
 /// Create an expression to represent the `DERIVATIVE` function.
 pub(crate) fn derivative(args: Vec<Expr>) -> Expr {
     DERIVATIVE.call(args)
@@ -131,10 +252,8 @@ pub(crate) fn derivative(args: Vec<Expr>) -> Expr {
 
 /// Definition of the `DERIVATIVE` function.
 static DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Float64)));
-    Arc::new(ScalarUDF::new(
-        DERIVATIVE_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(DerivativeUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .flat_map(|dt| {
@@ -149,13 +268,39 @@ static DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(DERIVATIVE_UDF_NAME),
-    ))
+    }))
 });
 
 const NON_NEGATIVE_DERIVATIVE_UDF_NAME: &str = "non_negative_derivative";
 
+#[derive(Debug)]
+struct NonNegativeDerivativeUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for NonNegativeDerivativeUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        NON_NEGATIVE_DERIVATIVE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{NON_NEGATIVE_DERIVATIVE_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
 /// Create an expression to represent the `NON_NEGATIVE_DERIVATIVE` function.
 pub(crate) fn non_negative_derivative(args: Vec<Expr>) -> Expr {
     NON_NEGATIVE_DERIVATIVE.call(args)
@@ -163,10 +308,8 @@ pub(crate) fn non_negative_derivative(args: Vec<Expr>) -> Expr {
 
 /// Definition of the `NON_NEGATIVE_DERIVATIVE` function.
 static NON_NEGATIVE_DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Float64)));
-    Arc::new(ScalarUDF::new(
-        NON_NEGATIVE_DERIVATIVE_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(NonNegativeDerivativeUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .flat_map(|dt| {
@@ -181,35 +324,58 @@ static NON_NEGATIVE_DERIVATIVE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(NON_NEGATIVE_DERIVATIVE_UDF_NAME),
-    ))
+    }))
 });
 
 const CUMULATIVE_SUM_UDF_NAME: &str = "cumulative_sum";
 
+#[derive(Debug)]
+struct CumulativeSumUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for CumulativeSumUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        CUMULATIVE_SUM_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{CUMULATIVE_SUM_UDF_NAME} expects at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        error::internal(format!(
+            "{CUMULATIVE_SUM_UDF_NAME} should not exist in the final logical plan"
+        ))
+    }
+}
+
 /// Create an expression to represent the `CUMULATIVE_SUM` function.
 pub(crate) fn cumulative_sum(args: Vec<Expr>) -> Expr {
     CUMULATIVE_SUM.call(args)
 }
 /// Definition of the `CUMULATIVE_SUM` function.
 static CUMULATIVE_SUM: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone())));
-    Arc::new(ScalarUDF::new(
-        CUMULATIVE_SUM_UDF_NAME,
-        &Signature::one_of(
+    Arc::new(ScalarUDF::from(CumulativeSumUDF {
+        signature: Signature::one_of(
             NUMERICS
                 .iter()
                 .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
                 .collect(),
             Volatility::Immutable,
         ),
-        &return_type_fn,
-        &stand_in_impl(CUMULATIVE_SUM_UDF_NAME),
-    ))
+    }))
 });
-
-/// Returns an implementation that always returns an error.
-fn stand_in_impl(name: &'static str) -> ScalarFunctionImplementation {
-    Arc::new(move |_| error::internal(format!("{name} should not exist in the final logical plan")))
-}
diff --git a/iox_query_influxql/src/plan/util.rs b/iox_query_influxql/src/plan/util.rs
index 719fcab1eb8..d63f8919f54 100644
--- a/iox_query_influxql/src/plan/util.rs
+++ b/iox_query_influxql/src/plan/util.rs
@@ -61,7 +61,7 @@ impl<'a> IQLSchema<'a> {
     }
 
     /// Returns `true` if the schema contains a tag column with the specified name.
-    pub fn is_tag_field(&self, name: &str) -> bool {
+    pub(crate) fn is_tag_field(&self, name: &str) -> bool {
         match self.tag_info {
             TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_tag_field(name),
             TagInfo::FieldList(fields) => fields
@@ -73,7 +73,7 @@ impl<'a> IQLSchema<'a> {
     /// Returns `true` if the schema contains a tag column with the specified name.
     /// If the underlying data source is a subquery, it will apply any aliases in the
     /// projection that represents the SELECT list.
-    pub fn is_projected_tag_field(&self, name: &str) -> bool {
+    pub(crate) fn is_projected_tag_field(&self, name: &str) -> bool {
         match self.tag_info {
             TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_projected_tag_field(name),
             _ => self.is_tag_field(name),
diff --git a/iox_query_influxql/src/window.rs b/iox_query_influxql/src/window.rs
index ced7f04ce1b..32d9586b5e9 100644
--- a/iox_query_influxql/src/window.rs
+++ b/iox_query_influxql/src/window.rs
@@ -1,8 +1,6 @@
 //! User defined window functions implementing influxQL features.
 
-use datafusion::logical_expr::{
-    PartitionEvaluatorFactory, ReturnTypeFunction, WindowFunction, WindowUDF,
-};
+use datafusion::logical_expr::{WindowFunctionDefinition, WindowUDF};
 use once_cell::sync::Lazy;
 use std::sync::Arc;
 
@@ -14,109 +12,55 @@ mod non_negative;
 mod percent_row_number;
 
 /// Definition of the `CUMULATIVE_SUM` user-defined window function.
-pub(crate) static CUMULATIVE_SUM: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(cumulative_sum::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory =
-        Arc::new(cumulative_sum::partition_evaluator_factory);
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        cumulative_sum::NAME,
-        &cumulative_sum::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static CUMULATIVE_SUM: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        cumulative_sum::CumulativeSumUDWF::new(),
     )))
 });
 
 /// Definition of the `DERIVATIVE` user-defined window function.
-pub(crate) static DERIVATIVE: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(derivative::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory =
-        Arc::new(derivative::partition_evaluator_factory);
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        derivative::NAME,
-        &derivative::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static DERIVATIVE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        derivative::DerivativeUDWF::new(),
     )))
 });
 
 /// Definition of the `DIFFERENCE` user-defined window function.
-pub(crate) static DIFFERENCE: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(difference::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory =
-        Arc::new(difference::partition_evaluator_factory);
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        difference::NAME,
-        &difference::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static DIFFERENCE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        difference::DifferenceUDWF::new(),
     )))
 });
 
 /// Definition of the `MOVING_AVERAGE` user-defined window function.
-pub(crate) static MOVING_AVERAGE: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(moving_average::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory =
-        Arc::new(moving_average::partition_evaluator_factory);
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        moving_average::NAME,
-        &moving_average::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static MOVING_AVERAGE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        moving_average::MovingAverageUDWF::new(),
     )))
 });
 
-const NON_NEGATIVE_DERIVATIVE_NAME: &str = "non_negative_derivative";
-
 /// Definition of the `NON_NEGATIVE_DERIVATIVE` user-defined window function.
-pub(crate) static NON_NEGATIVE_DERIVATIVE: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(derivative::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory = Arc::new(|| {
-        Ok(non_negative::wrapper(
-            derivative::partition_evaluator_factory()?,
-        ))
-    });
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        NON_NEGATIVE_DERIVATIVE_NAME,
-        &derivative::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static NON_NEGATIVE_DERIVATIVE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        non_negative::NonNegativeUDWF::new(
+            "non_negative_derivative",
+            derivative::DerivativeUDWF::new(),
+        ),
     )))
 });
-
-const NON_NEGATIVE_DIFFERENCE_NAME: &str = "non_negative_difference";
-
 /// Definition of the `NON_NEGATIVE_DIFFERENCE` user-defined window function.
-pub(crate) static NON_NEGATIVE_DIFFERENCE: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(difference::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory = Arc::new(|| {
-        Ok(non_negative::wrapper(
-            difference::partition_evaluator_factory()?,
-        ))
-    });
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        NON_NEGATIVE_DIFFERENCE_NAME,
-        &difference::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static NON_NEGATIVE_DIFFERENCE: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        non_negative::NonNegativeUDWF::new(
+            "non_negative_difference",
+            difference::DifferenceUDWF::new(),
+        ),
     )))
 });
 
 /// Definition of the `PERCENT_ROW_NUMBER` user-defined window function.
-pub(crate) static PERCENT_ROW_NUMBER: Lazy<WindowFunction> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(percent_row_number::return_type);
-    let partition_evaluator_factory: PartitionEvaluatorFactory =
-        Arc::new(percent_row_number::partition_evaluator_factory);
-
-    WindowFunction::WindowUDF(Arc::new(WindowUDF::new(
-        percent_row_number::NAME,
-        &percent_row_number::SIGNATURE,
-        &return_type,
-        &partition_evaluator_factory,
+pub(crate) static PERCENT_ROW_NUMBER: Lazy<WindowFunctionDefinition> = Lazy::new(|| {
+    WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl(
+        percent_row_number::PercentRowNumberUDWF::new(),
     )))
 });
diff --git a/iox_query_influxql/src/window/cumulative_sum.rs b/iox_query_influxql/src/window/cumulative_sum.rs
index b6acc4c3097..8153a9246aa 100644
--- a/iox_query_influxql/src/window/cumulative_sum.rs
+++ b/iox_query_influxql/src/window/cumulative_sum.rs
@@ -2,32 +2,51 @@ use crate::NUMERICS;
 use arrow::array::{Array, ArrayRef};
 use arrow::datatypes::DataType;
 use datafusion::common::{Result, ScalarValue};
-use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility};
-use once_cell::sync::Lazy;
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
+use std::any::Any;
 use std::sync::Arc;
 
-/// The name of the cumulative_sum window function.
-pub(super) const NAME: &str = "cumumlative_sum";
-
-/// Valid signatures for the cumulative_sum window function.
-pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
-    Signature::one_of(
-        NUMERICS
-            .iter()
-            .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
-            .collect(),
-        Volatility::Immutable,
-    )
-});
+#[derive(Debug)]
+pub(super) struct CumulativeSumUDWF {
+    signature: Signature,
+}
 
-/// Calculate the return type given the function signature.
-pub(super) fn return_type(sig: &[DataType]) -> Result<Arc<DataType>> {
-    Ok(Arc::new(sig[0].clone()))
+impl CumulativeSumUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
 }
 
-/// Create a new partition_evaluator_factory.
-pub(super) fn partition_evaluator_factory() -> Result<Box<dyn PartitionEvaluator>> {
-    Ok(Box::new(CumulativeSumPartitionEvaluator {}))
+impl WindowUDFImpl for CumulativeSumUDWF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "cumumlative_sum"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(CumulativeSumPartitionEvaluator {}))
+    }
 }
 
 /// PartitionEvaluator which returns the cumulative sum of the input.
diff --git a/iox_query_influxql/src/window/derivative.rs b/iox_query_influxql/src/window/derivative.rs
index 42730d532c5..019bc4ab23a 100644
--- a/iox_query_influxql/src/window/derivative.rs
+++ b/iox_query_influxql/src/window/derivative.rs
@@ -2,39 +2,66 @@ use crate::{error, NUMERICS};
 use arrow::array::{Array, ArrayRef};
 use arrow::datatypes::{DataType, TimeUnit};
 use datafusion::common::{Result, ScalarValue};
-use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility};
-use once_cell::sync::Lazy;
-
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, TIMEZONE_WILDCARD,
+};
 use std::sync::Arc;
 
-/// The name of the derivative window function.
-pub(super) const NAME: &str = "derivative";
-
-/// Valid signatures for the derivative window function.
-pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
-    Signature::one_of(
-        NUMERICS
-            .iter()
-            .map(|dt| {
-                TypeSignature::Exact(vec![
-                    dt.clone(),
-                    DataType::Duration(TimeUnit::Nanosecond),
-                    DataType::Timestamp(TimeUnit::Nanosecond, None),
-                ])
-            })
-            .collect(),
-        Volatility::Immutable,
-    )
-});
-
-/// Calculate the return type given the function signature.
-pub(super) fn return_type(_: &[DataType]) -> Result<Arc<DataType>> {
-    Ok(Arc::new(DataType::Float64))
+#[derive(Debug)]
+pub(super) struct DerivativeUDWF {
+    signature: Signature,
+}
+
+impl DerivativeUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .flat_map(|dt| {
+                        [
+                            TypeSignature::Exact(vec![
+                                dt.clone(),
+                                DataType::Duration(TimeUnit::Nanosecond),
+                                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                            ]),
+                            TypeSignature::Exact(vec![
+                                dt.clone(),
+                                DataType::Duration(TimeUnit::Nanosecond),
+                                DataType::Timestamp(
+                                    TimeUnit::Nanosecond,
+                                    Some(TIMEZONE_WILDCARD.into()),
+                                ),
+                            ]),
+                        ]
+                    })
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
 }
 
-/// Create a new partition_evaluator_factory.
-pub(super) fn partition_evaluator_factory() -> Result<Box<dyn PartitionEvaluator>> {
-    Ok(Box::new(DifferencePartitionEvaluator {}))
+impl WindowUDFImpl for DerivativeUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "derivative"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(DifferencePartitionEvaluator {}))
+    }
 }
 
 /// PartitionEvaluator which returns the derivative between input values,
diff --git a/iox_query_influxql/src/window/difference.rs b/iox_query_influxql/src/window/difference.rs
index d4c8adbb9fa..1618d72d07c 100644
--- a/iox_query_influxql/src/window/difference.rs
+++ b/iox_query_influxql/src/window/difference.rs
@@ -4,32 +4,50 @@ use arrow::compute::kernels::numeric::sub_wrapping;
 use arrow::compute::shift;
 use arrow::datatypes::DataType;
 use datafusion::common::{Result, ScalarValue};
-use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility};
-use once_cell::sync::Lazy;
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
 use std::sync::Arc;
 
-/// The name of the difference window function.
-pub(super) const NAME: &str = "difference";
-
-/// Valid signatures for the difference window function.
-pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
-    Signature::one_of(
-        NUMERICS
-            .iter()
-            .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
-            .collect(),
-        Volatility::Immutable,
-    )
-});
+#[derive(Debug)]
+pub(super) struct DifferenceUDWF {
+    signature: Signature,
+}
 
-/// Calculate the return type given the function signature.
-pub(super) fn return_type(sig: &[DataType]) -> Result<Arc<DataType>> {
-    Ok(Arc::new(sig[0].clone()))
+impl DifferenceUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone()]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
 }
 
-/// Create a new partition_evaluator_factory.
-pub(super) fn partition_evaluator_factory() -> Result<Box<dyn PartitionEvaluator>> {
-    Ok(Box::new(DifferencePartitionEvaluator {}))
+impl WindowUDFImpl for DifferenceUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "difference"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        Ok(arg_types[0].clone())
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(DifferencePartitionEvaluator {}))
+    }
 }
 
 /// PartitionEvaluator which returns the difference between input values.
diff --git a/iox_query_influxql/src/window/moving_average.rs b/iox_query_influxql/src/window/moving_average.rs
index 3702e691f48..e61129158e8 100644
--- a/iox_query_influxql/src/window/moving_average.rs
+++ b/iox_query_influxql/src/window/moving_average.rs
@@ -2,33 +2,51 @@ use crate::{error, NUMERICS};
 use arrow::array::{Array, ArrayRef, Int64Array};
 use arrow::datatypes::DataType;
 use datafusion::common::{downcast_value, DataFusionError, Result, ScalarValue};
-use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility};
-use once_cell::sync::Lazy;
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
 use std::collections::VecDeque;
 use std::sync::Arc;
 
-/// The name of the moving average window function.
-pub(super) const NAME: &str = "moving_average";
-
-/// Valid signatures for the moving average window function.
-pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
-    Signature::one_of(
-        NUMERICS
-            .iter()
-            .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64]))
-            .collect(),
-        Volatility::Immutable,
-    )
-});
+#[derive(Debug)]
+pub(super) struct MovingAverageUDWF {
+    signature: Signature,
+}
 
-/// Calculate the return type given the function signature.
-pub(super) fn return_type(_: &[DataType]) -> Result<Arc<DataType>> {
-    Ok(Arc::new(DataType::Float64))
+impl MovingAverageUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                NUMERICS
+                    .iter()
+                    .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64]))
+                    .collect(),
+                Volatility::Immutable,
+            ),
+        }
+    }
 }
 
-/// Create a new partition_evaluator_factory.
-pub(super) fn partition_evaluator_factory() -> Result<Box<dyn PartitionEvaluator>> {
-    Ok(Box::new(AvgNPartitionEvaluator {}))
+impl WindowUDFImpl for MovingAverageUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "moving_average"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(AvgNPartitionEvaluator {}))
+    }
 }
 
 /// PartitionEvaluator which returns a moving average of the input data..
diff --git a/iox_query_influxql/src/window/non_negative.rs b/iox_query_influxql/src/window/non_negative.rs
index 504b4b743cb..d97187963dc 100644
--- a/iox_query_influxql/src/window/non_negative.rs
+++ b/iox_query_influxql/src/window/non_negative.rs
@@ -1,26 +1,61 @@
 use arrow::array::Array;
 use arrow::compute::kernels::cmp::lt;
 use arrow::compute::nullif;
+use arrow::datatypes::DataType;
 use datafusion::common::{Result, ScalarValue};
 use datafusion::logical_expr::window_state::WindowAggState;
-use datafusion::logical_expr::PartitionEvaluator;
+use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDFImpl};
+use std::any::Any;
 use std::ops::Range;
 use std::sync::Arc;
 
-/// Wrap a PartitionEvaluator in a non-negative filter.
-pub(super) fn wrapper(
-    partition_evaluator: Box<dyn PartitionEvaluator>,
-) -> Box<dyn PartitionEvaluator> {
-    Box::new(NonNegative {
-        partition_evaluator,
-    })
+/// Wrap a WindowUDF so that all values are non-negative.
+
+#[derive(Debug)]
+pub(super) struct NonNegativeUDWF<U: WindowUDFImpl> {
+    name: String,
+    inner: U,
+}
+
+impl<U: WindowUDFImpl> NonNegativeUDWF<U> {
+    pub(super) fn new(name: impl Into<String>, inner: U) -> Self {
+        Self {
+            name: name.into(),
+            inner,
+        }
+    }
+}
+
+impl<U: WindowUDFImpl + 'static> WindowUDFImpl for NonNegativeUDWF<U> {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        &self.name
+    }
+
+    fn signature(&self) -> &Signature {
+        self.inner.signature()
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        self.inner.return_type(arg_types)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(NonNegative {
+            partition_evaluator: self.inner.partition_evaluator()?,
+        }))
+    }
 }
 
+/// Wraps an existing [`PartitionEvaluator`] and ensures that all values are
+/// non-negative.
 #[derive(Debug)]
 struct NonNegative {
     partition_evaluator: Box<dyn PartitionEvaluator>,
 }
-
 impl PartitionEvaluator for NonNegative {
     fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> {
         self.partition_evaluator.memoize(state)
@@ -37,7 +72,7 @@ impl PartitionEvaluator for NonNegative {
     ) -> Result<Arc<dyn Array>> {
         let array = self.partition_evaluator.evaluate_all(values, num_rows)?;
         let zero = ScalarValue::new_zero(array.data_type())?;
-        let predicate = lt(&array, &zero.to_scalar())?;
+        let predicate = lt(&array, &zero.to_scalar()?)?;
         Ok(nullif(&array, &predicate)?)
     }
 
@@ -60,7 +95,7 @@ impl PartitionEvaluator for NonNegative {
             .evaluate_all_with_rank(num_rows, ranks_in_partition)?;
 
         let zero = ScalarValue::new_zero(array.data_type())?;
-        let predicate = lt(&array, &zero.to_scalar())?;
+        let predicate = lt(&array, &zero.to_scalar()?)?;
         Ok(nullif(&array, &predicate)?)
     }
 
diff --git a/iox_query_influxql/src/window/percent_row_number.rs b/iox_query_influxql/src/window/percent_row_number.rs
index 91df0587ae0..7d1714e1225 100644
--- a/iox_query_influxql/src/window/percent_row_number.rs
+++ b/iox_query_influxql/src/window/percent_row_number.rs
@@ -2,33 +2,50 @@ use crate::error;
 use arrow::array::{Array, ArrayRef, Float64Array, Int64Array, UInt64Array};
 use arrow::datatypes::DataType;
 use datafusion::common::{downcast_value, DataFusionError, Result};
-use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility};
-use once_cell::sync::Lazy;
+use datafusion::logical_expr::{
+    PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl,
+};
 use std::sync::Arc;
 
-/// The name of the percent_row_number window function.
-pub(super) const NAME: &str = "percent_row_number";
-
-/// Valid signatures for the percent_row_number window function.
-pub(super) static SIGNATURE: Lazy<Signature> = Lazy::new(|| {
-    Signature::one_of(
-        vec![
-            TypeSignature::Exact(vec![DataType::Int64]),
-            TypeSignature::Exact(vec![DataType::Float64]),
-        ],
-        Volatility::Immutable,
-    )
-});
+#[derive(Debug)]
+pub(super) struct PercentRowNumberUDWF {
+    signature: Signature,
+}
 
-/// Calculate the return type given the function signature. Percent_row_number
-/// always returns a UInt64.
-pub(super) fn return_type(_: &[DataType]) -> Result<Arc<DataType>> {
-    Ok(Arc::new(DataType::UInt64))
+impl PercentRowNumberUDWF {
+    pub(super) fn new() -> Self {
+        Self {
+            signature: Signature::one_of(
+                vec![
+                    TypeSignature::Exact(vec![DataType::Int64]),
+                    TypeSignature::Exact(vec![DataType::Float64]),
+                ],
+                Volatility::Immutable,
+            ),
+        }
+    }
 }
 
-/// Create a new partition_evaluator_factory.
-pub(super) fn partition_evaluator_factory() -> Result<Box<dyn PartitionEvaluator>> {
-    Ok(Box::new(PercentRowNumberPartitionEvaluator {}))
+impl WindowUDFImpl for PercentRowNumberUDWF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "percent_row_number"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::UInt64)
+    }
+
+    fn partition_evaluator(&self) -> Result<Box<dyn PartitionEvaluator>> {
+        Ok(Box::new(PercentRowNumberPartitionEvaluator {}))
+    }
 }
 
 /// PartitionEvaluator which returns the row number at which the nth
diff --git a/iox_query_influxrpc/Cargo.toml b/iox_query_influxrpc/Cargo.toml
index bf8cd300c8b..73996fdf444 100644
--- a/iox_query_influxrpc/Cargo.toml
+++ b/iox_query_influxrpc/Cargo.toml
@@ -5,8 +5,11 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 data_types = { path = "../data_types" }
 datafusion = { workspace = true }
 datafusion_util = { path = "../datafusion_util" }
@@ -17,11 +20,11 @@ observability_deps = { path = "../observability_deps" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
 predicate = { path = "../predicate" }
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
 arrow_util = { path = "../arrow_util" }
 test_helpers = { path = "../test_helpers" }
 insta = { version = "1", features = ["yaml"] }
-tokio = { version = "1.32", features = ["macros", "parking_lot"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
diff --git a/iox_query_influxrpc/src/lib.rs b/iox_query_influxrpc/src/lib.rs
index 261df170774..ea4808fc24c 100644
--- a/iox_query_influxrpc/src/lib.rs
+++ b/iox_query_influxrpc/src/lib.rs
@@ -1602,7 +1602,7 @@ fn filtered_fields_iter<'a>(
 impl AggExprs {
     /// Create the appropriate aggregate expressions, based on the type of the
     /// field for a `read_group` plan.
-    pub fn try_new_for_read_group(
+    pub(crate) fn try_new_for_read_group(
         agg: Aggregate,
         schema: &Schema,
         predicate: &Predicate,
@@ -1620,7 +1620,7 @@ impl AggExprs {
 
     /// Create the appropriate aggregate expressions, based on the type of the
     /// field for a `read_window_aggregate` plan.
-    pub fn try_new_for_read_window_aggregate(
+    pub(crate) fn try_new_for_read_window_aggregate(
         agg: Aggregate,
         schema: &Schema,
         predicate: &Predicate,
@@ -1816,7 +1816,7 @@ fn prune_chunks(
 }
 
 fn chunk_column_names(
-    chunk: &dyn QueryChunk,
+    chunk: &Arc<dyn QueryChunk>,
     predicate: &Predicate,
     columns: Projection<'_>,
 ) -> Option<StringSet> {
@@ -2575,6 +2575,7 @@ mod tests {
             TestChunk::new("h2o")
                 .with_id(0)
                 .with_tag_column("foo")
+                .with_f64_field_column("my_field")
                 .with_time_column(),
         );
 
diff --git a/iox_query_influxrpc/src/missing_columns.rs b/iox_query_influxrpc/src/missing_columns.rs
index 0254e6e4b55..d79d5601482 100644
--- a/iox_query_influxrpc/src/missing_columns.rs
+++ b/iox_query_influxrpc/src/missing_columns.rs
@@ -27,13 +27,13 @@ use schema::Schema;
 /// parts of the predicate make sense.
 /// See comments on 'is_null_column'
 #[derive(Debug)]
-pub struct MissingColumnsToNull<'a> {
+pub(crate) struct MissingColumnsToNull<'a> {
     schema: &'a Schema,
     df_schema: DFSchema,
 }
 
 impl<'a> MissingColumnsToNull<'a> {
-    pub fn new(schema: &'a Schema) -> Self {
+    pub(crate) fn new(schema: &'a Schema) -> Self {
         let df_schema: DFSchema = schema
             .as_arrow()
             .as_ref()
diff --git a/iox_query_influxrpc/src/scan_plan.rs b/iox_query_influxrpc/src/scan_plan.rs
index 661a4ee94c7..d2d58393b4e 100644
--- a/iox_query_influxrpc/src/scan_plan.rs
+++ b/iox_query_influxrpc/src/scan_plan.rs
@@ -44,9 +44,9 @@ pub enum Error {
 pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
 
 /// Represents scanning one or more [`QueryChunk`]s.
-pub struct ScanPlan {
-    pub plan_builder: LogicalPlanBuilder,
-    pub provider: Arc<ChunkTableProvider>,
+pub(crate) struct ScanPlan {
+    pub(crate) plan_builder: LogicalPlanBuilder,
+    pub(crate) provider: Arc<ChunkTableProvider>,
 }
 
 impl std::fmt::Debug for ScanPlan {
@@ -60,7 +60,7 @@ impl std::fmt::Debug for ScanPlan {
 
 impl ScanPlan {
     /// Return the schema of the source (the merged schema across all tables)
-    pub fn schema(&self) -> &Schema {
+    pub(crate) fn schema(&self) -> &Schema {
         self.provider.iox_schema()
     }
 }
@@ -82,7 +82,7 @@ impl ScanPlan {
 /// (and thus prune) their own chunklist.
 
 #[derive(Debug)]
-pub struct ScanPlanBuilder<'a> {
+pub(crate) struct ScanPlanBuilder<'a> {
     table_name: Arc<str>,
     /// The schema of the resulting table (any chunks that don't have
     /// all the necessary columns will be extended appropriately)
@@ -92,7 +92,7 @@ pub struct ScanPlanBuilder<'a> {
 }
 
 impl<'a> ScanPlanBuilder<'a> {
-    pub fn new(table_name: Arc<str>, table_schema: &'a Schema) -> Self {
+    pub(crate) fn new(table_name: Arc<str>, table_schema: &'a Schema) -> Self {
         Self {
             table_name,
             table_schema,
@@ -102,20 +102,23 @@ impl<'a> ScanPlanBuilder<'a> {
     }
 
     /// Adds `chunks` to the list of Chunks to scan
-    pub fn with_chunks(mut self, chunks: impl IntoIterator<Item = Arc<dyn QueryChunk>>) -> Self {
+    pub(crate) fn with_chunks(
+        mut self,
+        chunks: impl IntoIterator<Item = Arc<dyn QueryChunk>>,
+    ) -> Self {
         self.chunks.extend(chunks);
         self
     }
 
     /// Sets the predicate
-    pub fn with_predicate(mut self, predicate: &'a Predicate) -> Self {
+    pub(crate) fn with_predicate(mut self, predicate: &'a Predicate) -> Self {
         assert!(self.predicate.is_none());
         self.predicate = Some(predicate);
         self
     }
 
     /// Creates a `ScanPlan` from the specified chunks
-    pub fn build(self) -> Result<ScanPlan> {
+    pub(crate) fn build(self) -> Result<ScanPlan> {
         let Self {
             table_name,
             chunks,
@@ -212,7 +215,7 @@ mod tests {
         - "   DeduplicateExec: [tag1@3 ASC,time@4 ASC]"
         - "     SortPreservingMergeExec: [tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
         - "       SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]"
-        - "         RecordBatchesExec: chunks=2"
+        - "         RecordBatchesExec: chunks=2, projection=[__chunk_order, field_int, field_int2, tag1, time]"
         "###
         );
 
diff --git a/iox_query_params/Cargo.toml b/iox_query_params/Cargo.toml
new file mode 100644
index 00000000000..3c0eeb91607
--- /dev/null
+++ b/iox_query_params/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "iox_query_params"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+datafusion = { workspace = true }
+generated_types = { path = "../generated_types" }
+observability_deps = { path = "../observability_deps" }
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+thiserror = "1.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1"
+
diff --git a/iox_query_params/src/lib.rs b/iox_query_params/src/lib.rs
new file mode 100644
index 00000000000..501493565e5
--- /dev/null
+++ b/iox_query_params/src/lib.rs
@@ -0,0 +1,21 @@
+//! Crate for common types and utilities related to InfluxDB
+//! query/statement parameters.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies,
+    missing_debug_implementations,
+    unreachable_pub
+)]
+
+mod params;
+
+pub use params::*;
+
+use workspace_hack as _;
diff --git a/iox_query_params/src/params.rs b/iox_query_params/src/params.rs
new file mode 100644
index 00000000000..ce2526c105f
--- /dev/null
+++ b/iox_query_params/src/params.rs
@@ -0,0 +1,675 @@
+//! General-purpose data type and utilities for working with
+//! values that can be supplied as an InfluxDB bind parameter.
+use std::{borrow::Cow, collections::HashMap};
+
+use datafusion::scalar::ScalarValue;
+use observability_deps::tracing::warn;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+// remap protobuf types for convenience
+mod proto {
+    pub(super) use generated_types::influxdata::iox::querier::v1::read_info::{
+        query_param::{NullValue, Value},
+        QueryParam,
+    };
+}
+
+#[derive(Debug, Error)]
+/// Parameter errors
+pub enum Error {
+    /// Data conversion error
+    #[error("{}", msg)]
+    Conversion { msg: String },
+}
+
+/// A helper macro to construct a `HashMap` over `(String, StatementParam)` pairs.
+#[macro_export]
+macro_rules! params {
+    () => (
+        std::collections::HashMap::new()
+    );
+    ($($key:expr => $val:expr),+ $(,)?) => (
+        std::collections::HashMap::from([$((String::from($key), $crate::StatementParam::from($val))),+])
+    );
+}
+
+/// A collection of statement parameter (name,value) pairs.
+///
+/// This is a newtype wrapper to facillitate data conversions.
+/// [From] instances can be used to convert to/from protobuf and JSON
+/// protocol formats.
+///
+/// There is also a [From] instance to convert to
+/// [datafusion::common::ParamValues] which makes it possible to pass
+/// parameters into a [datafusion::logical_expr::LogicalPlan]
+#[repr(transparent)]
+#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
+pub struct StatementParams(HashMap<String, StatementParam>);
+
+impl StatementParams {
+    /// Convert to internal representation.
+    pub fn into_inner(&self) -> &HashMap<String, StatementParam> {
+        &self.0
+    }
+
+    /// Convert into a HashMap of (name, value) pairs
+    pub fn into_hashmap<V: From<StatementParam>>(self) -> HashMap<String, V> {
+        self.0
+            .into_iter()
+            .map(|(key, value)| (key, value.into()))
+            .collect::<HashMap<String, V>>()
+    }
+
+    /// Convert to [datafusion::common::ParamValues] used by [datafusion::logical_expr::LogicalPlan]::with_param_values
+    pub fn into_df_param_values(self) -> datafusion::common::ParamValues {
+        self.into()
+    }
+}
+
+/// From HashMap
+impl From<HashMap<String, StatementParam>> for StatementParams {
+    fn from(value: HashMap<String, StatementParam>) -> Self {
+        Self(value)
+    }
+}
+
+/// To HashMap
+impl From<StatementParams> for HashMap<String, StatementParam> {
+    fn from(value: StatementParams) -> Self {
+        value.0
+    }
+}
+
+/// Converting to [datafusion::common::ParamValues] allows for
+/// parameters to be passed to DataFusion
+impl From<StatementParams> for datafusion::common::ParamValues {
+    fn from(params: StatementParams) -> Self {
+        Self::Map(params.into_hashmap())
+    }
+}
+
+/// Convert from protobuf
+impl TryFrom<Vec<proto::QueryParam>> for StatementParams {
+    type Error = self::Error;
+    fn try_from(proto: Vec<proto::QueryParam>) -> Result<Self, Self::Error> {
+        let params = proto
+            .into_iter()
+            .map(|param| {
+                match param.value {
+                    Some(value) => Ok((param.name, StatementParam::from(value))),
+                    None => Err(Error::Conversion {
+                        msg: format!(
+                            "Missing value for parameter \"{}\" when decoding query parameters in Flight gRPC ticket.",
+                            param.name)
+                    })
+                }
+            }).collect::<Result<HashMap<_, _>, _>>()?;
+        Ok(Self(params))
+    }
+}
+
+/// Convert into protobuf
+impl From<StatementParams> for Vec<proto::QueryParam> {
+    fn from(params: StatementParams) -> Self {
+        params
+            .0
+            .into_iter()
+            .map(|(name, value)| proto::QueryParam {
+                name,
+                value: Some(value.into()),
+            })
+            .collect()
+    }
+}
+
+/// Enum of possible data types that can be used as parameters in an InfluxQL query.
+///
+/// # creating values
+///
+/// [From] implementations for many builtin types are provided to make creation of parameter values
+/// easier from the influxdb client.
+///
+/// # protocol formats
+///
+/// There are [From]/[TryFrom] implementations to convert to/from
+/// protobuf and JSON. These are used for deserialization/serialization of
+/// protocol messages across gRPC and the legacy REST API
+///
+/// # planning/execution
+///
+/// There is a [From] implementation to convert to DataFusion [ScalarValue]s. This
+/// allows params to be passed into the DataFusion [datafusion::logical_expr::LogicalPlan]
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+#[serde(try_from = "serde_json::Value", into = "serde_json::Value")]
+pub enum StatementParam {
+    /// a NULL value
+    #[default]
+    Null,
+    /// a boolean value
+    Boolean(bool),
+    /// an unsigned integer value
+    UInt64(u64),
+    /// a signed integer value
+    Int64(i64),
+    /// a floating point value
+    Float64(f64),
+    /// a UTF-8 string value
+    String(String),
+}
+
+/// Display as "SQL-like" literals
+impl std::fmt::Display for StatementParam {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Null => write!(f, "NULL"),
+            Self::Boolean(b) => write!(f, "{}", b.to_string().to_uppercase()),
+            Self::UInt64(u) => write!(f, "{}", u),
+            Self::Int64(i) => write!(f, "{}", i),
+            Self::Float64(fl) => write!(f, "{}", fl),
+            Self::String(s) => write!(f, "'{}'", s.replace('\'', "''")),
+        }
+    }
+}
+
+impl PartialEq<Self> for StatementParam {
+    fn eq(&self, other: &Self) -> bool {
+        match (self, other) {
+            (Self::Null, Self::Null) => true,
+            (Self::Boolean(b1), Self::Boolean(b2)) => b1 == b2,
+            (Self::UInt64(u1), Self::UInt64(u2)) => u1 == u2,
+            (Self::Int64(i1), Self::Int64(i2)) => i1 == i2,
+            (Self::Float64(f1), Self::Float64(f2)) => f1 == f2,
+            (Self::String(s1), Self::String(s2)) => s1 == s2,
+            // do not use a `_` pattern here because we want the exhaustiveness
+            // check to fail if a new param variant is added
+            (
+                Self::Null
+                | Self::Boolean(_)
+                | Self::UInt64(_)
+                | Self::Int64(_)
+                | Self::Float64(_)
+                | Self::String(_),
+                _,
+            ) => false,
+        }
+    }
+}
+
+impl Eq for StatementParam {}
+
+/// Convert into protobuf representation
+impl From<StatementParam> for proto::Value {
+    fn from(value: StatementParam) -> Self {
+        use proto::NullValue;
+        match value {
+            StatementParam::Null => Self::Null(NullValue::Unspecified.into()),
+            StatementParam::Boolean(b) => Self::Boolean(b),
+            StatementParam::UInt64(u) => Self::UInt64(u),
+            StatementParam::Int64(i) => Self::Int64(i),
+            StatementParam::Float64(f) => Self::Float64(f),
+            StatementParam::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert into JSON representation
+impl From<StatementParam> for serde_json::Value {
+    fn from(param: StatementParam) -> Self {
+        match param {
+            StatementParam::Null => Self::Null,
+            StatementParam::Boolean(b) => Self::Bool(b),
+            StatementParam::Float64(f) => Self::from(f),
+            StatementParam::UInt64(u) => Self::from(u),
+            StatementParam::Int64(i) => Self::from(i),
+            StatementParam::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert to DataFusion [ScalarValue]. This makes it possible to pass parameters
+/// into a datafusion [datafusion::logical_expr::LogicalPlan]
+impl From<StatementParam> for ScalarValue {
+    fn from(value: StatementParam) -> Self {
+        match value {
+            StatementParam::Null => Self::Null,
+            StatementParam::Boolean(b) => Self::Boolean(Some(b)),
+            StatementParam::UInt64(u) => Self::UInt64(Some(u)),
+            StatementParam::Int64(i) => Self::Int64(Some(i)),
+            StatementParam::Float64(f) => Self::Float64(Some(f)),
+            StatementParam::String(s) => Self::Utf8(Some(s)),
+        }
+    }
+}
+
+/// Convert from protobuf representation
+impl From<proto::Value> for StatementParam {
+    fn from(value: proto::Value) -> Self {
+        match value {
+            proto::Value::Null(n) => {
+                const UNSPECIFIED: i32 = proto::NullValue::Unspecified as i32;
+                if n != UNSPECIFIED {
+                    warn!(
+                        "Malformed Null in protobuf when decoding parameter \
+                        value into StatementParam. Expected Null({UNSPECIFIED}) \
+                        but found Null({n}). Possibly mismatched protobuf \
+                        versions.
+                        "
+                    );
+                }
+                Self::Null
+            }
+            proto::Value::Boolean(b) => Self::Boolean(b),
+            proto::Value::Float64(f) => Self::from(f),
+            proto::Value::UInt64(u) => Self::from(u),
+            proto::Value::Int64(i) => Self::from(i),
+            proto::Value::String(s) => Self::String(s),
+        }
+    }
+}
+
+/// Convert from JSON representation
+impl TryFrom<serde_json::Value> for StatementParam {
+    type Error = self::Error;
+    fn try_from(value: serde_json::Value) -> Result<Self, Self::Error> {
+        use serde_json::Value;
+        match value {
+            Value::Null => Ok(Self::Null),
+            Value::Bool(b) => Ok(Self::Boolean(b)),
+            Value::Number(n) => {
+                if let Some(u) = n.as_u64() {
+                    Ok(Self::UInt64(u))
+                } else if let Some(i) = n.as_i64() {
+                    Ok(Self::Int64(i))
+                } else if let Some(f) = n.as_f64() {
+                    Ok(Self::Float64(f))
+                } else {
+                    // NOTE: without the "arbitrary_precision" feature enabled on serde_json,
+                    // deserialization will never encounter this branch
+                    Err(Error::Conversion {
+                        msg: format!("Could not convert JSON number to i64 or f64: {n}"),
+                    })
+                }
+            }
+            Value::String(s) => Ok(Self::String(s)),
+            Value::Array(_) => Err(Error::Conversion {
+                msg: "JSON arrays are not supported as query parameters. Expected null, boolean, number, or string.".to_string(),
+            }),
+            Value::Object(_) => Err(Error::Conversion {
+                msg: "JSON objects are not supported as query parameters. Expected null, boolean, number, or string".to_string(),
+            }),
+        }
+    }
+}
+
+/// [`Option`] values are unwrapped and [`None`] values are converted to NULL
+impl<T> From<Option<T>> for StatementParam
+where
+    Self: From<T>,
+{
+    fn from(value: Option<T>) -> Self {
+        match value {
+            None => Self::Null,
+            Some(value) => value.into(),
+        }
+    }
+}
+
+/// Unit type is converted to NULL
+impl From<()> for StatementParam {
+    fn from(_value: ()) -> Self {
+        Self::Null
+    }
+}
+
+impl From<bool> for StatementParam {
+    fn from(value: bool) -> Self {
+        Self::Boolean(value)
+    }
+}
+
+impl From<u8> for StatementParam {
+    fn from(value: u8) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u16> for StatementParam {
+    fn from(value: u16) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u32> for StatementParam {
+    fn from(value: u32) -> Self {
+        Self::UInt64(value as u64)
+    }
+}
+
+impl From<u64> for StatementParam {
+    fn from(value: u64) -> Self {
+        Self::UInt64(value)
+    }
+}
+
+impl From<usize> for StatementParam {
+    fn from(value: usize) -> Self {
+        Self::UInt64(value.try_into().unwrap())
+    }
+}
+
+impl From<i8> for StatementParam {
+    fn from(value: i8) -> Self {
+        Self::Int64(value as i64)
+    }
+}
+
+impl From<i16> for StatementParam {
+    fn from(value: i16) -> Self {
+        Self::Int64(value as i64)
+    }
+}
+
+impl From<i32> for StatementParam {
+    fn from(value: i32) -> Self {
+        Self::Int64(value.into())
+    }
+}
+
+impl From<i64> for StatementParam {
+    fn from(value: i64) -> Self {
+        Self::Int64(value)
+    }
+}
+
+impl From<isize> for StatementParam {
+    fn from(value: isize) -> Self {
+        Self::Int64(value.try_into().unwrap())
+    }
+}
+
+impl From<f32> for StatementParam {
+    fn from(value: f32) -> Self {
+        Self::Float64(value.into())
+    }
+}
+
+impl From<f64> for StatementParam {
+    fn from(value: f64) -> Self {
+        Self::Float64(value)
+    }
+}
+
+impl From<&str> for StatementParam {
+    fn from(value: &str) -> Self {
+        Self::String(value.to_string())
+    }
+}
+
+impl From<String> for StatementParam {
+    fn from(value: String) -> Self {
+        Self::String(value)
+    }
+}
+
+impl<'a> From<Cow<'a, str>> for StatementParam {
+    fn from(value: Cow<'a, str>) -> Self {
+        Self::String(value.into_owned())
+    }
+}
+
+#[cfg(test)]
+#[allow(clippy::approx_constant)] // allow 3.14  >:)
+mod tests {
+    use assert_matches::assert_matches;
+    use serde_json::json;
+
+    use super::*;
+
+    #[test]
+    fn params_from_protobuf_value() {
+        // empty case
+        assert_matches!(StatementParams::try_from(vec![]), Ok(StatementParams(hm)) if hm.is_empty());
+
+        // test happy path with all value types
+        let proto: Vec<proto::QueryParam> = [
+            ("foo", proto::Value::String("Test String".to_string())),
+            ("bar", proto::Value::Float64(3.14)),
+            ("baz", proto::Value::UInt64(1234)),
+            ("int", proto::Value::Int64(-1234)),
+            ("1", proto::Value::Boolean(false)),
+            ("2", proto::Value::Null(0)),
+        ]
+        .map(|(key, value)| proto::QueryParam {
+            name: key.to_string(),
+            value: Some(value),
+        })
+        .into();
+        let result = StatementParams::try_from(proto);
+        let params = result.unwrap().0;
+        assert_eq!(
+            params,
+            params! {
+                "foo" => "Test String",
+                "bar" => 3.14_f64,
+                "baz" => 1234_u64,
+                "int" => -1234_i64,
+                "1" => false,
+                "2" => StatementParam::Null,
+            }
+        );
+    }
+
+    #[test]
+    fn params_from_json_values() {
+        use serde_json::Value;
+        assert_matches!(
+            StatementParam::try_from(Value::from("Test String")),
+            Ok(StatementParam::String(s)) if s == "Test String");
+        assert_matches!(
+            StatementParam::try_from(Value::from(3.14)),
+            Ok(StatementParam::Float64(n)) if n == 3.14
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(1234)),
+            Ok(StatementParam::UInt64(1234))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(-1234)),
+            Ok(StatementParam::Int64(-1234))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::from(false)),
+            Ok(StatementParam::Boolean(false))
+        );
+        assert_matches!(
+            StatementParam::try_from(Value::Null),
+            Ok(StatementParam::Null)
+        );
+        // invalid values
+        assert_matches!(
+            StatementParam::try_from(json!([1, 2, 3])),
+            Err(Error::Conversion { .. })
+        );
+        assert_matches!(
+            StatementParam::try_from(json!({ "a": 1, "b": 2, "c": 3})),
+            Err(Error::Conversion { .. })
+        );
+    }
+
+    #[test]
+    fn params_from_json_str() {
+        let json = r#"
+            {
+                "foo": "Test String",
+                "bar": 3.14,
+                "baz": 1234,
+                "int": -1234,
+                "1": false,
+                "2": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        let params = result.unwrap().0;
+        assert_eq!(
+            params,
+            params! {
+                "foo" => "Test String",
+                "bar" => 3.14_f64,
+                "baz" => 1234_u64,
+                "int" => -1234_i64,
+                "1" => false,
+                "2" => StatementParam::Null,
+            }
+        );
+    }
+
+    #[test]
+    fn params_from_json_str_invalid() {
+        // invalid top-level values
+        assert_matches!(serde_json::from_str::<StatementParams>("null"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("100"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("3.14"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("true"), Err(_));
+        assert_matches!(serde_json::from_str::<StatementParams>("[\"foo\"]"), Err(_));
+
+        // nested lists are invalid
+        let json = r#"
+            {
+                "foo": [],
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested objects are invalid
+        let json = r#"
+            {
+                "foo": {},
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested list with contents
+        let json = r#"
+            {
+                "foo bar": [1, 2, "3", "4 5 6", [null], [[]], {}],
+                "baz": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+
+        // nested object with contents
+        let json = r#"
+            {
+                "fazbar": {
+                    "a": 1,
+                    "b": 2,
+                    "c": null
+                },
+                "baz": null
+            }
+        "#;
+        let result = serde_json::from_str::<StatementParams>(json);
+        assert_matches!(result, Err(serde_json::Error { .. }));
+    }
+
+    // tests what happens when integer and float are out of bounds
+    //
+    // without `arbitrary_precision` flag, `serde_json` will always deserialize numbers to
+    // either i64 or f64.
+    //
+    // one potential edge case to be aware of is what happens when `serde_json::Value`` deserializes
+    // an integer number that's out-of-bounds for i64, but in-bounds for f64. In this case
+    // it will be interpreted as a float, and rounding errors can be introduced. This case
+    // is unlikely to occur as long as clients are properly validating that their integers
+    // are within 64-bit bounds, but it's possible that a client serializing a bigdecimal could
+    // encounter this case. this has not been testing when `serde_json` has `arbitrary_precision` enabled,
+    // so it's possible adding that feature would prevent rounding errors in this case.
+    // supporting bigdecimal parameters would also fix this edge case.
+    #[test]
+    fn params_from_json_str_bignum() {
+        let json = format! {" {{ \"abc\" : {}999 }} ", f64::MAX};
+        let result = serde_json::from_str::<StatementParams>(&json);
+        // NOTE: without the "arbitrary_precision" feature enabled on serde_json, deserialization will never encounter
+        // our out-of-bounds guard
+        let err = result.unwrap_err();
+        assert!(err.to_string().contains("number out of range"));
+    }
+
+    #[test]
+    fn params_conversions() {
+        assert_matches!(StatementParam::from(true), StatementParam::Boolean(true));
+        assert_matches!(StatementParam::from(123_u32), StatementParam::UInt64(123));
+        assert_matches!(StatementParam::from(-123), StatementParam::Int64(-123));
+        assert_matches!(StatementParam::from(1.23), StatementParam::Float64(f) if f == 1.23);
+        assert_matches!(StatementParam::from("a string"), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from("a string".to_owned()), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Cow::from("a string")), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(()), StatementParam::Null);
+        assert_matches!(
+            StatementParam::from(None::<Option<bool>>),
+            StatementParam::Null
+        );
+        assert_matches!(
+            StatementParam::from(Some(true)),
+            StatementParam::Boolean(true)
+        );
+        assert_matches!(
+            StatementParam::from(Some(123_u32)),
+            StatementParam::UInt64(123)
+        );
+        assert_matches!(
+            StatementParam::from(Some(-123)),
+            StatementParam::Int64(-123)
+        );
+        assert_matches!(StatementParam::from(Some(1.23)), StatementParam::Float64(f) if f == 1.23);
+        assert_matches!(StatementParam::from(Some("a string")), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some("a string".to_owned())), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some(Cow::from("a string"))), StatementParam::String(s) if s == "a string");
+        assert_matches!(StatementParam::from(Some(())), StatementParam::Null);
+        assert_matches!(
+            StatementParam::from(Some(None::<Option<i32>>)),
+            StatementParam::Null
+        );
+        assert_matches!(
+            StatementParam::from(Some(Some(true))),
+            StatementParam::Boolean(true)
+        );
+    }
+
+    // test equality comparisons for StatementParams
+    #[test]
+    fn params_equality() {
+        let values = [
+            StatementParam::Null,
+            StatementParam::from(true),
+            StatementParam::from(32_u32),
+            StatementParam::from(-23),
+            StatementParam::from(32.23),
+            StatementParam::from("a string"),
+        ];
+        for (i, value1) in values.iter().enumerate() {
+            for (j, value2) in values.iter().enumerate() {
+                if i == j {
+                    assert_eq!(value1, value2);
+                } else {
+                    assert_ne!(value1, value2);
+                }
+            }
+        }
+        assert_ne!(StatementParam::from(true), StatementParam::from(false));
+        assert_ne!(
+            StatementParam::from(1984_u32),
+            StatementParam::from(2077_u32)
+        );
+        assert_ne!(StatementParam::from(-100), StatementParam::from(100));
+        assert_ne!(StatementParam::from(-1.23), StatementParam::from(1.23));
+        assert_ne!(
+            StatementParam::from("string1"),
+            StatementParam::from("string2")
+        );
+    }
+}
diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml
index 8fd924838bb..ae46c10d98b 100644
--- a/iox_tests/Cargo.toml
+++ b/iox_tests/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 arrow = { workspace = true }
 data_types = { path = "../data_types" }
diff --git a/iox_tests/src/builders.rs b/iox_tests/src/builders.rs
index 9e86366e5f7..f1eb5baf728 100644
--- a/iox_tests/src/builders.rs
+++ b/iox_tests/src/builders.rs
@@ -1,7 +1,7 @@
 use data_types::{
-    Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, NamespaceId, ParquetFile,
-    ParquetFileId, Partition, PartitionId, PartitionKey, SkippedCompaction, Table, TableId,
-    Timestamp, TransitionPartitionId,
+    Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, NamespaceId, ObjectStoreId,
+    ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId,
+    PartitionKey, SkippedCompaction, Table, TableId, Timestamp,
 };
 use uuid::Uuid;
 
@@ -21,11 +21,14 @@ impl ParquetFileBuilder {
                 id: ParquetFileId::new(id),
                 namespace_id: NamespaceId::new(0),
                 table_id,
-                partition_id: TransitionPartitionId::new(
+                partition_id: PartitionId::new(0),
+                partition_hash_id: Some(PartitionHashId::new(
                     table_id,
                     &PartitionKey::from("arbitrary"),
-                ),
-                object_store_id: Uuid::from_u128(id.try_into().expect("invalid id")),
+                )),
+                object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(
+                    id.try_into().expect("invalid id"),
+                )),
                 min_time: Timestamp::new(0),
                 max_time: Timestamp::new(0),
                 to_delete: None,
@@ -40,7 +43,7 @@ impl ParquetFileBuilder {
     }
 
     /// Set the partition identifier
-    pub fn with_partition(self, partition_id: TransitionPartitionId) -> Self {
+    pub fn with_partition(self, partition_id: PartitionId) -> Self {
         Self {
             file: ParquetFile {
                 partition_id,
@@ -104,6 +107,27 @@ impl ParquetFileBuilder {
     pub fn build(self) -> ParquetFile {
         self.file
     }
+
+    /// Construct [`ParquetFileParams`] and the corresponding [`ParquetFile`]
+    pub fn params(self) -> (ParquetFileParams, ParquetFile) {
+        let file = self.clone().build();
+        let params = ParquetFileParams {
+            partition_id: self.file.partition_id,
+            partition_hash_id: self.file.partition_hash_id,
+            namespace_id: self.file.namespace_id,
+            table_id: self.file.table_id,
+            object_store_id: self.file.object_store_id,
+            min_time: self.file.min_time,
+            max_time: self.file.max_time,
+            file_size_bytes: self.file.file_size_bytes,
+            row_count: self.file.row_count,
+            compaction_level: self.file.compaction_level,
+            created_at: self.file.created_at,
+            column_set: self.file.column_set,
+            max_l0_created_at: self.file.max_l0_created_at,
+        };
+        (params, file)
+    }
 }
 
 impl From<ParquetFile> for ParquetFileBuilder {
@@ -201,12 +225,16 @@ pub struct PartitionBuilder {
 impl PartitionBuilder {
     /// Create a builder to create a partition with `partition_id` `id`
     pub fn new(id: i64) -> Self {
+        let table_id = TableId::new(0);
+        let key = PartitionKey::from("key");
+        let hash_id = PartitionHashId::new(table_id, &key);
+
         Self {
-            partition: Partition::new_in_memory_only(
+            partition: Partition::new_catalog_only(
                 PartitionId::new(id),
-                TableId::new(0),
-                PartitionKey::from("key"),
-                vec![],
+                Some(hash_id),
+                table_id,
+                key,
                 Default::default(),
                 None,
             ),
diff --git a/iox_tests/src/catalog.rs b/iox_tests/src/catalog.rs
index 88ab9b6695b..507ef090bb9 100644
--- a/iox_tests/src/catalog.rs
+++ b/iox_tests/src/catalog.rs
@@ -7,19 +7,18 @@ use arrow::{
 use data_types::{
     partition_template::TablePartitionTemplateOverride, Column, ColumnSet, ColumnType,
     ColumnsByName, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceName,
-    NamespaceSchema, ParquetFile, ParquetFileParams, Partition, PartitionId, SortedColumnSet,
-    Table, TableId, TableSchema, Timestamp, TransitionPartitionId,
+    NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileParams, Partition, PartitionId,
+    SortKeyIds, Table, TableSchema, Timestamp, TransitionPartitionId,
 };
 use datafusion::physical_plan::metrics::Count;
 use datafusion_util::{unbounded_memory_pool, MemoryStream};
 use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate;
+use iox_catalog::interface::PartitionRepoExt;
 use iox_catalog::{
-    interface::{
-        get_schema_by_id, get_table_columns_by_id, Catalog, RepoCollection, SoftDeletedRows,
-    },
+    interface::{Catalog, ParquetFileRepoExt, RepoCollection, SoftDeletedRows},
     mem::MemCatalog,
-    partition_lookup,
     test_helpers::arbitrary_table,
+    util::{get_schema_by_id, get_table_columns_by_id},
 };
 use iox_query::{
     exec::{DedicatedExecutors, Executor, ExecutorConfig},
@@ -40,10 +39,9 @@ use schema::{
     Projection, Schema,
 };
 use std::{collections::HashMap, num::NonZeroUsize, sync::Arc};
-use uuid::Uuid;
 
 /// Common retention period used throughout tests
-pub const TEST_RETENTION_PERIOD_NS: Option<i64> = Some(3_600 * 1_000_000_000);
+pub(crate) const TEST_RETENTION_PERIOD_NS: Option<i64> = Some(3_600 * 1_000_000_000);
 
 /// Catalog for tests
 #[derive(Debug)]
@@ -79,11 +77,14 @@ impl TestCatalog {
         target_query_partitions: NonZeroUsize,
     ) -> Arc<Self> {
         let metric_registry = Arc::new(metric::Registry::new());
-        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(Arc::clone(&metric_registry)));
+        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
+        let catalog: Arc<dyn Catalog> = Arc::new(MemCatalog::new(
+            Arc::clone(&metric_registry),
+            Arc::clone(&time_provider) as _,
+        ));
         let object_store = Arc::new(InMemory::new());
         let parquet_store =
             ParquetStorage::new(Arc::clone(&object_store) as _, StorageId::from("iox"));
-        let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap()));
         let exec = Arc::new(Executor::new_with_config_and_executors(
             ExecutorConfig {
                 num_threads: exec.num_threads(),
@@ -148,7 +149,7 @@ impl TestCatalog {
         name: &str,
         retention_period_ns: Option<i64>,
     ) -> Arc<TestNamespace> {
-        let mut repos = self.catalog.repositories().await;
+        let mut repos = self.catalog.repositories();
         let namespace_name = NamespaceName::new(name).unwrap();
         let namespace = repos
             .namespaces()
@@ -171,27 +172,13 @@ impl TestCatalog {
             .await
     }
 
-    /// List all non-deleted files
-    pub async fn list_by_table_not_to_delete(
-        self: &Arc<Self>,
-        table_id: TableId,
-    ) -> Vec<ParquetFile> {
-        self.catalog
-            .repositories()
-            .await
-            .parquet_files()
-            .list_by_table_not_to_delete(table_id)
-            .await
-            .unwrap()
-    }
-
     /// Add a partition into skipped compaction
     pub async fn add_to_skipped_compaction(
         self: &Arc<Self>,
         partition_id: PartitionId,
         reason: &str,
     ) {
-        let mut repos = self.catalog.repositories().await;
+        let mut repos = self.catalog.repositories();
 
         repos
             .partitions()
@@ -212,7 +199,7 @@ pub struct TestNamespace {
 impl TestNamespace {
     /// Create a table in this namespace
     pub async fn create_table(self: &Arc<Self>, name: &str) -> Arc<TestTable> {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         let table = arbitrary_table(&mut *repos, name, &self.namespace).await;
 
@@ -229,7 +216,7 @@ impl TestNamespace {
         name: &str,
         template: Option<PartitionTemplate>,
     ) -> Arc<TestTable> {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         let table = repos
             .tables()
@@ -254,32 +241,36 @@ impl TestNamespace {
 
     /// Get namespace schema for this namespace.
     pub async fn schema(&self) -> NamespaceSchema {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
         get_schema_by_id(
             self.namespace.id,
             repos.as_mut(),
             SoftDeletedRows::ExcludeDeleted,
         )
         .await
-        .unwrap()
+        .expect("no catalog error")
+        .expect("namespace exists")
     }
 
     /// Set the number of tables allowed in this namespace.
-    pub async fn update_table_limit(&self, new_max: i32) {
-        let mut repos = self.catalog.catalog.repositories().await;
+    pub async fn update_table_limit(&self, new_max: usize) {
+        let mut repos = self.catalog.catalog.repositories();
         repos
             .namespaces()
-            .update_table_limit(&self.namespace.name, MaxTables::new(new_max))
+            .update_table_limit(&self.namespace.name, MaxTables::try_from(new_max).unwrap())
             .await
             .unwrap();
     }
 
     /// Set the number of columns per table allowed in this namespace.
-    pub async fn update_column_limit(&self, new_max: i32) {
-        let mut repos = self.catalog.catalog.repositories().await;
+    pub async fn update_column_limit(&self, new_max: usize) {
+        let mut repos = self.catalog.catalog.repositories();
         repos
             .namespaces()
-            .update_column_limit(&self.namespace.name, MaxColumnsPerTable::new(new_max))
+            .update_column_limit(
+                &self.namespace.name,
+                MaxColumnsPerTable::try_from(new_max).unwrap(),
+            )
             .await
             .unwrap();
     }
@@ -297,7 +288,7 @@ pub struct TestTable {
 impl TestTable {
     /// Creat a partition for the table
     pub async fn create_partition(self: &Arc<Self>, key: &str) -> Arc<TestPartition> {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         let partition = repos
             .partitions()
@@ -317,10 +308,9 @@ impl TestTable {
     pub async fn create_partition_with_sort_key(
         self: &Arc<Self>,
         key: &str,
-        sort_key: &[&str],
         sort_key_ids: &[i64],
     ) -> Arc<TestPartition> {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         let partition = repos
             .partitions()
@@ -331,11 +321,9 @@ impl TestTable {
         let partition = repos
             .partitions()
             .cas_sort_key(
-                &TransitionPartitionId::Deprecated(partition.id),
-                None,
+                partition.id,
                 None,
-                sort_key,
-                &SortedColumnSet::from(sort_key_ids.iter().cloned()),
+                &SortKeyIds::from(sort_key_ids.iter().cloned()),
             )
             .await
             .unwrap();
@@ -354,7 +342,7 @@ impl TestTable {
         name: &str,
         column_type: ColumnType,
     ) -> Arc<TestColumn> {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         let column = repos
             .columns()
@@ -381,7 +369,7 @@ impl TestTable {
 
     /// Get columns from the catalog.
     pub async fn catalog_columns(&self) -> ColumnsByName {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         get_table_columns_by_id(self.table.id, repos.as_mut())
             .await
@@ -402,9 +390,9 @@ impl TestTable {
         let selection: Vec<_> = file
             .column_set
             .iter()
-            .map(|id| *column_id_lookup.get(id).unwrap())
+            .map(|id| column_id_lookup.get(id).unwrap().as_ref())
             .collect();
-        let schema = table_schema.select_by_names(&selection).unwrap();
+        let schema = table_schema.select_by_names(&selection[..]).unwrap();
 
         let chunk = ParquetChunk::new(Arc::new(file), schema, self.catalog.parquet_store.clone());
         chunk
@@ -421,6 +409,7 @@ impl TestTable {
 
 /// A test column.
 #[allow(missing_docs)]
+#[derive(Debug)]
 pub struct TestColumn {
     pub catalog: Arc<TestCatalog>,
     pub namespace: Arc<TestNamespace>,
@@ -446,35 +435,20 @@ pub struct TestPartition {
 
 impl TestPartition {
     /// Update sort key.
-    pub async fn update_sort_key(
-        self: &Arc<Self>,
-        sort_key: SortKey,
-        sort_key_ids: &SortedColumnSet,
-    ) -> Arc<Self> {
-        let partition = partition_lookup(
-            self.catalog.catalog.repositories().await.as_mut(),
-            &self.partition.transition_partition_id(),
-        )
-        .await
-        .unwrap()
-        .unwrap();
+    pub async fn update_sort_key(self: &Arc<Self>, sort_key_ids: &SortKeyIds) -> Arc<Self> {
+        let mut repos = self.catalog.catalog.repositories();
+        let partition = repos
+            .partitions()
+            .get_by_id(self.partition.id)
+            .await
+            .unwrap()
+            .unwrap();
 
-        let old_sort_key = partition.sort_key;
-        let old_sort_key_ids = partition.sort_key_ids;
+        let old_sort_key_ids = partition.sort_key_ids();
 
-        let partition = self
-            .catalog
-            .catalog
-            .repositories()
-            .await
+        let partition = repos
             .partitions()
-            .cas_sort_key(
-                &self.partition.transition_partition_id(),
-                Some(old_sort_key),
-                Some(old_sort_key_ids),
-                &sort_key.to_columns().collect::<Vec<_>>(),
-                sort_key_ids,
-            )
+            .cas_sort_key(self.partition.id, old_sort_key_ids, sort_key_ids)
             .await
             .unwrap();
 
@@ -525,7 +499,7 @@ impl TestPartition {
         let (record_batch, sort_key) = sort_batch(record_batch, &schema);
         let record_batch = dedup_batch(record_batch, &sort_key);
 
-        let object_store_id = object_store_id.unwrap_or_else(Uuid::new_v4);
+        let object_store_id = object_store_id.unwrap_or_else(ObjectStoreId::new);
 
         let metadata = IoxMetadata {
             object_store_id,
@@ -567,13 +541,8 @@ impl TestPartition {
         };
 
         let result = self.create_parquet_file_catalog_record(builder).await;
-        let mut repos = self.catalog.catalog.repositories().await;
-        update_catalog_sort_key_if_needed(
-            repos.as_mut(),
-            &self.partition.transition_partition_id(),
-            sort_key,
-        )
-        .await;
+        let mut repos = self.catalog.catalog.repositories();
+        update_catalog_sort_key_if_needed(repos.as_mut(), self.partition.id, sort_key).await;
         result
     }
 
@@ -622,8 +591,9 @@ impl TestPartition {
         let parquet_file_params = ParquetFileParams {
             namespace_id: self.namespace.namespace.id,
             table_id: self.table.table.id,
-            partition_id: self.partition.transition_partition_id(),
-            object_store_id: object_store_id.unwrap_or_else(Uuid::new_v4),
+            partition_id: self.partition.id,
+            partition_hash_id: self.partition.hash_id().cloned(),
+            object_store_id: object_store_id.unwrap_or_else(ObjectStoreId::new),
             min_time: Timestamp::new(min_time),
             max_time: Timestamp::new(max_time),
             file_size_bytes: file_size_bytes.unwrap_or(0) as i64,
@@ -634,7 +604,7 @@ impl TestPartition {
             max_l0_created_at: Timestamp::new(max_l0_created_at),
         };
 
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
         let parquet_file = repos
             .parquet_files()
             .create(parquet_file_params)
@@ -644,7 +614,13 @@ impl TestPartition {
         if to_delete {
             repos
                 .parquet_files()
-                .create_upgrade_delete(&[parquet_file.id], &[], &[], CompactionLevel::Initial)
+                .create_upgrade_delete(
+                    parquet_file.partition_id,
+                    &[parquet_file.object_store_id],
+                    &[],
+                    &[],
+                    CompactionLevel::Initial,
+                )
                 .await
                 .unwrap();
         }
@@ -673,7 +649,7 @@ pub struct TestParquetFileBuilder {
     creation_time: i64,
     compaction_level: CompactionLevel,
     to_delete: bool,
-    object_store_id: Option<Uuid>,
+    object_store_id: Option<ObjectStoreId>,
     row_count: Option<usize>,
     max_l0_created_at: i64,
 }
@@ -711,6 +687,12 @@ impl TestParquetFileBuilder {
             .with_schema(schema)
     }
 
+    /// Specify an object store id for this parquet file.
+    pub fn with_object_store_id(mut self, object_store_id: ObjectStoreId) -> Self {
+        self.object_store_id = Some(object_store_id);
+        self
+    }
+
     fn with_record_batch(mut self, record_batch: RecordBatch) -> Self {
         self.record_batch = Some(record_batch);
         self
@@ -782,15 +764,12 @@ impl TestParquetFileBuilder {
     }
 }
 
-async fn update_catalog_sort_key_if_needed<R>(
-    repos: &mut R,
-    id: &TransitionPartitionId,
-    sort_key: SortKey,
-) where
+async fn update_catalog_sort_key_if_needed<R>(repos: &mut R, id: PartitionId, sort_key: SortKey)
+where
     R: RepoCollection + ?Sized,
 {
     // Fetch the latest partition info from the catalog
-    let partition = partition_lookup(repos, id).await.unwrap().unwrap();
+    let partition = repos.partitions().get_by_id(id).await.unwrap().unwrap();
 
     // fecth column ids from catalog
     let columns = get_table_columns_by_id(partition.table_id, repos)
@@ -799,9 +778,8 @@ async fn update_catalog_sort_key_if_needed<R>(
 
     // Similarly to what the ingester does, if there's an existing sort key in the catalog, add new
     // columns onto the end
-
-    match (partition.sort_key(), partition.sort_key_ids_none_if_empty()) {
-        (Some(catalog_sort_key), Some(catalog_sort_key_ids)) => {
+    match partition.sort_key(&columns) {
+        Some(catalog_sort_key) => {
             let new_sort_key = sort_key.to_columns().collect::<Vec<_>>();
             let (_metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &new_sort_key);
             if let Some(new_sort_key) = update {
@@ -811,44 +789,28 @@ async fn update_catalog_sort_key_if_needed<R>(
                 debug!(
                     "Updating (sort_key, sort_key_ids) from ({:?}, {:?}) to ({:?}, {:?})",
                     catalog_sort_key.to_columns().collect::<Vec<_>>(),
-                    catalog_sort_key_ids,
+                    partition.sort_key_ids(),
                     &new_sort_key,
                     &new_sort_key_ids,
                 );
 
                 repos
                     .partitions()
-                    .cas_sort_key(
-                        id,
-                        Some(
-                            catalog_sort_key
-                                .to_columns()
-                                .map(ToString::to_string)
-                                .collect::<Vec<_>>(),
-                        ),
-                        Some(partition.sort_key_ids),
-                        &new_sort_key,
-                        &new_sort_key_ids,
-                    )
+                    .cas_sort_key(partition.id, partition.sort_key_ids(), &new_sort_key_ids)
                     .await
                     .unwrap();
             }
         }
-        (None, None) => {
+        None => {
             let new_columns = sort_key.to_columns().collect::<Vec<_>>();
             debug!("Updating sort key from None to {:?}", &new_columns);
             let column_ids = columns.ids_for_names(&new_columns);
             repos
                 .partitions()
-                .cas_sort_key(id, None, None, &new_columns, &column_ids)
+                .cas_sort_key(partition.id, None, &column_ids)
                 .await
                 .unwrap();
         }
-        _ => panic!(
-            "sort_key {:?} and sort_key_ids {:?} should be both None or both Some",
-            partition.sort_key(),
-            partition.sort_key_ids_none_if_empty()
-        ),
     }
 }
 
@@ -869,6 +831,7 @@ async fn create_parquet_file(
 
 /// A test parquet file of the catalog
 #[allow(missing_docs)]
+#[derive(Debug)]
 pub struct TestParquetFile {
     pub catalog: Arc<TestCatalog>,
     pub namespace: Arc<TestNamespace>,
@@ -889,11 +852,17 @@ impl From<TestParquetFile> for ParquetFile {
 impl TestParquetFile {
     /// Make the parquet file deletable
     pub async fn flag_for_delete(&self) {
-        let mut repos = self.catalog.catalog.repositories().await;
+        let mut repos = self.catalog.catalog.repositories();
 
         repos
             .parquet_files()
-            .create_upgrade_delete(&[self.parquet_file.id], &[], &[], CompactionLevel::Initial)
+            .create_upgrade_delete(
+                self.parquet_file.partition_id,
+                &[self.parquet_file.object_store_id],
+                &[],
+                &[],
+                CompactionLevel::Initial,
+            )
             .await
             .unwrap();
     }
@@ -906,15 +875,15 @@ impl TestParquetFile {
             .parquet_file
             .column_set
             .iter()
-            .map(|id| *column_id_lookup.get(id).unwrap())
+            .map(|id| column_id_lookup.get(id).unwrap().as_ref())
             .collect();
         let table_schema: Schema = table_columns.clone().try_into().unwrap();
-        table_schema.select_by_names(&selection).unwrap()
+        table_schema.select_by_names(&selection[..]).unwrap()
     }
 }
 
 /// Return the current time
-pub fn now() -> Time {
+pub(crate) fn now() -> Time {
     Time::from_timestamp(0, 0).unwrap()
 }
 
diff --git a/iox_tests/src/lib.rs b/iox_tests/src/lib.rs
index 98f65aefa1c..b5e4a283872 100644
--- a/iox_tests/src/lib.rs
+++ b/iox_tests/src/lib.rs
@@ -17,8 +17,6 @@
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
-use data_types::{PartitionKey, TableId, TransitionPartitionId};
-
 mod catalog;
 pub use catalog::{
     TestCatalog, TestNamespace, TestParquetFile, TestParquetFileBuilder, TestPartition, TestTable,
@@ -28,11 +26,3 @@ mod builders;
 pub use builders::{
     ColumnBuilder, ParquetFileBuilder, PartitionBuilder, SkippedCompactionBuilder, TableBuilder,
 };
-
-/// Create a partition identifier from an int (which gets used as the table ID) and a partition key
-/// with the string "arbitrary". Most useful in cases where there isn't any actual catalog
-/// interaction (that is, in mocks) and when the important property of the partition identifiers is
-/// that they're either the same or different than other partition identifiers.
-pub fn partition_identifier(table_id: i64) -> TransitionPartitionId {
-    TransitionPartitionId::new(TableId::new(table_id), &PartitionKey::from("arbitrary"))
-}
diff --git a/iox_time/Cargo.toml b/iox_time/Cargo.toml
index d8ef58e829c..c8a8398d0f2 100644
--- a/iox_time/Cargo.toml
+++ b/iox_time/Cargo.toml
@@ -6,10 +6,13 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 chrono = { version = "0.4.31", default-features = false, features = ["clock", "std"] }
 parking_lot = "0.12"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/iox_time/src/lib.rs b/iox_time/src/lib.rs
index 853155d2312..3a2cf836029 100644
--- a/iox_time/src/lib.rs
+++ b/iox_time/src/lib.rs
@@ -204,7 +204,7 @@ pub trait TimeProvider: Debug + Display + Send + Sync + 'static {
 }
 
 /// A [`TimeProvider`] that uses [`Utc::now`] as a clock source
-#[derive(Debug, Default, Clone)]
+#[derive(Debug, Default, Clone, Copy)]
 pub struct SystemProvider {}
 
 impl SystemProvider {
diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml
index 0fce9b58904..c52b49ffd89 100644
--- a/ioxd_common/Cargo.toml
+++ b/ioxd_common/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 # Optional feature 'pprof' enables http://localhost:8080/debug/pprof/profile support support
 
 [dependencies]
@@ -12,15 +15,16 @@ license.workspace = true
 authz = { path = "../authz", features = ["http"] }
 clap_blocks = { path = "../clap_blocks" }
 generated_types = { path = "../generated_types" }
-heappy = { git = "https://github.com/mkmik/heappy", rev = "1de977a241cdd768acc5b6c82c0728b30c7db7b4", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true }
+heappy = { git = "https://github.com/mkmik/heappy", rev = "01a1f88e1b404c5894f89eb1a57f813f713d7ad1", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true }
 metric = { path = "../metric" }
 metric_exporters = { path = "../metric_exporters" }
 observability_deps = { path = "../observability_deps" }
 # NOTE: we may not notice that we need the "backtrace-rs" feature if we also build with the heappy feature, which depends on backtrace-rs.
 # (honestly I thought that cargo dependencies were isolated on a per crate basis so I'm a bit surprised that pprof accidentally builds
 # successfully just because another crate happens to depend on backtrace-rs)
-pprof = { version = "0.12", default-features = false, features = ["flamegraph", "prost-codec"], optional = true }
+pprof = { version = "0.13", default-features = false, features = ["flamegraph", "prost-codec"], optional = true }
 service_grpc_testing = { path = "../service_grpc_testing" }
+tower_trailer = { path = "../tower_trailer" }
 trace = { path = "../trace" }
 trace_exporters = { path = "../trace_exporters" }
 trace_http = { path = "../trace_http" }
@@ -32,18 +36,18 @@ clap = { version = "4", features = ["derive", "env"] }
 flate2 = "1.0"
 futures = "0.3"
 hashbrown = { workspace = true }
-http = "0.2.9"
+http = "0.2.11"
 hyper = "0.14"
 log = "0.4"
 parking_lot = "0.12"
-reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.107"
+serde_json = "1.0.111"
 serde_urlencoded = "0.7.0"
-snafu = "0.7"
-tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-stream = { version = "0.1", features = ["net"] }
-tokio-util = { version = "0.7.9" }
+tokio-util = { version = "0.7.10" }
 tonic  = { workspace = true }
 tonic-health  = { workspace = true }
 tonic-reflection = { workspace = true }
@@ -51,7 +55,6 @@ tower = "0.4"
 tower-http = { version = "0.4", features = ["catch-panic"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
-
 [dev-dependencies]
 # Workspace dependencies, in alphabetical order
 # Crates.io dependencies, in alphabetical order
diff --git a/ioxd_common/src/http/error.rs b/ioxd_common/src/http/error.rs
index 64538477676..c08146c677c 100644
--- a/ioxd_common/src/http/error.rs
+++ b/ioxd_common/src/http/error.rs
@@ -1,5 +1,6 @@
 use hyper::{Body, Response, StatusCode};
 use observability_deps::tracing::warn;
+use serde::Serialize;
 
 /// Constants used in API error codes.
 ///
@@ -13,6 +14,7 @@ pub enum HttpApiErrorCode {
     Invalid,
     UnprocessableEntity,
     EmptyValue,
+    PartialWrite,
     Unavailable,
     Forbidden,
     TooManyRequests,
@@ -32,6 +34,7 @@ impl HttpApiErrorCode {
             Self::Invalid => "invalid",
             Self::UnprocessableEntity => "unprocessable entity",
             Self::EmptyValue => "empty value",
+            Self::PartialWrite => "created with partial errors found",
             Self::Unavailable => "unavailable",
             Self::Forbidden => "forbidden",
             Self::TooManyRequests => "too many requests",
@@ -51,6 +54,7 @@ impl HttpApiErrorCode {
             Self::Invalid => StatusCode::BAD_REQUEST,
             Self::UnprocessableEntity => StatusCode::UNPROCESSABLE_ENTITY,
             Self::EmptyValue => StatusCode::NO_CONTENT,
+            Self::PartialWrite => StatusCode::CREATED,
             Self::Unavailable => StatusCode::SERVICE_UNAVAILABLE,
             Self::Forbidden => StatusCode::FORBIDDEN,
             Self::TooManyRequests => StatusCode::TOO_MANY_REQUESTS,
@@ -76,6 +80,7 @@ impl From<StatusCode> for HttpApiErrorCode {
             StatusCode::BAD_REQUEST => Self::Invalid,
             StatusCode::UNPROCESSABLE_ENTITY => Self::UnprocessableEntity,
             StatusCode::NO_CONTENT => Self::EmptyValue,
+            StatusCode::CREATED => Self::PartialWrite,
             StatusCode::SERVICE_UNAVAILABLE => Self::Unavailable,
             StatusCode::FORBIDDEN => Self::Forbidden,
             StatusCode::TOO_MANY_REQUESTS => Self::TooManyRequests,
@@ -91,16 +96,27 @@ impl From<StatusCode> for HttpApiErrorCode {
     }
 }
 
+impl Serialize for HttpApiErrorCode {
+    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
+        serializer.serialize_str(self.as_text())
+    }
+}
+
 /// Error that is compatible with the Influxdata Cloud 2 HTTP API.
 ///
 /// See <https://docs.influxdata.com/influxdb/v2.1/api/#operation/PostWrite>.
-#[derive(Debug)]
+#[derive(Debug, Serialize)]
 pub struct HttpApiError {
     /// Machine-readable error code.
     code: HttpApiErrorCode,
 
     /// Human-readable message.
+    #[serde(rename = "message")]
     msg: String,
+
+    /// Optional error line (for line protocol errors).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    line: Option<usize>,
 }
 
 impl HttpApiError {
@@ -109,18 +125,18 @@ impl HttpApiError {
         Self {
             code: code.into(),
             msg: msg.into(),
+            line: None,
         }
     }
 
+    /// Add body to error.
+    pub fn with_line(self, line: Option<usize>) -> Self {
+        Self { line, ..self }
+    }
+
     /// Generate response body for this error.
     fn body(&self) -> Body {
-        let json = serde_json::json!({
-            "code": self.code.as_text().to_string(),
-            "message": self.msg.clone(),
-        })
-        .to_string();
-
-        Body::from(json)
+        Body::from(serde_json::to_string(&self).expect("must serialise to json"))
     }
 
     /// Generate response for this error.
diff --git a/ioxd_common/src/http/mod.rs b/ioxd_common/src/http/mod.rs
index 6b59c416c71..21a770d3d3a 100644
--- a/ioxd_common/src/http/mod.rs
+++ b/ioxd_common/src/http/mod.rs
@@ -1,3 +1,4 @@
+use http::StatusCode;
 use std::{convert::Infallible, num::NonZeroI32, sync::Arc};
 
 use authz::http::AuthorizationHeaderExtension;
@@ -94,14 +95,11 @@ pub async fn serve(
     shutdown: CancellationToken,
     trace_header_parser: TraceHeaderParser,
 ) -> Result<(), hyper::Error> {
-    let metric_registry = server_type.metric_registry();
     let trace_collector = server_type.trace_collector();
-
     let trace_layer = TraceLayer::new(
         trace_header_parser,
-        metric_registry,
+        Arc::new(server_type.http_request_metrics()),
         trace_collector,
-        false,
         server_type.name(),
     );
 
@@ -136,7 +134,7 @@ async fn route_request(
     let content_length = req.headers().get("content-length").cloned();
 
     let response = match (method.clone(), uri.path()) {
-        (Method::GET, "/health") => health(),
+        (Method::GET, "/health") => Ok(health(server_type.as_ref())),
         (Method::GET, "/metrics") => handle_metrics(server_type.as_ref()),
         (Method::GET, "/debug/pprof") => pprof_home(req).await,
         (Method::GET, "/debug/pprof/profile") => pprof_profile(req).await,
@@ -165,9 +163,18 @@ async fn route_request(
     }
 }
 
-fn health() -> Result<Response<Body>, ApplicationError> {
-    let response_body = "OK";
-    Ok(Response::new(Body::from(response_body.to_string())))
+fn health(server_type: &dyn ServerType) -> Response<Body> {
+    match server_type.is_healthy() {
+        true => {
+            let response_body = "OK";
+            Response::new(Body::from(response_body.to_string()))
+        }
+        false => {
+            let mut resp = Response::new(Body::empty());
+            *resp.status_mut() = StatusCode::SERVICE_UNAVAILABLE;
+            resp
+        }
+    }
 }
 
 fn handle_metrics(server_type: &dyn ServerType) -> Result<Response<Body>, ApplicationError> {
diff --git a/ioxd_common/src/http/pprof.rs b/ioxd_common/src/http/pprof.rs
index c49992f596f..c15a62da04b 100644
--- a/ioxd_common/src/http/pprof.rs
+++ b/ioxd_common/src/http/pprof.rs
@@ -1,7 +1,7 @@
 use observability_deps::tracing::info;
 use tokio::time::Duration;
 
-pub async fn dump_rsprof(seconds: u64, frequency: i32) -> pprof::Result<pprof::Report> {
+pub(crate) async fn dump_rsprof(seconds: u64, frequency: i32) -> pprof::Result<pprof::Report> {
     let guard = pprof::ProfilerGuard::new(frequency)?;
     info!(
         "start profiling {} seconds with frequency {} /s",
diff --git a/ioxd_common/src/lib.rs b/ioxd_common/src/lib.rs
index cbc82c63dc5..4326c322d7e 100644
--- a/ioxd_common/src/lib.rs
+++ b/ioxd_common/src/lib.rs
@@ -29,6 +29,7 @@ pub mod reexport {
     pub use tonic_health;
     pub use tonic_reflection;
     pub use tower_http;
+    pub use tower_trailer;
     pub use trace_http;
 }
 
@@ -45,6 +46,9 @@ use trace_http::ctx::TraceHeaderParser;
 
 #[derive(Debug, Snafu)]
 pub enum Error {
+    #[snafu(display("Neither grpc nor http listeners are available"))]
+    MissingListener,
+
     #[snafu(display("Unable to bind to listen for HTTP requests on {}: {}", addr, source))]
     StartListeningHttp {
         addr: SocketAddr,
@@ -121,10 +125,14 @@ pub async fn http_listener(addr: SocketAddr) -> Result<AddrIncoming> {
 pub async fn serve(
     common_state: CommonServerState,
     frontend_shutdown: CancellationToken,
-    grpc_listener: tokio::net::TcpListener,
+    grpc_listener: Option<tokio::net::TcpListener>,
     http_listener: Option<AddrIncoming>,
     server_type: Arc<dyn ServerType>,
 ) -> Result<()> {
+    if grpc_listener.is_none() && http_listener.is_none() {
+        return Err(Error::MissingListener);
+    }
+
     let trace_header_parser = TraceHeaderParser::new()
         .with_jaeger_trace_context_header_name(
             &common_state
@@ -140,14 +148,26 @@ pub async fn serve(
         );
 
     // Construct and start up gRPC server
-    let grpc_server = rpc::serve(
-        grpc_listener,
-        Arc::clone(&server_type),
-        trace_header_parser.clone(),
-        frontend_shutdown.clone(),
-    )
+    let captured_server_type = Arc::clone(&server_type);
+    let captured_shutdown = frontend_shutdown.clone();
+    let captured_trace_header_parser = trace_header_parser.clone();
+    let grpc_server = async move {
+        if let Some(grpc_listener) = grpc_listener {
+            info!(?captured_server_type, "gRPC server listening");
+            rpc::serve(
+                grpc_listener,
+                captured_server_type,
+                captured_trace_header_parser,
+                captured_shutdown,
+            )
+            .await?
+        } else {
+            // don't resolve otherwise will cause server to shutdown
+            captured_shutdown.cancelled().await
+        }
+        Ok(())
+    }
     .fuse();
-    info!(?server_type, "gRPC server listening");
 
     let captured_server_type = Arc::clone(&server_type);
     let captured_shutdown = frontend_shutdown.clone();
@@ -218,7 +238,7 @@ pub async fn serve(
     //
     // This is important to ensure background tasks, such as polling the tracker
     // registry, don't exit before HTTP and gRPC requests dependent on them
-    while !grpc_server.is_terminated() && !http_server.is_terminated() {
+    while !grpc_server.is_terminated() || !http_server.is_terminated() {
         futures::select! {
             _ = signal => info!(?server_type, "shutdown requested"),
             _ = server_handle => {
diff --git a/ioxd_common/src/rpc.rs b/ioxd_common/src/rpc.rs
index 8f25f326c45..1185e5b6363 100644
--- a/ioxd_common/src/rpc.rs
+++ b/ioxd_common/src/rpc.rs
@@ -34,6 +34,9 @@ pub struct RpcBuilder<T> {
 #[macro_export]
 macro_rules! add_service {
     ($builder:ident, $svc:expr) => {
+        $crate::add_service!($builder, $svc, Serving)
+    };
+    ($builder:ident, $svc:expr, $status:ident) => {
         let $builder = {
             // `inner` might be required to be `mut` or not depending if we're acting on:
             // - a `Server`, no service added yet, no `mut` required
@@ -50,7 +53,7 @@ macro_rules! add_service {
                 } = $builder;
                 let service = $svc;
 
-                let status = $crate::reexport::tonic_health::ServingStatus::Serving;
+                let status = $crate::reexport::tonic_health::ServingStatus::$status;
                 health_reporter
                     .set_service_status(service_name(&service), status)
                     .await;
@@ -97,16 +100,19 @@ macro_rules! setup_builder {
         let builder = builder
             .layer($crate::reexport::trace_http::tower::TraceLayer::new(
                 trace_header_parser,
-                $server_type.metric_registry(),
+                Arc::new($crate::reexport::trace_http::metrics::RequestMetrics::new(
+                    $server_type.metric_registry(),
+                    $crate::reexport::trace_http::metrics::MetricFamily::GrpcServer,
+                )),
                 $server_type.trace_collector(),
-                true,
                 $server_type.name(),
             ))
             .layer(
                 $crate::reexport::tower_http::catch_panic::CatchPanicLayer::custom(
                     $crate::rpc::handle_panic,
                 ),
-            );
+            )
+            .layer($crate::reexport::tower_trailer::TrailerLayer::default());
 
         let builder = RpcBuilder {
             inner: builder,
diff --git a/ioxd_common/src/server_type.rs b/ioxd_common/src/server_type.rs
index 767dacd9ea4..519b2c443f2 100644
--- a/ioxd_common/src/server_type.rs
+++ b/ioxd_common/src/server_type.rs
@@ -10,6 +10,7 @@ use tokio_util::sync::CancellationToken;
 use trace::TraceCollector;
 
 pub use common_state::{CommonServerState, CommonServerStateError};
+use trace_http::metrics::{MetricFamily, RequestMetrics};
 
 use crate::{http::error::HttpApiErrorSource, rpc::RpcBuilderInput};
 
@@ -20,6 +21,9 @@ pub enum RpcError {
         source: tonic::transport::Error,
         details: String,
     },
+
+    #[snafu(display("gRPC endpoint is not implemented"))]
+    UnImplemented,
 }
 
 // Custom impl to include underlying source (not included in tonic
@@ -47,6 +51,11 @@ pub trait ServerType: std::fmt::Debug + Send + Sync + 'static {
     /// Trace collector associated with the server, if any.
     fn trace_collector(&self) -> Option<Arc<dyn TraceCollector>>;
 
+    /// Returns the `RequestMetrics` for instrumenting HTTP requests
+    fn http_request_metrics(&self) -> RequestMetrics {
+        RequestMetrics::new(self.metric_registry(), MetricFamily::HttpServer)
+    }
+
     /// Route given HTTP request.
     ///
     /// Note that this is only called if none of the shared, common routes (e.g. `/health`) match.
@@ -69,4 +78,9 @@ pub trait ServerType: std::fmt::Debug + Send + Sync + 'static {
     /// to shutdown the "frontend" (HTTP & RPC servers) when appropriate - this
     /// should happen before [`Self::join()`] returns.
     fn shutdown(&self, frontend: CancellationToken);
+
+    /// Return `true` if the service is healthy
+    fn is_healthy(&self) -> bool {
+        true
+    }
 }
diff --git a/ioxd_common/src/service.rs b/ioxd_common/src/service.rs
index 6f1bf840357..b7513030853 100644
--- a/ioxd_common/src/service.rs
+++ b/ioxd_common/src/service.rs
@@ -8,7 +8,7 @@ use crate::server_type::ServerType;
 #[derive(Debug)]
 pub struct Service {
     pub http_bind_address: Option<SocketAddr>,
-    pub grpc_bind_address: SocketAddr,
+    pub grpc_bind_address: Option<SocketAddr>,
     pub server_type: Arc<dyn ServerType>,
 }
 
@@ -16,7 +16,7 @@ impl Service {
     pub fn create(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
         Self {
             http_bind_address: Some(run_config.http_bind_address),
-            grpc_bind_address: run_config.grpc_bind_address,
+            grpc_bind_address: Some(run_config.grpc_bind_address),
             server_type,
         }
     }
@@ -24,7 +24,15 @@ impl Service {
     pub fn create_grpc_only(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
         Self {
             http_bind_address: None,
-            grpc_bind_address: run_config.grpc_bind_address,
+            grpc_bind_address: Some(run_config.grpc_bind_address),
+            server_type,
+        }
+    }
+
+    pub fn create_http_only(server_type: Arc<dyn ServerType>, run_config: &RunConfig) -> Self {
+        Self {
+            http_bind_address: Some(run_config.http_bind_address),
+            grpc_bind_address: None,
             server_type,
         }
     }
diff --git a/ioxd_test/Cargo.toml b/ioxd_test/Cargo.toml
index 7483189b2be..488efafd9e7 100644
--- a/ioxd_test/Cargo.toml
+++ b/ioxd_test/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 # Workspace dependencies, in alphabetical order
 ioxd_common = { path = "../ioxd_common" }
@@ -15,6 +18,6 @@ trace = { path = "../trace" }
 async-trait = "0.1"
 clap = { version = "4", features = ["derive", "env"] }
 hyper = "0.14"
-snafu = "0.7"
-tokio-util = "0.7.9"
+snafu = "0.8"
+tokio-util = "0.7.10"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/ioxd_test/src/lib.rs b/ioxd_test/src/lib.rs
index fe652e5dc93..b32dcdabbf3 100644
--- a/ioxd_test/src/lib.rs
+++ b/ioxd_test/src/lib.rs
@@ -44,7 +44,7 @@ impl HttpApiErrorSource for ApplicationError {
     }
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum)]
+#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum, Copy)]
 pub enum TestAction {
     None,
     EarlyReturnFromGrpcWorker,
diff --git a/kube_test/Cargo.toml b/kube_test/Cargo.toml
new file mode 100644
index 00000000000..f7da1fe14de
--- /dev/null
+++ b/kube_test/Cargo.toml
@@ -0,0 +1,22 @@
+[package]
+name = "kube_test"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+http = "0.2.9"
+hyper = "0.14.27"
+kube-core = "0.87.1"
+k8s-openapi = { version = "0.20.0", features = ["earliest"] }
+rand = "0.8.5"
+serde = "1.0.195"
+serde_json = "1.0.111"
+serde_yaml = "0.9.30"
+tower = "0.4.13"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
diff --git a/kube_test/src/call.rs b/kube_test/src/call.rs
new file mode 100644
index 00000000000..6ed31a03bd8
--- /dev/null
+++ b/kube_test/src/call.rs
@@ -0,0 +1,70 @@
+use super::{request::Request, Handler, Result};
+use http::{HeaderMap, Response, StatusCode};
+use hyper::body::HttpBody;
+use hyper::Body;
+use std::future::Future;
+use std::mem;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+pub struct Call {
+    handler: Option<Arc<dyn Handler + Send + Sync>>,
+    request: Request,
+    header: HeaderMap,
+    body: Body,
+    buf: Vec<u8>,
+}
+
+impl Call {
+    pub(crate) fn new(
+        handler: Option<Arc<dyn Handler + Send + Sync>>,
+        request: Request,
+        header: HeaderMap,
+        body: Body,
+    ) -> Self {
+        Self {
+            handler,
+            request,
+            header,
+            body,
+            buf: vec![],
+        }
+    }
+}
+
+impl Future for Call {
+    type Output = Result<Response<Body>>;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let this = self.get_mut();
+        match &this.handler {
+            None => {
+                let data = serde_json::to_vec(&super::status::resource_not_found(
+                    &this.request.api_plural(),
+                ))
+                .unwrap();
+                Poll::Ready(
+                    Response::builder()
+                        .status(StatusCode::NOT_FOUND)
+                        .body(data.into())
+                        .map_err(super::Error::from),
+                )
+            }
+            Some(handler) => {
+                while !&this.body.is_end_stream() {
+                    match ready!(Pin::new(&mut this.body).poll_data(cx)).transpose()? {
+                        Some(buf) => this.buf.extend_from_slice(buf.as_ref()),
+                        None => break,
+                    }
+                }
+                Poll::Ready(handler.handle(
+                    mem::take(&mut this.request),
+                    mem::take(&mut this.header),
+                    mem::take(&mut this.buf),
+                ))
+            }
+        }
+    }
+}
diff --git a/kube_test/src/error.rs b/kube_test/src/error.rs
new file mode 100644
index 00000000000..e3c4f9139ba
--- /dev/null
+++ b/kube_test/src/error.rs
@@ -0,0 +1,57 @@
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug)]
+pub enum Error {
+    Serialization(serde_json::Error),
+    Yaml(serde_yaml::Error),
+    Http(http::Error),
+    Hyper(hyper::Error),
+}
+
+pub type Result<T, E = Error> = std::result::Result<T, E>;
+
+impl Display for Error {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Serialization(e) => e.fmt(f),
+            Self::Yaml(e) => e.fmt(f),
+            Self::Http(e) => e.fmt(f),
+            Self::Hyper(e) => e.fmt(f),
+        }
+    }
+}
+
+impl std::error::Error for Error {
+    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
+        match self {
+            Self::Serialization(e) => Some(e),
+            Self::Yaml(e) => Some(e),
+            Self::Http(e) => Some(e),
+            Self::Hyper(e) => Some(e),
+        }
+    }
+}
+
+impl From<serde_json::Error> for Error {
+    fn from(value: serde_json::Error) -> Self {
+        Self::Serialization(value)
+    }
+}
+
+impl From<serde_yaml::Error> for Error {
+    fn from(value: serde_yaml::Error) -> Self {
+        Self::Yaml(value)
+    }
+}
+
+impl From<http::Error> for Error {
+    fn from(value: http::Error) -> Self {
+        Self::Http(value)
+    }
+}
+
+impl From<hyper::Error> for Error {
+    fn from(value: hyper::Error) -> Self {
+        Self::Hyper(value)
+    }
+}
diff --git a/kube_test/src/handler.rs b/kube_test/src/handler.rs
new file mode 100644
index 00000000000..464797baa25
--- /dev/null
+++ b/kube_test/src/handler.rs
@@ -0,0 +1,25 @@
+use super::{request::Request, Result};
+use http::{HeaderMap, Response};
+use hyper::Body;
+use kube_core::ApiResource;
+use std::fmt::Debug;
+use std::sync::Arc;
+
+pub trait Handler: Debug {
+    fn api_resource(&self) -> ApiResource;
+
+    fn handle(&self, req: Request, header: HeaderMap, body: Vec<u8>) -> Result<Response<Body>>;
+}
+
+pub trait AsHandler {
+    fn as_handler(self: &Arc<Self>) -> Arc<dyn Handler + Send + Sync>;
+}
+
+impl<T> AsHandler for T
+where
+    T: Handler + Send + Sync + 'static,
+{
+    fn as_handler(self: &Arc<Self>) -> Arc<dyn Handler + Send + Sync> {
+        Arc::clone(self) as Arc<dyn Handler + Send + Sync>
+    }
+}
diff --git a/kube_test/src/lib.rs b/kube_test/src/lib.rs
new file mode 100644
index 00000000000..3c72ce2c1f3
--- /dev/null
+++ b/kube_test/src/lib.rs
@@ -0,0 +1,31 @@
+//! Kube_test provides a fake kubernetes service that can be used to test a kubernetes controller.
+//! The Service class provides a [tower::Service] that can be used with a kubernetes Client to
+//! behave sufficiently like a kubernetes controller to simplify testing controller reconcile loops.
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+missing_debug_implementations,
+clippy::explicit_iter_loop,
+clippy::use_self,
+clippy::clone_on_ref_ptr,
+// See https://github.com/influxdata/influxdb_iox/pull/1671
+clippy::future_not_send
+)]
+#![allow(unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod call;
+mod error;
+mod handler;
+mod object_map;
+mod request;
+mod resource_handler;
+mod service;
+mod status;
+
+pub use call::Call;
+pub use error::{Error, Result};
+pub use handler::{AsHandler, Handler};
+pub use resource_handler::ResourceHandler;
+pub use service::Service;
diff --git a/kube_test/src/object_map.rs b/kube_test/src/object_map.rs
new file mode 100644
index 00000000000..55807b8edb5
--- /dev/null
+++ b/kube_test/src/object_map.rs
@@ -0,0 +1,178 @@
+use super::status;
+use kube_core::{ApiResource, DynamicObject, Status};
+use std::collections::{hash_map, HashMap};
+use std::mem;
+
+#[derive(Debug)]
+pub struct ObjectMap {
+    api_resource: ApiResource,
+    objects: HashMap<Key, DynamicObject>,
+}
+
+#[derive(Debug, Clone, Hash, PartialEq, Eq)]
+struct Key {
+    ns: Option<String>,
+    name: String,
+}
+
+impl ObjectMap {
+    pub fn new(api_resource: ApiResource) -> Self {
+        Self {
+            api_resource,
+            objects: HashMap::new(),
+        }
+    }
+
+    pub fn entry(&mut self, ns: Option<String>, name: String) -> Entry<'_> {
+        let key = Key { ns, name };
+        let inner = self.objects.entry(key);
+        Entry {
+            api_resource: &self.api_resource,
+            inner,
+        }
+    }
+
+    pub fn values(&self, ns: Option<String>) -> Values<'_> {
+        Values {
+            ns,
+            inner: self.objects.values(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct Entry<'a> {
+    api_resource: &'a ApiResource,
+    inner: hash_map::Entry<'a, Key, DynamicObject>,
+}
+
+impl<'a> Entry<'a> {
+    pub fn create(self, obj: DynamicObject) -> Result<&'a DynamicObject, Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(entry) => Err(Box::new(status::already_exists(
+                self.api_resource,
+                Some(entry.key().name.as_str()),
+            ))),
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok(obj)
+            }
+        }
+    }
+
+    pub fn get(&mut self) -> Result<&DynamicObject, Box<Status>> {
+        match &self.inner {
+            hash_map::Entry::Occupied(entry) => Ok(entry.get()),
+            hash_map::Entry::Vacant(entry) => {
+                let name = entry.key().name.as_str();
+                Err(Box::new(status::not_found(self.api_resource, Some(name))))
+            }
+        }
+    }
+
+    pub fn delete(self) -> Result<DynamicObject, Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(entry) => {
+                let obj = entry.remove();
+                Ok(obj)
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let name = entry.key().name.as_str();
+                Err(Box::new(status::not_found(self.api_resource, Some(name))))
+            }
+        }
+    }
+
+    pub fn update(self, mut obj: DynamicObject) -> Result<(bool, DynamicObject), Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(mut entry) => {
+                let Key { ns, name } = entry.key().clone();
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                let _ = entry.insert(obj.clone());
+                Ok((false, obj))
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok((true, obj.clone()))
+            }
+        }
+    }
+
+    pub fn apply(self, patch: DynamicObject) -> Result<DynamicObject, Box<Status>> {
+        let Key { ns, name } = self.inner.key().clone();
+
+        let obj = self.inner.or_insert_with(|| {
+            let obj = DynamicObject::new(name.as_str(), self.api_resource);
+            if let Some(ns) = ns {
+                obj.within(ns.as_str())
+            } else {
+                obj
+            }
+        });
+        let _ = mem::replace(&mut obj.data, patch.data);
+        Ok(obj.clone())
+    }
+
+    pub fn update_subresource(
+        self,
+        subresource: String,
+        obj: DynamicObject,
+    ) -> Result<(bool, DynamicObject), Box<Status>> {
+        match self.inner {
+            hash_map::Entry::Occupied(mut entry) => {
+                if let Some(value) = obj.data.as_object().and_then(|v| v.get(&subresource)) {
+                    if let Some(data) = entry.get_mut().data.as_object_mut() {
+                        data.insert(subresource, value.clone());
+                    }
+                }
+                Ok((false, entry.get().clone()))
+            }
+            hash_map::Entry::Vacant(entry) => {
+                let Key { ns, name } = entry.key().clone();
+                let obj = entry.insert(obj);
+                obj.metadata.namespace = ns;
+                obj.metadata.name = Some(name);
+                if obj.metadata.uid.is_none() {
+                    obj.metadata.uid = Some(format!("{}", rand::random::<u64>()));
+                }
+                Ok((true, obj.clone()))
+            }
+        }
+    }
+}
+
+pub struct Values<'a> {
+    ns: Option<String>,
+    inner: hash_map::Values<'a, Key, DynamicObject>,
+}
+
+impl<'a> Iterator for Values<'a> {
+    type Item = &'a DynamicObject;
+    fn next(&mut self) -> Option<Self::Item> {
+        match &self.ns {
+            None => self.inner.next(),
+            Some(ns) => loop {
+                match self.inner.next() {
+                    None => return None,
+                    Some(v) => match &v.metadata.namespace {
+                        Some(ns2) if ns2 == ns => return Some(v),
+                        _ => continue,
+                    },
+                };
+            },
+        }
+    }
+}
diff --git a/kube_test/src/request.rs b/kube_test/src/request.rs
new file mode 100644
index 00000000000..6a741ab6b07
--- /dev/null
+++ b/kube_test/src/request.rs
@@ -0,0 +1,115 @@
+use http::request::Parts;
+use kube_core::ApiResource;
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug, Default, Clone)]
+pub struct Request {
+    pub verb: String,
+    pub group: String,
+    pub version: String,
+    pub plural: String,
+    pub ns: Option<String>,
+    pub name: Option<String>,
+    pub subresource: Option<String>,
+}
+
+impl Request {
+    pub(crate) fn parse(parts: &Parts) -> Self {
+        let verb = parts.method.as_str().to_lowercase();
+        let (group, version, plural, ns, name, subresource) = match parts
+            .uri
+            .path()
+            .split('/')
+            .skip(1)
+            .collect::<Vec<&str>>()
+            .as_slice()
+        {
+            ["api", "v1", plural] => ("", "v1", *plural, "", "", ""),
+            ["api", "v1", plural, name] => ("", "v1", *plural, "", *name, ""),
+            ["api", "v1", "namespaces", ns, plural] => ("", "v1", *plural, *ns, "", ""),
+            ["api", "v1", "namespaces", ns, plural, name] => ("", "v1", *plural, *ns, *name, ""),
+            ["api", "v1", "namespaces", ns, plural, name, subresource] => {
+                ("", "v1", *plural, *ns, *name, *subresource)
+            }
+            ["api", "v1", plural, name, subresource] => {
+                ("", "v1", *plural, "", *name, *subresource)
+            }
+            ["apis", group, version, "namespaces", ns, plural] => {
+                (*group, *version, *plural, *ns, "", "")
+            }
+            ["apis", group, version, "namespaces", ns, plural, name] => {
+                (*group, *version, *plural, *ns, *name, "")
+            }
+            ["apis", group, version, "namespaces", ns, plural, name, subresource] => {
+                (*group, *version, *plural, *ns, *name, *subresource)
+            }
+            ["apis", group, version, plural] => (*group, *version, *plural, "", "", ""),
+            ["apis", group, version, plural, name] => (*group, *version, *plural, "", *name, ""),
+            ["apis", group, version, plural, name, subresource] => {
+                (*group, *version, *plural, "", *name, *subresource)
+            }
+            _ => ("", "", "", "", "", ""),
+        };
+
+        let verb = match (verb.as_str(), name.len()) {
+            ("get", 0) => String::from("list"),
+            ("delete", 0) => String::from("deletecollection"),
+            ("post", _) => String::from("create"),
+            ("put", _) => String::from("update"),
+            _ => verb,
+        };
+
+        Self {
+            verb,
+            group: String::from(group),
+            version: String::from(version),
+            plural: String::from(plural),
+            ns: if ns.is_empty() {
+                None
+            } else {
+                Some(String::from(ns))
+            },
+            name: if name.is_empty() {
+                None
+            } else {
+                Some(String::from(name))
+            },
+            subresource: if subresource.is_empty() {
+                None
+            } else {
+                Some(String::from(subresource))
+            },
+        }
+    }
+
+    pub fn api_plural(&self) -> ApiPlural {
+        ApiPlural::new(self.group.clone(), self.plural.clone())
+    }
+}
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct ApiPlural {
+    group: String,
+    plural: String,
+}
+
+impl ApiPlural {
+    pub fn new(group: String, plural: String) -> Self {
+        Self { group, plural }
+    }
+}
+
+impl Display for ApiPlural {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        if self.group.is_empty() {
+            self.plural.fmt(f)
+        } else {
+            write!(f, "{}/{}", self.group, self.plural)
+        }
+    }
+}
+
+impl From<ApiResource> for ApiPlural {
+    fn from(value: ApiResource) -> Self {
+        Self::new(value.group, value.plural)
+    }
+}
diff --git a/kube_test/src/resource_handler.rs b/kube_test/src/resource_handler.rs
new file mode 100644
index 00000000000..9b76ad8aa6e
--- /dev/null
+++ b/kube_test/src/resource_handler.rs
@@ -0,0 +1,267 @@
+use super::{object_map::ObjectMap, request::Request, status, Handler, Result};
+use http::{HeaderMap, HeaderValue, Response, StatusCode};
+use hyper::Body;
+use kube_core::{ApiResource, DynamicObject, ObjectList, ObjectMeta, Resource};
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+use std::fmt::Debug;
+use std::marker::PhantomData;
+use std::sync::atomic::{AtomicI16, Ordering};
+use std::sync::{Arc, Mutex};
+
+#[derive(Debug)]
+pub struct ResourceHandler<R> {
+    api_resource: ApiResource,
+    objects: Arc<Mutex<ObjectMap>>,
+    gen_id: AtomicI16,
+    phantom: PhantomData<R>,
+}
+
+impl<R> ResourceHandler<R>
+where
+    R: Resource<DynamicType = ()> + DeserializeOwned + Serialize,
+{
+    /// Create a new handler for a kubernetes resource type.
+    pub fn new() -> Self {
+        let api_resource = ApiResource::erase::<R>(&());
+        Self {
+            api_resource: api_resource.clone(),
+            objects: Arc::new(Mutex::new(ObjectMap::new(api_resource))),
+            gen_id: AtomicI16::new(0),
+            phantom: Default::default(),
+        }
+    }
+
+    /// Retrieve a stored kubernetes resource, if available.
+    pub fn get(&self, ns: impl Into<String>, name: impl Into<String>) -> Option<R> {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        let name = name.into();
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .get()
+        {
+            Ok(obj) => obj.clone().try_parse::<R>().ok(),
+            _ => None,
+        }
+    }
+
+    /// Store, or overwrite, the resource with the given name.
+    pub fn set(&self, ns: impl Into<String>, name: impl Into<String>, resource: R) -> R {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        let name = name.into();
+        let obj = serde_json::from_value::<DynamicObject>(serde_json::to_value(resource).unwrap())
+            .unwrap();
+        let (_, obj) = Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update(obj)
+            .unwrap();
+        obj.try_parse::<R>().unwrap()
+    }
+
+    /// Retrieve all the stored resources. if the resource is namespaced and ns is not None then
+    /// only resources in that namespace will be returned.
+    pub fn all(&self, ns: impl Into<String>) -> Vec<R> {
+        let ns = ns.into();
+        let ns = if ns.is_empty() { None } else { Some(ns) };
+        Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .values(ns)
+            .cloned()
+            .filter_map(|v| v.try_parse::<R>().ok())
+            .collect::<Vec<_>>()
+    }
+}
+
+impl<R> Default for ResourceHandler<R>
+where
+    R: Resource<DynamicType = ()> + DeserializeOwned + Serialize,
+{
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<R> ResourceHandler<R> {
+    fn maybe_generate_name(&self, meta: &mut ObjectMeta) {
+        if meta.name.is_none() {
+            if let Some(prefix) = &meta.generate_name {
+                meta.name = Some(format!(
+                    "{prefix}{:05}",
+                    self.gen_id.fetch_add(1, Ordering::SeqCst)
+                ));
+            }
+        }
+    }
+
+    fn create(&self, body: Vec<u8>) -> Result<Response<Body>> {
+        let mut obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        self.maybe_generate_name(&mut obj.metadata);
+        let ns = obj.metadata.namespace.clone();
+        let name = obj.metadata.name.clone().unwrap();
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .create(obj)
+        {
+            Ok(obj) => response(StatusCode::CREATED, obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn retrieve(&self, ns: Option<String>, name: String) -> Result<Response<Body>> {
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .get()
+        {
+            Ok(obj) => response(StatusCode::OK, obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn list(&self, ns: Option<String>) -> Result<Response<Body>> {
+        let list = ObjectList {
+            metadata: Default::default(),
+            items: Arc::clone(&self.objects)
+                .lock()
+                .unwrap()
+                .values(ns)
+                .cloned()
+                .collect(),
+        };
+        response(StatusCode::OK, &list)
+    }
+
+    fn update(&self, ns: Option<String>, name: String, body: Vec<u8>) -> Result<Response<Body>> {
+        let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update(obj)
+        {
+            Ok((true, obj)) => response(StatusCode::CREATED, &obj),
+            Ok((false, obj)) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn update_subresource(
+        &self,
+        ns: Option<String>,
+        name: String,
+        subresource: String,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .update_subresource(subresource, obj)
+        {
+            Ok((true, obj)) => response(StatusCode::CREATED, &obj),
+            Ok((false, obj)) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn delete(&self, ns: Option<String>, name: String) -> Result<Response<Body>> {
+        match Arc::clone(&self.objects)
+            .lock()
+            .unwrap()
+            .entry(ns, name)
+            .delete()
+        {
+            Ok(obj) => response(StatusCode::OK, &obj),
+            Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+        }
+    }
+
+    fn patch(
+        &self,
+        ns: Option<String>,
+        name: String,
+        header: HeaderMap,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let content_type = match header.get("Content-Type") {
+            Some(v) => v.to_str().unwrap(),
+            None => "",
+        };
+        match content_type {
+            "application/apply-patch+yaml" => {
+                let obj = serde_yaml::from_reader::<&[u8], DynamicObject>(body.as_ref())?;
+                match Arc::clone(&self.objects)
+                    .lock()
+                    .unwrap()
+                    .entry(ns, name)
+                    .apply(obj)
+                {
+                    Ok(obj) => response(StatusCode::OK, &obj),
+                    Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status),
+                }
+            }
+            ct => {
+                let status = status::invalid(&format!("unsupported patch type \"{ct}\""));
+                response(StatusCode::from_u16(status.code).unwrap(), &status)
+            }
+        }
+    }
+}
+
+fn response<T: Serialize>(status: StatusCode, data: &T) -> Result<Response<Body>> {
+    let buf = serde_json::to_vec(data)?;
+    Ok(Response::builder().status(status).body(buf.into())?)
+}
+
+impl<R> Handler for ResourceHandler<R>
+where
+    R: Debug,
+{
+    fn api_resource(&self) -> ApiResource {
+        self.api_resource.clone()
+    }
+
+    fn handle(
+        &self,
+        req: Request,
+        header: HeaderMap<HeaderValue>,
+        body: Vec<u8>,
+    ) -> Result<Response<Body>> {
+        let Request {
+            verb,
+            ns,
+            name,
+            subresource,
+            ..
+        } = req;
+        match verb.as_str() {
+            "create" => self.create(body),
+            "delete" => self.delete(ns, name.unwrap()),
+            "get" => self.retrieve(ns, name.unwrap()),
+            "list" => self.list(ns),
+            "patch" => self.patch(ns, name.unwrap(), header, body),
+            "update" => {
+                if let Some(subresource) = subresource {
+                    self.update_subresource(ns, name.unwrap(), subresource, body)
+                } else {
+                    self.update(ns, name.unwrap(), body)
+                }
+            }
+            v => {
+                let api_resource = self.api_resource();
+                super::status::method_not_allowed(&api_resource, name, v)
+            }
+        }
+    }
+}
diff --git a/kube_test/src/service.rs b/kube_test/src/service.rs
new file mode 100644
index 00000000000..ffc4ef8641a
--- /dev/null
+++ b/kube_test/src/service.rs
@@ -0,0 +1,54 @@
+use super::{request::ApiPlural, Call, Handler, Result};
+use http::{Request, Response};
+use hyper::Body;
+use std::collections::HashMap;
+use std::ops::DerefMut;
+use std::sync::{Arc, Mutex};
+use std::task::{Context, Poll};
+
+/// Service provides a [tower::Service] that acts like a kubernetes API server.
+#[derive(Debug)]
+pub struct Service {
+    handlers: Arc<Mutex<HashMap<ApiPlural, Arc<dyn Handler + Send + Sync>>>>,
+}
+
+impl Service {
+    pub fn new() -> Self {
+        let handlers = Arc::new(Mutex::new(HashMap::new()));
+        Self { handlers }
+    }
+
+    pub fn add_handler(&self, handler: Arc<dyn Handler + Send + Sync>) {
+        let key = handler.api_resource().into();
+        self.handlers
+            .lock()
+            .unwrap()
+            .deref_mut()
+            .insert(key, handler);
+    }
+}
+
+impl Default for Service {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl tower::Service<Request<Body>> for Service {
+    type Response = Response<Body>;
+    type Error = super::Error;
+    type Future = Call;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let (parts, body) = req.into_parts();
+        let req = super::request::Request::parse(&parts);
+        match self.handlers.lock().unwrap().get(&req.api_plural()) {
+            Some(handler) => Call::new(Some(Arc::clone(handler)), req, parts.headers, body),
+            None => Call::new(None, req, parts.headers, body),
+        }
+    }
+}
diff --git a/kube_test/src/status.rs b/kube_test/src/status.rs
new file mode 100644
index 00000000000..cca92b63a78
--- /dev/null
+++ b/kube_test/src/status.rs
@@ -0,0 +1,61 @@
+use super::{request::ApiPlural, Result};
+use http::{Response, StatusCode};
+use hyper::Body;
+use kube_core::{ApiResource, Status};
+
+/// Generate an "Invalid" kubernetes status response.
+pub(crate) fn invalid(message: &str) -> Status {
+    Status::failure(message, "Invalid").with_code(422)
+}
+
+/// Generate an "AlreadyExists" kubernetes status response.
+pub(crate) fn already_exists(api_resource: &ApiResource, name: Option<&str>) -> Status {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name);
+    Status::failure(
+        format!("{resource_id} already exists",).as_str(),
+        "AlreadyExists",
+    )
+    .with_code(StatusCode::CONFLICT.as_u16())
+}
+
+/// Generate a "NotFound" kubernetes status response for a resource.
+pub(crate) fn resource_not_found(api_plural: &ApiPlural) -> Status {
+    Status::failure(&format!("resource {api_plural} not found"), "NotFound")
+        .with_code(StatusCode::NOT_FOUND.as_u16())
+}
+
+/// Generate a "NotFound" kubernetes status response.
+pub(crate) fn not_found(api_resource: &ApiResource, name: Option<&str>) -> Status {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name);
+    Status::failure(&format!("{resource_id} not found"), "NotFound")
+        .with_code(StatusCode::NOT_FOUND.as_u16())
+}
+
+/// Generate a "MethodNotAllowed" kubernetes status response.
+pub(crate) fn method_not_allowed(
+    api_resource: &ApiResource,
+    name: Option<String>,
+    method: &str,
+) -> Result<Response<Body>> {
+    let resource_id = resource_id(&api_resource.group, &api_resource.kind, name.as_deref());
+    let status = Status::failure(
+        format!("method {method} not allowed for {resource_id}").as_str(),
+        "MethodNotAllowed",
+    )
+    .with_code(StatusCode::METHOD_NOT_ALLOWED.as_u16());
+    response(&status)
+}
+
+fn response(status: &Status) -> Result<Response<Body>> {
+    let buf = serde_json::to_vec(status)?;
+    Ok(Response::builder().status(status.code).body(buf.into())?)
+}
+
+fn resource_id(group: &str, kind: &str, name: Option<&str>) -> String {
+    match (name, group.is_empty()) {
+        (None, true) => format!("resource {kind}"),
+        (None, false) => format!("resource {group}.{kind}"),
+        (Some(name), true) => format!("{kind} {name}"),
+        (Some(name), false) => format!("{group}.{kind} {name}"),
+    }
+}
diff --git a/logfmt/Cargo.toml b/logfmt/Cargo.toml
index e7eceb04764..c194cba7f84 100644
--- a/logfmt/Cargo.toml
+++ b/logfmt/Cargo.toml
@@ -6,13 +6,16 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 observability_deps = { path = "../observability_deps" }
 tracing-subscriber = "0.3"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parking_lot = "0.12"
 regex = "1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
diff --git a/metric/Cargo.toml b/metric/Cargo.toml
index d7ced7e0e67..b177d090e5a 100644
--- a/metric/Cargo.toml
+++ b/metric/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 parking_lot = "0.12"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/metric/src/counter.rs b/metric/src/counter.rs
index 6e31585f4e8..b8c1cd41442 100644
--- a/metric/src/counter.rs
+++ b/metric/src/counter.rs
@@ -2,7 +2,10 @@ use crate::{MetricKind, MetricObserver, Observation};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::Arc;
 
-/// A monotonic counter
+/// A monotonic counter.
+///
+/// A [`U64Counter`]` is an internally reference counted type, and all mutations
+/// to cloned instances mutate the same underlying counter.
 #[derive(Debug, Clone, Default)]
 pub struct U64Counter {
     state: Arc<AtomicU64>,
diff --git a/metric/src/duration.rs b/metric/src/duration.rs
index 7a0728e1e37..6fd9750ffbc 100644
--- a/metric/src/duration.rs
+++ b/metric/src/duration.rs
@@ -126,6 +126,14 @@ impl DurationHistogram {
             count,
         )
     }
+
+    pub fn reset(&self) {
+        self.inner.reset();
+    }
+
+    pub fn percentile(&self, percentile: u64) -> Duration {
+        Duration::from_nanos(self.inner.percentile(percentile))
+    }
 }
 
 /// `DurationHistogramOptions` allows configuring the buckets used by `DurationHistogram`
diff --git a/metric/src/histogram.rs b/metric/src/histogram.rs
index 4416c849780..099ef66b494 100644
--- a/metric/src/histogram.rs
+++ b/metric/src/histogram.rs
@@ -65,6 +65,41 @@ impl U64Histogram {
             state.total = state.total.wrapping_add(value * count);
         }
     }
+
+    pub fn reset(&self) {
+        let mut state = self.shared.lock();
+        for bucket in &mut state.buckets {
+            bucket.count = 0;
+        }
+        state.total = 0;
+    }
+
+    /// percentile returns the bucket threshold for the given percentile.
+    /// For example, if you want the median value, percentile(50) will return the 'le' threshold
+    /// for the histogram bucket that contains the median sample.
+    ///
+    /// A use case for for this function is:
+    ///     Use a histogram tracks the load placed on a system.
+    ///     Set the buckets so they represent load levels of idle/low/medium/high/overloaded.
+    ///     Then use percentile to determine how much of the time is spent at various load levels.
+    ///  e.g. if percentile(50) comes come back with the low load threshold, the median load on the system is low
+    pub fn percentile(&self, percentile: u64) -> u64 {
+        let state = self.shared.lock();
+
+        // we need the total quantity of samples, not the sum of samples.
+        let total: u64 = state.buckets.iter().map(|bucket| bucket.count).sum();
+
+        let target = total * percentile / 100;
+
+        let mut sum = 0;
+        for bucket in &state.buckets {
+            sum += bucket.count;
+            if sum >= target {
+                return bucket.le;
+            }
+        }
+        0
+    }
 }
 
 impl MakeMetricObserver for U64Histogram {
@@ -162,5 +197,61 @@ mod tests {
         histogram.record(0);
 
         assert_eq!(histogram.observe(), buckets(&[2, 1, 1], 80));
+
+        // Now test the percentile reporting function
+        let options = U64HistogramOptions::new(vec![0, 1, 2, 4, 8, 16, 32, u64::MAX]);
+        let histogram = U64Histogram::create(&options);
+
+        histogram.record(0); // bucket 0, le 0
+        histogram.record(2); // bucket 2, le 2
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(20); // bucket 6, le 32
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+        histogram.record(20000); // bucket 7, le u64::MAX
+
+        // Of the 10 samples above:
+        // 1 (10%) is in bucket 0, le 0
+        // 1 (10%) is in bucket 2, le 2
+        // 2 (20%) are in bucket 3, le 4
+        // 1 (10%) is in bucket 6, le 32
+        // 5 (50%) are in bucket 7, le u64::MAX
+
+        // request percentiles falling in bucket 0, le 0
+        assert_eq!(histogram.percentile(3), 0);
+        assert_eq!(histogram.percentile(10), 0);
+        assert_eq!(histogram.percentile(19), 0);
+
+        // request percentiles falling in bucket 2, le 2
+        assert_eq!(histogram.percentile(20), 2);
+        assert_eq!(histogram.percentile(29), 2);
+
+        // requests percentiles falling in bucket 3, le 4
+        assert_eq!(histogram.percentile(30), 4);
+        assert_eq!(histogram.percentile(49), 4);
+
+        // requests percentiles falling in bucket 6, le 32
+        assert_eq!(histogram.percentile(50), 32);
+        assert_eq!(histogram.percentile(59), 32);
+
+        // requests percentiles falling in bucket 6, le 32
+        assert_eq!(histogram.percentile(60), u64::MAX);
+        assert_eq!(histogram.percentile(80), u64::MAX);
+        assert_eq!(histogram.percentile(100), u64::MAX);
+
+        // test reset
+        histogram.reset();
+        assert_eq!(histogram.percentile(100), 0);
+        histogram.record(1); // bucket 1, le 1
+        histogram.record(2); // bucket 2, le 2
+        histogram.record(3); // bucket 3, le 4
+        histogram.record(3); // bucket 3, le 4
+        assert_eq!(histogram.percentile(0), 0);
+        assert_eq!(histogram.percentile(25), 1);
+        assert_eq!(histogram.percentile(49), 1);
+        assert_eq!(histogram.percentile(50), 2);
     }
 }
diff --git a/metric/src/lib.rs b/metric/src/lib.rs
index ccd0ddceb33..23b085eef8c 100644
--- a/metric/src/lib.rs
+++ b/metric/src/lib.rs
@@ -280,8 +280,7 @@ pub trait Instrument: std::fmt::Debug + Send + Sync {
     /// - call finish_metric once complete
     fn report(&self, reporter: &mut dyn Reporter);
 
-    /// Returns the type as [`Any`] so that it can be downcast to
-    /// it underlying type
+    /// Returns the type as [`Any`] so that it can be downcast to its underlying type
     fn as_any(&self) -> &dyn Any;
 }
 
diff --git a/metric/src/metric.rs b/metric/src/metric.rs
index cd0582c7b48..04b0a20e7db 100644
--- a/metric/src/metric.rs
+++ b/metric/src/metric.rs
@@ -265,6 +265,7 @@ pub struct ResultMetric<T> {
     pub ok: T,
     pub client_error: T,
     pub server_error: T,
+    pub unexpected_response: T,
 }
 
 impl<T> ResultMetric<T>
@@ -279,12 +280,16 @@ where
         let client_error = metric.recorder(attributes.clone());
 
         attributes.insert("status", "server_error");
-        let server_error = metric.recorder(attributes);
+        let server_error = metric.recorder(attributes.clone());
+
+        attributes.insert("status", "unexpected_response");
+        let unexpected_response = metric.recorder(attributes);
 
         Self {
             ok,
             client_error,
             server_error,
+            unexpected_response,
         }
     }
 }
diff --git a/metric_exporters/Cargo.toml b/metric_exporters/Cargo.toml
index e1edd6fe9a4..dc70a674b8b 100644
--- a/metric_exporters/Cargo.toml
+++ b/metric_exporters/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 observability_deps = { path = "../observability_deps" }
 metric = { path = "../metric" }
diff --git a/mutable_batch/Cargo.toml b/mutable_batch/Cargo.toml
index 251c83f85db..21bbf5277dd 100644
--- a/mutable_batch/Cargo.toml
+++ b/mutable_batch/Cargo.toml
@@ -6,24 +6,24 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 arrow_util = { path = "../arrow_util" }
-chrono = { version = "0.4", default-features = false }
 data_types = { path = "../data_types" }
+hashbrown = { workspace = true }
 iox_time = { path = "../iox_time" }
+itertools = "0.12"
 schema = { path = "../schema" }
-snafu = "0.7"
-hashbrown = { workspace = true }
-itertools = "0.11"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-percent-encoding = "2.2.0"
-thiserror = "1.0.48"
-unicode-segmentation = "1.10.1"
 
 [dev-dependencies]
 assert_matches = "1.5.0"
 mutable_batch_lp = { path = "../mutable_batch_lp" }
-paste = "1.0.14"
-proptest = { version = "1.2.0", default-features = false }
+partition = { path = "../partition" }
+pretty_assertions = "1.4.0"
+proptest = { version = "1.4.0", default-features = false }
 rand = "0.8"
diff --git a/mutable_batch/src/column.rs b/mutable_batch/src/column.rs
index 665ee9e7a84..17d43053a26 100644
--- a/mutable_batch/src/column.rs
+++ b/mutable_batch/src/column.rs
@@ -13,7 +13,7 @@ use arrow_util::{bitset::BitSet, string::PackedStringArray};
 use data_types::{StatValues, Statistics};
 use schema::{InfluxColumnType, InfluxFieldType, TIME_DATA_TYPE};
 use snafu::{ResultExt, Snafu};
-use std::{fmt::Formatter, mem, sync::Arc};
+use std::{fmt::Formatter, iter, mem, num::NonZeroU64, sync::Arc};
 
 /// A "dictionary ID" (DID) is a compact numeric representation of an interned
 /// string in the dictionary. The same string always maps the same DID.
@@ -25,11 +25,38 @@ use std::{fmt::Formatter, mem, sync::Arc};
 pub(crate) type DID = i32;
 
 /// An invalid DID used for NULL rows
-pub(crate) const INVALID_DID: DID = -1;
+pub(crate) const NULL_DID: DID = -1;
 
 /// The type of the dictionary used
 type Dictionary = arrow_util::dictionary::StringDictionary<DID>;
 
+/// A type-agnostic way of splitting the various [`ColumnData`] arrays.
+///
+/// This macro is required because it's not possible to write a generic function
+/// that operates on all "data" types across [`ColumnData`] variants.`
+macro_rules! split_off_column {
+    ($self:expr, $data:expr, $n:expr, $stats:expr, $right_nulls:expr, $($ty:tt)+) => {{
+        // Compute the new number of nulls in the left side of the split.
+        let left_nulls = $stats.null_count.map(|v| v - $right_nulls);
+
+        // Update the stats for the left side of the split.
+        *$stats = StatValues::new(None, None, $self.valid.len() as u64, left_nulls);
+
+        // Generate the right side of the split (with minimal stats).
+        let right_data = $data.split_off($n);
+        let right_len = right_data.len();
+        $($ty)+(
+            right_data,
+            StatValues::new(
+                None,
+                None,
+                right_len as _,
+                Some($right_nulls),
+            ),
+        )
+    }};
+}
+
 #[derive(Debug, Snafu)]
 #[allow(missing_copy_implementations, missing_docs)]
 pub enum Error {
@@ -63,11 +90,28 @@ pub struct Column {
 #[derive(Debug, Clone)]
 #[allow(missing_docs)]
 pub enum ColumnData {
-    F64(Vec<f64>, StatValues<f64>),
+    /// These types contain arrays that contain an element for every logical row
+    /// (including nulls).
+    ///
+    /// Null values are padded with an arbitrary dummy value.
+    F64(Vec<f64>, StatValues<f64>), // NaN is ignored when computing statistics.
     I64(Vec<i64>, StatValues<i64>),
     U64(Vec<u64>, StatValues<u64>),
-    String(PackedStringArray<i32>, StatValues<String>),
     Bool(BitSet, StatValues<bool>),
+
+    /// The String encoding contains an entry for every logical row, and
+    /// explicitly stores an empty string in the PackedStringArray for NULL
+    /// values.
+    String(PackedStringArray<i32>, StatValues<String>),
+
+    /// Whereas the dictionary encoding does not store an explicit empty string
+    /// in the internal PackedStringArray, nor does it create an entry in the
+    /// dedupe map. A NULL entry is padded into the data vec using the
+    /// [`NULL_DID`] value.
+    ///
+    /// Every distinct, non-null value is stored in the dictionary exactly once,
+    /// and the data arrays contains the dictionary ID for every logical row
+    /// (including nulls as described above).
     Tag(Vec<DID>, Dictionary, StatValues<String>),
 }
 
@@ -97,6 +141,9 @@ impl Column {
         // Keep track of how many total rows there are
         let total_count = row_count as u64;
 
+        // If there are no values, there are no distinct values.
+        let distinct_count = if row_count > 0 { Some(1) } else { None };
+
         let data = match column_type {
             InfluxColumnType::Field(InfluxFieldType::Boolean) => {
                 let mut data = BitSet::new();
@@ -119,12 +166,12 @@ impl Column {
             }
             InfluxColumnType::Field(InfluxFieldType::String) => ColumnData::String(
                 PackedStringArray::new_empty(row_count),
-                StatValues::new_all_null(total_count, Some(1)),
+                StatValues::new_all_null(total_count, distinct_count),
             ),
             InfluxColumnType::Tag => ColumnData::Tag(
-                vec![INVALID_DID; row_count],
+                vec![NULL_DID; row_count],
                 Default::default(),
-                StatValues::new_all_null(total_count, Some(1)),
+                StatValues::new_all_null(total_count, distinct_count),
             ),
         };
 
@@ -182,7 +229,7 @@ impl Column {
                 stats.update_for_nulls(delta as u64);
             }
             ColumnData::Tag(data, _dict, stats) => {
-                data.resize(len, INVALID_DID);
+                data.resize(len, NULL_DID);
                 stats.update_for_nulls(delta as u64);
             }
         }
@@ -323,4 +370,819 @@ impl Column {
 
         Ok(data)
     }
+
+    /// Split this [`Column`] at the specified row boundary, such that after
+    /// this call, `self` contains the range of rows indexed from `[0, n)` and
+    /// the returned value contains `[n, len)`.
+    ///
+    /// # Statistics
+    ///
+    /// For performance reasons, this operation leaves `self` and the returned
+    /// [`Column`] with reduced summary statistics available.
+    ///
+    /// This allows the caller to selectively reconstruct the statistics that
+    /// will be useful to the caller, instead of always paying the price of
+    /// recomputing statistics, even if unused.
+    ///
+    /// For the following column types:
+    ///
+    ///  - [`ColumnData::F64`]
+    ///  - [`ColumnData::I64`]
+    ///  - [`ColumnData::U64`]
+    ///  - [`ColumnData::Bool`]
+    ///  - [`ColumnData::String`]
+    ///
+    /// The statistics for both [`Column`] contain only:
+    ///
+    ///  - Total count
+    ///  - NULL count (see below)
+    ///
+    /// The NULL count is always present in the returned [`Column`], and only
+    /// present in `self` if it had a NULL count statistic prior to the split.
+    ///
+    /// For [`ColumnData::Tag`] all the statistics above are included, with the
+    /// addition of the distinct count.
+    ///
+    /// # Performance
+    ///
+    /// This call is `O(n)` where `n` is the number of elements in the right
+    /// side of the split (the `[n, len)` interval) due to the need to copy
+    /// and process these elements only.
+    ///
+    /// The size of the left-side interval (the [0, n) interval) does not affect
+    /// performance of this call.
+    pub fn split_off(&mut self, n: usize) -> Self {
+        if n > self.len() {
+            return Self::new(0, self.influx_type);
+        }
+
+        // Split the null mask into [0, n) and [n, len).
+        let right_bitmap = self.valid.split_off(n);
+
+        // Compute the null count for the right side.
+        let right_nulls = right_bitmap.count_zeros() as u64;
+
+        // Split the actual data and update/compute the statistics.
+        let right_data = match &mut self.data {
+            ColumnData::F64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::F64)
+            }
+            ColumnData::I64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::I64)
+            }
+            ColumnData::U64(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::U64)
+            }
+            ColumnData::String(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::String)
+            }
+            ColumnData::Bool(data, left_stats) => {
+                split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::Bool)
+            }
+            ColumnData::Tag(data, dict, left_stats) => {
+                // Split the tag data at the value index.
+                let mut new_data = data.split_off(n);
+
+                // "new_data" now contains values [n, len), and likely no longer
+                // references all the values in the current dictionary.
+                //
+                // Generate a dictionary for "new_data" that contains only the
+                // values that appear in "new_data", and rewrite the dictionary
+                // IDs in "new_data" to reflect this new mapping.
+                let new_dict = rebuild_dictionary(dict, &mut new_data);
+
+                // The original "dict" may now contain references to keys that
+                // appear only in "new_data", and never in the "data" that
+                // remains.
+                //
+                // Rewrite this dictionary, to shrink it to contain only entries
+                // that appear in "data".
+                //
+                // Note: this may not be required if Arrow can tolerate a
+                // dictionary with more keys than necessary, but it optimises
+                // for memory utilisation.
+                *dict = rebuild_dictionary(dict, data);
+
+                // Compute how many NULLs are left in the left side.
+                let left_nulls = left_stats.null_count.map(|v| v - right_nulls);
+
+                // It's effectively free to compute the distinct count of a
+                // column using dictionary encoding - it's simply the length of
+                // the dictionary, and plus one if a NULL exists - maintain
+                // distinct counts in the returned statistics.
+                let make_distinct_count = |dict: &Dictionary, has_null| {
+                    let mut count = dict.values().len();
+                    if has_null {
+                        count += 1;
+                    }
+                    NonZeroU64::try_from(count as u64).ok()
+                };
+
+                let left_distinct = make_distinct_count(dict, left_nulls.unwrap_or_default() > 0);
+                let right_distinct = make_distinct_count(&new_dict, right_nulls > 0);
+
+                // Update the stats for the left side of the split.
+                *left_stats = StatValues::new_with_distinct(
+                    None,
+                    None,
+                    self.valid.len() as _,
+                    left_nulls,
+                    left_distinct,
+                );
+
+                // Generate the right side of the split.
+                let new_len = new_data.len();
+                ColumnData::Tag(
+                    new_data,
+                    new_dict,
+                    StatValues::new_with_distinct(
+                        None,
+                        None,
+                        new_len as _,
+                        Some(right_nulls),
+                        right_distinct,
+                    ),
+                )
+            }
+        };
+
+        Self {
+            influx_type: self.influx_type,
+            valid: right_bitmap,
+            data: right_data,
+        }
+    }
+}
+
+/// Constructs a new, minimal dictionary for `data`, rewriting the dictionary
+/// IDs in `data` to use the new returned dictionary.
+fn rebuild_dictionary(original: &Dictionary, data: &mut [DID]) -> Dictionary {
+    let mut dict = Dictionary::new();
+
+    for id in data.iter_mut() {
+        if *id == NULL_DID {
+            continue;
+        }
+        let value = original
+            .lookup_id(*id)
+            .expect("original dictionary does not contain value");
+        *id = dict.lookup_value_or_insert(value);
+    }
+
+    dict
+}
+
+/// Recompute the min/max values for the given [`Column`].
+///
+/// This is an `O(n)` operation for:
+///
+///  - [`ColumnData::F64`]
+///  - [`ColumnData::I64`]
+///  - [`ColumnData::U64`]
+///  - [`ColumnData::Bool`]
+///  - [`ColumnData::String`]
+///
+/// This is an `O(distinct(n))` operation for [`ColumnData::Tag`].
+pub fn recompute_min_max(c: &mut Column) {
+    match &mut c.data {
+        // A specialised implementation for floats is required to filter out NaN
+        // values in order to match the behaviour of `StatValues::update()`.
+        ColumnData::F64(data, stats) => {
+            data.iter()
+                .zip(c.valid.iter())
+                .filter_map(|(v, valid)| {
+                    if !valid || v.is_nan() {
+                        // NaN are completely ignored in stats.
+                        return None;
+                    }
+                    Some(*v)
+                })
+                .for_each(|v| {
+                    stats.min = Some(stats.min.unwrap_or(v).min(v));
+                    stats.max = Some(stats.max.unwrap_or(v).max(v));
+                });
+        }
+
+        // A specialised implementation for boolean values for significantly
+        // improved performance.
+        ColumnData::Bool(data, stats) => {
+            // Process 8 values at a time by evaluating against the underlying
+            // bytes directly in both the validity and value bitsets.
+            //
+            // Invariant: the excess bits beyond "bitset.len()" are always 0.
+            let iter = c.valid.bytes().iter().zip(data.bytes().iter());
+
+            let mut contains_false = false;
+            let mut contains_true = false;
+
+            for (valid, data) in iter {
+                // Set bits only if they're non-null and 1.
+                contains_true |= valid & data > 0;
+
+                // Set bits only if they're non-null and 0.
+                contains_false |= valid & !data > 0;
+
+                // Short circuit if both have been observed.
+                if contains_false && contains_true {
+                    break;
+                }
+            }
+
+            // If all values are NULL, no real values were observed, and the
+            // stats should be cleared (as the stats ignore NULLs).
+            if !contains_false && !contains_true {
+                stats.min = None;
+                stats.max = None;
+                return;
+            }
+
+            stats.min = Some(!contains_false);
+            stats.max = Some(contains_true);
+        }
+
+        // The rest of the data types use `recompute_min_max_for()`.
+        ColumnData::I64(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(*min);
+                stats.max = Some(*max);
+            }
+        }
+        ColumnData::U64(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(*min);
+                stats.max = Some(*max);
+            }
+        }
+
+        // Optimised to avoid cloning the string for every change in min/max
+        // value, instead this clones the strings at most once for each of
+        // min/max.
+        //
+        // This applies to both the String and Tag data types.
+        ColumnData::String(data, stats) => {
+            if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) {
+                stats.min = Some(min.to_string());
+                stats.max = Some(max.to_string());
+            }
+        }
+        ColumnData::Tag(_, dict, stats) => {
+            // The dictionary does not store a representation of NULL, so all
+            // the values in the dictionary are candidates for min/max.
+            if let Some((min, max)) =
+                recompute_min_max_for(dict.values().iter(), iter::repeat(true))
+            {
+                stats.min = Some(min.to_string());
+                stats.max = Some(max.to_string());
+            }
+        }
+    }
+}
+
+/// Compute the min/max values of `data`, filtering out any values with
+/// corresponding positions in `valid` that are `false`.
+fn recompute_min_max_for<'a, T>(
+    data: impl IntoIterator<Item = &'a T>,
+    valid: impl IntoIterator<Item = bool>,
+) -> Option<(&'a T, &'a T)>
+where
+    T: Ord + ?Sized,
+{
+    let (min, max) = data
+        .into_iter()
+        .zip(valid.into_iter())
+        .filter_map(|(v, valid)| if valid { Some(v) } else { None })
+        .fold((None, None), |acc, v| {
+            (
+                Some(acc.0.unwrap_or(v).min(v)),
+                Some(acc.1.unwrap_or(v).max(v)),
+            )
+        });
+
+    min.zip(max)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{borrow::Borrow, collections::HashSet, fmt::Debug, mem::discriminant};
+
+    use arrow::record_batch::RecordBatch;
+    use arrow_util::assert_batches_eq;
+    use assert_matches::assert_matches;
+    use data_types::IsNan;
+    use proptest::prelude::*;
+
+    use super::*;
+
+    fn hydrate(dict: &Dictionary, data: &[DID]) -> Vec<String> {
+        data.iter()
+            .map(|&id| dict.lookup_id(id).unwrap().to_string())
+            .collect::<Vec<_>>()
+    }
+
+    /// Take an iterator of nullable `T`, and convert it into a vector of
+    /// non-optional values and a null mask compatible with [`ColumnData`].
+    ///
+    /// Returns the number of nulls in `data`.
+    fn densify<T, U>(data: impl IntoIterator<Item = Option<U>>) -> (Vec<T>, BitSet, usize)
+    where
+        U: ToOwned<Owned = T>,
+        T: Default,
+    {
+        let mut out = Vec::new();
+        let mut bitmap = BitSet::new();
+        let mut nulls = 0;
+        for v in data.into_iter() {
+            match v {
+                Some(v) => {
+                    bitmap.append_set(1);
+                    out.push(v.to_owned());
+                }
+                None => {
+                    out.push(Default::default());
+                    bitmap.append_unset(1);
+                    nulls += 1;
+                }
+            }
+        }
+
+        (out, bitmap, nulls)
+    }
+
+    #[test]
+    #[allow(clippy::bool_assert_comparison)]
+    fn test_densify() {
+        let input = [None, Some(42), None, None, Some(24)];
+
+        let (got, nulls, count) = densify(input);
+        assert_eq!(got, [0, 42, 0, 0, 24]); // NULLS are populated with 0 (not sparse representation)
+        assert_eq!(nulls.get(0), false);
+        assert_eq!(nulls.get(1), true);
+        assert_eq!(nulls.get(2), false);
+        assert_eq!(nulls.get(3), false);
+        assert_eq!(nulls.get(4), true);
+        assert_eq!(nulls.len(), 5);
+        assert_eq!(count, 3);
+    }
+
+    #[test]
+    fn test_rewrite_dictionary() {
+        let mut original = Dictionary::new();
+        let mut data = vec![];
+
+        // Input strings to be dictionary encoded.
+        let input = [
+            "bananas", "platanos", "bananas", "platanos", "ananas", "ananas", "ananas",
+        ];
+
+        for v in input {
+            data.push(original.lookup_value_or_insert(v));
+        }
+
+        assert_eq!(data.len(), input.len());
+        assert_eq!(original.values().len(), 3); // 3 distinct values
+
+        let mut new_data = data.split_off(3);
+        let new_dict = rebuild_dictionary(&original, &mut new_data);
+        let old_dict = rebuild_dictionary(&original, &mut data);
+
+        let new_data_hydrated = hydrate(&new_dict, &new_data);
+        let old_data_hydrated = hydrate(&old_dict, &data);
+
+        assert_eq!(
+            new_data_hydrated,
+            ["platanos", "ananas", "ananas", "ananas"]
+        );
+        assert_eq!(old_data_hydrated, ["bananas", "platanos", "bananas"]);
+
+        assert_eq!(new_dict.values().len(), 2); // 2 distinct values
+        assert_eq!(old_dict.values().len(), 2); // 2 distinct values
+    }
+
+    #[test]
+    fn test_split_off() {
+        let (data, valid, _) = densify([Some(42), None, None, Some(24)]);
+        valid.to_arrow();
+
+        let mut col = Column {
+            influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger),
+            valid,
+            data: ColumnData::U64(data, StatValues::new(None, None, 4, Some(2))),
+        };
+
+        let mut schema = schema::SchemaBuilder::new();
+        schema.influx_column("bananas", col.influx_type());
+        let schema = schema.build().unwrap();
+
+        // Before the split
+        let batch = RecordBatch::try_new(
+            schema.clone().into(),
+            vec![col.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "| 42      |",
+                "|         |",
+                "|         |",
+                "| 24      |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+
+        let col2 = col.split_off(2);
+
+        // After the split, the input column
+        let batch = RecordBatch::try_new(
+            schema.clone().into(),
+            vec![col.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "| 42      |",
+                "|         |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+
+        // After the split, the split off column
+        let batch = RecordBatch::try_new(
+            schema.into(),
+            vec![col2.to_arrow().expect("failed to covert column to arrow")],
+        )
+        .expect("failed to build record batch");
+        assert_batches_eq!(
+            [
+                "+---------+",
+                "| bananas |",
+                "+---------+",
+                "|         |",
+                "| 24      |",
+                "+---------+",
+            ],
+            &[batch]
+        );
+    }
+
+    const MAX_ROWS: usize = 20;
+
+    /// Returns a vector of `Option<T>`.
+    fn sparse_array<T>(s: impl Strategy<Value = T>) -> impl Strategy<Value = Vec<Option<T>>>
+    where
+        T: Debug,
+    {
+        prop::collection::vec(prop::option::of(s), 0..MAX_ROWS)
+    }
+
+    /// Produces a valid [`Column`]` of an arbitrary data type and data.
+    ///
+    /// The embedded statistics do not contain min/max values but otherwise
+    /// model a column within a [`MutableBatch`] produced by a [`Writer`].
+    ///
+    /// [`MutableBatch`]: crate::MutableBatch
+    /// [`Writer`]: crate::writer::Writer
+    fn arbitrary_column() -> impl Strategy<Value = Column> {
+        prop_oneof![
+            sparse_array(any::<f64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Float),
+                    valid,
+                    data: ColumnData::F64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<i64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Integer),
+                    valid,
+                    data: ColumnData::I64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<u64>()).prop_map(|v| {
+                let (data, valid, null_count) = densify(v.clone());
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger),
+                    valid,
+                    data: ColumnData::U64(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<String>()).prop_map(|v| {
+                let (strings, valid, null_count) = densify(v.clone());
+                let mut data = PackedStringArray::new();
+                for s in strings {
+                    data.append(&s);
+                }
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::String),
+                    valid,
+                    data: ColumnData::String(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            sparse_array(any::<bool>()).prop_map(|v| {
+                let (values, valid, null_count) = densify(v.clone());
+                let mut data = BitSet::new();
+                for v in values {
+                    match v {
+                        true => data.append_set(1),
+                        false => data.append_unset(1),
+                    }
+                }
+                Column {
+                    influx_type: InfluxColumnType::Field(InfluxFieldType::Boolean),
+                    valid,
+                    data: ColumnData::Bool(
+                        data,
+                        StatValues::new(None, None, v.len() as _, Some(null_count as _)),
+                    ),
+                }
+            }),
+            // This artificially weights string generation to produce arrays
+            // with a higher chance of covering both dense and sparse arrays
+            // where distinct values != array length.
+            prop_oneof![
+                sparse_array(
+                    prop::string::string_regex("[a-b]").expect("invalid repetition regex")
+                ),
+                sparse_array(any::<String>()),
+            ]
+            .prop_map(|v| {
+                // The NULL encoding of the dictionary is a bit of a snowflake.
+                //
+                // Walk the NULL-able input, and for any NULLs insert NULL_DID
+                // into the data array without inserting into the dictionary.
+                let mut data = Vec::new();
+                let mut dict = Dictionary::new();
+                let mut valid = BitSet::new();
+
+                let mut nulls = 0;
+                for v in &v {
+                    match v {
+                        Some(v) => {
+                            valid.append_set(1);
+                            data.push(dict.lookup_value_or_insert(v));
+                        }
+                        None => {
+                            data.push(NULL_DID);
+                            valid.append_unset(1);
+                            nulls += 1;
+                        }
+                    }
+                }
+
+                // A NULL is a distinct value, that does not appear in the
+                // dictionary.
+                let distinct_count = if nulls > 0 {
+                    dict.values().len() + 1
+                } else {
+                    dict.values().len()
+                };
+
+                Column {
+                    influx_type: InfluxColumnType::Tag,
+                    valid,
+                    data: ColumnData::Tag(
+                        data,
+                        dict,
+                        StatValues::new_with_distinct(
+                            None,
+                            None,
+                            v.len() as _,
+                            Some(nulls),
+                            NonZeroU64::try_from(distinct_count as u64).ok(),
+                        ),
+                    ),
+                }
+            }),
+        ]
+    }
+    // Set the number of test cases higher than the default (256) to ensure better
+    // coverage of the generated arbitrary columns without compromising too
+    // much on the input space.
+    proptest! {
+        #![proptest_config(ProptestConfig::with_cases(2048))]
+
+        /// Asserts the correctness of the [`Column::split_off()`] method, using
+        /// the Arrow "Array" slice method as a test oracle.
+        ///
+        /// Asserts the following invariants after splitting:
+        ///
+        ///  - Never panics due to out-of-bounds split position
+        ///  - Data types remain unchanged
+        ///  - Metadata for influx data model unchanged
+        ///  - NULL mask is of the correct length
+        ///  - Data length matches count statistics
+        ///  - NULL value count matches NULL count statistics
+        ///  - Tag distinct values matches distinct count statistics
+        ///  - Tag dictionary contains correct number of entries, with NULLs
+        ///  - Total count statistics are equal to input statistics
+        ///  - NULL count statistics are equal to input statistics
+        ///  - Both sides of the split match equivalent Arrow oracle splits
+        ///
+        #[test]
+        fn prop_split_off(
+            input in arbitrary_column(),
+            split_at in 0..=MAX_ROWS,
+        ) {
+            // Split the column.
+            let mut col = input.clone();
+            let col2 = col.split_off(split_at);
+
+            // Assert no rows were lost.
+            assert_eq!(col.len() + col2.len(), input.len());
+
+            // Because "split_at" may be greater than the number of rows in the
+            // input column, compute how many rows should remain after the
+            // split.
+            let want_remaining_rows = input.len().min(split_at);
+            assert_eq!(col.len(), want_remaining_rows);
+
+            // And validate the rest of the rows wound up in the col2 half.
+            assert_eq!(col2.len(), input.len() - want_remaining_rows);
+
+            for c in [&col, &col2] {
+                // The data type should remain the same.
+                assert_eq!(c.influx_type(), input.influx_type());
+                assert_eq!(discriminant(c.data()), discriminant(input.data()));
+
+                // Inspect the statistics for each.
+                let data_len = match c.data() {
+                    ColumnData::F64(data, _) => data.len(),
+                    ColumnData::I64(data, _) => data.len(),
+                    ColumnData::U64(data, _) => data.len(),
+                    ColumnData::String(data, _) => data.len(),
+                    ColumnData::Bool(data, _) => data.len(),
+                    ColumnData::Tag(data, dict, stats) => {
+                        // Tags have an additional distinct count statistics
+                        // maintained throughout the split.
+                        let want = stats.distinct_count.map(|v| v.get()).unwrap_or_default();
+                        let have = data.iter().collect::<HashSet<_>>().len() as u64;
+                        assert_eq!(have, want);
+
+                        // If there are no nulls, the dictionary length must
+                        // match the number of distinct values. If there are
+                        // NULLs, +1 to the dictionary length (it does not
+                        // contain NULLs).
+                        if stats.null_count.unwrap_or_default() == 0 {
+                            assert_eq!(have, dict.values().len() as u64);
+                        } else {
+                            // Otherwise there must be one more distinct value.
+                            assert_eq!(have, dict.values().len() as u64 + 1);
+                        }
+
+                        data.len()
+                    },
+                };
+
+                // First check the consistency of the total count:
+                assert_eq!(c.valid_mask().len(), data_len);
+                assert_eq!(data_len as u64, c.stats().total_count());
+
+                // Null counts:
+                let nulls = c.valid_mask().count_zeros() as u64;
+                assert_eq!(c.stats().null_count(), Some(nulls));
+            }
+
+            // The sum of various statistics must match the input counts.
+            let count = col.stats().total_count() + col2.stats().total_count();
+            assert_eq!(input.stats().total_count(), count);
+
+            // Null counts must sum to the input count
+            let nulls = col.stats().null_count().unwrap_or_default() +
+            col2.stats().null_count().unwrap_or_default();
+            assert_eq!(input.stats().null_count().unwrap_or_default(), nulls);
+
+            // Generate arrow arrays from both inputs
+            let col = col.to_arrow().unwrap();
+            let col2 = col2.to_arrow().unwrap();
+
+            // And the test oracle
+            let input = input.to_arrow().unwrap();
+
+            // Slice the input data using arrow's slice methods.
+            let want = input.slice(0, split_at.min(input.len()));
+
+            // And assert the split_off() data is equal.
+            assert!(col.eq(&want));
+
+            // Only attempt to slice off and validate the right side if it would
+            // be non-empty (or arrow panics)
+            if split_at >= input.len() {
+                assert_eq!(col2.len(), 0);
+            } else {
+                let want2 = input.slice(split_at, input.len() - split_at);
+                assert!(col2.eq(&want2));
+            }
+        }
+
+        /// Exercise [`recompute_min_max()`] against a [`Column`], asserting the
+        /// resulting [`StatValues`] match that produced by using the [`Writer`]
+        /// to populate the [`Column`].
+        #[test]
+        fn prop_recompute_min_max(
+            mut input in arbitrary_column(),
+        ) {
+            // Compute a `StatValues` using the test oracle implementation.
+            fn stats_oracle<S, T, U>(data: S, valid: impl IntoIterator<Item = bool>) -> StatValues<T>
+            where
+                S: IntoIterator<Item = U>,
+                T: Borrow<U>,
+                U: ToOwned<Owned = T> + PartialOrd + IsNan,
+            {
+                data.into_iter()
+                    .zip(valid.into_iter())
+                    .filter_map(|(v, valid)| if valid { Some(v) } else { None })
+                    .fold(StatValues::default(), |mut acc, v| {
+                        acc.update(&v);
+                        acc
+                    })
+            }
+
+            match input.clone().data() {
+                ColumnData::F64(data,_) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::F64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::I64(data, _) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::I64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::U64(data, _) => {
+                    let want = stats_oracle(data, input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::U64(v) => v);
+
+                    assert_eq!(want.min.cloned(), got.min);
+                    assert_eq!(want.max.cloned(), got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::Bool(data, _) => {
+                    let want = stats_oracle(data.iter(), input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::Bool(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::String(data, _) => {
+                    let want = stats_oracle(data.iter().map(ToString::to_string), input.valid.iter());
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::String(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+                ColumnData::Tag(_data, dict, _) => {
+                    let want = stats_oracle(
+                        dict.values().iter().map(ToString::to_string),
+                        iter::repeat(true)
+                    );
+
+                    recompute_min_max(&mut input);
+                    let got = assert_matches!(input.stats(), Statistics::String(v) => v);
+
+                    assert_eq!(want.min, got.min);
+                    assert_eq!(want.max, got.max);
+                    assert!(got.min <= got.max);
+                },
+            }
+        }
+    }
 }
diff --git a/mutable_batch/src/lib.rs b/mutable_batch/src/lib.rs
index 681bb5a06b2..62244d6944a 100644
--- a/mutable_batch/src/lib.rs
+++ b/mutable_batch/src/lib.rs
@@ -20,6 +20,12 @@
 //! permitting fast conversion to [`RecordBatch`].
 
 // Workaround for "unused crate" lint false positives.
+#[cfg(test)]
+use partition as _;
+#[cfg(test)]
+use pretty_assertions as _;
+#[cfg(test)]
+use rand as _;
 use workspace_hack as _;
 
 use crate::column::{Column, ColumnData};
@@ -156,16 +162,13 @@ impl MutableBatch {
     /// Returns a summary of the write timestamps in this chunk if a
     /// time column exists
     pub fn timestamp_summary(&self) -> Option<TimestampSummary> {
-        let time = self.column_names.get(TIME_COLUMN_NAME)?;
+        let col_data = self.time_column().ok()?;
         let mut summary = TimestampSummary::default();
-        match &self.columns[*time].data {
-            ColumnData::I64(col_data, _) => {
-                for t in col_data {
-                    summary.record_nanos(*t)
-                }
-            }
-            _ => unreachable!(),
+
+        for t in col_data {
+            summary.record_nanos(*t)
         }
+
         Some(summary)
     }
 
@@ -205,6 +208,27 @@ impl MutableBatch {
         Ok(&self.columns[*idx])
     }
 
+    /// Returns a reference to the column at the specified index
+    pub fn column_by_index(&self, idx: usize) -> Result<&Column> {
+        self.columns.get(idx).with_context(|| ColumnNotFoundSnafu {
+            column: format!("index {}", idx),
+        })
+    }
+
+    /// Return the values in the time column in this batch. Returns an error if the batch has no
+    /// time column.
+    ///
+    /// # Panics
+    ///
+    /// If a time column exists but its data isn't of type `i64`, this function will panic.
+    fn time_column(&self) -> Result<&[i64]> {
+        let time_column = self.column(TIME_COLUMN_NAME)?;
+        match &time_column.data {
+            ColumnData::I64(col_data, _) => Ok(col_data),
+            x => unreachable!("expected i64 got {} for time column", x),
+        }
+    }
+
     /// Return the approximate memory size of the batch, in bytes.
     ///
     /// This includes `Self`.
@@ -222,6 +246,31 @@ impl MutableBatch {
     pub fn size_data(&self) -> usize {
         self.columns.iter().map(|c| c.size_data()).sum::<usize>()
     }
+
+    /// Split this [`MutableBatch`] at the specified row boundary, such that
+    /// after this call, `self` contains the range of rows indexed from `[0, n)`
+    /// and the returned value contains `[n, len)`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `n > self.rows()`.
+    ///
+    /// # Performance
+    ///
+    /// This implementation is heavily optimised towards splitting `self` at a
+    /// `n` value skewed towards the high end of the row count - see [`Column`].
+    pub fn split_off(&mut self, n: usize) -> Self {
+        assert!(n <= self.row_count);
+
+        let right_row_count = self.row_count - n;
+        self.row_count = n;
+
+        Self {
+            column_names: self.column_names.clone(),
+            columns: self.columns.iter_mut().map(|v| v.split_off(n)).collect(),
+            row_count: right_row_count,
+        }
+    }
 }
 
 /// A description of the distribution of timestamps in a
@@ -262,7 +311,9 @@ impl TimestampSummary {
 
 #[cfg(test)]
 mod tests {
+    use arrow_util::assert_batches_eq;
     use mutable_batch_lp::lines_to_batches;
+    use schema::Projection;
 
     #[test]
     fn size_data_without_nulls() {
@@ -298,4 +349,168 @@ mod tests {
         assert_eq!(batch.size_data(), 124);
         assert_eq!(batch.columns().len(), 5);
     }
+
+    /// Assert the correct row index is split off using
+    /// [`MutableBatch::split_off()`].
+    ///
+    /// Correctness of the [`Column`] splitting is handled by tests against the
+    /// [`Column`] itself.
+    #[test]
+    fn test_split_off() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(1);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[batch.to_arrow(Projection::All).unwrap()]
+        );
+        assert_batches_eq!(
+            &[
+                "+-----+----+----+----+--------------------------------+",
+                "| f1  | f2 | t1 | t2 | time                           |",
+                "+-----+----+----+----+--------------------------------+",
+                "| 2.2 | 2  |    | w  | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+----+----+--------------------------------+",
+            ],
+            &[got.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 1);
+        assert_eq!(got.rows(), 1);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
+
+    #[test]
+    fn test_split_off_n_0() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(0);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "| 2.2 | 2  |       | w     | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[got.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 0);
+        assert_eq!(got.rows(), 2);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
+
+    #[test]
+    fn test_split_off_none() {
+        let mut batches = lines_to_batches(
+            "\
+            cpu,t1=hello,t2=world f1=1.1 1234\n\
+            cpu,t2=w f1=2.2,f2=2i 1234\n\
+            ",
+            0,
+        )
+        .unwrap();
+        let mut batch = batches.remove("cpu").unwrap();
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(batch.column_names().len(), 5);
+
+        let got = batch.split_off(2);
+
+        assert_batches_eq!(
+            &[
+                "+-----+----+-------+-------+--------------------------------+",
+                "| f1  | f2 | t1    | t2    | time                           |",
+                "+-----+----+-------+-------+--------------------------------+",
+                "| 1.1 |    | hello | world | 1970-01-01T00:00:00.000001234Z |",
+                "| 2.2 | 2  |       | w     | 1970-01-01T00:00:00.000001234Z |",
+                "+-----+----+-------+-------+--------------------------------+",
+            ],
+            &[batch.to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_eq!(batch.rows(), 2);
+        assert_eq!(got.rows(), 0);
+
+        // Actual Column instances
+        assert_eq!(got.columns().len(), batch.columns().len());
+
+        // Column name map
+        assert_eq!(got.column_names().len(), 5);
+        assert_eq!(got.column_names(), batch.column_names());
+        assert_eq!(got.column_names().len(), got.columns().len());
+
+        // Schema
+        assert_eq!(
+            got.schema(Projection::All).unwrap(),
+            batch.schema(Projection::All).unwrap()
+        );
+        assert_eq!(
+            got.schema(Projection::All).unwrap().len(),
+            got.columns().len()
+        );
+    }
 }
diff --git a/mutable_batch/src/payload.rs b/mutable_batch/src/payload.rs
index 7cb85de5a63..0fb64037639 100644
--- a/mutable_batch/src/payload.rs
+++ b/mutable_batch/src/payload.rs
@@ -1,15 +1,6 @@
 //! Write payload abstractions derived from [`MutableBatch`]
 
-use crate::{column::ColumnData, MutableBatch, Result};
-use data_types::{partition_template::TablePartitionTemplateOverride, PartitionKey};
-use hashbrown::HashMap;
-use schema::TIME_COLUMN_NAME;
-use std::{num::NonZeroUsize, ops::Range};
-
-pub use self::partition::PartitionKeyError;
-
-mod filter;
-mod partition;
+use crate::{MutableBatch, Result};
 
 /// A payload that can be written to a mutable batch
 pub trait WritePayload {
@@ -22,142 +13,3 @@ impl WritePayload for MutableBatch {
         batch.extend_from(self)
     }
 }
-
-/// A [`MutableBatch`] with a non-zero set of row ranges to write
-#[derive(Debug)]
-pub struct PartitionWrite<'a> {
-    batch: &'a MutableBatch,
-    ranges: Vec<Range<usize>>,
-    min_timestamp: i64,
-    max_timestamp: i64,
-    row_count: NonZeroUsize,
-}
-
-impl<'a> PartitionWrite<'a> {
-    /// Create a new [`PartitionWrite`] with the entire range of the provided batch
-    ///
-    /// # Panic
-    ///
-    /// Panics if the batch has no rows
-    pub fn new(batch: &'a MutableBatch) -> Self {
-        let row_count = NonZeroUsize::new(batch.row_count).unwrap();
-        let time = get_time_column(batch);
-        let (min_timestamp, max_timestamp) = min_max_time(time);
-
-        // This `allow` can be removed when this issue is fixed and released:
-        // <https://github.com/rust-lang/rust-clippy/issues/11086>
-        #[allow(clippy::single_range_in_vec_init)]
-        Self {
-            batch,
-            ranges: vec![0..batch.row_count],
-            min_timestamp,
-            max_timestamp,
-            row_count,
-        }
-    }
-
-    /// Returns the minimum timestamp in the write
-    pub fn min_timestamp(&self) -> i64 {
-        self.min_timestamp
-    }
-
-    /// Returns the maximum timestamp in the write
-    pub fn max_timestamp(&self) -> i64 {
-        self.max_timestamp
-    }
-
-    /// Returns the number of rows in the write
-    pub fn rows(&self) -> NonZeroUsize {
-        self.row_count
-    }
-
-    /// Returns a [`PartitionWrite`] containing just the rows of `Self` that pass
-    /// the provided time predicate, or None if no rows
-    pub fn filter(&self, predicate: impl Fn(i64) -> bool) -> Option<PartitionWrite<'a>> {
-        let mut min_timestamp = i64::MAX;
-        let mut max_timestamp = i64::MIN;
-        let mut row_count = 0_usize;
-
-        // Construct a predicate that lets us inspect the timestamps as they are filtered
-        let inspect = |t| match predicate(t) {
-            true => {
-                min_timestamp = min_timestamp.min(t);
-                max_timestamp = max_timestamp.max(t);
-                row_count += 1;
-                true
-            }
-            false => false,
-        };
-
-        let ranges: Vec<_> = filter::filter_time(self.batch, &self.ranges, inspect);
-        let row_count = NonZeroUsize::new(row_count)?;
-
-        Some(PartitionWrite {
-            batch: self.batch,
-            ranges,
-            min_timestamp,
-            max_timestamp,
-            row_count,
-        })
-    }
-
-    /// Create a collection of [`PartitionWrite`] indexed by partition key
-    /// from a [`MutableBatch`] and [`TablePartitionTemplateOverride`]
-    pub fn partition(
-        batch: &'a MutableBatch,
-        partition_template: &TablePartitionTemplateOverride,
-    ) -> Result<HashMap<PartitionKey, Self>, PartitionKeyError> {
-        use hashbrown::hash_map::Entry;
-        let time = get_time_column(batch);
-
-        let mut partition_ranges = HashMap::new();
-        for (partition, range) in partition::partition_batch(batch, partition_template) {
-            let row_count = NonZeroUsize::new(range.end - range.start).unwrap();
-            let (min_timestamp, max_timestamp) = min_max_time(&time[range.clone()]);
-
-            match partition_ranges.entry(PartitionKey::from(partition?)) {
-                Entry::Vacant(v) => {
-                    v.insert(PartitionWrite {
-                        batch,
-                        ranges: vec![range],
-                        min_timestamp,
-                        max_timestamp,
-                        row_count,
-                    });
-                }
-                Entry::Occupied(mut o) => {
-                    let pw = o.get_mut();
-                    pw.min_timestamp = pw.min_timestamp.min(min_timestamp);
-                    pw.max_timestamp = pw.max_timestamp.max(max_timestamp);
-                    pw.row_count = NonZeroUsize::new(pw.row_count.get() + row_count.get()).unwrap();
-                    pw.ranges.push(range);
-                }
-            }
-        }
-        Ok(partition_ranges)
-    }
-}
-
-impl<'a> WritePayload for PartitionWrite<'a> {
-    fn write_to_batch(&self, batch: &mut MutableBatch) -> Result<()> {
-        batch.extend_from_ranges(self.batch, &self.ranges)
-    }
-}
-
-fn get_time_column(batch: &MutableBatch) -> &[i64] {
-    let time_column = batch.column(TIME_COLUMN_NAME).expect("time column");
-    match &time_column.data {
-        ColumnData::I64(col_data, _) => col_data,
-        x => unreachable!("expected i64 got {} for time column", x),
-    }
-}
-
-fn min_max_time(col: &[i64]) -> (i64, i64) {
-    let mut min_timestamp = i64::MAX;
-    let mut max_timestamp = i64::MIN;
-    for t in col {
-        min_timestamp = min_timestamp.min(*t);
-        max_timestamp = max_timestamp.max(*t);
-    }
-    (min_timestamp, max_timestamp)
-}
diff --git a/mutable_batch/src/writer.rs b/mutable_batch/src/writer.rs
index 3a1e2bcf117..8158077628f 100644
--- a/mutable_batch/src/writer.rs
+++ b/mutable_batch/src/writer.rs
@@ -1,7 +1,7 @@
 //! A panic-safe write abstraction for [`MutableBatch`]
 
 use crate::{
-    column::{Column, ColumnData, INVALID_DID},
+    column::{Column, ColumnData, NULL_DID},
     MutableBatch,
 };
 use arrow_util::bitset::{iter_set_positions, iter_set_positions_with_offset, BitSet};
@@ -325,7 +325,7 @@ impl<'a> Writer<'a> {
         let mut stats = StatValues::new_empty();
         match &mut col.data {
             ColumnData::Tag(col_data, dict, _) => {
-                col_data.resize(initial_rows + to_insert, INVALID_DID);
+                col_data.resize(initial_rows + to_insert, NULL_DID);
 
                 for idx in set_position_iterator(valid_mask, to_insert) {
                     let value = values.next().ok_or(Error::InsufficientValues)?;
@@ -375,7 +375,7 @@ impl<'a> Writer<'a> {
                 // Lazily compute mappings to handle dictionaries with unused mappings
                 let mut mapping: Vec<_> = values.map(|value| (value, None)).collect();
 
-                col_data.resize(initial_rows + to_insert, INVALID_DID);
+                col_data.resize(initial_rows + to_insert, NULL_DID);
 
                 for idx in set_position_iterator(valid_mask, to_insert) {
                     let key = keys.next().ok_or(Error::InsufficientValues)?;
@@ -483,7 +483,7 @@ impl<'a> Writer<'a> {
                         .collect();
 
                     dst_data.extend(src_data.iter().map(|src_id| match *src_id {
-                        INVALID_DID => INVALID_DID,
+                        NULL_DID => NULL_DID,
                         _ => mapping[*src_id as usize],
                     }));
 
@@ -567,9 +567,9 @@ impl<'a> Writer<'a> {
                     for range in ranges {
                         dst_data.extend(src_data[range.clone()].iter().map(
                             |src_id| match *src_id {
-                                INVALID_DID => {
+                                NULL_DID => {
                                     stats.update_for_nulls(1);
-                                    INVALID_DID
+                                    NULL_DID
                                 }
                                 _ => {
                                     let maybe_did = &mut mapping[*src_id as usize];
diff --git a/mutable_batch/tests/writer.rs b/mutable_batch/tests/writer.rs
index 87da6b55575..96e1aa0575e 100644
--- a/mutable_batch/tests/writer.rs
+++ b/mutable_batch/tests/writer.rs
@@ -2,7 +2,7 @@ use arrow_util::assert_batches_eq;
 use data_types::{StatValues, Statistics};
 use mutable_batch::{writer::Writer, MutableBatch, TimestampSummary};
 use schema::Projection;
-use std::num::NonZeroU64;
+use std::{f64::NAN, num::NonZeroU64};
 
 fn get_stats(batch: &MutableBatch) -> Vec<(&str, Statistics)> {
     let mut stats: Vec<_> = batch
@@ -343,3 +343,110 @@ fn test_basic() {
     let timestamps = batch.timestamp_summary().unwrap();
     assert_eq!(timestamps, expected_timestamps);
 }
+
+#[test]
+fn test_null_only() {
+    let mut batch = MutableBatch::new();
+
+    let mut writer = Writer::new(&mut batch, 1);
+
+    writer
+        .write_bool("b1", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_f64("f64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_i64("i64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_u64("u64", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer
+        .write_string("string", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer.write_time("time", vec![42].into_iter()).unwrap();
+
+    writer
+        .write_tag("tag1", Some(&[0b00000000]), vec![].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_data = &[
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+        "| b1 | f64 | i64 | string | tag1 | time                           | u64 |",
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+        "|    |     |     |        |      | 1970-01-01T00:00:00.000000042Z |     |",
+        "+----+-----+-----+--------+------+--------------------------------+-----+",
+    ];
+
+    let expected_stats = vec![
+        (
+            "b1",
+            Statistics::Bool(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "f64",
+            Statistics::F64(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "i64",
+            Statistics::I64(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "string",
+            Statistics::String(StatValues::new(None, None, 1, Some(1))),
+        ),
+        (
+            "tag1",
+            Statistics::String(StatValues::new_with_distinct(
+                None,
+                None,
+                1,
+                Some(1),
+                Some(1.try_into().unwrap()),
+            )),
+        ),
+        (
+            "time",
+            Statistics::I64(StatValues::new(Some(42), Some(42), 1, Some(0))),
+        ),
+        (
+            "u64",
+            Statistics::U64(StatValues::new(None, None, 1, Some(1))),
+        ),
+    ];
+
+    assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]);
+    pretty_assertions::assert_eq!(expected_stats, stats);
+}
+
+#[test]
+fn test_nan_stats() {
+    let mut batch = MutableBatch::new();
+
+    let mut writer = Writer::new(&mut batch, 3);
+
+    writer
+        .write_f64("f64", None, vec![4.2, NAN, 2.4].into_iter())
+        .unwrap();
+
+    writer.commit();
+
+    let stats: Vec<_> = get_stats(&batch);
+
+    let expected_stats = vec![(
+        "f64",
+        Statistics::F64(StatValues::new(Some(2.4), Some(4.2), 3, Some(0))),
+    )];
+
+    pretty_assertions::assert_eq!(expected_stats, stats);
+}
diff --git a/mutable_batch/tests/writer_fuzz.rs b/mutable_batch/tests/writer_fuzz.rs
index 31c23f51c78..bf8183d4e96 100644
--- a/mutable_batch/tests/writer_fuzz.rs
+++ b/mutable_batch/tests/writer_fuzz.rs
@@ -19,7 +19,8 @@ use data_types::{
     IsNan, StatValues, Statistics,
 };
 use hashbrown::HashSet;
-use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload};
+use mutable_batch::{writer::Writer, MutableBatch, WritePayload};
+use partition::PartitionWrite;
 use rand::prelude::*;
 use schema::Projection;
 use std::{collections::BTreeMap, num::NonZeroU64, ops::Range, sync::Arc};
@@ -416,7 +417,7 @@ fn test_partition_write() {
     let mut batch = MutableBatch::new();
     let expected = extend_batch(&mut rng, &mut batch);
 
-    let w = PartitionWrite::new(&batch);
+    let w = PartitionWrite::new(&batch).unwrap();
     assert_eq!(w.rows().get(), expected.tag_expected.len());
 
     let verify_write = |write: &PartitionWrite<'_>| {
diff --git a/mutable_batch_lp/Cargo.toml b/mutable_batch_lp/Cargo.toml
index 3f75bb21aee..89e367aa351 100644
--- a/mutable_batch_lp/Cargo.toml
+++ b/mutable_batch_lp/Cargo.toml
@@ -6,11 +6,15 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 hashbrown = { workspace = true }
 influxdb-line-protocol = { path = "../influxdb_line_protocol" }
+itertools = "0.12.0"
 mutable_batch = { path = "../mutable_batch" }
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
@@ -18,12 +22,12 @@ arrow_util = { path = "../arrow_util" }
 assert_matches = "1.5.0"
 criterion = { version = "0.5", default-features = false, features = ["rayon"]}
 schema = { path = "../schema" }
+test_helpers = { path = "../test_helpers" }
 
 [[bench]]
 name = "parse_lp"
 harness = false
 
-
 [lib]
 # Allow --save-baseline to work
 # https://github.com/bheisler/criterion.rs/issues/275
diff --git a/mutable_batch_lp/fuzz/.gitignore b/mutable_batch_lp/fuzz/.gitignore
new file mode 100644
index 00000000000..1a45eee7760
--- /dev/null
+++ b/mutable_batch_lp/fuzz/.gitignore
@@ -0,0 +1,4 @@
+target
+corpus
+artifacts
+coverage
diff --git a/mutable_batch_lp/fuzz/Cargo.lock b/mutable_batch_lp/fuzz/Cargo.lock
new file mode 100644
index 00000000000..db2c6c7e4e5
--- /dev/null
+++ b/mutable_batch_lp/fuzz/Cargo.lock
@@ -0,0 +1,4129 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "addr2line"
+version = "0.21.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+dependencies = [
+ "gimli",
+]
+
+[[package]]
+name = "adler"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
+
+[[package]]
+name = "ahash"
+version = "0.8.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01"
+dependencies = [
+ "cfg-if",
+ "const-random",
+ "getrandom",
+ "once_cell",
+ "version_check",
+ "zerocopy",
+]
+
+[[package]]
+name = "aho-corasick"
+version = "1.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "allocator-api2"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"
+
+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
+[[package]]
+name = "android_system_properties"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "anstream"
+version = "0.6.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6"
+dependencies = [
+ "anstyle",
+ "anstyle-parse",
+ "anstyle-query",
+ "anstyle-wincon",
+ "colorchoice",
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87"
+
+[[package]]
+name = "anstyle-parse"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c"
+dependencies = [
+ "utf8parse",
+]
+
+[[package]]
+name = "anstyle-query"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anstyle-wincon"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7"
+dependencies = [
+ "anstyle",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "anyhow"
+version = "1.0.79"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca"
+
+[[package]]
+name = "arbitrary"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110"
+
+[[package]]
+name = "arrow"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614"
+dependencies = [
+ "ahash",
+ "arrow-arith",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-csv",
+ "arrow-data",
+ "arrow-ipc",
+ "arrow-json",
+ "arrow-ord",
+ "arrow-row",
+ "arrow-schema",
+ "arrow-select",
+ "arrow-string",
+]
+
+[[package]]
+name = "arrow-arith"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-array"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d"
+dependencies = [
+ "ahash",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "chrono-tz",
+ "half",
+ "hashbrown 0.14.3",
+ "num",
+]
+
+[[package]]
+name = "arrow-buffer"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c"
+dependencies = [
+ "bytes",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-cast"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "base64",
+ "chrono",
+ "comfy-table",
+ "half",
+ "lexical-core",
+ "num",
+]
+
+[[package]]
+name = "arrow-csv"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "csv",
+ "csv-core",
+ "lazy_static",
+ "lexical-core",
+ "regex",
+]
+
+[[package]]
+name = "arrow-data"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634"
+dependencies = [
+ "arrow-buffer",
+ "arrow-schema",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-ipc"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "flatbuffers",
+]
+
+[[package]]
+name = "arrow-json"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-cast",
+ "arrow-data",
+ "arrow-schema",
+ "chrono",
+ "half",
+ "indexmap 2.1.0",
+ "lexical-core",
+ "num",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "arrow-ord"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "half",
+ "num",
+]
+
+[[package]]
+name = "arrow-row"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "half",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "arrow-schema"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167"
+
+[[package]]
+name = "arrow-select"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036"
+dependencies = [
+ "ahash",
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "num",
+]
+
+[[package]]
+name = "arrow-string"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7"
+dependencies = [
+ "arrow-array",
+ "arrow-buffer",
+ "arrow-data",
+ "arrow-schema",
+ "arrow-select",
+ "num",
+ "regex",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "arrow_util"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "arrow",
+ "chrono",
+ "comfy-table",
+ "hashbrown 0.14.3",
+ "num-traits",
+ "once_cell",
+ "regex",
+ "snafu",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "async-stream"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "async-trait"
+version = "0.1.77"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "atoi"
+version = "2.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "atomic-write-file"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edcdbedc2236483ab103a53415653d6b4442ea6141baf1ffa85df29635e88436"
+dependencies = [
+ "nix",
+ "rand",
+]
+
+[[package]]
+name = "autocfg"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+
+[[package]]
+name = "axum"
+version = "0.6.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf"
+dependencies = [
+ "async-trait",
+ "axum-core",
+ "bitflags 1.3.2",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "hyper",
+ "itoa",
+ "matchit",
+ "memchr",
+ "mime",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustversion",
+ "serde",
+ "sync_wrapper",
+ "tower",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "axum-core"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "futures-util",
+ "http",
+ "http-body",
+ "mime",
+ "rustversion",
+ "tower-layer",
+ "tower-service",
+]
+
+[[package]]
+name = "backtrace"
+version = "0.3.69"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+dependencies = [
+ "addr2line",
+ "cc",
+ "cfg-if",
+ "libc",
+ "miniz_oxide",
+ "object",
+ "rustc-demangle",
+]
+
+[[package]]
+name = "base64"
+version = "0.21.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9"
+
+[[package]]
+name = "base64ct"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
+
+[[package]]
+name = "bit-set"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1"
+dependencies = [
+ "bit-vec",
+]
+
+[[package]]
+name = "bit-vec"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb"
+
+[[package]]
+name = "bitflags"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
+
+[[package]]
+name = "bitflags"
+version = "2.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.10.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "bumpalo"
+version = "3.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "bytes"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+
+[[package]]
+name = "cc"
+version = "1.0.83"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+dependencies = [
+ "jobserver",
+ "libc",
+]
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "chrono"
+version = "0.4.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
+dependencies = [
+ "android-tzdata",
+ "iana-time-zone",
+ "js-sys",
+ "num-traits",
+ "serde",
+ "wasm-bindgen",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "chrono-tz"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7"
+dependencies = [
+ "chrono",
+ "chrono-tz-build",
+ "phf",
+]
+
+[[package]]
+name = "chrono-tz-build"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f"
+dependencies = [
+ "parse-zoneinfo",
+ "phf",
+ "phf_codegen",
+]
+
+[[package]]
+name = "clap"
+version = "4.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dcfab8ba68f3668e89f6ff60f5b205cea56aa7b769451a59f34b8682f51c056d"
+dependencies = [
+ "clap_builder",
+ "clap_derive",
+]
+
+[[package]]
+name = "clap_builder"
+version = "4.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb7fb5e4e979aec3be7791562fcba452f94ad85e954da024396433e0e25a79e9"
+dependencies = [
+ "anstream",
+ "anstyle",
+ "clap_lex",
+ "strsim",
+]
+
+[[package]]
+name = "clap_derive"
+version = "4.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "clap_lex"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1"
+
+[[package]]
+name = "colorchoice"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
+
+[[package]]
+name = "comfy-table"
+version = "7.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7c64043d6c7b7a4c58e39e7efccfdea7b93d885a795d0c054a69dbbf4dd52686"
+dependencies = [
+ "strum",
+ "strum_macros",
+ "unicode-width",
+]
+
+[[package]]
+name = "const-oid"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
+
+[[package]]
+name = "const-random"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a"
+dependencies = [
+ "const-random-macro",
+]
+
+[[package]]
+name = "const-random-macro"
+version = "0.1.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e"
+dependencies = [
+ "getrandom",
+ "once_cell",
+ "tiny-keccak",
+]
+
+[[package]]
+name = "core-foundation"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "core-foundation-sys"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f"
+
+[[package]]
+name = "cpufeatures"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "crc"
+version = "3.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe"
+dependencies = [
+ "crc-catalog",
+]
+
+[[package]]
+name = "crc-catalog"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5"
+
+[[package]]
+name = "crc32fast"
+version = "1.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "croaring"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7266f0a7275b00ce4c4f4753e8c31afdefe93828101ece83a06e2ddab1dd1010"
+dependencies = [
+ "byteorder",
+ "croaring-sys",
+]
+
+[[package]]
+name = "croaring-sys"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e47112498c394a7067949ebc07ef429b7384a413cf0efcf675846a47bcd307fb"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "crossbeam-queue"
+version = "0.3.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "adc6598521bb5a83d491e8c1fe51db7296019d2ca3cb93cc6c2a20369a4d78a2"
+dependencies = [
+ "cfg-if",
+ "crossbeam-utils",
+]
+
+[[package]]
+name = "crossbeam-utils"
+version = "0.8.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3a430a770ebd84726f584a90ee7f020d28db52c6d02138900f22341f866d39c"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
+[[package]]
+name = "crypto-common"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
+dependencies = [
+ "generic-array",
+ "typenum",
+]
+
+[[package]]
+name = "csv"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe"
+dependencies = [
+ "csv-core",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "csv-core"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "data_types"
+version = "0.1.0"
+dependencies = [
+ "arrow-buffer",
+ "bytes",
+ "chrono",
+ "croaring",
+ "generated_types",
+ "influxdb-line-protocol",
+ "iox_time",
+ "murmur3",
+ "observability_deps",
+ "once_cell",
+ "ordered-float 4.2.0",
+ "percent-encoding",
+ "prost",
+ "schema",
+ "serde_json",
+ "sha2",
+ "siphasher 1.0.0",
+ "snafu",
+ "sqlx",
+ "thiserror",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "der"
+version = "0.7.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c"
+dependencies = [
+ "const-oid",
+ "pem-rfc7468",
+ "zeroize",
+]
+
+[[package]]
+name = "difflib"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8"
+
+[[package]]
+name = "digest"
+version = "0.10.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
+dependencies = [
+ "block-buffer",
+ "const-oid",
+ "crypto-common",
+ "subtle",
+]
+
+[[package]]
+name = "doc-comment"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10"
+
+[[package]]
+name = "dotenvy"
+version = "0.15.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b"
+
+[[package]]
+name = "dyn-clone"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d"
+
+[[package]]
+name = "either"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07"
+dependencies = [
+ "serde",
+]
+
+[[package]]
+name = "encoding_rs"
+version = "0.8.33"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1"
+dependencies = [
+ "cfg-if",
+]
+
+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
+[[package]]
+name = "errno"
+version = "0.3.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+dependencies = [
+ "libc",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "etcetera"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943"
+dependencies = [
+ "cfg-if",
+ "home",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "event-listener"
+version = "2.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0"
+
+[[package]]
+name = "fastrand"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
+
+[[package]]
+name = "finl_unicode"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+
+[[package]]
+name = "fixedbitset"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80"
+
+[[package]]
+name = "flatbuffers"
+version = "23.5.26"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640"
+dependencies = [
+ "bitflags 1.3.2",
+ "rustc_version",
+]
+
+[[package]]
+name = "flate2"
+version = "1.0.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e"
+dependencies = [
+ "crc32fast",
+ "miniz_oxide",
+]
+
+[[package]]
+name = "float-cmp"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "flume"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+ "spin 0.9.8",
+]
+
+[[package]]
+name = "fnv"
+version = "1.0.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
+
+[[package]]
+name = "form_urlencoded"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
+dependencies = [
+ "percent-encoding",
+]
+
+[[package]]
+name = "futures"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-channel"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78"
+dependencies = [
+ "futures-core",
+ "futures-sink",
+]
+
+[[package]]
+name = "futures-core"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d"
+
+[[package]]
+name = "futures-executor"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d"
+dependencies = [
+ "futures-core",
+ "futures-task",
+ "futures-util",
+]
+
+[[package]]
+name = "futures-intrusive"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f"
+dependencies = [
+ "futures-core",
+ "lock_api",
+ "parking_lot",
+]
+
+[[package]]
+name = "futures-io"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1"
+
+[[package]]
+name = "futures-macro"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "futures-sink"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5"
+
+[[package]]
+name = "futures-task"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004"
+
+[[package]]
+name = "futures-util"
+version = "0.3.30"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48"
+dependencies = [
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-macro",
+ "futures-sink",
+ "futures-task",
+ "memchr",
+ "pin-project-lite",
+ "pin-utils",
+ "slab",
+]
+
+[[package]]
+name = "generated_types"
+version = "0.1.0"
+dependencies = [
+ "observability_deps",
+ "pbjson",
+ "pbjson-build",
+ "pbjson-types",
+ "prost",
+ "prost-build",
+ "serde",
+ "tonic",
+ "tonic-build",
+ "uuid",
+ "workspace-hack",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "getrandom"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "wasi",
+]
+
+[[package]]
+name = "gimli"
+version = "0.28.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+
+[[package]]
+name = "h2"
+version = "0.3.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4d6250322ef6e60f93f9a2162799302cd6f68f79f6e5d85c8c16f14d1d958178"
+dependencies = [
+ "bytes",
+ "fnv",
+ "futures-core",
+ "futures-sink",
+ "futures-util",
+ "http",
+ "indexmap 2.1.0",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tracing",
+]
+
+[[package]]
+name = "half"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "num-traits",
+]
+
+[[package]]
+name = "hashbrown"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
+
+[[package]]
+name = "hashbrown"
+version = "0.14.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+dependencies = [
+ "ahash",
+ "allocator-api2",
+]
+
+[[package]]
+name = "hashlink"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
+dependencies = [
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "heck"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
+dependencies = [
+ "unicode-segmentation",
+]
+
+[[package]]
+name = "hermit-abi"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"
+
+[[package]]
+name = "hex"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
+
+[[package]]
+name = "hkdf"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7"
+dependencies = [
+ "hmac",
+]
+
+[[package]]
+name = "hmac"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e"
+dependencies = [
+ "digest",
+]
+
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "http"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb"
+dependencies = [
+ "bytes",
+ "fnv",
+ "itoa",
+]
+
+[[package]]
+name = "http-body"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2"
+dependencies = [
+ "bytes",
+ "http",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "http-range-header"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f"
+
+[[package]]
+name = "httparse"
+version = "1.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904"
+
+[[package]]
+name = "httpdate"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
+
+[[package]]
+name = "humantime"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
+
+[[package]]
+name = "hyper"
+version = "0.14.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80"
+dependencies = [
+ "bytes",
+ "futures-channel",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "httparse",
+ "httpdate",
+ "itoa",
+ "pin-project-lite",
+ "socket2",
+ "tokio",
+ "tower-service",
+ "tracing",
+ "want",
+]
+
+[[package]]
+name = "hyper-rustls"
+version = "0.24.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590"
+dependencies = [
+ "futures-util",
+ "http",
+ "hyper",
+ "log",
+ "rustls",
+ "rustls-native-certs",
+ "tokio",
+ "tokio-rustls",
+]
+
+[[package]]
+name = "hyper-timeout"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
+dependencies = [
+ "hyper",
+ "pin-project-lite",
+ "tokio",
+ "tokio-io-timeout",
+]
+
+[[package]]
+name = "iana-time-zone"
+version = "0.1.59"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
+dependencies = [
+ "android_system_properties",
+ "core-foundation-sys",
+ "iana-time-zone-haiku",
+ "js-sys",
+ "wasm-bindgen",
+ "windows-core",
+]
+
+[[package]]
+name = "iana-time-zone-haiku"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
+dependencies = [
+ "cc",
+]
+
+[[package]]
+name = "idna"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6"
+dependencies = [
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "indexmap"
+version = "1.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
+dependencies = [
+ "autocfg",
+ "hashbrown 0.12.3",
+]
+
+[[package]]
+name = "indexmap"
+version = "2.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.3",
+]
+
+[[package]]
+name = "influxdb-line-protocol"
+version = "1.0.0"
+dependencies = [
+ "bytes",
+ "log",
+ "nom",
+ "smallvec",
+ "snafu",
+]
+
+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
+[[package]]
+name = "iox_time"
+version = "0.1.0"
+dependencies = [
+ "chrono",
+ "parking_lot",
+ "tokio",
+ "workspace-hack",
+]
+
+[[package]]
+name = "ipnet"
+version = "2.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3"
+
+[[package]]
+name = "itertools"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itertools"
+version = "0.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0"
+dependencies = [
+ "either",
+]
+
+[[package]]
+name = "itoa"
+version = "1.0.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
+
+[[package]]
+name = "jobserver"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "js-sys"
+version = "0.3.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca"
+dependencies = [
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "json-patch"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6"
+dependencies = [
+ "serde",
+ "serde_json",
+ "thiserror",
+ "treediff",
+]
+
+[[package]]
+name = "k8s-openapi"
+version = "0.20.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6"
+dependencies = [
+ "base64",
+ "bytes",
+ "chrono",
+ "schemars",
+ "serde",
+ "serde-value",
+ "serde_json",
+]
+
+[[package]]
+name = "kube-core"
+version = "0.87.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae"
+dependencies = [
+ "chrono",
+ "form_urlencoded",
+ "http",
+ "json-patch",
+ "k8s-openapi",
+ "once_cell",
+ "schemars",
+ "serde",
+ "serde_json",
+ "thiserror",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
+dependencies = [
+ "spin 0.5.2",
+]
+
+[[package]]
+name = "lexical-core"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46"
+dependencies = [
+ "lexical-parse-float",
+ "lexical-parse-integer",
+ "lexical-util",
+ "lexical-write-float",
+ "lexical-write-integer",
+]
+
+[[package]]
+name = "lexical-parse-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f"
+dependencies = [
+ "lexical-parse-integer",
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-parse-integer"
+version = "0.8.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-util"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc"
+dependencies = [
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-float"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862"
+dependencies = [
+ "lexical-util",
+ "lexical-write-integer",
+ "static_assertions",
+]
+
+[[package]]
+name = "lexical-write-integer"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446"
+dependencies = [
+ "lexical-util",
+ "static_assertions",
+]
+
+[[package]]
+name = "libc"
+version = "0.2.151"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4"
+
+[[package]]
+name = "libfuzzer-sys"
+version = "0.4.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7"
+dependencies = [
+ "arbitrary",
+ "cc",
+ "once_cell",
+]
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "libsqlite3-sys"
+version = "0.27.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716"
+dependencies = [
+ "cc",
+ "pkg-config",
+ "vcpkg",
+]
+
+[[package]]
+name = "linux-raw-sys"
+version = "0.4.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456"
+
+[[package]]
+name = "lock_api"
+version = "0.4.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+dependencies = [
+ "autocfg",
+ "scopeguard",
+]
+
+[[package]]
+name = "log"
+version = "0.4.20"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f"
+
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
+[[package]]
+name = "matchit"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94"
+
+[[package]]
+name = "md-5"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
+dependencies = [
+ "cfg-if",
+ "digest",
+]
+
+[[package]]
+name = "memchr"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+
+[[package]]
+name = "mime"
+version = "0.3.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
+
+[[package]]
+name = "minimal-lexical"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
+
+[[package]]
+name = "miniz_oxide"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7"
+dependencies = [
+ "adler",
+]
+
+[[package]]
+name = "mio"
+version = "0.8.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09"
+dependencies = [
+ "libc",
+ "log",
+ "wasi",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "multimap"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
+
+[[package]]
+name = "murmur3"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b"
+
+[[package]]
+name = "mutable_batch"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "arrow_util",
+ "chrono",
+ "data_types",
+ "hashbrown 0.14.3",
+ "iox_time",
+ "itertools 0.12.0",
+ "percent-encoding",
+ "schema",
+ "snafu",
+ "thiserror",
+ "unicode-segmentation",
+ "workspace-hack",
+]
+
+[[package]]
+name = "mutable_batch_lp"
+version = "0.0.0"
+dependencies = [
+ "hashbrown 0.14.3",
+ "libfuzzer-sys",
+ "mutable_batch",
+ "mutable_batch_lp 0.1.0",
+]
+
+[[package]]
+name = "mutable_batch_lp"
+version = "0.1.0"
+dependencies = [
+ "hashbrown 0.14.3",
+ "influxdb-line-protocol",
+ "itertools 0.12.0",
+ "mutable_batch",
+ "snafu",
+ "workspace-hack",
+]
+
+[[package]]
+name = "nix"
+version = "0.27.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053"
+dependencies = [
+ "bitflags 2.4.1",
+ "cfg-if",
+ "libc",
+]
+
+[[package]]
+name = "nom"
+version = "7.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
+dependencies = [
+ "memchr",
+ "minimal-lexical",
+]
+
+[[package]]
+name = "normalize-line-endings"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be"
+
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
+[[package]]
+name = "num"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
+dependencies = [
+ "num-bigint",
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-bigint-dig"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151"
+dependencies = [
+ "byteorder",
+ "lazy_static",
+ "libm",
+ "num-integer",
+ "num-iter",
+ "num-traits",
+ "rand",
+ "smallvec",
+ "zeroize",
+]
+
+[[package]]
+name = "num-complex"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "num-integer"
+version = "0.1.45"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+dependencies = [
+ "autocfg",
+ "num-traits",
+]
+
+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-bigint",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-traits"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "num_cpus"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
+dependencies = [
+ "hermit-abi",
+ "libc",
+]
+
+[[package]]
+name = "object"
+version = "0.32.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+dependencies = [
+ "memchr",
+]
+
+[[package]]
+name = "object_store"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050"
+dependencies = [
+ "async-trait",
+ "base64",
+ "bytes",
+ "chrono",
+ "futures",
+ "humantime",
+ "hyper",
+ "itertools 0.11.0",
+ "parking_lot",
+ "percent-encoding",
+ "quick-xml",
+ "rand",
+ "reqwest",
+ "ring",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "snafu",
+ "tokio",
+ "tracing",
+ "url",
+ "walkdir",
+]
+
+[[package]]
+name = "observability_deps"
+version = "0.1.0"
+dependencies = [
+ "tracing",
+ "workspace-hack",
+]
+
+[[package]]
+name = "once_cell"
+version = "1.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92"
+dependencies = [
+ "parking_lot_core",
+]
+
+[[package]]
+name = "openssl-probe"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
+
+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "ordered-float"
+version = "4.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e"
+dependencies = [
+ "num-traits",
+]
+
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
+[[package]]
+name = "parking_lot"
+version = "0.12.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+dependencies = [
+ "lock_api",
+ "parking_lot_core",
+]
+
+[[package]]
+name = "parking_lot_core"
+version = "0.9.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "redox_syscall",
+ "smallvec",
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "parse-zoneinfo"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41"
+dependencies = [
+ "regex",
+]
+
+[[package]]
+name = "paste"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c"
+
+[[package]]
+name = "pbjson"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90"
+dependencies = [
+ "base64",
+ "serde",
+]
+
+[[package]]
+name = "pbjson-build"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735"
+dependencies = [
+ "heck",
+ "itertools 0.11.0",
+ "prost",
+ "prost-types",
+]
+
+[[package]]
+name = "pbjson-types"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12"
+dependencies = [
+ "bytes",
+ "chrono",
+ "pbjson",
+ "pbjson-build",
+ "prost",
+ "prost-build",
+ "serde",
+]
+
+[[package]]
+name = "pem-rfc7468"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412"
+dependencies = [
+ "base64ct",
+]
+
+[[package]]
+name = "percent-encoding"
+version = "2.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
+
+[[package]]
+name = "petgraph"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9"
+dependencies = [
+ "fixedbitset",
+ "indexmap 2.1.0",
+]
+
+[[package]]
+name = "phf"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+dependencies = [
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_codegen"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a"
+dependencies = [
+ "phf_generator",
+ "phf_shared",
+]
+
+[[package]]
+name = "phf_generator"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0"
+dependencies = [
+ "phf_shared",
+ "rand",
+]
+
+[[package]]
+name = "phf_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+dependencies = [
+ "siphasher 0.3.11",
+]
+
+[[package]]
+name = "pin-project"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422"
+dependencies = [
+ "pin-project-internal",
+]
+
+[[package]]
+name = "pin-project-internal"
+version = "1.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "pin-project-lite"
+version = "0.2.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+
+[[package]]
+name = "pin-utils"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
+
+[[package]]
+name = "pkcs1"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f"
+dependencies = [
+ "der",
+ "pkcs8",
+ "spki",
+]
+
+[[package]]
+name = "pkcs8"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7"
+dependencies = [
+ "der",
+ "spki",
+]
+
+[[package]]
+name = "pkg-config"
+version = "0.3.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a"
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
+
+[[package]]
+name = "predicates"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0"
+dependencies = [
+ "anstyle",
+ "difflib",
+ "float-cmp",
+ "itertools 0.11.0",
+ "normalize-line-endings",
+ "predicates-core",
+ "regex",
+]
+
+[[package]]
+name = "predicates-core"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174"
+
+[[package]]
+name = "prettyplease"
+version = "0.2.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5"
+dependencies = [
+ "proc-macro2",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "proc-macro2"
+version = "1.0.74"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db"
+dependencies = [
+ "unicode-ident",
+]
+
+[[package]]
+name = "proptest"
+version = "1.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31b476131c3c86cb68032fdc5cb6d5a1045e3e42d96b69fa599fd77701e1f5bf"
+dependencies = [
+ "bit-set",
+ "bit-vec",
+ "bitflags 2.4.1",
+ "lazy_static",
+ "num-traits",
+ "rand",
+ "rand_chacha",
+ "rand_xorshift",
+ "regex-syntax 0.8.2",
+ "rusty-fork",
+ "tempfile",
+ "unarray",
+]
+
+[[package]]
+name = "prost"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a"
+dependencies = [
+ "bytes",
+ "prost-derive",
+]
+
+[[package]]
+name = "prost-build"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2"
+dependencies = [
+ "bytes",
+ "heck",
+ "itertools 0.11.0",
+ "log",
+ "multimap",
+ "once_cell",
+ "petgraph",
+ "prettyplease",
+ "prost",
+ "prost-types",
+ "regex",
+ "syn 2.0.46",
+ "tempfile",
+ "which",
+]
+
+[[package]]
+name = "prost-derive"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e"
+dependencies = [
+ "anyhow",
+ "itertools 0.11.0",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "prost-types"
+version = "0.12.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e"
+dependencies = [
+ "prost",
+]
+
+[[package]]
+name = "quick-error"
+version = "1.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0"
+
+[[package]]
+name = "quick-xml"
+version = "0.31.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
+[[package]]
+name = "quote"
+version = "1.0.35"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+dependencies = [
+ "proc-macro2",
+]
+
+[[package]]
+name = "rand"
+version = "0.8.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
+dependencies = [
+ "libc",
+ "rand_chacha",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_chacha"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
+dependencies = [
+ "ppv-lite86",
+ "rand_core",
+]
+
+[[package]]
+name = "rand_core"
+version = "0.6.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "rand_xorshift"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f"
+dependencies = [
+ "rand_core",
+]
+
+[[package]]
+name = "redox_syscall"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
+[[package]]
+name = "regex"
+version = "1.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.4.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f"
+dependencies = [
+ "aho-corasick",
+ "memchr",
+ "regex-syntax 0.8.2",
+]
+
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
+[[package]]
+name = "regex-syntax"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f"
+
+[[package]]
+name = "reqwest"
+version = "0.11.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41"
+dependencies = [
+ "base64",
+ "bytes",
+ "encoding_rs",
+ "futures-core",
+ "futures-util",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-rustls",
+ "ipnet",
+ "js-sys",
+ "log",
+ "mime",
+ "once_cell",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "system-configuration",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wasm-streams",
+ "web-sys",
+ "webpki-roots",
+ "winreg",
+]
+
+[[package]]
+name = "ring"
+version = "0.17.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74"
+dependencies = [
+ "cc",
+ "getrandom",
+ "libc",
+ "spin 0.9.8",
+ "untrusted",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "rsa"
+version = "0.9.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc"
+dependencies = [
+ "const-oid",
+ "digest",
+ "num-bigint-dig",
+ "num-integer",
+ "num-traits",
+ "pkcs1",
+ "pkcs8",
+ "rand_core",
+ "signature",
+ "spki",
+ "subtle",
+ "zeroize",
+]
+
+[[package]]
+name = "rustc-demangle"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+
+[[package]]
+name = "rustc_version"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366"
+dependencies = [
+ "semver",
+]
+
+[[package]]
+name = "rustix"
+version = "0.38.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+dependencies = [
+ "bitflags 2.4.1",
+ "errno",
+ "libc",
+ "linux-raw-sys",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "rustls"
+version = "0.21.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba"
+dependencies = [
+ "log",
+ "ring",
+ "rustls-webpki",
+ "sct",
+]
+
+[[package]]
+name = "rustls-native-certs"
+version = "0.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00"
+dependencies = [
+ "openssl-probe",
+ "rustls-pemfile",
+ "schannel",
+ "security-framework",
+]
+
+[[package]]
+name = "rustls-pemfile"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c"
+dependencies = [
+ "base64",
+]
+
+[[package]]
+name = "rustls-webpki"
+version = "0.101.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "rustversion"
+version = "1.0.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4"
+
+[[package]]
+name = "rusty-fork"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f"
+dependencies = [
+ "fnv",
+ "quick-error",
+ "tempfile",
+ "wait-timeout",
+]
+
+[[package]]
+name = "ryu"
+version = "1.0.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
+
+[[package]]
+name = "same-file"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
+dependencies = [
+ "winapi-util",
+]
+
+[[package]]
+name = "schannel"
+version = "0.1.23"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "schema"
+version = "0.1.0"
+dependencies = [
+ "arrow",
+ "hashbrown 0.14.3",
+ "indexmap 2.1.0",
+ "observability_deps",
+ "snafu",
+ "workspace-hack",
+]
+
+[[package]]
+name = "schemars"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29"
+dependencies = [
+ "dyn-clone",
+ "schemars_derive",
+ "serde",
+ "serde_json",
+]
+
+[[package]]
+name = "schemars_derive"
+version = "0.8.16"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "serde_derive_internals",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "scopeguard"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
+
+[[package]]
+name = "sct"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414"
+dependencies = [
+ "ring",
+ "untrusted",
+]
+
+[[package]]
+name = "security-framework"
+version = "2.9.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "core-foundation-sys",
+ "libc",
+ "security-framework-sys",
+]
+
+[[package]]
+name = "security-framework-sys"
+version = "2.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "semver"
+version = "1.0.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0"
+
+[[package]]
+name = "serde"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773"
+dependencies = [
+ "serde_derive",
+]
+
+[[package]]
+name = "serde-value"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c"
+dependencies = [
+ "ordered-float 2.10.1",
+ "serde",
+]
+
+[[package]]
+name = "serde_derive"
+version = "1.0.194"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "serde_derive_internals"
+version = "0.26.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "serde_json"
+version = "1.0.110"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fbd975230bada99c8bb618e0c365c2eefa219158d5c6c29610fd09ff1833257"
+dependencies = [
+ "indexmap 2.1.0",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "serde_urlencoded"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
+dependencies = [
+ "form_urlencoded",
+ "itoa",
+ "ryu",
+ "serde",
+]
+
+[[package]]
+name = "sha1"
+version = "0.10.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sha2"
+version = "0.10.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8"
+dependencies = [
+ "cfg-if",
+ "cpufeatures",
+ "digest",
+]
+
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
+[[package]]
+name = "signal-hook-registry"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "signature"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de"
+dependencies = [
+ "digest",
+ "rand_core",
+]
+
+[[package]]
+name = "similar"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21"
+
+[[package]]
+name = "siphasher"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
+
+[[package]]
+name = "siphasher"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe"
+
+[[package]]
+name = "slab"
+version = "0.4.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67"
+dependencies = [
+ "autocfg",
+]
+
+[[package]]
+name = "smallvec"
+version = "1.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970"
+
+[[package]]
+name = "snafu"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6"
+dependencies = [
+ "doc-comment",
+ "snafu-derive",
+]
+
+[[package]]
+name = "snafu-derive"
+version = "0.7.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "socket2"
+version = "0.5.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9"
+dependencies = [
+ "libc",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "spin"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
+
+[[package]]
+name = "spin"
+version = "0.9.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
+dependencies = [
+ "lock_api",
+]
+
+[[package]]
+name = "spki"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d"
+dependencies = [
+ "base64ct",
+ "der",
+]
+
+[[package]]
+name = "sqlformat"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c"
+dependencies = [
+ "itertools 0.12.0",
+ "nom",
+ "unicode_categories",
+]
+
+[[package]]
+name = "sqlx"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dba03c279da73694ef99763320dea58b51095dfe87d001b1d4b5fe78ba8763cf"
+dependencies = [
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+]
+
+[[package]]
+name = "sqlx-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d84b0a3c3739e220d94b3239fd69fb1f74bc36e16643423bd99de3b43c21bfbd"
+dependencies = [
+ "ahash",
+ "atoi",
+ "byteorder",
+ "bytes",
+ "crc",
+ "crossbeam-queue",
+ "dotenvy",
+ "either",
+ "event-listener",
+ "futures-channel",
+ "futures-core",
+ "futures-intrusive",
+ "futures-io",
+ "futures-util",
+ "hashlink",
+ "hex",
+ "indexmap 2.1.0",
+ "log",
+ "memchr",
+ "once_cell",
+ "paste",
+ "percent-encoding",
+ "rustls",
+ "rustls-pemfile",
+ "serde",
+ "serde_json",
+ "sha2",
+ "smallvec",
+ "sqlformat",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+ "tracing",
+ "url",
+ "uuid",
+ "webpki-roots",
+]
+
+[[package]]
+name = "sqlx-macros"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89961c00dc4d7dffb7aee214964b065072bff69e36ddb9e2c107541f75e4f2a5"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "sqlx-core",
+ "sqlx-macros-core",
+ "syn 1.0.109",
+]
+
+[[package]]
+name = "sqlx-macros-core"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d0bd4519486723648186a08785143599760f7cc81c52334a55d6a83ea1e20841"
+dependencies = [
+ "atomic-write-file",
+ "dotenvy",
+ "either",
+ "heck",
+ "hex",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "serde",
+ "serde_json",
+ "sha2",
+ "sqlx-core",
+ "sqlx-mysql",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "syn 1.0.109",
+ "tempfile",
+ "tokio",
+ "url",
+]
+
+[[package]]
+name = "sqlx-mysql"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "bytes",
+ "crc",
+ "digest",
+ "dotenvy",
+ "either",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "generic-array",
+ "hex",
+ "hkdf",
+ "hmac",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "percent-encoding",
+ "rand",
+ "rsa",
+ "serde",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-postgres"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24"
+dependencies = [
+ "atoi",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "crc",
+ "dotenvy",
+ "etcetera",
+ "futures-channel",
+ "futures-core",
+ "futures-io",
+ "futures-util",
+ "hex",
+ "hkdf",
+ "hmac",
+ "home",
+ "itoa",
+ "log",
+ "md-5",
+ "memchr",
+ "once_cell",
+ "rand",
+ "serde",
+ "serde_json",
+ "sha1",
+ "sha2",
+ "smallvec",
+ "sqlx-core",
+ "stringprep",
+ "thiserror",
+ "tracing",
+ "uuid",
+ "whoami",
+]
+
+[[package]]
+name = "sqlx-sqlite"
+version = "0.7.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "210976b7d948c7ba9fced8ca835b11cbb2d677c59c79de41ac0d397e14547490"
+dependencies = [
+ "atoi",
+ "flume",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-intrusive",
+ "futures-util",
+ "libsqlite3-sys",
+ "log",
+ "percent-encoding",
+ "serde",
+ "sqlx-core",
+ "tracing",
+ "url",
+ "urlencoding",
+ "uuid",
+]
+
+[[package]]
+name = "static_assertions"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
+
+[[package]]
+name = "stringprep"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+dependencies = [
+ "finl_unicode",
+ "unicode-bidi",
+ "unicode-normalization",
+]
+
+[[package]]
+name = "strsim"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
+
+[[package]]
+name = "strum"
+version = "0.25.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125"
+dependencies = [
+ "strum_macros",
+]
+
+[[package]]
+name = "strum_macros"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0"
+dependencies = [
+ "heck",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "subtle"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+
+[[package]]
+name = "syn"
+version = "1.0.109"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "syn"
+version = "2.0.46"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "89456b690ff72fddcecf231caedbe615c59480c93358a93dfae7fc29e3ebbf0e"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "unicode-ident",
+]
+
+[[package]]
+name = "sync_wrapper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160"
+
+[[package]]
+name = "system-configuration"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7"
+dependencies = [
+ "bitflags 1.3.2",
+ "core-foundation",
+ "system-configuration-sys",
+]
+
+[[package]]
+name = "system-configuration-sys"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9"
+dependencies = [
+ "core-foundation-sys",
+ "libc",
+]
+
+[[package]]
+name = "tempfile"
+version = "3.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+dependencies = [
+ "cfg-if",
+ "fastrand",
+ "redox_syscall",
+ "rustix",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "thiserror"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
+dependencies = [
+ "thiserror-impl",
+]
+
+[[package]]
+name = "thiserror-impl"
+version = "1.0.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "thread_local"
+version = "1.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
+[[package]]
+name = "threadpool"
+version = "1.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa"
+dependencies = [
+ "num_cpus",
+]
+
+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "log",
+ "ordered-float 2.10.1",
+ "threadpool",
+]
+
+[[package]]
+name = "tiny-keccak"
+version = "2.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237"
+dependencies = [
+ "crunchy",
+]
+
+[[package]]
+name = "tinyvec"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
+[[package]]
+name = "tokio"
+version = "1.35.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104"
+dependencies = [
+ "backtrace",
+ "bytes",
+ "libc",
+ "mio",
+ "num_cpus",
+ "parking_lot",
+ "pin-project-lite",
+ "signal-hook-registry",
+ "socket2",
+ "tokio-macros",
+ "tracing",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "tokio-io-timeout"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
+dependencies = [
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-macros"
+version = "2.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tokio-rustls"
+version = "0.24.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081"
+dependencies = [
+ "rustls",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-stream"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842"
+dependencies = [
+ "futures-core",
+ "pin-project-lite",
+ "tokio",
+]
+
+[[package]]
+name = "tokio-util"
+version = "0.7.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "futures-io",
+ "futures-sink",
+ "pin-project-lite",
+ "slab",
+ "tokio",
+ "tracing",
+]
+
+[[package]]
+name = "tonic"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e"
+dependencies = [
+ "async-stream",
+ "async-trait",
+ "axum",
+ "base64",
+ "bytes",
+ "h2",
+ "http",
+ "http-body",
+ "hyper",
+ "hyper-timeout",
+ "percent-encoding",
+ "pin-project",
+ "prost",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "tokio",
+ "tokio-rustls",
+ "tokio-stream",
+ "tower",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tonic-build"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889"
+dependencies = [
+ "prettyplease",
+ "proc-macro2",
+ "prost-build",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tower"
+version = "0.4.13"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
+dependencies = [
+ "futures-core",
+ "futures-util",
+ "indexmap 1.9.3",
+ "pin-project",
+ "pin-project-lite",
+ "rand",
+ "slab",
+ "tokio",
+ "tokio-util",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-http"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140"
+dependencies = [
+ "base64",
+ "bitflags 2.4.1",
+ "bytes",
+ "futures-core",
+ "futures-util",
+ "http",
+ "http-body",
+ "http-range-header",
+ "mime",
+ "pin-project-lite",
+ "tower-layer",
+ "tower-service",
+ "tracing",
+]
+
+[[package]]
+name = "tower-layer"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0"
+
+[[package]]
+name = "tower-service"
+version = "0.3.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
+
+[[package]]
+name = "tracing"
+version = "0.1.40"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+dependencies = [
+ "log",
+ "pin-project-lite",
+ "tracing-attributes",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-attributes"
+version = "0.1.27"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "tracing-core"
+version = "0.1.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+dependencies = [
+ "once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-serde"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+dependencies = [
+ "serde",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.18"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "parking_lot",
+ "regex",
+ "serde",
+ "serde_json",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-serde",
+]
+
+[[package]]
+name = "treediff"
+version = "4.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "52984d277bdf2a751072b5df30ec0377febdb02f7696d64c2d7d54630bac4303"
+dependencies = [
+ "serde_json",
+]
+
+[[package]]
+name = "try-lock"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
+
+[[package]]
+name = "typenum"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+
+[[package]]
+name = "unarray"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94"
+
+[[package]]
+name = "unicode-bidi"
+version = "0.3.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416"
+
+[[package]]
+name = "unicode-ident"
+version = "1.0.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+
+[[package]]
+name = "unicode-normalization"
+version = "0.1.22"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921"
+dependencies = [
+ "tinyvec",
+]
+
+[[package]]
+name = "unicode-segmentation"
+version = "1.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85"
+
+[[package]]
+name = "unicode_categories"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
+
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
+[[package]]
+name = "url"
+version = "2.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+]
+
+[[package]]
+name = "urlencoding"
+version = "2.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da"
+
+[[package]]
+name = "utf8parse"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a"
+
+[[package]]
+name = "uuid"
+version = "1.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
+dependencies = [
+ "getrandom",
+]
+
+[[package]]
+name = "valuable"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
+
+[[package]]
+name = "vcpkg"
+version = "0.2.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
+
+[[package]]
+name = "version_check"
+version = "0.9.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
+
+[[package]]
+name = "wait-timeout"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6"
+dependencies = [
+ "libc",
+]
+
+[[package]]
+name = "walkdir"
+version = "2.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee"
+dependencies = [
+ "same-file",
+ "winapi-util",
+]
+
+[[package]]
+name = "want"
+version = "0.3.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
+dependencies = [
+ "try-lock",
+]
+
+[[package]]
+name = "wasi"
+version = "0.11.0+wasi-snapshot-preview1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
+
+[[package]]
+name = "wasm-bindgen"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e"
+dependencies = [
+ "cfg-if",
+ "wasm-bindgen-macro",
+]
+
+[[package]]
+name = "wasm-bindgen-backend"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826"
+dependencies = [
+ "bumpalo",
+ "log",
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.39"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac36a15a220124ac510204aec1c3e5db8a22ab06fd6706d881dc6149f8ed9a12"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "wasm-bindgen",
+ "web-sys",
+]
+
+[[package]]
+name = "wasm-bindgen-macro"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2"
+dependencies = [
+ "quote",
+ "wasm-bindgen-macro-support",
+]
+
+[[package]]
+name = "wasm-bindgen-macro-support"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+ "wasm-bindgen-backend",
+ "wasm-bindgen-shared",
+]
+
+[[package]]
+name = "wasm-bindgen-shared"
+version = "0.2.89"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f"
+
+[[package]]
+name = "wasm-streams"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
+[[package]]
+name = "web-sys"
+version = "0.3.66"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "0.25.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10"
+
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
+[[package]]
+name = "whoami"
+version = "1.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50"
+
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
+[[package]]
+name = "winapi-util"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
+dependencies = [
+ "winapi",
+]
+
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
+[[package]]
+name = "windows-core"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
+[[package]]
+name = "windows-sys"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
+dependencies = [
+ "windows-targets 0.52.0",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c"
+dependencies = [
+ "windows_aarch64_gnullvm 0.48.5",
+ "windows_aarch64_msvc 0.48.5",
+ "windows_i686_gnu 0.48.5",
+ "windows_i686_msvc 0.48.5",
+ "windows_x86_64_gnu 0.48.5",
+ "windows_x86_64_gnullvm 0.48.5",
+ "windows_x86_64_msvc 0.48.5",
+]
+
+[[package]]
+name = "windows-targets"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd"
+dependencies = [
+ "windows_aarch64_gnullvm 0.52.0",
+ "windows_aarch64_msvc 0.52.0",
+ "windows_i686_gnu 0.52.0",
+ "windows_i686_msvc 0.52.0",
+ "windows_x86_64_gnu 0.52.0",
+ "windows_x86_64_gnullvm 0.52.0",
+ "windows_x86_64_msvc 0.52.0",
+]
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
+
+[[package]]
+name = "windows_aarch64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
+
+[[package]]
+name = "windows_aarch64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
+
+[[package]]
+name = "windows_i686_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
+
+[[package]]
+name = "windows_i686_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
+
+[[package]]
+name = "windows_x86_64_gnu"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
+
+[[package]]
+name = "windows_x86_64_gnullvm"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.48.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
+
+[[package]]
+name = "windows_x86_64_msvc"
+version = "0.52.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04"
+
+[[package]]
+name = "winreg"
+version = "0.50.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1"
+dependencies = [
+ "cfg-if",
+ "windows-sys 0.48.0",
+]
+
+[[package]]
+name = "workspace-hack"
+version = "0.1.0"
+dependencies = [
+ "ahash",
+ "base64",
+ "bitflags 2.4.1",
+ "byteorder",
+ "bytes",
+ "cc",
+ "chrono",
+ "clap",
+ "clap_builder",
+ "crossbeam-utils",
+ "crypto-common",
+ "digest",
+ "either",
+ "fixedbitset",
+ "flatbuffers",
+ "flate2",
+ "futures-channel",
+ "futures-core",
+ "futures-executor",
+ "futures-io",
+ "futures-sink",
+ "futures-task",
+ "futures-util",
+ "getrandom",
+ "hashbrown 0.14.3",
+ "heck",
+ "hyper",
+ "hyper-rustls",
+ "indexmap 2.1.0",
+ "itertools 0.11.0",
+ "k8s-openapi",
+ "kube-core",
+ "libc",
+ "lock_api",
+ "log",
+ "md-5",
+ "memchr",
+ "mio",
+ "nix",
+ "nom",
+ "num-traits",
+ "object_store",
+ "once_cell",
+ "parking_lot",
+ "percent-encoding",
+ "petgraph",
+ "phf_shared",
+ "predicates",
+ "proptest",
+ "prost",
+ "prost-types",
+ "rand",
+ "rand_core",
+ "regex",
+ "regex-automata 0.4.3",
+ "regex-syntax 0.8.2",
+ "reqwest",
+ "ring",
+ "rustls",
+ "serde",
+ "serde_json",
+ "sha2",
+ "similar",
+ "spin 0.9.8",
+ "sqlx",
+ "sqlx-core",
+ "sqlx-macros",
+ "sqlx-macros-core",
+ "sqlx-postgres",
+ "sqlx-sqlite",
+ "strum",
+ "syn 1.0.109",
+ "syn 2.0.46",
+ "thrift",
+ "tokio",
+ "tokio-stream",
+ "tokio-util",
+ "tower",
+ "tower-http",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+ "unicode-bidi",
+ "unicode-normalization",
+ "url",
+ "uuid",
+ "winapi",
+ "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be"
+dependencies = [
+ "zerocopy-derive",
+]
+
+[[package]]
+name = "zerocopy-derive"
+version = "0.7.32"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.46",
+]
+
+[[package]]
+name = "zeroize"
+version = "1.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d"
diff --git a/mutable_batch_lp/fuzz/Cargo.toml b/mutable_batch_lp/fuzz/Cargo.toml
new file mode 100644
index 00000000000..7a564adb7cf
--- /dev/null
+++ b/mutable_batch_lp/fuzz/Cargo.toml
@@ -0,0 +1,27 @@
+[package]
+name = "mutable_batch_lp"
+version = "0.0.0"
+publish = false
+edition = "2021"
+
+[package.metadata]
+cargo-fuzz = true
+
+[dependencies]
+hashbrown = "0.14"
+libfuzzer-sys = "0.4"
+mutable_batch_lp = { path = ".." }
+mutable_batch = { path = "../../mutable_batch" }
+
+# Prevent this from interfering with workspaces
+[workspace]
+members = ["."]
+
+[profile.release]
+debug = 1
+
+[[bin]]
+name = "lines_converter"
+path = "fuzz_targets/lines_converter.rs"
+test = false
+doc = false
diff --git a/mutable_batch_lp/fuzz/README.md b/mutable_batch_lp/fuzz/README.md
new file mode 100644
index 00000000000..b978638c87c
--- /dev/null
+++ b/mutable_batch_lp/fuzz/README.md
@@ -0,0 +1,46 @@
+# Fuzz tests
+
+The fuzz tests in this `fuzz` crate were created using [cargo-fuzz] version 0.11.3.
+
+[cargo-fuzz]: https://rust-fuzz.github.io/book/introduction.html
+
+## One-time setup
+
+To install `cargo-fuzz`:
+
+```
+$ cargo install cargo-fuzz
+```
+
+You'll also need a nightly Rust:
+
+```
+$ rustup install nightly
+```
+
+## Running
+
+To run an existing fuzz test, change to the `mutable_batch_lp` directory and run:
+
+```
+$ cargo +nightly fuzz run <TARGET>
+```
+
+where `<TARGET>` is the name of one of the files in `fuzz/fuzz_targets`. To list all targets, run:
+
+```
+$ cargo fuzz list
+```
+
+## Adding more
+
+To add more fuzzing targets, run:
+
+```
+$ cargo fuzz add <TARGET>
+```
+
+which will add a new file in `fuzz/fuzz_targets`. Edit the new file to call the code you want to
+fuzz; see the [`cargo-fuzz` tutorial] for examples.
+
+[`cargo-fuzz` tutorial]: https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html
diff --git a/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs
new file mode 100644
index 00000000000..34421898f63
--- /dev/null
+++ b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs
@@ -0,0 +1,66 @@
+#![no_main]
+
+use hashbrown::HashMap;
+use libfuzzer_sys::fuzz_target;
+use mutable_batch::{column::ColumnData, MutableBatch, PartitionWrite, WritePayload};
+use mutable_batch_lp::LinesConverter;
+
+fuzz_target!(|data: &[u8]| {
+    if let Ok(body) = std::str::from_utf8(data) {
+        let table_partition_template = Default::default();
+        let mut converter = LinesConverter::new(10);
+        let errors = match converter.write_lp(body) {
+            Ok(_) => vec![],
+            Err(mutable_batch_lp::Error::PerLine { lines }) => lines,
+            Err(other) => panic!("unexpected error: `{other}` input: `{body}`"),
+        };
+
+        if let Ok((batches, stats)) = converter.finish() {
+            let mut total_rows = 0;
+
+            let mut partitions: HashMap<_, HashMap<String, MutableBatch>> =
+                HashMap::default();
+
+            for (table_name, mutable_batch) in &batches {
+                assert!(
+                    mutable_batch.column("time").is_ok(),
+                    "batch for table `{table_name}` does not have a time column: \
+                    {mutable_batch:#?}\ninput: `{body}`\nerrors: `{errors:#?}`"
+                );
+
+                let data = mutable_batch.column("time").unwrap().data();
+                assert!(
+                    matches!(data, ColumnData::I64(_, _)),
+                    "expected the time column to be I64, instead got `{data:?}`.\ninput: `{body}`"
+                );
+
+                for (partition_key, partition_payload) in
+                    PartitionWrite::partition(&mutable_batch, &table_partition_template).unwrap()
+                {
+                    let partition = partitions.entry(partition_key).or_default();
+
+                    let mut table_batch = partition
+                        .raw_entry_mut()
+                        .from_key(table_name.as_str())
+                        .or_insert_with(|| (table_name.to_owned(), MutableBatch::default()));
+                    partition_payload
+                        .write_to_batch(&mut table_batch.1)
+                        .unwrap();
+                }
+
+                total_rows += mutable_batch.rows();
+            }
+
+            for (_partition_key, table_batches) in partitions {
+                for (_table_name, batch) in table_batches {
+                    assert_ne!(batch.rows(), 0);
+                }
+            }
+
+            assert_eq!(
+                stats.num_lines, total_rows,
+                "batches: {batches:#?}\ninput: `{body}`\nerrors: `{errors:#?}`"
+            );
+        }
+    }
+});
diff --git a/mutable_batch_lp/src/lib.rs b/mutable_batch_lp/src/lib.rs
index 9d0c07c8efc..5579a431963 100644
--- a/mutable_batch_lp/src/lib.rs
+++ b/mutable_batch_lp/src/lib.rs
@@ -26,24 +26,37 @@ use mutable_batch::writer::Writer;
 use mutable_batch::MutableBatch;
 use snafu::{ResultExt, Snafu};
 
-/// Error type for line protocol conversion
+const MAXIMUM_RETURNED_ERRORS: usize = 100;
+
+/// Error type for a conversion attempt on a set of line protocol lines
 #[derive(Debug, Snafu)]
 #[allow(missing_docs)]
 pub enum Error {
+    #[snafu(display(
+        "errors encountered on line(s):\n{}",
+        itertools::join(lines.iter(), "\n")
+    ))]
+    PerLine { lines: Vec<LineError> },
+
+    #[snafu(display("empty write payload"))]
+    EmptyPayload,
+}
+
+/// Errors which occur independently per line
+#[derive(Debug, Snafu)]
+#[allow(missing_docs)]
+pub enum LineError {
     #[snafu(display("error parsing line {} (1-based): {}", line, source))]
     LineProtocol {
         source: influxdb_line_protocol::Error,
         line: usize,
     },
 
-    #[snafu(display("error writing line {}: {}", line, source))]
+    #[snafu(display("error writing line {} (1-based): {}", line, source))]
     Write { source: LineWriteError, line: usize },
 
-    #[snafu(display("empty write payload"))]
-    EmptyPayload,
-
-    #[snafu(display("timestamp overflows i64"))]
-    TimestampOverflow,
+    #[snafu(display("timestamp overflows i64 on line {} (1-based)", line))]
+    TimestampOverflow { line: usize },
 }
 
 /// Result type for line protocol conversion
@@ -107,40 +120,83 @@ impl LinesConverter {
     ///     [`mutable_batch::writer::Error::TypeMismatch`]
     ///
     pub fn write_lp(&mut self, lines: &str) -> Result<()> {
-        for (line_idx, maybe_line) in parse_lines(lines).enumerate() {
-            let mut line = maybe_line.context(LineProtocolSnafu { line: line_idx + 1 })?;
-
-            if let Some(t) = line.timestamp.as_mut() {
-                *t = t
-                    .checked_mul(self.timestamp_base)
-                    .ok_or(Error::TimestampOverflow)?;
-            }
-
-            self.stats.num_lines += 1;
-            self.stats.num_fields += line.field_set.len();
-
-            let measurement = line.series.measurement.as_str();
-
-            let (_, batch) = self
-                .batches
-                .raw_entry_mut()
-                .from_key(measurement)
-                .or_insert_with(|| (measurement.to_string(), MutableBatch::new()));
+        let errors = parse_lines(lines)
+            .enumerate()
+            .filter_map(|(line_idx, maybe_line)| {
+                maybe_line
+                    .context(LineProtocolSnafu { line: line_idx + 1 })
+                    .and_then(|line| self.rebase_timestamp(line, line_idx))
+                    .and_then(|line| self.add_line_to_batch(line, line_idx))
+                    .err()
+            })
+            .take(MAXIMUM_RETURNED_ERRORS)
+            .collect::<Vec<_>>();
+
+        if !errors.is_empty() {
+            return Err(Error::PerLine { lines: errors });
+        }
+        Ok(())
+    }
 
-            // TODO: Reuse writer
-            let mut writer = Writer::new(batch, 1);
-            write_line(&mut writer, &line, self.default_time)
-                .context(WriteSnafu { line: line_idx + 1 })?;
-            writer.commit();
+    fn rebase_timestamp<'a>(
+        &self,
+        mut line: ParsedLine<'a>,
+        line_idx: usize,
+    ) -> Result<ParsedLine<'a>, LineError> {
+        if let Some(t) = line.timestamp.as_mut() {
+            let updated_timestamp = match t.checked_mul(self.timestamp_base) {
+                Some(t) => t,
+                None => return Err(LineError::TimestampOverflow { line: line_idx + 1 }),
+            };
+            *t = updated_timestamp;
         }
+        Ok(line)
+    }
+
+    fn add_line_to_batch(
+        &mut self,
+        line: ParsedLine<'_>,
+        line_idx: usize,
+    ) -> Result<(), LineError> {
+        let measurement = line.series.measurement.as_str();
+
+        let (_, batch) = self
+            .batches
+            .raw_entry_mut()
+            .from_key(measurement)
+            .or_insert_with(|| (measurement.to_string(), MutableBatch::new()));
+
+        // TODO: Reuse writer
+        let mut writer = Writer::new(batch, 1);
+        match write_line(&mut writer, &line, self.default_time)
+            .context(WriteSnafu { line: line_idx + 1 })
+        {
+            Ok(_) => {
+                writer.commit();
+                self.stats.num_lines += 1;
+                self.stats.num_fields += line.field_set.len();
+            }
+            Err(e) => return Err(e),
+        };
         Ok(())
     }
 
     /// Consume this [`LinesConverter`] returning the [`MutableBatch`]
     /// and the [`PayloadStatistics`] for the written data
     pub fn finish(self) -> Result<(HashMap<String, MutableBatch>, PayloadStatistics)> {
-        match self.batches.is_empty() {
-            false => Ok((self.batches, self.stats)),
+        let Self { batches, stats, .. } = self;
+
+        // Keep only batches that have rows. If add_line_to_batch returned a WriteError for all
+        // lines of that table, there will be an empty mutable batch in `batches` that will violate
+        // the assumptions that the partitioner makes later.
+        let nonempty_batches: HashMap<_, _> = batches
+            .into_iter()
+            .filter(|(_table, batch)| batch.rows() > 0)
+            .collect();
+
+        // If there aren't any nonempty batches, then we have an empty payload.
+        match nonempty_batches.is_empty() {
+            false => Ok((nonempty_batches, stats)),
             true => Err(Error::EmptyPayload),
         }
     }
@@ -332,6 +388,7 @@ pub mod test_helpers {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use ::test_helpers::assert_error;
     use arrow_util::assert_batches_eq;
     use assert_matches::assert_matches;
     use schema::Projection;
@@ -376,6 +433,58 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_partial_line_conversion() {
+        let lp = r#"cpu,tag1=v1,tag2=v2 val=2i 0
+        cpu,tag1=v4,tag2=v1 val=2i 0
+        mem,tag1=v2 ival=3i 0
+        ,tag2=v2 val=3i 1
+        cpu,tag1=v1,tag2=v2 fval=2.0
+        bad_line
+        mem,tag1=v5 ival=2i 1
+        "#;
+
+        let mut converter = LinesConverter::new(5);
+        let result = converter.write_lp(lp);
+        assert_matches!(
+            result,
+            Err(Error::PerLine { lines }) if matches!(&lines[..], [LineError::LineProtocol { .. }, LineError::LineProtocol { .. }]),
+            "expected an error returned from write_lp(), but found {:?}", result
+        );
+        let (batches, _) = converter.finish().unwrap();
+        assert_eq!(
+            batches.len(),
+            2,
+            "expected both batches are written, instead found {:?}",
+            batches.len(),
+        );
+
+        assert_batches_eq!(
+            &[
+                "+------+------+------+--------------------------------+-----+",
+                "| fval | tag1 | tag2 | time                           | val |",
+                "+------+------+------+--------------------------------+-----+",
+                "|      | v1   | v2   | 1970-01-01T00:00:00Z           | 2   |",
+                "|      | v4   | v1   | 1970-01-01T00:00:00Z           | 2   |",
+                "| 2.0  | v1   | v2   | 1970-01-01T00:00:00.000000005Z |     |",
+                "+------+------+------+--------------------------------+-----+",
+            ],
+            &[batches["cpu"].to_arrow(Projection::All).unwrap()]
+        );
+
+        assert_batches_eq!(
+            &[
+                "+------+------+--------------------------------+",
+                "| ival | tag1 | time                           |",
+                "+------+------+--------------------------------+",
+                "| 3    | v2   | 1970-01-01T00:00:00Z           |",
+                "| 2    | v5   | 1970-01-01T00:00:00.000000001Z |",
+                "+------+------+--------------------------------+",
+            ],
+            &[batches["mem"].to_arrow(Projection::All).unwrap()]
+        );
+    }
+
     #[test]
     fn test_nulls_string_and_float() {
         let lp = r#"m f0="cat" 1639612800000000000
@@ -501,13 +610,12 @@ m b=t 1639612800000000000
 
             let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
             assert_matches!(err,
-                Error::Write {
+                Error::PerLine { lines } if matches!(&lines[..],
+                [LineError::Write {
                     source: LineWriteError::ConflictedFieldTypes { name },
                     line: 1
-                }
-            => {
-                assert_eq!(name, "val");
-            });
+                }] if name == "val"
+            ));
         }
 
         #[test]
@@ -516,13 +624,13 @@ m b=t 1639612800000000000
 
             let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail");
             assert_matches!(err,
-                Error::Write {
-                    source: LineWriteError::DuplicateTag { name },
-                    line: 1
-                }
-            => {
-                assert_eq!(name, "tag");
-            });
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
         }
 
         #[test]
@@ -531,13 +639,13 @@ m b=t 1639612800000000000
 
             let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail");
             assert_matches!(err,
-                Error::Write {
-                    source: LineWriteError::DuplicateTag { name },
-                    line: 1
-                }
-            => {
-                assert_eq!(name, "tag");
-            });
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
         }
 
         // NOTE: All tags are strings, so this should never be a type conflict.
@@ -547,13 +655,13 @@ m b=t 1639612800000000000
 
             let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
             assert_matches!(err,
-                Error::Write {
-                    source: LineWriteError::DuplicateTag { name },
-                    line: 1
-                }
-            => {
-                assert_eq!(name, "tag");
-            });
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::DuplicateTag { name },
+                        line: 1
+                    }] if name == "tag"
+            ));
         }
 
         // NOTE: disallowed in IOx but accepted in TSM
@@ -564,13 +672,14 @@ m b=t 1639612800000000000
             let lp = "m1,v=1i v=1i 0";
 
             let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
-            assert_matches!(
-                err,
-                Error::Write {
-                    source: LineWriteError::MutableBatch { .. },
-                    line: 1
-                }
-            );
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::MutableBatch { .. },
+                        line: 1
+                    }]
+            ));
         }
 
         #[test]
@@ -578,13 +687,49 @@ m b=t 1639612800000000000
             let lp = "m1,v=1i v=1.0 0";
 
             let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail");
-            assert_matches!(
-                err,
-                Error::Write {
-                    source: LineWriteError::MutableBatch { .. },
-                    line: 1
-                }
-            );
+            assert_matches!(err,
+                Error::PerLine { lines } if matches!(
+                    &lines[..],
+                    [LineError::Write {
+                        source: LineWriteError::MutableBatch { .. },
+                        line: 1
+                    }]
+            ));
         }
     }
+
+    #[test]
+    fn dont_add_batches_when_there_are_write_errors() {
+        let lp = r#"6,,=0,,=^/+\---6,,=yY\w\w\,y-/- (="
+\_/1 (=""#;
+
+        let mut converter = LinesConverter::new(10);
+        let _errors = match converter.write_lp(lp) {
+            Ok(_) => vec![],
+            Err(Error::PerLine { lines }) => lines,
+            Err(other) => panic!("unexpected error: `{other}` input: `{lp}`"),
+        };
+
+        assert_error!(converter.finish(), Error::EmptyPayload);
+    }
+
+    #[test]
+    fn dont_add_stats_when_there_are_write_errors() {
+        let lp = "cpu,tag1=v1,tag2=v2 val=2i 0
+cpu val=4u";
+
+        let mut converter = LinesConverter::new(10);
+        // The second line has a different type for val
+        converter.write_lp(lp).unwrap_err();
+        let (batches, stats) = converter.finish().unwrap();
+
+        let total_rows: usize = batches.iter().map(|(_table, batch)| batch.rows()).sum();
+        assert_eq!(stats.num_lines, total_rows);
+    }
+
+    #[test]
+    fn duplicate_field_names_when_one_contains_optional_escaping_doesnt_panic() {
+        let lp = "table ,field=33,\\,field=333";
+        lines_to_batches(lp, 5).unwrap();
+    }
 }
diff --git a/mutable_batch_pb/Cargo.toml b/mutable_batch_pb/Cargo.toml
index c9bc27acbd0..5af7558c399 100644
--- a/mutable_batch_pb/Cargo.toml
+++ b/mutable_batch_pb/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 arrow_util = { path = "../arrow_util" }
 dml = { path = "../dml" }
@@ -13,9 +16,10 @@ generated_types = { path = "../generated_types" }
 hashbrown = { workspace = true }
 mutable_batch = { path = "../mutable_batch" }
 schema = { path = "../schema" }
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-mutable_batch_lp = { path = "../mutable_batch_lp" }
 data_types = { path = "../data_types" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+partition = { path = "../partition" }
diff --git a/mutable_batch_pb/src/decode.rs b/mutable_batch_pb/src/decode.rs
index 4c334b0ef39..5cb4fa528dc 100644
--- a/mutable_batch_pb/src/decode.rs
+++ b/mutable_batch_pb/src/decode.rs
@@ -4,6 +4,7 @@ use generated_types::influxdata::pbdata::v1::{
     column::{SemanticType, Values as PbValues},
     Column as PbColumn, DatabaseBatch, PackedStrings, TableBatch,
 };
+use generated_types::DecodeError;
 use hashbrown::{HashMap, HashSet};
 use mutable_batch::{writer::Writer, MutableBatch};
 use schema::{InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME};
@@ -49,6 +50,9 @@ pub enum Error {
     #[snafu(display("column \"{}\" contains more than one type of values", column))]
     MultipleValues { column: String },
 
+    #[snafu(display("unknown type for column {column}: {source}"))]
+    UnknownType { source: DecodeError, column: String },
+
     #[snafu(display("cannot infer type for column: {}", column))]
     InvalidType { column: String },
 }
@@ -365,12 +369,14 @@ fn pb_column_type(col: &PbColumn) -> Result<InfluxColumnType> {
     })?;
 
     let value_type = pb_value_type(&col.column_name, values)?;
-    let semantic_type = SemanticType::from_i32(col.semantic_type);
+    let semantic_type = SemanticType::try_from(col.semantic_type).context(UnknownTypeSnafu {
+        column: &col.column_name,
+    })?;
 
     match (semantic_type, value_type) {
-        (Some(SemanticType::Tag), InfluxFieldType::String) => Ok(InfluxColumnType::Tag),
-        (Some(SemanticType::Field), field) => Ok(InfluxColumnType::Field(field)),
-        (Some(SemanticType::Time), InfluxFieldType::Integer)
+        (SemanticType::Tag, InfluxFieldType::String) => Ok(InfluxColumnType::Tag),
+        (SemanticType::Field, field) => Ok(InfluxColumnType::Field(field)),
+        (SemanticType::Time, InfluxFieldType::Integer)
             if col.column_name.as_str() == TIME_COLUMN_NAME =>
         {
             Ok(InfluxColumnType::Timestamp)
diff --git a/mutable_batch_pb/src/lib.rs b/mutable_batch_pb/src/lib.rs
index f42f556264d..6babb30310c 100644
--- a/mutable_batch_pb/src/lib.rs
+++ b/mutable_batch_pb/src/lib.rs
@@ -20,6 +20,8 @@
 use data_types as _;
 #[cfg(test)]
 use mutable_batch_lp as _;
+#[cfg(test)]
+use partition as _;
 use workspace_hack as _;
 
 pub mod decode;
diff --git a/mutable_batch_pb/tests/encode.rs b/mutable_batch_pb/tests/encode.rs
index f43c2208e9f..2fd818bd038 100644
--- a/mutable_batch_pb/tests/encode.rs
+++ b/mutable_batch_pb/tests/encode.rs
@@ -1,7 +1,8 @@
 use arrow_util::assert_batches_eq;
 use data_types::PartitionKey;
-use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload};
+use mutable_batch::{writer::Writer, MutableBatch, WritePayload};
 use mutable_batch_pb::{decode::write_table_batch, encode::encode_batch};
+use partition::PartitionWrite;
 use schema::Projection;
 
 #[test]
diff --git a/mutable_batch_tests/Cargo.toml b/mutable_batch_tests/Cargo.toml
index 5b44ada698f..d6e14947556 100644
--- a/mutable_batch_tests/Cargo.toml
+++ b/mutable_batch_tests/Cargo.toml
@@ -6,19 +6,26 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 flate2 = "1.0"
 
 [dev-dependencies]
 bytes = "1.5"
-criterion = { version = "0.5", default-features = false, features = ["rayon"]}
+criterion = { version = "0.5", default-features = false, features = ["rayon"] }
 data_types = { path = "../data_types", default-features = false }
 dml = { path = "../dml" }
 generated_types = { path = "../generated_types" }
 mutable_batch = { path = "../mutable_batch" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch_pb = { path = "../mutable_batch_pb" }
-prost = "0.11"
+prost = { workspace = true }
+
+[[bench]]
+name = "statistics"
+harness = false
 
 [[bench]]
 name = "write_lp"
@@ -28,7 +35,6 @@ harness = false
 name = "write_pb"
 harness = false
 
-
 [lib]
 # Allow --save-baseline to work
 # https://github.com/bheisler/criterion.rs/issues/275
diff --git a/mutable_batch_tests/benches/statistics.rs b/mutable_batch_tests/benches/statistics.rs
new file mode 100644
index 00000000000..ef8d14cf16f
--- /dev/null
+++ b/mutable_batch_tests/benches/statistics.rs
@@ -0,0 +1,184 @@
+use std::hint::black_box;
+
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput};
+
+use data_types::StatValues;
+use mutable_batch::{
+    column::{recompute_min_max, Column, ColumnData},
+    writer::Writer,
+    MutableBatch,
+};
+
+const N_VALUES: usize = 16_000; // Must be multiple of 8
+
+fn generate_f64() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v as f64).take(N_VALUES / 2);
+
+    w.write_f64("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_u64() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v as u64).take(N_VALUES / 2);
+
+    w.write_u64("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_bool() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..).map(|v| v & 1 == 0).take(N_VALUES / 2);
+
+    w.write_bool("v", Some(mask.as_slice()), values)
+        .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn generate_tag() -> Column {
+    let mut mb = MutableBatch::default();
+
+    let mut w = Writer::new(&mut mb, N_VALUES);
+    let mask = std::iter::repeat(0b01010101)
+        .take(N_VALUES / 8)
+        .collect::<Vec<_>>();
+
+    let values = (0..)
+        .map(|v| (v % 100).to_string())
+        .take(N_VALUES / 2)
+        .collect::<Vec<_>>();
+
+    w.write_tag(
+        "v",
+        Some(mask.as_slice()),
+        values.iter().map(|v| v.as_str()),
+    )
+    .expect("failed to generate test column");
+
+    w.commit();
+
+    mb.column("v").unwrap().clone()
+}
+
+fn bench_rebuild(data: &mut Column) {
+    recompute_min_max(data);
+}
+
+fn bench_stats(col: &Column) {
+    match col.data() {
+        ColumnData::F64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::I64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::U64(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::Bool(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(&v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::String(data, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, v) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(v)
+                }
+            }
+            black_box(s);
+        }
+        ColumnData::Tag(data, dict, _) => {
+            let mut s = StatValues::new(None, None, N_VALUES as _, None);
+            for (i, id) in data.iter().enumerate() {
+                if col.valid_mask().get(i) {
+                    s.update(dict.lookup_id(*id).unwrap())
+                }
+            }
+            black_box(s);
+        }
+    }
+}
+
+fn run_bench(col: Column, c: &mut Criterion) {
+    let mut group = c.benchmark_group(col.data().to_string());
+    group.throughput(Throughput::Bytes(col.size() as u64));
+    group.bench_function("StatValues", |b| {
+        b.iter(|| {
+            bench_stats(&col);
+        });
+    });
+    group.bench_function("recompute_min_max", |b| {
+        b.iter_batched(
+            || col.clone(),
+            |mut col| {
+                bench_rebuild(&mut col);
+            },
+            BatchSize::SmallInput,
+        );
+    });
+    group.finish();
+}
+
+pub fn bench_statistics(c: &mut Criterion) {
+    run_bench(generate_f64(), c);
+    run_bench(generate_u64(), c);
+    run_bench(generate_bool(), c);
+    run_bench(generate_tag(), c);
+}
+
+criterion_group!(benches, bench_statistics);
+criterion_main!(benches);
diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml
index 216213bd47b..3a9488516b8 100644
--- a/object_store_metrics/Cargo.toml
+++ b/object_store_metrics/Cargo.toml
@@ -5,17 +5,20 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-async-trait = "0.1.73"
+async-trait = "0.1.77"
 bytes = "1.5"
 futures = "0.3"
 iox_time = { version = "0.1.0", path = "../iox_time" }
 metric = { version = "0.1.0", path = "../metric" }
 object_store = { workspace = true }
 pin-project = "1.1.3"
-tokio = { version = "1.32", features = ["io-util"] }
+tokio = { version = "1.35", features = ["io-util"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
-snafu = "0.7"
-tokio = { version = "1.32", features = ["macros", "io-util"] }
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "io-util"] }
diff --git a/object_store_metrics/src/dummy.rs b/object_store_metrics/src/dummy.rs
index d25325b1855..9960e2ed910 100644
--- a/object_store_metrics/src/dummy.rs
+++ b/object_store_metrics/src/dummy.rs
@@ -3,11 +3,13 @@
 
 use async_trait::async_trait;
 use bytes::Bytes;
+use futures::StreamExt;
 use snafu::Snafu;
 use std::ops::Range;
 
 use object_store::{
-    path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result,
+    path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId,
+    ObjectMeta, ObjectStore, PutOptions, PutResult, Result,
 };
 use tokio::io::AsyncWrite;
 
@@ -36,13 +38,13 @@ impl From<Error> for object_store::Error {
 #[derive(Debug, Clone)]
 #[allow(missing_copy_implementations)]
 /// An object store that always generates an error
-pub struct DummyObjectStore {
+pub(crate) struct DummyObjectStore {
     name: &'static str,
 }
 
 impl DummyObjectStore {
     /// Create a new [`DummyObjectStore`] that always fails
-    pub fn new(name: &'static str) -> Self {
+    pub(crate) fn new(name: &'static str) -> Self {
         Self { name }
     }
 }
@@ -55,7 +57,12 @@ impl std::fmt::Display for DummyObjectStore {
 
 #[async_trait]
 impl ObjectStore for DummyObjectStore {
-    async fn put(&self, _location: &Path, _bytes: Bytes) -> Result<()> {
+    async fn put_opts(
+        &self,
+        _location: &Path,
+        _bytes: Bytes,
+        _opts: PutOptions,
+    ) -> Result<PutResult> {
         Ok(NotSupportedSnafu { name: self.name }.fail()?)
     }
 
@@ -90,11 +97,16 @@ impl ObjectStore for DummyObjectStore {
         Ok(NotSupportedSnafu { name: self.name }.fail()?)
     }
 
-    async fn list(
-        &self,
-        _prefix: Option<&Path>,
-    ) -> Result<futures::stream::BoxStream<'_, Result<ObjectMeta>>> {
-        Ok(NotSupportedSnafu { name: self.name }.fail()?)
+    fn list(&self, _prefix: Option<&Path>) -> futures::stream::BoxStream<'_, Result<ObjectMeta>> {
+        futures::stream::once(async move {
+            NotSupportedSnafu { name: self.name }
+                .fail()
+                .map_err(|e| ObjectStoreError::Generic {
+                    store: self.name,
+                    source: Box::new(e),
+                })
+        })
+        .boxed()
     }
 
     async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result<ListResult> {
diff --git a/object_store_metrics/src/lib.rs b/object_store_metrics/src/lib.rs
index c7d6812bdb8..09292baf450 100644
--- a/object_store_metrics/src/lib.rs
+++ b/object_store_metrics/src/lib.rs
@@ -14,7 +14,7 @@
     unused_crate_dependencies
 )]
 
-use object_store::{GetOptions, GetResultPayload};
+use object_store::{GetOptions, GetResultPayload, PutOptions, PutResult};
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
@@ -41,6 +41,118 @@ use tokio::io::AsyncWrite;
 #[cfg(test)]
 mod dummy;
 
+#[derive(Debug, Clone)]
+struct Metrics {
+    success_duration: DurationHistogram,
+    error_duration: DurationHistogram,
+}
+
+impl Metrics {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        // Call durations broken down by op & result
+        let duration: Metric<DurationHistogram> = registry.register_metric(
+            "object_store_op_duration",
+            "object store operation duration",
+        );
+
+        Self {
+            success_duration: duration.recorder(&[("op", op), ("result", "success")]),
+            error_duration: duration.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool) {
+        // Avoid exploding if time goes backwards - simply drop the measurement
+        // if it happens.
+        let Some(delta) = t_end.checked_duration_since(t_begin) else {
+            return;
+        };
+
+        if success {
+            self.success_duration.record(delta);
+        } else {
+            self.error_duration.record(delta);
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MetricsWithBytes {
+    inner: Metrics,
+    success_bytes: U64Counter,
+    error_bytes: U64Counter,
+}
+
+impl MetricsWithBytes {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        // Byte counts up/down
+        let bytes = registry.register_metric::<U64Counter>(
+            "object_store_transfer_bytes",
+            "cumulative count of file content bytes transferred to/from the object store",
+        );
+
+        Self {
+            inner: Metrics::new(registry, op),
+            success_bytes: bytes.recorder(&[("op", op), ("result", "success")]),
+            error_bytes: bytes.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record_bytes_only(&self, success: bool, bytes: u64) {
+        if success {
+            self.success_bytes.inc(bytes);
+        } else {
+            self.error_bytes.inc(bytes);
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool, bytes: Option<u64>) {
+        if let Some(bytes) = bytes {
+            self.record_bytes_only(success, bytes);
+        }
+
+        self.inner.record(t_begin, t_end, success);
+    }
+}
+
+#[derive(Debug, Clone)]
+struct MetricsWithCount {
+    inner: Metrics,
+    success_count: U64Counter,
+    error_count: U64Counter,
+}
+
+impl MetricsWithCount {
+    fn new(registry: &metric::Registry, op: &'static str) -> Self {
+        let count = registry.register_metric::<U64Counter>(
+            "object_store_transfer_objects",
+            "cumulative count of objects transferred to/from the object store",
+        );
+
+        Self {
+            inner: Metrics::new(registry, op),
+            success_count: count.recorder(&[("op", op), ("result", "success")]),
+            error_count: count.recorder(&[("op", op), ("result", "error")]),
+        }
+    }
+
+    fn record_count_only(&self, success: bool, count: u64) {
+        if success {
+            self.success_count.inc(count);
+        } else {
+            self.error_count.inc(count);
+        }
+    }
+
+    fn record(&self, t_begin: Time, t_end: Time, success: bool, count: Option<u64>) {
+        if let Some(count) = count {
+            self.record_count_only(success, count);
+        }
+
+        self.inner.record(t_begin, t_end, success);
+    }
+}
+
 /// An instrumentation decorator, wrapping an underlying [`ObjectStore`]
 /// implementation and recording bytes transferred and call latency.
 ///
@@ -92,26 +204,20 @@ pub struct ObjectStoreMetrics {
     inner: Arc<dyn ObjectStore>,
     time_provider: Arc<dyn TimeProvider>,
 
-    put_success_duration: DurationHistogram,
-    put_error_duration: DurationHistogram,
-    put_bytes: U64Counter,
-
-    get_success_duration: DurationHistogram,
-    get_error_duration: DurationHistogram,
-    get_bytes: U64Counter,
-
-    get_range_success_duration: DurationHistogram,
-    get_range_error_duration: DurationHistogram,
-    get_range_bytes: U64Counter,
-
-    head_success_duration: DurationHistogram,
-    head_error_duration: DurationHistogram,
-
-    delete_success_duration: DurationHistogram,
-    delete_error_duration: DurationHistogram,
-
-    list_success_duration: DurationHistogram,
-    list_error_duration: DurationHistogram,
+    put: MetricsWithBytes,
+    get: MetricsWithBytes,
+    get_range: MetricsWithBytes,
+    get_ranges: MetricsWithBytes,
+    head: Metrics,
+    delete: Metrics,
+    delete_stream: MetricsWithCount,
+    list: MetricsWithCount,
+    list_with_offset: MetricsWithCount,
+    list_with_delimiter: MetricsWithCount,
+    copy: Metrics,
+    rename: Metrics,
+    copy_if_not_exists: Metrics,
+    rename_if_not_exists: Metrics,
 }
 
 impl ObjectStoreMetrics {
@@ -121,65 +227,24 @@ impl ObjectStoreMetrics {
         time_provider: Arc<dyn TimeProvider>,
         registry: &metric::Registry,
     ) -> Self {
-        // Byte counts up/down
-        let bytes = registry.register_metric::<U64Counter>(
-            "object_store_transfer_bytes",
-            "cumulative count of file content bytes transferred to/from the object store",
-        );
-        let put_bytes = bytes.recorder(&[("op", "put")]);
-        let get_bytes = bytes.recorder(&[("op", "get")]);
-        let get_range_bytes = bytes.recorder(&[("op", "get_range")]);
-
-        // Call durations broken down by op & result
-        let duration: Metric<DurationHistogram> = registry.register_metric(
-            "object_store_op_duration",
-            "object store operation duration",
-        );
-
-        let put_success_duration = duration.recorder(&[("op", "put"), ("result", "success")]);
-        let put_error_duration = duration.recorder(&[("op", "put"), ("result", "error")]);
-
-        let get_success_duration = duration.recorder(&[("op", "get"), ("result", "success")]);
-        let get_error_duration = duration.recorder(&[("op", "get"), ("result", "error")]);
-
-        let get_range_success_duration =
-            duration.recorder(&[("op", "get_range"), ("result", "success")]);
-        let get_range_error_duration =
-            duration.recorder(&[("op", "get_range"), ("result", "error")]);
-
-        let head_success_duration = duration.recorder(&[("op", "head"), ("result", "success")]);
-        let head_error_duration = duration.recorder(&[("op", "head"), ("result", "error")]);
-
-        let delete_success_duration = duration.recorder(&[("op", "delete"), ("result", "success")]);
-        let delete_error_duration = duration.recorder(&[("op", "delete"), ("result", "error")]);
-
-        let list_success_duration = duration.recorder(&[("op", "list"), ("result", "success")]);
-        let list_error_duration = duration.recorder(&[("op", "list"), ("result", "error")]);
-
         Self {
             inner,
             time_provider,
 
-            put_success_duration,
-            put_error_duration,
-            put_bytes,
-
-            get_bytes,
-            get_success_duration,
-            get_error_duration,
-
-            get_range_bytes,
-            get_range_success_duration,
-            get_range_error_duration,
-
-            head_success_duration,
-            head_error_duration,
-
-            delete_success_duration,
-            delete_error_duration,
-
-            list_success_duration,
-            list_error_duration,
+            put: MetricsWithBytes::new(registry, "put"),
+            get: MetricsWithBytes::new(registry, "get"),
+            get_range: MetricsWithBytes::new(registry, "get_range"),
+            get_ranges: MetricsWithBytes::new(registry, "get_ranges"),
+            head: Metrics::new(registry, "head"),
+            delete: Metrics::new(registry, "delete"),
+            delete_stream: MetricsWithCount::new(registry, "delete_stream"),
+            list: MetricsWithCount::new(registry, "list"),
+            list_with_offset: MetricsWithCount::new(registry, "list_with_offset"),
+            list_with_delimiter: MetricsWithCount::new(registry, "list_with_delimiter"),
+            copy: Metrics::new(registry, "copy"),
+            rename: Metrics::new(registry, "rename"),
+            copy_if_not_exists: Metrics::new(registry, "copy_if_not_exists"),
+            rename_if_not_exists: Metrics::new(registry, "rename_if_not_exists"),
         }
     }
 }
@@ -192,22 +257,12 @@ impl std::fmt::Display for ObjectStoreMetrics {
 
 #[async_trait]
 impl ObjectStore for ObjectStoreMetrics {
-    async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> {
+    async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> {
         let t = self.time_provider.now();
-
         let size = bytes.len();
-        let res = self.inner.put(location, bytes).await;
-        self.put_bytes.inc(size as _);
-
-        // Avoid exploding if time goes backwards - simply drop the measurement
-        // if it happens.
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(_) => self.put_success_duration.record(delta),
-                Err(_) => self.put_error_duration.record(delta),
-            };
-        }
-
+        let res = self.inner.put_opts(location, bytes, opts).await;
+        self.put
+            .record(t, self.time_provider.now(), res.is_ok(), Some(size as _));
         res
     }
 
@@ -231,15 +286,12 @@ impl ObjectStore for ObjectStoreMetrics {
             Ok(mut res) => {
                 res.payload = match res.payload {
                     GetResultPayload::File(file, path) => {
-                        // Record the file size in bytes and time the inner call took.
-                        if let Ok(m) = file.metadata() {
-                            self.get_bytes.inc(m.len());
-                            if let Some(d) =
-                                self.time_provider.now().checked_duration_since(started_at)
-                            {
-                                self.get_success_duration.record(d)
-                            }
-                        }
+                        self.get.record(
+                            started_at,
+                            self.time_provider.now(),
+                            true,
+                            file.metadata().ok().map(|m| m.len()),
+                        );
                         GetResultPayload::File(file, path)
                     }
                     GetResultPayload::Stream(s) => {
@@ -249,9 +301,7 @@ impl ObjectStore for ObjectStoreMetrics {
                             StreamMetricRecorder::new(
                                 s,
                                 started_at,
-                                self.get_success_duration.clone(),
-                                self.get_error_duration.clone(),
-                                BytesStreamDelegate(self.get_bytes.clone()),
+                                BytesStreamDelegate::new(self.get.clone()),
                             )
                             .fuse(),
                         )))
@@ -260,10 +310,8 @@ impl ObjectStore for ObjectStoreMetrics {
                 Ok(res)
             }
             Err(e) => {
-                // Record the call duration in the error histogram.
-                if let Some(delta) = self.time_provider.now().checked_duration_since(started_at) {
-                    self.get_error_duration.record(delta);
-                }
+                self.get
+                    .record(started_at, self.time_provider.now(), false, None);
                 Err(e)
             }
         }
@@ -271,113 +319,135 @@ impl ObjectStore for ObjectStoreMetrics {
 
     async fn get_range(&self, location: &Path, range: Range<usize>) -> Result<Bytes> {
         let t = self.time_provider.now();
-
         let res = self.inner.get_range(location, range).await;
+        self.get_range.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref().ok().map(|b| b.len() as _),
+        );
+        res
+    }
 
-        // Avoid exploding if time goes backwards - simply drop the measurement
-        // if it happens.
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(data) => {
-                    self.get_range_success_duration.record(delta);
-                    self.get_range_bytes.inc(data.len() as _);
-                }
-                Err(_) => self.get_range_error_duration.record(delta),
-            };
-        }
-
+    async fn get_ranges(&self, location: &Path, ranges: &[Range<usize>]) -> Result<Vec<Bytes>> {
+        let t = self.time_provider.now();
+        let res = self.inner.get_ranges(location, ranges).await;
+        self.get_ranges.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref()
+                .ok()
+                .map(|b| b.iter().map(|b| b.len() as u64).sum()),
+        );
         res
     }
 
     async fn head(&self, location: &Path) -> Result<ObjectMeta> {
         let t = self.time_provider.now();
-
         let res = self.inner.head(location).await;
-
-        // Avoid exploding if time goes backwards - simply drop the measurement
-        // if it happens.
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(_) => self.head_success_duration.record(delta),
-                Err(_) => self.head_error_duration.record(delta),
-            };
-        }
-
+        self.head.record(t, self.time_provider.now(), res.is_ok());
         res
     }
 
     async fn delete(&self, location: &Path) -> Result<()> {
         let t = self.time_provider.now();
-
         let res = self.inner.delete(location).await;
+        self.delete.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
 
-        // Avoid exploding if time goes backwards - simply drop the measurement
-        // if it happens.
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(_) => self.delete_success_duration.record(delta),
-                Err(_) => self.delete_error_duration.record(delta),
-            };
-        }
+    fn delete_stream<'a>(
+        &'a self,
+        locations: BoxStream<'a, Result<Path>>,
+    ) -> BoxStream<'a, Result<Path>> {
+        let started_at = self.time_provider.now();
 
-        res
+        let s = self.inner.delete_stream(locations);
+
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(
+            s,
+            started_at,
+            CountStreamDelegate::new(self.delete_stream.clone()),
+        )
+        .fuse()
+        .boxed()
     }
 
-    async fn list(&self, prefix: Option<&Path>) -> Result<BoxStream<'_, Result<ObjectMeta>>> {
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
         let started_at = self.time_provider.now();
 
-        let res = self.inner.list(prefix).await;
+        let s = self.inner.list(prefix);
 
-        match res {
-            Ok(s) => {
-                // Wrap the object store data stream in a decorator to track the
-                // yielded data / wall clock, inclusive of the inner call above.
-                Ok(Box::pin(Box::new(
-                    StreamMetricRecorder::new(
-                        s,
-                        started_at,
-                        self.list_success_duration.clone(),
-                        self.list_error_duration.clone(),
-                        NopStreamDelegate::default(),
-                    )
-                    .fuse(),
-                )))
-            }
-            Err(e) => {
-                // Record the call duration in the error histogram.
-                if let Some(delta) = self.time_provider.now().checked_duration_since(started_at) {
-                    self.list_error_duration.record(delta);
-                }
-                Err(e)
-            }
-        }
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(s, started_at, CountStreamDelegate::new(self.list.clone()))
+            .fuse()
+            .boxed()
     }
 
-    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
-        let t = self.time_provider.now();
+    fn list_with_offset(
+        &self,
+        prefix: Option<&Path>,
+        offset: &Path,
+    ) -> BoxStream<'_, Result<ObjectMeta>> {
+        let started_at = self.time_provider.now();
 
-        let res = self.inner.list_with_delimiter(prefix).await;
+        let s = self.inner.list_with_offset(prefix, offset);
 
-        // Avoid exploding if time goes backwards - simply drop the measurement
-        // if it happens.
-        if let Some(delta) = self.time_provider.now().checked_duration_since(t) {
-            match &res {
-                Ok(_) => self.list_success_duration.record(delta),
-                Err(_) => self.list_error_duration.record(delta),
-            };
-        }
+        // Wrap the object store data stream in a decorator to track the
+        // yielded data / wall clock, inclusive of the inner call above.
+        StreamMetricRecorder::new(
+            s,
+            started_at,
+            CountStreamDelegate::new(self.list_with_offset.clone()),
+        )
+        .fuse()
+        .boxed()
+    }
 
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        let t = self.time_provider.now();
+        let res = self.inner.list_with_delimiter(prefix).await;
+        self.list_with_delimiter.record(
+            t,
+            self.time_provider.now(),
+            res.is_ok(),
+            res.as_ref().ok().map(|res| res.objects.len() as _),
+        );
         res
     }
 
     async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
-        // TODO: Instrument me
-        self.inner.copy(from, to).await
+        let t = self.time_provider.now();
+        let res = self.inner.copy(from, to).await;
+        self.copy.record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn rename(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.rename(from, to).await;
+        self.rename.record(t, self.time_provider.now(), res.is_ok());
+        res
     }
 
     async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
-        // TODO: Instrument me
-        self.inner.copy_if_not_exists(from, to).await
+        let t = self.time_provider.now();
+        let res = self.inner.copy_if_not_exists(from, to).await;
+        self.copy_if_not_exists
+            .record(t, self.time_provider.now(), res.is_ok());
+        res
+    }
+
+    async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
+        let t = self.time_provider.now();
+        let res = self.inner.rename_if_not_exists(from, to).await;
+        self.rename_if_not_exists
+            .record(t, self.time_provider.now(), res.is_ok());
+        res
     }
 }
 
@@ -386,8 +456,12 @@ impl ObjectStore for ObjectStoreMetrics {
 trait MetricDelegate {
     /// The type this delegate observes.
     type Item;
+
     /// Invoked when the stream yields an `Ok(Item)`.
     fn observe_ok(&self, value: &Self::Item);
+
+    /// Finish stream.
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool);
 }
 
 /// A [`MetricDelegate`] for instrumented streams of [`Bytes`].
@@ -395,30 +469,44 @@ trait MetricDelegate {
 /// This impl is used to record the number of bytes yielded for
 /// [`ObjectStore::get()`] calls.
 #[derive(Debug)]
-struct BytesStreamDelegate(U64Counter);
+struct BytesStreamDelegate(MetricsWithBytes);
+
+impl BytesStreamDelegate {
+    fn new(metrics: MetricsWithBytes) -> Self {
+        Self(metrics)
+    }
+}
 
 impl MetricDelegate for BytesStreamDelegate {
     type Item = Bytes;
 
     fn observe_ok(&self, bytes: &Self::Item) {
-        self.0.inc(bytes.len() as _);
+        self.0.record_bytes_only(true, bytes.len() as _);
+    }
+
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool) {
+        self.0.record(t_begin, t_end, success, None);
     }
 }
 
 #[derive(Debug)]
-struct NopStreamDelegate<T>(PhantomData<T>);
+struct CountStreamDelegate<T>(MetricsWithCount, PhantomData<T>);
 
-impl<T> Default for NopStreamDelegate<T> {
-    fn default() -> Self {
-        Self(Default::default())
+impl<T> CountStreamDelegate<T> {
+    fn new(metrics: MetricsWithCount) -> Self {
+        Self(metrics, Default::default())
     }
 }
 
-impl<T> MetricDelegate for NopStreamDelegate<T> {
+impl<T> MetricDelegate for CountStreamDelegate<T> {
     type Item = T;
 
     fn observe_ok(&self, _value: &Self::Item) {
-        // it does nothing!
+        self.0.record_count_only(true, 1);
+    }
+
+    fn finish(&self, t_begin: Time, t_end: Time, success: bool) {
+        self.0.record(t_begin, t_end, success, None);
     }
 }
 
@@ -472,9 +560,6 @@ where
     // Called when the stream yields an `Ok(T)` to allow the delegate to inspect
     // the `T`.
     metric_delegate: D,
-
-    success_duration: DurationHistogram,
-    error_duration: DurationHistogram,
 }
 
 impl<S, D> StreamMetricRecorder<S, D>
@@ -482,13 +567,7 @@ where
     S: Stream,
     D: MetricDelegate,
 {
-    fn new(
-        stream: S,
-        started_at: Time,
-        success_duration: DurationHistogram,
-        error_duration: DurationHistogram,
-        metric_delegate: D,
-    ) -> Self {
+    fn new(stream: S, started_at: Time, metric_delegate: D) -> Self {
         let time_provider = SystemProvider::default();
         Self {
             inner: stream,
@@ -504,8 +583,6 @@ where
             started_at,
             time_provider,
 
-            success_duration,
-            error_duration,
             metric_delegate,
         }
     }
@@ -542,21 +619,13 @@ where
             Poll::Ready(None) => {
                 // The stream has terminated - record the wall clock duration
                 // immediately.
-                let hist = match this.last_call_ok {
-                    true => this.success_duration,
-                    false => this.error_duration,
-                };
-
-                // Take the last_yielded_at option, marking metrics as emitted
-                // so the drop impl does not duplicate them.
-                if let Some(d) = this
-                    .last_yielded_at
-                    .take()
-                    .expect("no last_yielded_at value for fused stream")
-                    .checked_duration_since(*this.started_at)
-                {
-                    hist.record(d)
-                }
+                this.metric_delegate.finish(
+                    *this.started_at,
+                    this.last_yielded_at
+                        .take()
+                        .expect("no last_yielded_at value for fused stream"),
+                    *this.last_call_ok,
+                );
 
                 Poll::Ready(None)
             }
@@ -581,14 +650,8 @@ where
         // Only emit metrics if the end of the stream was not observed (and
         // therefore last_yielded_at is still Some).
         if let Some(last) = self.last_yielded_at {
-            let hist = match self.last_call_ok {
-                true => &self.success_duration,
-                false => &self.error_duration,
-            };
-
-            if let Some(d) = last.checked_duration_since(self.started_at) {
-                hist.record(d)
-            }
+            self.metric_delegate
+                .finish(self.started_at, last, self.last_call_ok);
         }
     }
 }
@@ -601,7 +664,7 @@ mod tests {
         time::Duration,
     };
 
-    use futures::stream;
+    use futures::{stream, TryStreamExt};
     use metric::Attributes;
     use std::io::Read;
 
@@ -610,6 +673,7 @@ mod tests {
 
     use super::*;
 
+    #[track_caller]
     fn assert_histogram_hit<const N: usize>(
         metrics: &metric::Registry,
         name: &'static str,
@@ -626,6 +690,24 @@ mod tests {
         assert!(hit_count > 0, "metric {name} did not record any calls");
     }
 
+    #[track_caller]
+    fn assert_histogram_not_hit<const N: usize>(
+        metrics: &metric::Registry,
+        name: &'static str,
+        attr: [(&'static str, &'static str); N],
+    ) {
+        let histogram = metrics
+            .get_instrument::<Metric<DurationHistogram>>(name)
+            .expect("failed to read histogram")
+            .get_observer(&Attributes::from(&attr))
+            .expect("failed to get observer")
+            .fetch();
+
+        let hit_count = histogram.sample_count();
+        assert!(hit_count == 0, "metric {name} did record {hit_count} calls");
+    }
+
+    #[track_caller]
     fn assert_counter_value<const N: usize>(
         metrics: &metric::Registry,
         name: &'static str,
@@ -656,7 +738,12 @@ mod tests {
             .await
             .expect("put should succeed");
 
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "put")], 5);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "put"), ("result", "success")],
+            5,
+        );
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
@@ -679,7 +766,12 @@ mod tests {
             .await
             .expect_err("put should error");
 
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "put")], 5);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "put"), ("result", "error")],
+            5,
+        );
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
@@ -691,12 +783,71 @@ mod tests {
     async fn test_list() {
         let metrics = Arc::new(metric::Registry::default());
         let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store.list(None).try_collect::<Vec<_>>().await.unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "list"), ("result", "success")],
+            2,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_list_with_offset() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("baz"), Bytes::default())
+            .await
+            .unwrap();
         let time = Arc::new(SystemProvider::new());
         let store = ObjectStoreMetrics::new(store, time, &metrics);
 
-        store.list(None).await.expect("list should succeed");
+        store
+            .list_with_offset(None, &Path::from("bar"))
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
 
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "list_with_offset"), ("result", "success")],
+            2,
+        );
         assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "list_with_offset"), ("result", "success")],
+        );
+
+        // NOT raw `list` call
+        assert_histogram_not_hit(
             &metrics,
             "object_store_op_duration",
             [("op", "list"), ("result", "success")],
@@ -710,7 +861,10 @@ mod tests {
         let time = Arc::new(SystemProvider::new());
         let store = ObjectStoreMetrics::new(store, time, &metrics);
 
-        assert!(store.list(None).await.is_err(), "mock configured to fail");
+        assert!(
+            store.list(None).try_collect::<Vec<_>>().await.is_err(),
+            "mock configured to fail"
+        );
 
         assert_histogram_hit(
             &metrics,
@@ -734,7 +888,7 @@ mod tests {
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
-            [("op", "list"), ("result", "success")],
+            [("op", "list_with_delimiter"), ("result", "success")],
         );
     }
 
@@ -756,7 +910,7 @@ mod tests {
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
-            [("op", "list"), ("result", "error")],
+            [("op", "list_with_delimiter"), ("result", "error")],
         );
     }
 
@@ -817,6 +971,212 @@ mod tests {
         );
     }
 
+    #[tokio::test]
+    async fn test_getranges() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::from_static(b"bar"))
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .get_ranges(&Path::from("foo"), &[0..2, 1..2, 0..1])
+            .await
+            .unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get_ranges"), ("result", "success")],
+            4,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_ranges"), ("result", "success")],
+        );
+
+        // NO `get_range` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "get_range"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_copy() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .copy(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_copy_if_not_exists() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .copy_if_not_exists(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy_if_not_exists"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rename() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .rename(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "rename"), ("result", "success")],
+        );
+
+        // NO `copy`/`delete` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_rename_if_not_exists() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .rename_if_not_exists(&Path::from("foo"), &Path::from("bar"))
+            .await
+            .unwrap();
+
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "rename_if_not_exists"), ("result", "success")],
+        );
+
+        // NO `copy`/`copy_if_not_exists`/`delete` used!
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "copy_if_not_exists"), ("result", "success")],
+        );
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
+    #[tokio::test]
+    async fn test_delete_stream() {
+        let metrics = Arc::new(metric::Registry::default());
+        let store = Arc::new(InMemory::new());
+        store
+            .put(&Path::from("foo"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("bar"), Bytes::default())
+            .await
+            .unwrap();
+        store
+            .put(&Path::from("baz"), Bytes::default())
+            .await
+            .unwrap();
+        let time = Arc::new(SystemProvider::new());
+        let store = ObjectStoreMetrics::new(store, time, &metrics);
+
+        store
+            .delete_stream(
+                stream::iter(["foo", "baz"])
+                    .map(|s| Ok(Path::from(s)))
+                    .boxed(),
+            )
+            .try_collect::<Vec<_>>()
+            .await
+            .unwrap();
+
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_objects",
+            [("op", "delete_stream"), ("result", "success")],
+            2,
+        );
+        assert_histogram_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete_stream"), ("result", "success")],
+        );
+
+        // NOT raw `delete` call
+        assert_histogram_not_hit(
+            &metrics,
+            "object_store_op_duration",
+            [("op", "delete"), ("result", "success")],
+        );
+    }
+
     #[tokio::test]
     async fn test_put_get_getrange_head_delete_file() {
         let metrics = Arc::new(metric::Registry::default());
@@ -844,7 +1204,12 @@ mod tests {
             v => panic!("not a file: {v:?}"),
         }
 
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "get")], 5);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get"), ("result", "success")],
+            5,
+        );
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
@@ -858,7 +1223,7 @@ mod tests {
         assert_counter_value(
             &metrics,
             "object_store_transfer_bytes",
-            [("op", "get_range")],
+            [("op", "get_range"), ("result", "success")],
             3,
         );
         assert_histogram_hit(
@@ -905,7 +1270,12 @@ mod tests {
             v => panic!("not a stream: {v:?}"),
         }
 
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "get")], 5);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "get"), ("result", "success")],
+            5,
+        );
         assert_histogram_hit(
             &metrics,
             "object_store_op_duration",
@@ -930,21 +1300,12 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let mut stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         let got = stream
@@ -953,7 +1314,12 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 1);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
 
         // Sleep at least 10ms to assert the recorder to captures the wall clock
         // time.
@@ -966,11 +1332,14 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 3);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
 
-        let success_hist = hist
-            .get_observer(&metric::Attributes::from(&[("result", "success")]))
-            .expect("failed to get observer");
+        let success_hist = &m.inner.success_duration;
 
         // Until the stream is fully consumed, there should be no wall clock
         // metrics emitted.
@@ -983,7 +1352,12 @@ mod tests {
         // recorded.
         let hit_count = success_hist.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
 
         // And it must be in a SLEEP or higher bucket.
         let hit_count: u64 = success_hist
@@ -1002,7 +1376,12 @@ mod tests {
         drop(stream);
         let hit_count = success_hist.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration duplicated");
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
     }
 
     // Ensures the stream decorator correctly records the wall clock duration
@@ -1022,21 +1401,12 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let mut stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         let got = stream
@@ -1045,7 +1415,12 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 1);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
 
         // Sleep at least 10ms to assert the recorder to captures the wall clock
         // time.
@@ -1057,15 +1432,16 @@ mod tests {
 
         // Now the stream is complete, the wall clock duration must have been
         // recorded.
-        let hit_count = hist
-            .get_observer(&metric::Attributes::from(&[("result", "success")]))
-            .expect("failed to get observer")
-            .fetch()
-            .sample_count();
+        let hit_count = m.inner.success_duration.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
 
         // And the number of bytes read must match the pre-drop value.
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
     }
 
     // Ensures the stream decorator records the wall clock duration into the
@@ -1085,21 +1461,12 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let mut stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         let got = stream
@@ -1108,7 +1475,18 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 1);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "error")],
+            0,
+        );
 
         let _err = stream
             .next()
@@ -1120,15 +1498,22 @@ mod tests {
         drop(stream);
 
         // Ensure the wall clock was added to the "error" histogram.
-        let hit_count = hist
-            .get_observer(&metric::Attributes::from(&[("result", "error")]))
-            .expect("failed to get observer")
-            .fetch()
-            .sample_count();
+        let hit_count = m.inner.error_duration.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
 
         // And the number of bytes read must match
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "error")],
+            0,
+        );
     }
 
     // Ensures the stream decorator records the wall clock duration into the
@@ -1148,21 +1533,12 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let mut stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         let got = stream
@@ -1171,7 +1547,12 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 1);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            1,
+        );
 
         let _err = stream
             .next()
@@ -1185,22 +1566,28 @@ mod tests {
             .expect("should yield data")
             .expect("should succeed");
         assert_eq!(got.len(), 3);
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
 
         // Drop after observing an error
         drop(stream);
 
         // Ensure the wall clock was added to the "success" histogram after
         // progressing past the transient error.
-        let hit_count = hist
-            .get_observer(&metric::Attributes::from(&[("result", "success")]))
-            .expect("failed to get observer")
-            .fetch()
-            .sample_count();
+        let hit_count = m.inner.success_duration.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
 
         // And the number of bytes read must match
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            4,
+        );
     }
 
     // Ensures the wall clock time recorded by the stream decorator includes the
@@ -1216,36 +1603,28 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         // Drop immediately
         drop(stream);
 
         // Ensure the wall clock was added to the "success" histogram
-        let hit_count = hist
-            .get_observer(&metric::Attributes::from(&[("result", "success")]))
-            .expect("failed to get observer")
-            .fetch()
-            .sample_count();
+        let hit_count = m.inner.success_duration.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
 
         // And the number of bytes read must match
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 0);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            0,
+        );
     }
 
     // Ensures the wall clock time recorded by the stream decorator emits a wall
@@ -1260,35 +1639,27 @@ mod tests {
         let time_provider = SystemProvider::default();
 
         let metrics = Arc::new(metric::Registry::default());
-        let hist: Metric<DurationHistogram> = metrics.register_metric("wall_clock", "");
-
-        let bytes = metrics
-            .register_metric::<U64Counter>(
-                "object_store_transfer_bytes",
-                "cumulative count of file content bytes transferred to/from the object store",
-            )
-            .recorder(&[]);
+        let m = MetricsWithBytes::new(&metrics, "test");
 
         let mut stream = StreamMetricRecorder::new(
             inner,
             time_provider.now(),
-            hist.recorder(&[("result", "success")]),
-            hist.recorder(&[("result", "error")]),
-            BytesStreamDelegate(bytes),
+            BytesStreamDelegate::new(m.clone()),
         );
 
         assert!(stream.next().await.is_none());
 
         // Ensure the wall clock was added to the "success" histogram even
         // though it yielded no data.
-        let hit_count = hist
-            .get_observer(&metric::Attributes::from(&[("result", "success")]))
-            .expect("failed to get observer")
-            .fetch()
-            .sample_count();
+        let hit_count = m.inner.success_duration.fetch().sample_count();
         assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly");
 
         // And the number of bytes read must match
-        assert_counter_value(&metrics, "object_store_transfer_bytes", [], 0);
+        assert_counter_value(
+            &metrics,
+            "object_store_transfer_bytes",
+            [("op", "test"), ("result", "success")],
+            0,
+        );
     }
 }
diff --git a/observability_deps/Cargo.toml b/observability_deps/Cargo.toml
index 20b64a89860..a24de4fb002 100644
--- a/observability_deps/Cargo.toml
+++ b/observability_deps/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 tracing = { version = "0.1", features = ["max_level_trace"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/panic_logging/Cargo.toml b/panic_logging/Cargo.toml
index 75010930e0a..3c704e26821 100644
--- a/panic_logging/Cargo.toml
+++ b/panic_logging/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 metric = { path = "../metric" }
 observability_deps = { path = "../observability_deps" }
diff --git a/panic_logging/src/lib.rs b/panic_logging/src/lib.rs
index c1413b536c1..ef17394f2fe 100644
--- a/panic_logging/src/lib.rs
+++ b/panic_logging/src/lib.rs
@@ -60,7 +60,17 @@ impl SendPanicsToTracing {
             if let Some(metrics) = &metrics {
                 metrics.inc(panic_type);
             }
-            error!(panic_type=panic_type.name(), panic_info=%info, "Thread panic");
+
+            let location = info.location();
+            error!(
+                panic_type = panic_type.name(),
+                panic_message = message(info),
+                panic_file = location.map(|l| l.file()),
+                panic_line = location.map(|l| l.line()),
+                panic_column = location.map(|l| l.column()),
+                "Thread panic",
+            );
+
             current_panic_hook(info);
         }));
 
@@ -149,20 +159,23 @@ impl PanicType {
     }
 
     fn classify(panic_info: &PanicInfo<'_>) -> Self {
-        let payload_any = panic_info.payload();
-
-        let maybe_msg = payload_any
-            .downcast_ref::<&str>()
-            .copied()
-            .or(payload_any.downcast_ref::<String>().map(|s| s.as_str()));
-
-        match maybe_msg {
+        match message(panic_info) {
             Some("offset overflow" | "offset") => Self::OffsetOverflow,
             _ => Self::Unknown,
         }
     }
 }
 
+/// Extract string message from [`PanicInfo`]
+fn message<'a>(panic_info: &'a PanicInfo<'a>) -> Option<&'a str> {
+    let payload_any = panic_info.payload();
+
+    payload_any
+        .downcast_ref::<&str>()
+        .copied()
+        .or(payload_any.downcast_ref::<String>().map(|s| s.as_str()))
+}
+
 /// Metrics used for panics.
 #[derive(Debug)]
 struct Metrics {
@@ -195,6 +208,8 @@ impl Metrics {
 
 #[cfg(test)]
 mod tests {
+    use std::panic::panic_any;
+
     use metric::{Attributes, Metric};
     use test_helpers::{maybe_start_logging, tracing::TracingCapture};
 
@@ -246,6 +261,14 @@ mod tests {
         .join()
         .expect_err("wat");
 
+        let capture2 = Arc::clone(&capture);
+        std::thread::spawn(move || {
+            capture2.register_in_current_thread();
+            panic_any(1);
+        })
+        .join()
+        .expect_err("wat");
+
         drop(guard);
         let capture2 = Arc::clone(&capture);
         std::thread::spawn(move || {
@@ -256,16 +279,14 @@ mod tests {
         .expect_err("wat");
 
         assert_count(&metrics, "offset_overflow", 2);
-        assert_count(&metrics, "unknown", 1);
+        assert_count(&metrics, "unknown", 2);
 
         assert_eq!(
             capture.to_string(),
-            "level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_info = panicked at panic_logging/src/lib.rs:227:13:\n\
-            it's bananas; \n\
-            level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_info = panicked at panic_logging/src/lib.rs:235:13:\n\
-            offset; \n\
-            level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_info = panicked at panic_logging/src/lib.rs:244:13:\n\
-            offset overflow; "
+            "level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_message = \"it's bananas\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 242; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 250; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset overflow\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 259; panic_column = 13; \n\
+             level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 267; panic_column = 13; "
         );
     }
 }
diff --git a/parquet_cache/Cargo.toml b/parquet_cache/Cargo.toml
new file mode 100644
index 00000000000..7fc1b907166
--- /dev/null
+++ b/parquet_cache/Cargo.toml
@@ -0,0 +1,60 @@
+[package]
+name = "parquet_cache"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[features]
+test-with-server-port = []
+
+[dependencies]
+arc-swap = "1.6.0"
+async-channel = "2.1.1"
+async-trait = "0.1.77"
+backoff = { path = "../backoff" }
+bytes = "1.5.0"
+chrono = "0.4.31"
+data_types = { path = "../data_types" }
+fnv = "1.0.7"
+futures = "0.3.30"
+http = "0.2.11"
+hyper = { version = "0.14.27", features = ["http2"] }
+iox_catalog = { path = "../iox_catalog" }
+k8s-openapi = { version = "0.20.0", features = ["schemars", "earliest"] }
+kube = { version = "0.87.1", features = ["runtime", "client", "derive"] }
+moka = { version = "0.12.3", features = ["future"] }
+mpchash = "1.2.1"
+notify = "6.1.1"
+object_store = { workspace = true }
+observability_deps = { path = "../observability_deps" }
+parking_lot = "0.12.1"
+parquet_file = { path = "../parquet_file" }
+pin-project = "1.1.3"
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
+schemars = "0.8.16"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0.111"
+thiserror = "1.0.56"
+tokio = "1.35.1"
+tokio-util = { version = "0.7.10", features = ["codec"] }
+tower = "0.4.13"
+url = "2.5.0"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+ahash = "0.8.7"
+assert_matches = "1.5.0"
+bytes = "1.5.0"
+iox_tests = { path = "../iox_tests" }
+iox_time = { path = "../iox_time" }
+kube_test = { path = "../kube_test" }
+lazy_static = "1.4.0"
+rand = "0.8.5"
+tempfile = "3.9.0"
+tokio-stream = "0.1.14"
+uuid = "1.6.1"
+
diff --git a/parquet_cache/src/client.rs b/parquet_cache/src/client.rs
new file mode 100644
index 00000000000..0f767f588ab
--- /dev/null
+++ b/parquet_cache/src/client.rs
@@ -0,0 +1,16 @@
+//! Contains the cache client.
+
+/// Interface for the object store. Consumed by Iox components.
+pub mod object_store;
+/// Interface for write hinting. Consumed by Iox components.
+pub mod write_hints;
+
+/// Connection to remote data cache. Used by the ObjectStore cache impl.
+pub(crate) mod cache_connector;
+pub(crate) mod http;
+pub(crate) mod keyspace;
+pub(crate) mod request;
+
+/// Mocks used for internal testing
+#[cfg(test)]
+pub(crate) mod mock;
diff --git a/parquet_cache/src/client/cache_connector.rs b/parquet_cache/src/client/cache_connector.rs
new file mode 100644
index 00000000000..6ec3c185bc7
--- /dev/null
+++ b/parquet_cache/src/client/cache_connector.rs
@@ -0,0 +1,37 @@
+use std::fmt::Debug;
+
+use tower::{Layer, ServiceBuilder};
+
+use super::{http::HttpService, keyspace::HostKeyspaceService};
+
+pub type ClientCacheConnector = HostKeyspaceService<HttpService>;
+
+/// Data cache errors.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Failure getting data from the cache.
+    #[error("Fetch error: {0}")]
+    FetchData(#[from] super::keyspace::Error),
+
+    /// Failure reading the (already fetched) data from cache.
+    #[error("Data error: {0}")]
+    ReadData(String),
+}
+
+/// Builder for the cache connector service.
+pub fn build_cache_connector(ns_service_addr: impl ToString) -> ClientCacheConnector {
+    ServiceBuilder::new()
+        .layer(MapToHost(ns_service_addr.to_string()))
+        .service(HttpService::new())
+}
+
+#[derive(Debug)]
+struct MapToHost(pub String);
+
+impl<S> Layer<S> for MapToHost {
+    type Service = HostKeyspaceService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        HostKeyspaceService::new(service, self.0.clone())
+    }
+}
diff --git a/parquet_cache/src/client/http.rs b/parquet_cache/src/client/http.rs
new file mode 100644
index 00000000000..6bac49e0ff5
--- /dev/null
+++ b/parquet_cache/src/client/http.rs
@@ -0,0 +1,62 @@
+use std::{pin::Pin, sync::Arc, task::Poll};
+
+use futures::Future;
+use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode};
+use tower::Service;
+
+use super::request::{PinnedFuture, RawRequest};
+
+#[derive(Debug, Clone)]
+pub struct HttpService {
+    /// Pool of connections.
+    client: Arc<Client<HttpConnector, Body>>,
+}
+
+impl HttpService {
+    pub fn new() -> Self {
+        let client = Client::builder()
+            .http2_keep_alive_while_idle(true)
+            .http2_only(true)
+            .retry_canceled_requests(true)
+            .build_http::<Body>();
+
+        Self {
+            client: Arc::new(client),
+        }
+    }
+}
+
+impl Default for HttpService {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl Service<RawRequest> for HttpService {
+    type Response = Response<Body>;
+    type Error = hyper::Error;
+    type Future = PinnedFuture<Self::Response, Self::Error>;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: RawRequest) -> Self::Future {
+        match Request::<Body>::try_from(req) {
+            Ok(req) => Box::pin(self.client.request(req)),
+            Err(e) => invalid_request(e),
+        }
+    }
+}
+
+fn invalid_request(
+    error: impl std::error::Error,
+) -> Pin<Box<dyn Future<Output = Result<Response<Body>, hyper::Error>> + Send>> {
+    let (mut parts, _) = Response::new("invalid request").into_parts();
+    parts.status = StatusCode::BAD_REQUEST;
+
+    let body = Body::from(
+        serde_json::json!({"status": 400, "description": error.to_string()}).to_string(),
+    );
+    Box::pin(futures::future::ok(Response::from_parts(parts, body)))
+}
diff --git a/parquet_cache/src/client/keyspace.rs b/parquet_cache/src/client/keyspace.rs
new file mode 100644
index 00000000000..d65a0b5d0d5
--- /dev/null
+++ b/parquet_cache/src/client/keyspace.rs
@@ -0,0 +1,314 @@
+use std::{collections::HashMap, sync::Arc, task::Poll};
+
+use arc_swap::ArcSwap;
+use backoff::{Backoff, BackoffConfig};
+use bytes::Buf;
+use http::uri::Authority;
+use hyper::{Body, Method, Response, StatusCode, Uri};
+use mpchash::HashRing;
+use observability_deps::tracing::warn;
+use tokio::sync::OnceCell;
+use tower::{Service, ServiceExt};
+
+use super::request::{PinnedFuture, RawRequest};
+use crate::data_types::{KeyspaceResponseBody, ServiceNode, ServiceNodeId};
+
+/// Errors associated fetching data from the cache.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Generic connection failure to remote data cache service.
+    #[error("Connection error: {0}")]
+    Connection(#[from] hyper::Error),
+
+    /// Error in constructing request.
+    #[error("Request error: {0}")]
+    Request(String),
+
+    /// Error with hashring keyspace
+    #[error("Keyspace error: {0}")]
+    Keyspace(String),
+
+    /// Invalid addr
+    #[error("Invalid addr: {0}")]
+    InvalidAddr(#[from] http::uri::InvalidUri),
+
+    /// Failure reading data from cache.
+    #[error("Data error: {0}")]
+    ReadData(String),
+}
+
+#[derive(Debug, Clone)]
+pub struct HostKeyspaceService<S> {
+    /// Inner service
+    service: S,
+    /// Namespace service addr (for requests to any cache server).
+    dst: String,
+    /// Inner state
+    inner: Arc<HostKeyspace>,
+}
+
+impl<S> HostKeyspaceService<S> {
+    /// Create keyspace middleware [`HostKeyspaceService`]
+    pub fn new(service: S, dst: String) -> Self {
+        Self {
+            service,
+            dst,
+            inner: Default::default(),
+        }
+    }
+}
+
+impl<S> HostKeyspaceService<S>
+where
+    S: Clone + Send + Sync + Service<RawRequest, Response = Response<Body>, Error = hyper::Error>,
+    for<'b> <S as Service<RawRequest>>::Future: std::marker::Send + 'b,
+{
+    /// Primary goal of [`HostKeyspaceService`] is to add the host to the [`RawRequest`].
+    async fn add_host_to_request(&mut self, mut req: RawRequest) -> Result<RawRequest, Error> {
+        let host = match &req.key {
+            Some(obj_key) => self.hostname(obj_key).await?,
+            None => self.dst.clone(), // k8s namespace service addr
+        };
+
+        req.uri_parts.authority =
+            Some(Authority::from_maybe_shared(host).map_err(Error::InvalidAddr)?);
+
+        Ok(req)
+    }
+
+    /// Hostname provided based upon hashed keyspace.
+    /// Lookup, if missing the re-query service for the latest keyspace.
+    async fn hostname(&mut self, key: &String) -> Result<String, Error> {
+        let node = self.inner.key_to_node(key);
+
+        match self.inner.hostname_table.load().get(&node) {
+            Some(hostname) => Ok(hostname.to_owned()),
+            None => {
+                let keyspace = self.get_service_nodes().await?;
+                let inner = &mut self.inner;
+                inner.build_keyspace(keyspace);
+                let node = inner.key_to_node(key);
+
+                let hostname = inner.hostname_table
+                    .load()
+                    .get(&node)
+                    .ok_or(Error::Keyspace(format!("key {} was assigned to node {}, but node was not found in latest keyspace hosts", key, node)))?
+                    .to_owned();
+                Ok(hostname)
+            }
+        }
+    }
+
+    /// Get list of [`ServiceNode`]s from cache service.
+    async fn get_service_nodes(&mut self) -> Result<Vec<ServiceNode>, Error> {
+        // use the Namespace service addr (self.dst), and not an individual server, to fetch the keyspace.
+        let uri_parts = format!("{}/keyspace", &self.dst)
+            .parse::<Uri>()
+            .map(http::uri::Parts::from)
+            .map_err(Error::InvalidAddr)?;
+
+        let req = RawRequest {
+            uri_parts,
+            method: Method::GET,
+            ..Default::default()
+        };
+
+        let service = self.service.ready().await?;
+        let resp = service.call(req).await.map_err(Error::Connection)?;
+
+        match resp.status() {
+            StatusCode::OK => {
+                let reader = hyper::body::aggregate(resp.into_body())
+                    .await
+                    .map_err(|e| Error::Keyspace(e.to_string()))?
+                    .reader();
+
+                let keyspace_nodes: KeyspaceResponseBody =
+                    serde_json::from_reader(reader).map_err(|e| Error::Keyspace(e.to_string()))?;
+
+                Ok(keyspace_nodes.nodes)
+            }
+            _ => Err(Error::Keyspace(String::from("keyspace request failure"))),
+        }
+    }
+
+    /// Initialize the keyspace on service start.
+    /// Has backoff-and-retry; intended to be called once.
+    async fn initialized(&mut self) {
+        Backoff::new(&BackoffConfig::default())
+            .retry_all_errors("probe data cache service for keyspace", || {
+                let mut this = self.clone();
+                async move {
+                    let probe = this
+                        .get_service_nodes()
+                        .await
+                        .map(|keyspace| this.inner.build_keyspace(keyspace));
+                    if probe.is_err() {
+                        warn!("failed to build data cache keyspace");
+                    }
+                    probe
+                }
+            })
+            .await
+            .expect("retry forever")
+    }
+}
+
+impl<S> Service<RawRequest> for HostKeyspaceService<S>
+where
+    S: Clone
+        + Service<RawRequest, Response = Response<Body>, Error = hyper::Error>
+        + Send
+        + Sync
+        + 'static,
+    for<'b> <S as Service<RawRequest>>::Future: std::marker::Send + 'b,
+{
+    type Response = S::Response;
+    type Error = Error;
+    type Future = PinnedFuture<Self::Response, Self::Error>;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: RawRequest) -> Self::Future {
+        let mut this = self.clone();
+        Box::pin(async move {
+            Arc::clone(&this.inner)
+                .initialize_once
+                .get_or_init(|| this.initialized())
+                .await;
+            let req = this.add_host_to_request(req).await?;
+            this.service.call(req).await.map_err(Error::Connection)
+        })
+    }
+}
+
+#[derive(Debug, Default)]
+struct HostKeyspace {
+    /// Hashring
+    keyspace: ArcSwap<HashRing<ServiceNodeId>>,
+    /// Map nodes to hostname.
+    hostname_table: ArcSwap<HashMap<ServiceNodeId, String>>,
+    /// A single init of the shared, clonable keyspace.
+    /// (Note that the re-building of an invalidated keyspace, is separate from this init.)
+    initialize_once: OnceCell<()>,
+}
+
+impl HostKeyspace {
+    /// Lookup key in keyspace
+    fn key_to_node(&self, key: &String) -> ServiceNodeId {
+        self.keyspace
+            .load()
+            .as_ref()
+            .primary_node(key)
+            .unwrap()
+            .to_owned()
+    }
+
+    /// Build keyspace for cache connector, from list of [`ServiceNode`]s.
+    fn build_keyspace(&self, keyspace_nodes: Vec<ServiceNode>) {
+        let mut keyspace = HashRing::new();
+        let mut hostname_table = HashMap::new();
+
+        for node in keyspace_nodes {
+            keyspace.add(node.id);
+            hostname_table.insert(node.id, node.hostname);
+        }
+
+        self.keyspace.swap(Arc::new(keyspace));
+        self.hostname_table.swap(Arc::new(hostname_table));
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::collections::hash_map::Entry;
+
+    use parking_lot::Mutex;
+    use rand::seq::SliceRandom;
+    use uuid::Uuid;
+
+    use super::super::http::HttpService;
+    use crate::data_types::ServiceNode;
+
+    use super::*;
+
+    async fn assert_consistent_hashing(
+        mut cache_connector: HostKeyspaceService<HttpService>,
+        prev_assignments: Arc<Mutex<HashMap<String, String>>>,
+    ) {
+        // test with 100 files
+        for _ in 0..100 {
+            let key = format!("unique/location/{}/file.parquet", Uuid::new_v4());
+            for _ in 0..1000 {
+                let key = key.clone();
+
+                let got = cache_connector
+                    .hostname(&key)
+                    .await
+                    .expect("should assign hostname");
+                let expected = match prev_assignments.lock().entry(key) {
+                    Entry::Vacant(v) => {
+                        v.insert(got.clone());
+                        got.clone()
+                    }
+                    Entry::Occupied(o) => o.get().clone(),
+                };
+
+                assert_eq!(
+                    got, expected,
+                    "should match previous assignment {}, instead got {}",
+                    expected, got
+                );
+            }
+        }
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_hashing_is_consistent() {
+        let remote_cache_connector =
+            HostKeyspaceService::new(HttpService::default(), "foo".to_string());
+
+        let keyspace_nodes = (0..100)
+            .map(|id| ServiceNode {
+                id,
+                hostname: format!("cache-server-hostname-{}", id),
+            })
+            .collect();
+        remote_cache_connector.inner.build_keyspace(keyspace_nodes);
+
+        let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned
+        assert_consistent_hashing(remote_cache_connector, prev_assignments).await;
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_population_is_not_ordering_sensitive() {
+        // Sanity check. Asserting that the expected hashing properties hold true.
+
+        let remote_cache_connector =
+            HostKeyspaceService::new(HttpService::default(), "foo".to_string());
+        let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned
+
+        // test with 0..100 ordered nodes, used when building keyspace
+        let mut keyspace_nodes: Vec<ServiceNode> = (0..100)
+            .map(|id| ServiceNode {
+                id,
+                hostname: format!("cache-server-hostname-{}", id),
+            })
+            .collect();
+        remote_cache_connector
+            .inner
+            .build_keyspace(keyspace_nodes.clone());
+        assert_consistent_hashing(
+            remote_cache_connector.clone(),
+            Arc::clone(&prev_assignments),
+        )
+        .await;
+
+        // shuffled nodes, test against same/original assignments
+        keyspace_nodes.shuffle(&mut rand::thread_rng());
+        remote_cache_connector.inner.build_keyspace(keyspace_nodes);
+        assert_consistent_hashing(remote_cache_connector, prev_assignments).await;
+    }
+}
diff --git a/parquet_cache/src/client/mock.rs b/parquet_cache/src/client/mock.rs
new file mode 100644
index 00000000000..584cb314641
--- /dev/null
+++ b/parquet_cache/src/client/mock.rs
@@ -0,0 +1,153 @@
+use std::collections::HashSet;
+use std::{ops::Range, sync::Arc};
+
+use async_trait::async_trait;
+use bytes::Bytes;
+use data_types::ParquetFileParams;
+use futures::stream::BoxStream;
+use object_store::{
+    path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore,
+    PutOptions, PutResult, Result,
+};
+use parking_lot::Mutex;
+use tokio::io::AsyncWrite;
+
+use crate::{
+    data_types::WriteHintAck, DataCacheObjectStore, MockCacheServer, WriteHintingObjectStore,
+};
+
+use super::cache_connector::build_cache_connector;
+
+/// Build a cache client,
+/// with a mocked server and mocked direct-to-store fallback.
+pub async fn build_cache_server_client(
+    direct_to_store: Arc<dyn ObjectStore>,
+) -> (DataCacheObjectStore, MockCacheServer) {
+    // build server and client
+    let dst = "localhost:0";
+    let cache_server = MockCacheServer::create(dst, Arc::clone(&direct_to_store)).await;
+    let cache_client = build_cache_connector(cache_server.addr());
+
+    // build object_store
+    let object_store = DataCacheObjectStore::new(cache_client, direct_to_store);
+
+    (object_store, cache_server)
+}
+
+/// A mocked direct-to-object-store, with the following characteristics:
+///   * panics when used as fallback (for GET requests)
+///   * tracks when called for PUT requests
+#[derive(Debug, Default)]
+pub struct MockDirectStore {
+    called: Mutex<HashSet<String>>,
+}
+
+impl MockDirectStore {
+    pub fn was_called(&self, fn_name: &str) -> bool {
+        self.called.lock().contains(&String::from(fn_name))
+    }
+}
+
+#[async_trait]
+impl ObjectStore for MockDirectStore {
+    async fn put_opts(
+        &self,
+        _location: &Path,
+        _bytes: Bytes,
+        _opts: PutOptions,
+    ) -> Result<PutResult> {
+        self.called.lock().insert(String::from("put"));
+        Ok(PutResult {
+            e_tag: None,
+            version: None,
+        })
+    }
+
+    async fn put_multipart(
+        &self,
+        _location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        self.called.lock().insert(String::from("put_multipart"));
+        Ok((
+            String::from("AsyncWriter for MockDirectStore"),
+            Box::new(tokio::io::BufWriter::new(vec![])),
+        ))
+    }
+
+    async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> {
+        self.called.lock().insert(String::from("abort_multipart"));
+        Ok(())
+    }
+
+    async fn get(&self, _location: &Path) -> Result<GetResult> {
+        panic!("object was not found in test cache")
+    }
+
+    async fn get_opts(&self, _location: &Path, _options: GetOptions) -> Result<GetResult> {
+        // test may intentionally test fallback behavior of get_opts()
+        panic!("direct_store.get_opts() was called during test")
+    }
+
+    async fn get_range(&self, _location: &Path, _range: Range<usize>) -> Result<Bytes> {
+        panic!("direct_store should not be called during test")
+    }
+
+    async fn get_ranges(&self, _location: &Path, _ranges: &[Range<usize>]) -> Result<Vec<Bytes>> {
+        panic!("direct_store should not be called during test")
+    }
+
+    async fn head(&self, _location: &Path) -> Result<ObjectMeta> {
+        // test may intentionally test fallback behavior of get_opts()
+        panic!("direct_store.head() was called during test")
+    }
+
+    async fn delete(&self, _location: &Path) -> Result<()> {
+        self.called.lock().insert(String::from("delete"));
+        Ok(())
+    }
+
+    fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+        self.called.lock().insert(String::from("list"));
+        Box::pin(tokio_stream::iter(vec![]))
+    }
+
+    async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result<ListResult> {
+        self.called
+            .lock()
+            .insert(String::from("list_with_delimiter"));
+        Ok(ListResult {
+            common_prefixes: vec![],
+            objects: vec![],
+        })
+    }
+
+    async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> {
+        self.called.lock().insert(String::from("copy"));
+        Ok(())
+    }
+
+    async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> Result<()> {
+        self.called
+            .lock()
+            .insert(String::from("copy_if_not_exists"));
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl WriteHintingObjectStore for MockDirectStore {
+    async fn write_hint<'a>(
+        &self,
+        _location: &'a Path,
+        _new_file: &'a ParquetFileParams,
+        _ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        panic!("direct_store should not be called during test");
+    }
+}
+
+impl std::fmt::Display for MockDirectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DirectStore")
+    }
+}
diff --git a/parquet_cache/src/client/object_store.rs b/parquet_cache/src/client/object_store.rs
new file mode 100644
index 00000000000..b00642b5767
--- /dev/null
+++ b/parquet_cache/src/client/object_store.rs
@@ -0,0 +1,776 @@
+use std::collections::HashMap;
+use std::io::{Error, ErrorKind};
+use std::{ops::Range, sync::Arc};
+
+use async_trait::async_trait;
+use bytes::{Buf, Bytes};
+use futures::stream::{BoxStream, StreamExt, TryStreamExt};
+use http::Method;
+use hyper::StatusCode;
+use hyper::{Body, Response};
+use object_store::{
+    path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId,
+    ObjectMeta, ObjectStore, PutOptions, PutResult, Result,
+};
+use tokio::io::AsyncWrite;
+use tower::{Service, ServiceExt};
+
+use crate::data_types::{
+    extract_usize_header, GetObjectMetaResponse, X_RANGE_END_HEADER, X_RANGE_START_HEADER,
+};
+
+use super::cache_connector::{ClientCacheConnector, Error as CacheClientError};
+use super::request::RawRequest;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "object store to data cache";
+
+/// Data cache, consumable by IOX Components.
+pub struct DataCacheObjectStore {
+    pub(crate) cache: ClientCacheConnector,
+    pub(crate) direct_passthru: Arc<dyn ObjectStore>,
+}
+
+impl DataCacheObjectStore {
+    /// Create a new [`DataCacheObjectStore`].
+    pub fn new(cache: ClientCacheConnector, direct_store: Arc<dyn ObjectStore>) -> Self {
+        Self {
+            cache,
+            direct_passthru: Arc::new(direct_store),
+        }
+    }
+}
+
+/// ObjectStore client for using the data cache.
+///
+/// Defines when to use the direct (passthru) object store,
+/// versus the data cache.
+///
+/// Iox components all utilize the [`ObjectStore`] for store connection.
+/// Based upon startup configuration, this may be the data cache.
+#[async_trait]
+impl ObjectStore for DataCacheObjectStore {
+    async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result<PutResult> {
+        self.direct_passthru.put_opts(location, bytes, opts).await
+    }
+
+    async fn put_multipart(
+        &self,
+        location: &Path,
+    ) -> Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+        self.direct_passthru.put_multipart(location).await
+    }
+
+    async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> {
+        self.direct_passthru
+            .abort_multipart(location, multipart_id)
+            .await
+    }
+
+    async fn get_opts(&self, location: &Path, options: GetOptions) -> Result<GetResult> {
+        let object_meta: ObjectMeta = self.head(location).await?;
+
+        let key = location.to_string();
+
+        let uri_parts = format!("/object?location={}", key)
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let GetOptions {
+            if_match,
+            if_none_match,
+            if_modified_since,
+            if_unmodified_since,
+            range,
+            version,
+            head,
+        } = &options;
+        let headers = Headers(&mut HashMap::new())
+            .add_header("If-Match", if_match)
+            .add_header("If-None-Match", if_none_match)
+            .add_header("If-Modified-Since", if_modified_since)
+            .add_header("If-Unmodified-Since", if_unmodified_since)
+            // Pass other options as non standard headers
+            .add_header("X-Version", version)
+            .add_header("X-Head", &Some(head))
+            .add_range(range)
+            .0
+            .to_owned();
+
+        let req = RawRequest {
+            method: Method::GET,
+            uri_parts,
+            headers,
+            key: Some(key),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        match service.call(req).await {
+            Ok(resp) => match resp.status() {
+                StatusCode::OK => {
+                    match transform_get_object_response(resp, object_meta, range) {
+                        Ok(res) => Ok(res),
+                        Err(_) => self.direct_passthru.get_opts(location, options).await, // read_data error
+                    }
+                }
+                code => {
+                    if use_fallback(code) {
+                        self.direct_passthru.get_opts(location, options).await // http code error
+                    } else {
+                        let source = Box::new(Error::new(ErrorKind::Other, code.to_string()));
+                        Err(ObjectStoreError::Generic {
+                            store: DATA_CACHE,
+                            source,
+                        })
+                    }
+                }
+            },
+            Err(_) => self.direct_passthru.get_opts(location, options).await, // connection error
+        }
+    }
+
+    async fn get_range(&self, location: &Path, range: Range<usize>) -> Result<Bytes> {
+        self.get_opts(
+            location,
+            GetOptions {
+                range: Some(range),
+                ..Default::default()
+            },
+        )
+        .await?
+        .bytes()
+        .await
+    }
+
+    async fn head(&self, location: &Path) -> Result<ObjectMeta> {
+        let key = location.to_string();
+
+        let uri_parts = format!("/metadata?location={}", key)
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let req = RawRequest {
+            method: Method::GET,
+            uri_parts,
+            key: Some(key),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        match service.call(req).await {
+            Ok(mut resp) => match resp.status() {
+                StatusCode::OK => {
+                    let maybe_meta: Result<ObjectMeta, CacheClientError> =
+                        hyper::body::aggregate(resp.body_mut())
+                            .await
+                            .map_err(|e| CacheClientError::ReadData(e.to_string()))
+                            .map(|buf| buf.reader())
+                            .and_then(|reader| {
+                                serde_json::from_reader(reader)
+                                    .map_err(|e| CacheClientError::ReadData(e.to_string()))
+                            })
+                            .map(|get_meta_resp: GetObjectMetaResponse| {
+                                ObjectMeta::from(get_meta_resp)
+                            });
+
+                    match maybe_meta {
+                        Ok(meta) => Ok(meta),
+                        Err(_) => self.direct_passthru.head(location).await, // read_data error
+                    }
+                }
+                code => {
+                    if use_fallback(code) {
+                        self.direct_passthru.head(location).await // http code error
+                    } else {
+                        let source = Box::new(Error::new(ErrorKind::Other, code.to_string()));
+                        Err(ObjectStoreError::Generic {
+                            store: DATA_CACHE,
+                            source,
+                        })
+                    }
+                }
+            },
+            Err(_) => self.direct_passthru.head(location).await, // connection error
+        }
+    }
+
+    async fn delete(&self, location: &Path) -> Result<()> {
+        // Do not delete from cache, instead let it age out.
+        // Querier runs off of catalog snapshots of object_store state.
+        self.direct_passthru.delete(location).await
+    }
+
+    fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result<ObjectMeta>> {
+        // Use object_store directly as src of truth for currently existing files.
+        // Because cache cannot know about completeness of the file set.
+        self.direct_passthru.list(prefix)
+    }
+
+    async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result<ListResult> {
+        // Use object_store directly as src of truth for currently existing files.
+        // Because cache cannot know about completeness of the file set.
+        self.direct_passthru.list_with_delimiter(prefix).await
+    }
+
+    async fn copy(&self, from: &Path, to: &Path) -> Result<()> {
+        self.direct_passthru.copy(from, to).await
+    }
+
+    async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> {
+        self.direct_passthru.copy_if_not_exists(from, to).await
+    }
+}
+
+impl std::fmt::Display for DataCacheObjectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DataCacheObjectStore")
+    }
+}
+
+impl std::fmt::Debug for DataCacheObjectStore {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "DataCacheObjectStore")
+    }
+}
+
+fn use_fallback(code: StatusCode) -> bool {
+    match code {
+        StatusCode::OK => unreachable!("should not be requesting fallback if response is OK"),
+        // Errors which should not result in trying the fallback.
+        StatusCode::BAD_REQUEST
+        | StatusCode::PRECONDITION_FAILED
+        | StatusCode::FORBIDDEN
+        | StatusCode::UNAUTHORIZED
+        | StatusCode::MOVED_PERMANENTLY
+        | StatusCode::NETWORK_AUTHENTICATION_REQUIRED => false,
+        // All other errors => use fallback.
+        _ => true,
+    }
+}
+
+fn transform_get_object_response(
+    resp: Response<Body>,
+    meta: ObjectMeta,
+    expected_range: &Option<Range<usize>>,
+) -> Result<GetResult, CacheClientError> {
+    let headers = resp.headers();
+    let range = Range {
+        start: extract_usize_header(X_RANGE_START_HEADER, headers)?,
+        end: extract_usize_header(X_RANGE_END_HEADER, headers)?,
+    };
+
+    if let Some(expected_range) = expected_range {
+        if !expected_range.start.eq(&range.start) || !expected_range.end.eq(&range.end) {
+            return Err(CacheClientError::ReadData(format!(
+                "expected range {:?} but found range {:?}",
+                expected_range, range
+            )));
+        }
+    };
+
+    let stream = resp
+        .into_body()
+        .map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })
+        .boxed();
+
+    Ok(GetResult {
+        payload: object_store::GetResultPayload::Stream(stream),
+        meta,
+        range,
+    })
+}
+
+/// Newtype around headers, for convenience methods.
+struct Headers<'a>(pub &'a mut HashMap<&'static str, String>);
+
+impl<'a> Headers<'a> {
+    fn add_header<T: ToString>(&mut self, k: &'static str, v: &Option<T>) -> &mut Self {
+        if let Some(v) = v {
+            // let header_name = k.to_owned();
+            self.0.insert(k, v.to_string());
+        }
+        self
+    }
+
+    fn add_range(&mut self, range: &Option<Range<usize>>) -> &mut Self {
+        if let Some(v) = range {
+            self.0
+                .insert("Range", format!("bytes={}-{}", v.start, v.end));
+        }
+        self
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+
+    use crate::client::mock::{build_cache_server_client, MockDirectStore};
+    use crate::server::mock::{build_resp_body, ExpectedResponse};
+
+    use super::*;
+
+    static FILE: &[u8] = "All my pretty data.".as_bytes();
+
+    #[tokio::test]
+    async fn test_writes_are_passed_to_store() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        assert!(object_store
+            .put(&Path::default(), FILE.into())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("put"),
+            "put should be passed to direct store"
+        );
+
+        assert!(object_store.put_multipart(&Path::default()).await.is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("put_multipart"),
+            "put_multipart should be passed to direct store"
+        );
+
+        assert!(object_store
+            .abort_multipart(&Path::default(), &MultipartId::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("abort_multipart"),
+            "abort_multipart should be passed to direct store"
+        );
+
+        assert!(object_store.delete(&Path::default()).await.is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("delete"),
+            "delete should be passed to direct store"
+        );
+
+        assert!(object_store
+            .copy(&Path::default(), &Path::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("copy"),
+            "copy should be passed to direct store"
+        );
+
+        assert!(object_store
+            .copy_if_not_exists(&Path::default(), &Path::default())
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("copy_if_not_exists"),
+            "copy_if_not_exists should be passed to direct store"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_list_all_objects_are_passed_to_store() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        object_store.list(Some(&Path::default()));
+        assert!(
+            Arc::clone(&direct_to_store).was_called("list"),
+            "list should be passed to direct store"
+        );
+
+        assert!(object_store
+            .list_with_delimiter(Some(&Path::default()))
+            .await
+            .is_ok());
+        assert!(
+            Arc::clone(&direct_to_store).was_called("list_with_delimiter"),
+            "list_with_delimiter should be passed to direct store"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_requests_hit_the_cache() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let path = Path::from("my/scoped/data/file.parquet");
+
+        // GET /metadata
+        let route = format!("/metadata?location={}", &path.to_string());
+        let expected_metadata_resp = GetObjectMetaResponse {
+            location: path.to_string(),
+            last_modified: Default::default(),
+            size: 42,
+            e_tag: None,
+            version: None,
+        };
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: build_resp_body(&expected_metadata_resp),
+                range: None,
+            },
+        );
+        assert_matches!(
+            object_store.head(&path).await,
+            Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()),
+            "payload was returned and parsed properly"
+        );
+        assert!(
+            cache_server.was_called(&route),
+            "head should hit the cache server"
+        );
+
+        // GET fetch /object
+        // note: all fetch object requests use ObjectStore::get_opts()
+        let route = format!("/object?location={}", path);
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: std::str::from_utf8(FILE).unwrap().into(),
+                range: Some(Range {
+                    start: 0,
+                    end: FILE.len(),
+                }),
+            },
+        );
+        let object_resp = object_store.get(&path).await;
+        assert_matches!(
+            &object_resp,
+            Ok(GetResult {payload: _, meta, range: _}) if meta == &ObjectMeta::from(expected_metadata_resp),
+            "object metadata was returned and parsed properly"
+        ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests.
+        assert!(
+            cache_server.was_called(&route),
+            "get should hit the cache server"
+        );
+
+        cache_server.close().await;
+    }
+
+    #[tokio::test]
+    async fn test_fetch_range_request() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let path = Path::from("my/scoped/data/file.parquet");
+
+        // add mock metadata
+        let route = format!("/metadata?location={}", &path.to_string());
+        let expected_metadata_resp = GetObjectMetaResponse {
+            location: path.to_string(),
+            last_modified: Default::default(),
+            size: 42,
+            e_tag: None,
+            version: None,
+        };
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: build_resp_body(&expected_metadata_resp),
+                range: None,
+            },
+        );
+
+        // add mock file
+        let route = format!("/object?location={}", &path.to_string());
+        cache_server.respond_with(
+            route.clone(),
+            ExpectedResponse {
+                bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(),
+                range: Some(Range { start: 3, end: 9 }),
+            },
+        );
+
+        // TEST: get_range()
+        let range = Range { start: 3, end: 9 };
+        let object_resp = object_store.get_range(&path, range.clone()).await;
+        assert_matches!(
+            &object_resp,
+            Ok(bytes) if bytes.len() == range.len(),
+            "returns proper bytes size for the range"
+        );
+        assert!(
+            cache_server.was_called(&route),
+            "get should hit the cache server"
+        );
+
+        // TEST: multiple get_ranges()
+        let object_resp = object_store
+            .get_ranges(&path, &[range.clone(), range.clone()])
+            .await;
+        assert_matches!(
+            &object_resp,
+            Ok(vec_bytes) if matches!(
+                &vec_bytes[..],
+                [bytes, bytes_2] if bytes.len() == range.len() && bytes_2.len() == range.len()
+            ),
+            "returns proper bytes size for multiple ranges"
+        );
+
+        cache_server.close().await;
+    }
+
+    mod test_range_failures {
+        use super::*;
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_get_opts_will_use_fallback_if_returned_range_does_not_match() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // add mock metadata
+            let route = format!("/metadata?location={}", &path.to_string());
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+
+            // add mock file
+            let route = format!("/object?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(),
+                    range: Some(Range { start: 3, end: 9 }),
+                },
+            );
+
+            // TEST: get_range()
+            let range = Range { start: 1, end: 7 };
+            let _ = object_store.get_range(&path, range.clone()).await;
+
+            cache_server.close().await;
+        }
+    }
+
+    mod test_head_failures {
+        use super::*;
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_missing_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, _cache_server) =
+                build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // TEST: metadata never provided to mock
+            let _ = object_store.head(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_bad_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // TEST: incorrect metadata provided to mock
+            let route = format!("/metadata?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: vec![].into(), // BAD: should be metadata
+                    range: None,
+                },
+            );
+            let _ = object_store.head(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_on_connection_failed() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+
+            // GET /metadata is working
+            let route = format!("/metadata?location={}", &path.to_string());
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+            assert_matches!(
+                object_store.head(&path).await,
+                Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()),
+                "payload was returned and parsed properly"
+            );
+
+            // kill server
+            cache_server.close().await;
+
+            // TEST: connection fails
+            let _ = object_store.head(&path).await;
+        }
+    }
+
+    mod test_get_opts_failures {
+        use crate::MockCacheServer;
+
+        use super::*;
+
+        async fn setup_metadata_head(path: &Path, cache_server: &MockCacheServer) {
+            // GET /metadata is working
+            let route = format!("/metadata?location={}", path);
+            let expected_metadata_resp = GetObjectMetaResponse {
+                location: path.to_string(),
+                last_modified: Default::default(),
+                size: 42,
+                e_tag: None,
+                version: None,
+            };
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: build_resp_body(&expected_metadata_resp),
+                    range: None,
+                },
+            );
+        }
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_missing_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // TEST: object never provided to mock
+            let _ = object_store.get(&path).await;
+        }
+
+        #[should_panic(expected = "direct_store.get_opts() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_when_bad_data() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // TEST: incorrect metadata provided to mock
+            let route = format!("/object?location={}", &path.to_string());
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: vec![].into(), // BAD: should be object
+                    range: None,
+                },
+            );
+            let _ = object_store.get(&path).await;
+        }
+
+        // since server is shutdown, will fail on head() request before get_opts() request
+        #[should_panic(expected = "direct_store.head() was called during test")]
+        #[tokio::test]
+        async fn test_use_fallback_on_connection_failed() {
+            let direct_to_store = Arc::new(MockDirectStore::default());
+
+            let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+            let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+            let path = Path::from("my/scoped/data/file.parquet");
+            setup_metadata_head(&path, &cache_server).await;
+            assert!(
+                object_store.head(&path).await.is_ok(),
+                "should have functioning metadata/head request"
+            );
+
+            // GET /object is working
+            let route = format!("/object?location={}", path);
+            cache_server.respond_with(
+                route.clone(),
+                ExpectedResponse {
+                    bytes: std::str::from_utf8(FILE).unwrap().into(),
+                    range: Some(Range {
+                        start: 0,
+                        end: FILE.len(),
+                    }),
+                },
+            );
+            assert!(object_store.get(&path).await.is_ok());
+
+            // kill server
+            cache_server.close().await;
+
+            // TEST: connection fails
+            let _ = object_store.get(&path).await;
+        }
+    }
+}
diff --git a/parquet_cache/src/client/request.rs b/parquet_cache/src/client/request.rs
new file mode 100644
index 00000000000..bb0e7335af4
--- /dev/null
+++ b/parquet_cache/src/client/request.rs
@@ -0,0 +1,46 @@
+use std::{collections::HashMap, pin::Pin};
+
+use futures::Future;
+use http::uri::Scheme;
+use hyper::{header::HeaderValue, Body, Method, Request, Uri};
+
+pub type PinnedFuture<R, E> = Pin<Box<dyn Future<Output = Result<R, E>> + Send>>;
+
+#[derive(Debug, Default)]
+pub struct RawRequest {
+    pub headers: HashMap<&'static str, String>,
+    pub body: Body,
+    pub uri_parts: http::uri::Parts,
+    pub method: Method,
+    pub key: Option<String>,
+}
+
+impl TryFrom<RawRequest> for Request<Body> {
+    type Error = http::Error;
+
+    fn try_from(value: RawRequest) -> Result<Self, Self::Error> {
+        let RawRequest {
+            headers: req_headers,
+            body,
+            mut uri_parts,
+            method,
+            key: _,
+        } = value;
+
+        // reduce unnecessary (within cluster) overhead from https
+        uri_parts.scheme = Some(Scheme::HTTP);
+
+        let mut req = Request::builder()
+            .method(method)
+            .uri(Uri::from_parts(uri_parts)?);
+
+        for (k, v) in req_headers.into_iter() {
+            req = req.header(
+                k,
+                HeaderValue::from_str(v.as_str()).map_err(http::Error::from)?,
+            );
+        }
+
+        req.body(body)
+    }
+}
diff --git a/parquet_cache/src/client/write_hints.rs b/parquet_cache/src/client/write_hints.rs
new file mode 100644
index 00000000000..4d091ac3a7d
--- /dev/null
+++ b/parquet_cache/src/client/write_hints.rs
@@ -0,0 +1,223 @@
+use async_trait::async_trait;
+use bytes::{BufMut, BytesMut};
+use data_types::ParquetFileParams;
+use futures::FutureExt;
+use hyper::Method;
+use object_store::{limit::LimitStore, path::Path, Error as ObjectStoreError, ObjectStore, Result};
+use tower::{Service, ServiceExt};
+
+use crate::data_types::{WriteHint, WriteHintAck, WriteHintRequestBody};
+use crate::DataCacheObjectStore;
+
+use super::request::RawRequest;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "write hint to data cache";
+
+/// An [`ObjectStore`] which handles write hinting.
+///
+/// In some cases, the write hinting request does nothing (e.g. for direct-to-store impls).
+#[async_trait]
+pub trait WriteHintingObjectStore: ObjectStore {
+    /// Handle any write hinting performed by the [`ObjectStore`].
+    async fn write_hint<'a>(
+        &self,
+        location: &'a Path,
+        new_file: &'a ParquetFileParams,
+        ack_setting: WriteHintAck,
+    ) -> Result<()>;
+}
+
+#[async_trait]
+impl WriteHintingObjectStore for DataCacheObjectStore {
+    /// Provide write hinting to data cache.
+    ///
+    /// Response is configuration based on [`WriteHintAck`].
+    async fn write_hint<'a>(
+        &self,
+        location: &'a Path,
+        new_file: &'a ParquetFileParams,
+        ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: location.to_string(),
+                hint: WriteHint::from(new_file),
+                ack_setting,
+            },
+        )
+        .map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        let key = location.to_string();
+
+        let uri_parts = "/write-hint"
+            .parse::<http::Uri>()
+            .map(http::uri::Parts::from)
+            .expect("should be valid uri");
+
+        let req = RawRequest {
+            method: Method::POST,
+            uri_parts,
+            key: Some(key),
+            body: hyper::Body::from(buf.into_inner().freeze()),
+            ..Default::default()
+        };
+
+        let mut cache = self.cache.clone();
+        let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic {
+            store: DATA_CACHE,
+            source: Box::new(e),
+        })?;
+
+        let write_hints = service.call(req);
+
+        match ack_setting {
+            WriteHintAck::Sent => {
+                write_hints.now_or_never();
+                Ok(())
+            }
+            WriteHintAck::Received => {
+                // server responds ok after receipt
+                write_hints.await.map_err(|e| ObjectStoreError::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                })?;
+                Ok(())
+            }
+            WriteHintAck::Completed => {
+                // server responds ok after downstream actions complete
+                write_hints.await.map_err(|e| ObjectStoreError::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                })?;
+                Ok(())
+            }
+        }
+    }
+}
+
+#[async_trait]
+impl<T: ObjectStore> WriteHintingObjectStore for LimitStore<T> {
+    /// Enable our store interface to always use `Arc<dyn ObjectStore + WriteHinting>`.
+    /// (Aws, Azure, and Gcp [`ObjectStore`] impls are all [`LimitStore`].)
+    ///
+    /// When data cache is not used, the write hinting does not occur.
+    async fn write_hint<'a>(
+        &self,
+        _location: &'a Path,
+        _new_file: &'a ParquetFileParams,
+        _ack_setting: WriteHintAck,
+    ) -> Result<()> {
+        Ok(())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::sync::Arc;
+
+    use data_types::{
+        ColumnId, ColumnSet, CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId,
+        Timestamp,
+    };
+    use object_store::{
+        aws::AmazonS3Builder, azure::MicrosoftAzureBuilder, gcp::GoogleCloudStorageBuilder,
+        limit::LimitStore,
+    };
+
+    use crate::client::mock::{build_cache_server_client, MockDirectStore};
+
+    use super::*;
+
+    fn new_file() -> ParquetFileParams {
+        ParquetFileParams {
+            namespace_id: NamespaceId::new(0),
+            table_id: TableId::new(0),
+            partition_id: PartitionId::new(0),
+            partition_hash_id: None,
+            object_store_id: ObjectStoreId::new(),
+            min_time: Timestamp::new(1),
+            max_time: Timestamp::new(5),
+            file_size_bytes: 0,
+            row_count: 0,
+            compaction_level: CompactionLevel::Initial,
+            created_at: Timestamp::new(1234),
+            column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]),
+            max_l0_created_at: Timestamp::new(1234),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_write_hinting_always_available() {
+        // This test confirms that any external interfaces can always utilize
+        // the object_store, without awareness of whether or not it's the data cache
+        // or a direct_to_store.
+        //
+        //  if object_store.put(&location).await.is_ok() {
+        //      object_store.write_hints(&location, new_files, ack_setting).await
+        //  }
+        //
+        // This avoids leaking any configuration details (for conditional checks) across the codebase.
+
+        let location = Path::from("my/scoped/data/file.parquet");
+        let new_file = new_file();
+        let ack_setting = WriteHintAck::Received;
+
+        // impl with gcp store
+        let builder = GoogleCloudStorageBuilder::new().with_bucket_name("foo".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+
+        // impl with aws store
+        let builder = AmazonS3Builder::new()
+            .with_bucket_name("foo".to_string())
+            .with_region("mars".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+
+        // impl with azure store
+        let builder = MicrosoftAzureBuilder::new()
+            .with_container_name("foo".to_string())
+            .with_account("dabozz".to_string());
+        let direct_store: Arc<dyn WriteHintingObjectStore> =
+            Arc::new(LimitStore::new(builder.build().unwrap(), 10));
+        assert!(direct_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+    }
+
+    #[tokio::test]
+    async fn test_write_hinting_hits_the_cache() {
+        let direct_to_store = Arc::new(MockDirectStore::default());
+
+        let casted_object_store = Arc::clone(&direct_to_store) as Arc<dyn ObjectStore>;
+        let (object_store, cache_server) = build_cache_server_client(casted_object_store).await;
+
+        let location = Path::from("my/scoped/data/file.parquet");
+        let new_file = new_file();
+        let ack_setting = WriteHintAck::Received;
+
+        assert!(object_store
+            .write_hint(&location, &new_file, ack_setting)
+            .await
+            .is_ok());
+        assert!(
+            cache_server.was_called(&"/write-hint".to_string()),
+            "write-hint should hit the cache server"
+        ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests.
+    }
+}
diff --git a/parquet_cache/src/controller.rs b/parquet_cache/src/controller.rs
new file mode 100644
index 00000000000..e2ae2484663
--- /dev/null
+++ b/parquet_cache/src/controller.rs
@@ -0,0 +1,53 @@
+//! The controller module contains the API and functionality
+//! used to implement the controller for a DataCacheSet.
+
+use futures::future::select;
+use kube::Client;
+use std::time::Duration;
+
+mod error;
+pub use error::{Error, Result};
+mod kube_util;
+mod parquet_cache;
+pub use parquet_cache::{
+    ParquetCache, ParquetCacheInstanceSet, ParquetCacheSpec, ParquetCacheStatus,
+};
+
+mod parquet_cache_controller;
+
+mod parquet_cache_set;
+pub use parquet_cache_set::{ParquetCacheSet, ParquetCacheSetSpec, ParquetCacheSetStatus};
+
+mod parquet_cache_set_controller;
+
+mod state_service;
+
+/// The name of the controller.
+const CONTROLLER_NAME: &str = "parquet-cache-set-controller";
+
+/// Label used to annotate the objects with the hash of the pod template.
+const POD_TEMPLATE_HASH_LABEL: &str = "pod-template-hash";
+
+/// Label used to annotate objects with the count of parquet cache replicas.
+const PARQUET_CACHE_REPLICAS_LABEL: &str = "parquet-cache-replicas";
+
+/// The time to wait before re-executing when waiting for cache instances to warm, or cool.
+const SHORT_WAIT: Duration = Duration::from_secs(60);
+
+/// The time to wait before re-executing when there is no longer any active work to do, or
+/// the controller will be awoken by changes to owned objects.
+const LONG_WAIT: Duration = Duration::from_secs(3600);
+
+/// Run the controllers for ParquetCache and ParquetCacheSet resources to completion.
+pub async fn run(client: Client, namespace: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_join_handle =
+        parquet_cache_controller::spawn_controller(client.clone(), namespace.clone());
+    let parquet_cache_set_join_handle =
+        parquet_cache_set_controller::spawn_controller(client.clone(), namespace.clone());
+
+    select(parquet_cache_join_handle, parquet_cache_set_join_handle)
+        .await
+        .factor_first()
+        .0
+        .unwrap()
+}
diff --git a/parquet_cache/src/controller/error.rs b/parquet_cache/src/controller/error.rs
new file mode 100644
index 00000000000..bb3dc653350
--- /dev/null
+++ b/parquet_cache/src/controller/error.rs
@@ -0,0 +1,29 @@
+/// Errors that can be generated by the controller.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Error when encoding a resource object.
+    #[error("encoding error: {0}")]
+    EncodingError(#[from] serde_json::Error),
+
+    /// Error performing a kubernetes operation.
+    #[error("kubernetes error: {0}")]
+    KubeError(#[from] kube::Error),
+
+    /// Error getting a cache node's state.
+    #[error("node state error: {0}")]
+    NodeStateError(Box<dyn std::error::Error + Send + Sync>),
+
+    /// Error caused by an internal failure, this is almost certainly a bug.
+    #[error("internal error: {0}")]
+    InternalError(String),
+}
+
+impl Error {
+    /// Create a new [Error::InternalError] with the provided message.
+    pub fn internal(msg: &str) -> Self {
+        Self::InternalError(String::from(msg))
+    }
+}
+
+/// Result type for the controller.
+pub type Result<T, E = Error> = std::result::Result<T, E>;
diff --git a/parquet_cache/src/controller/kube_util.rs b/parquet_cache/src/controller/kube_util.rs
new file mode 100644
index 00000000000..67f847fdff3
--- /dev/null
+++ b/parquet_cache/src/controller/kube_util.rs
@@ -0,0 +1,93 @@
+use fnv::FnvHasher;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{LabelSelector, OwnerReference};
+use kube::{Api, Error, Resource, ResourceExt};
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+use std::fmt::Debug;
+use std::hash::Hasher;
+
+/// The set of characters kubernetes considers safe for generated strings.
+const SAFE_CHARS: [char; 27] = [
+    'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x',
+    'z', '2', '4', '5', '6', '7', '8', '9',
+];
+
+/// Encode a string using a small character set that is considered safe. This
+/// minimizes the chances of accidental vulgarity.
+pub fn safe_string(s: &str) -> String {
+    s.chars()
+        .map(|c| SAFE_CHARS[c as usize % SAFE_CHARS.len()])
+        .collect()
+}
+
+/// Get a hash value for the provided object. The hashed value has no guaranteed properties
+/// other than the same input will have the same resulting hash. There is no attempt made to
+/// hash the value in the same way that kubernetes controllers will.
+pub fn hash_object<T>(obj: &T) -> Result<String, serde_json::Error>
+where
+    T: ?Sized + Serialize,
+{
+    let bytes = serde_json::to_vec(obj)?;
+    let mut hasher = FnvHasher::with_key(0);
+    hasher.write(&bytes);
+    Ok(safe_string(&format!(
+        "{}",
+        (hasher.finish() & 0xFFFFFFFF) as u32
+    )))
+}
+
+/// Format label selectors so they can be used with ListParams.
+pub fn selectors(selector: &LabelSelector) -> Option<String> {
+    let mut clauses = vec![];
+    if let Some(expressions) = &selector.match_expressions {
+        clauses.extend(expressions.iter().filter_map(|requirement| {
+            match requirement.operator.as_ref() {
+                "In" => requirement
+                    .values
+                    .as_ref()
+                    .map(|values| format!("{} in ({})", requirement.key, values.join(","))),
+                "NotIn" => requirement
+                    .values
+                    .as_ref()
+                    .map(|values| format!("{} notin ({})", requirement.key, values.join(","))),
+                "Exists" => Some(requirement.key.clone()),
+                "DoesNotExist" => Some(format!("!{}", requirement.key)),
+                _ => None, // Skip unknown operator.
+            }
+        }));
+    }
+    if let Some(labels) = &selector.match_labels {
+        clauses.extend(labels.iter().map(|(k, v)| format!("{k}={v}")))
+    }
+    match clauses.len() {
+        0 => None,
+        _ => Some(clauses.join(",")),
+    }
+}
+
+pub async fn list_owned<K>(api: &Api<K>, owner_uid: &String) -> Result<Vec<K>, Error>
+where
+    K: Debug + Clone + Resource + DeserializeOwned + Send + Sync + 'static,
+{
+    let object_list = api.list(&Default::default()).await?;
+    Ok(object_list
+        .items
+        .into_iter()
+        .filter(|obj| obj.owner_references().iter().any(|or| &or.uid == owner_uid))
+        .collect())
+}
+
+pub fn owner_reference<R>(obj: &R) -> OwnerReference
+where
+    R: Resource<DynamicType = ()>,
+{
+    let meta = obj.meta();
+    OwnerReference {
+        api_version: R::api_version(&()).into(),
+        block_owner_deletion: Some(true),
+        controller: Some(true),
+        kind: R::kind(&()).into(),
+        name: meta.name.clone().unwrap_or_default(),
+        uid: meta.uid.clone().unwrap_or_default(),
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache.rs b/parquet_cache/src/controller/parquet_cache.rs
new file mode 100644
index 00000000000..8c65bc7ddc5
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache.rs
@@ -0,0 +1,139 @@
+use super::{Error, Result, PARQUET_CACHE_REPLICAS_LABEL};
+use k8s_openapi::api::core::v1::PodTemplateSpec;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
+use k8s_openapi::schemars::JsonSchema;
+use kube::CustomResource;
+use serde::{Deserialize, Serialize};
+
+/// Specification of a ParquetCache.
+#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)]
+#[kube(
+    kind = "ParquetCache",
+    group = "iox.influxdata.com",
+    version = "v1alpha1",
+    namespaced
+)]
+#[kube(status = "ParquetCacheStatus")]
+#[kube(derive = "Default")]
+#[serde(rename_all = "camelCase")]
+pub struct ParquetCacheSpec {
+    /// The name of the config map to generate containing the data cache set
+    /// state. This config map must be volume mounted in the pod template.
+    /// If a name isn't set then the config map will have the same name as
+    /// the data cache set.
+    pub config_map_name: Option<String>,
+
+    /// The number of replicas that are required to be in the data cache set.
+    pub replicas: Option<i32>,
+
+    /// Selector is a label query over pods that should match the replica
+    /// count. Label keys and values that must match in order to be controlled
+    /// by this data cache set. It must match the pod template's labels.
+    pub selector: LabelSelector,
+
+    /// Port running on the pods that should be used to query the working state
+    /// using the `/state` endpoint.
+    pub state_port: Option<String>,
+
+    /// Template is the object that describes the pod that will be created
+    /// if insufficient replicas are detected.
+    pub template: PodTemplateSpec,
+}
+
+/// Status of a ParquetCache.
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct ParquetCacheStatus {
+    /// The current cache instance set.
+    pub current: ParquetCacheInstanceSet,
+
+    /// The upcoming cache instance set.
+    pub next: ParquetCacheInstanceSet,
+}
+
+/// The set of instances that form a parquet cache group.
+#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)]
+pub struct ParquetCacheInstanceSet {
+    /// The revision number of the cache instance set.
+    pub revision: i64,
+
+    /// The set of instances that form the cache set.
+    pub instances: Vec<String>,
+}
+
+impl ParquetCache {
+    fn name(&self) -> Result<&String> {
+        self.metadata
+            .name
+            .as_ref()
+            .ok_or(Error::internal("ParquetCache has no name"))
+    }
+
+    /// Get the name of the [k8s_openapi::api::core::v1::ConfigMap] that should be created to
+    /// contain the status information required by the parquet servers.
+    pub(super) fn config_map_name(&self) -> Result<&String> {
+        if let Some(name) = &self.spec.config_map_name {
+            Ok(name)
+        } else {
+            self.name()
+        }
+    }
+
+    /// The number of replicas specified for this ParquetCache.
+    pub(super) fn replicas(&self) -> i32 {
+        self.spec.replicas.unwrap_or(1)
+    }
+
+    /// Get the PodTemplateSpec to pass on to the [super::ParquetCacheSet]. This will make necessary
+    /// changes to the template supplied in the [ParquetCacheSpec].
+    ///
+    /// The generated [PodTemplateSpec] includes a label containing the requested replica count.
+    /// This ensures that a different [super::ParquetCacheSet] is created even if the only change to the
+    /// [ParquetCache] is a change in the replica count.
+    pub(super) fn parquet_cache_set_template(&self) -> PodTemplateSpec {
+        let mut template = self.spec.template.clone();
+        let metadata = template.metadata.get_or_insert(Default::default());
+        let labels = metadata.labels.get_or_insert(Default::default());
+        labels.insert(
+            String::from(PARQUET_CACHE_REPLICAS_LABEL),
+            format!("{}", self.replicas()),
+        );
+        template
+    }
+
+    /// Generate a name for a ParquetCacheSet derived from this ParquetCache.
+    pub(super) fn parquet_cache_set_name(&self, pod_template_hash: &str) -> Result<String> {
+        let name = self.name()?;
+        Ok(format!("{name}-{pod_template_hash}"))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta;
+
+    #[test]
+    fn config_map_name() {
+        let pc = ParquetCache {
+            metadata: ObjectMeta {
+                name: Some(String::from("test-data-cache-set")),
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        assert_eq!("test-data-cache-set", pc.config_map_name().unwrap());
+
+        let pc = ParquetCache {
+            metadata: ObjectMeta {
+                name: Some("test-data-cache-set".to_string()),
+                ..Default::default()
+            },
+            spec: ParquetCacheSpec {
+                config_map_name: Some(String::from("config-map")),
+                ..Default::default()
+            },
+            ..Default::default()
+        };
+        assert_eq!("config-map", pc.config_map_name().unwrap());
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_controller.rs b/parquet_cache/src/controller/parquet_cache_controller.rs
new file mode 100644
index 00000000000..3dba0587b66
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_controller.rs
@@ -0,0 +1,1446 @@
+use super::{
+    kube_util::{hash_object, list_owned, owner_reference},
+    Error, ParquetCache, ParquetCacheInstanceSet, ParquetCacheSet, ParquetCacheSetSpec,
+    ParquetCacheStatus, Result, LONG_WAIT, PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL,
+    SHORT_WAIT,
+};
+use crate::data_types::InstanceState;
+use chrono::Utc;
+use futures::StreamExt;
+use k8s_openapi::api::core::v1::{ConfigMap, PodTemplateSpec};
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
+use kube::runtime::controller::Action;
+use kube::runtime::Controller;
+use kube::{Api, Client, Resource, ResourceExt};
+use observability_deps::tracing::{debug, error, info};
+use std::collections::{BTreeMap, BTreeSet};
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::task::JoinHandle;
+
+/// Start a new controller task to reconcile [ParquetCacheSet] objects.
+pub fn spawn_controller(client: Client, ns: Option<String>) -> JoinHandle<Result<(), kube::Error>> {
+    tokio::spawn(run_controller(client, ns))
+}
+
+async fn run_controller(client: Client, ns: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_api = match &ns {
+        Some(ns) => Api::<ParquetCache>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCache>::all(client.clone()),
+    };
+    let parquet_cache_set_api = match &ns {
+        Some(ns) => Api::<ParquetCacheSet>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCacheSet>::all(client.clone()),
+    };
+
+    Controller::new(parquet_cache_api, Default::default())
+        .owns(parquet_cache_set_api, Default::default())
+        .run(reconcile, error_policy, Arc::new(Context { client }))
+        .for_each(|_| futures::future::ready(()))
+        .await;
+    Ok(())
+}
+
+async fn reconcile(obj: Arc<ParquetCache>, ctx: Arc<Context>) -> Result<Action> {
+    let namespace = obj.metadata.namespace.as_deref();
+    let name = obj.name_any();
+    info!(namespace, name, "reconcile request");
+    let sleep = ParquetCacheController::new(obj.as_ref().clone(), ctx.client.clone())
+        .reconcile()
+        .await?;
+    Ok(Action::requeue(sleep))
+}
+
+fn error_policy(_object: Arc<ParquetCache>, err: &Error, _ctx: Arc<Context>) -> Action {
+    // TODO add exponential backoff
+    let sleep = Duration::from_secs(5);
+    error!(
+        err = err as &dyn std::error::Error,
+        "reconcile failed, requeue in {:?}", sleep
+    );
+    Action::requeue(sleep)
+}
+
+/// Context used when reconciling [ParquetCacheSet] objects.
+struct Context {
+    client: Client,
+}
+
+const COOLING_SECONDS: i64 = 300;
+
+/// Controller for the ParquetCache custom resource. This controller maintains ParquetCacheSet
+/// resources for a ParquetCache.
+#[derive(Debug)]
+struct ParquetCacheController {
+    config_map_api: Api<ConfigMap>,
+    parquet_cache_api: Api<ParquetCache>,
+    parquet_cache_set_api: Api<ParquetCacheSet>,
+
+    parquet_cache: ParquetCache,
+}
+
+impl ParquetCacheController {
+    /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet]
+    /// and [Client].
+    fn new(parquet_cache: ParquetCache, client: Client) -> Self {
+        let ns = parquet_cache.metadata.namespace.as_ref().unwrap();
+        let config_maps = Api::namespaced(client.clone(), ns);
+        let parquet_caches = Api::namespaced(client.clone(), ns);
+        let parquet_cache_sets = Api::namespaced(client.clone(), ns);
+
+        Self {
+            config_map_api: config_maps,
+            parquet_cache_api: parquet_caches,
+            parquet_cache_set_api: parquet_cache_sets,
+            parquet_cache,
+        }
+    }
+
+    /// Perform the business logic required to move the DataCacheSet state forward towards the
+    /// desired state.
+    pub async fn reconcile(&mut self) -> Result<Duration> {
+        // ensure the config map exists before attempting to start pods.
+        let cm = self.status_config_map()?;
+        match self.config_map_api.create(&Default::default(), &cm).await {
+            Ok(_) => {
+                info!(name = cm.metadata.name, "Created ConfigMap");
+            }
+            Err(kube::Error::Api(status)) if status.reason == "AlreadyExists" => (),
+            Err(error) => return Err(error)?,
+        }
+
+        let duration = self.reconcile_inner().await?;
+
+        // update the config map with the latest set.
+        let cm = self.status_config_map()?;
+        debug!("update config map");
+        self.config_map_api
+            .replace(
+                self.parquet_cache.config_map_name()?,
+                &Default::default(),
+                &cm,
+            )
+            .await?;
+        debug!("update ParquetCache status");
+        self.parquet_cache_api
+            .replace_status(
+                self.parquet_cache.metadata.name.as_ref().unwrap(),
+                &Default::default(),
+                serde_json::to_vec(&self.parquet_cache)?,
+            )
+            .await?;
+        Ok(duration)
+    }
+
+    /// Perform the changes required to reconcile the state of the ParquetCache. Changes to the
+    /// status are written to memory and will updated after this method returns.
+    async fn reconcile_inner(&mut self) -> Result<Duration> {
+        let template = self.parquet_cache.parquet_cache_set_template();
+        let pod_template_hash = hash_object(&template)?;
+
+        // find and remove any owned cache sets that are no longer required.
+        self.remove_empty_cache_sets(&pod_template_hash).await?;
+
+        if self.check_warming_pods().await? {
+            self.status_mut().current = self.status_mut().next.clone();
+        } else {
+            // Some pods are still warming, check again soon.
+            return Ok(SHORT_WAIT);
+        }
+        if !self.check_cooling_pods(&pod_template_hash).await? {
+            // Some pods are still cooling, check again soon.
+            return Ok(SHORT_WAIT);
+        }
+        if self.status_mut().current.instances.len() != self.parquet_cache.replicas() as usize {
+            self.resize(&pod_template_hash, &template).await?;
+        } else {
+            self.migrate(&pod_template_hash, &template).await?;
+        }
+
+        // If we get to here then either there is nothing to change, or some changes
+        // have been made and the controller will be woken by those changes.
+        Ok(LONG_WAIT)
+    }
+
+    async fn remove_empty_cache_sets(&mut self, pod_template_hash: &String) -> Result<()> {
+        let parquet_cache_sets = self.owned_parquet_cache_sets().await?;
+        let to_delete = parquet_cache_sets
+            .into_iter()
+            .filter(|pcs| {
+                let is_latest = if let Some(pth) = pcs
+                    .metadata
+                    .labels
+                    .as_ref()
+                    .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL).cloned())
+                {
+                    &pth == pod_template_hash
+                } else {
+                    false
+                };
+                let is_empty = if let Some(pods) =
+                    pcs.status.as_ref().and_then(|status| status.pods.as_ref())
+                {
+                    pods.is_empty()
+                } else {
+                    true
+                };
+                !is_latest && is_empty
+            })
+            .collect::<Vec<ParquetCacheSet>>();
+
+        for pcs in to_delete {
+            info!(name = pcs.metadata.name, "Deleting ParquetCacheSet");
+            self.parquet_cache_set_api
+                .delete(&pcs.metadata.name.unwrap(), &Default::default())
+                .await?;
+        }
+        Ok(())
+    }
+
+    async fn check_warming_pods(&mut self) -> Result<bool> {
+        let status = self.status_mut();
+        if status.current.revision == status.next.revision {
+            return Ok(true);
+        }
+        for instance in status.next.instances.clone() {
+            let (parquet_cache_set_name, _) = instance.rsplit_once('-').unwrap();
+            let parquet_cache_set = self
+                .parquet_cache_set_api
+                .get(parquet_cache_set_name)
+                .await?;
+            let parquet_cache_set_status = parquet_cache_set.status.unwrap_or_default();
+            let pod_status = parquet_cache_set_status
+                .pods
+                .as_ref()
+                .and_then(|pods| pods.get(&instance));
+            let phase = pod_status
+                .and_then(|status| status.phase.as_ref())
+                .map(String::as_str);
+            let state = pod_status
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state.to_string());
+            debug!(name = &instance, phase, state, "Checking Pod status");
+            if phase.unwrap_or("") != "Running" {
+                return Ok(false);
+            }
+            if pod_status
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state != InstanceState::Warming)
+                .unwrap_or(true)
+            {
+                return Ok(false);
+            }
+        }
+        Ok(true)
+    }
+
+    async fn check_cooling_pods(&mut self, pod_template_hash: &String) -> Result<bool> {
+        let mut live_pods = self
+            .status_mut()
+            .current
+            .instances
+            .iter()
+            .cloned()
+            .collect::<BTreeSet<_>>();
+        for pod in &self.status_mut().next.instances {
+            live_pods.insert(pod.clone());
+        }
+        let parquet_cache_sets = self.owned_parquet_cache_sets().await?;
+
+        let current_status = parquet_cache_sets
+            .iter()
+            .filter_map(|pcs| pcs.status.as_ref())
+            .filter_map(|status| status.pods.as_ref())
+            .flat_map(|pods| pods.clone().into_iter().collect::<Vec<_>>())
+            .filter(|(k, _)| self.status_mut().current.instances.contains(k))
+            .map(|(k, status)| {
+                let (_, suffix) = split_pod_name(&k);
+                (suffix, status)
+            })
+            .collect::<BTreeMap<_, _>>();
+
+        let cooling_pods = parquet_cache_sets
+            .iter()
+            .filter(|&pcs| !has_pod_template_hash(pcs, pod_template_hash))
+            .filter_map(|pcs| pcs.status.as_ref())
+            .filter_map(|status| status.pods.as_ref())
+            .flat_map(|pods| pods.keys().cloned().collect::<Vec<_>>())
+            .filter(|key| !live_pods.contains(key))
+            .collect::<Vec<_>>();
+
+        let mut cooling = false;
+        for pod in cooling_pods {
+            let (pcs_name, suffix) = split_pod_name(&pod);
+            if let Some(change) = current_status
+                .get(&suffix)
+                .and_then(|status| status.state.as_ref())
+                .map(|state| state.state_changed)
+            {
+                if change > Utc::now().timestamp() - COOLING_SECONDS {
+                    // If the pod has been cooling for less than the wait time, keep waiting.
+                    cooling = true;
+                    continue;
+                }
+            }
+            let mut pcs = self.parquet_cache_set_api.get(&pcs_name).await?;
+            pcs.spec
+                .replica_suffixes
+                .as_mut()
+                .unwrap()
+                .retain(|s| s != &suffix);
+            self.parquet_cache_set_api
+                .replace(&pcs_name, &Default::default(), &pcs)
+                .await?;
+        }
+        Ok(!cooling)
+    }
+
+    async fn resize(
+        &mut self,
+        pod_template_hash: &String,
+        template: &PodTemplateSpec,
+    ) -> Result<()> {
+        let owned = self.owned_parquet_cache_sets().await?;
+
+        // Clear any ParquetCacheSets that are not the required one.
+        for mut pcs in owned {
+            let is_current = pcs
+                .metadata
+                .labels
+                .as_ref()
+                .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL))
+                .map(|v| v == pod_template_hash)
+                .unwrap_or_default();
+            if is_current {
+                continue;
+            }
+            pcs.spec.replica_suffixes = None;
+            self.set_parquet_cache_set(&pcs).await?;
+        }
+
+        // Create the desired ParquetCacheSet.
+        let mut pcs = self
+            .get_parquet_cache_set(pod_template_hash, template)
+            .await?;
+        let suffixes = (0..self.parquet_cache.replicas())
+            .map(|n| format!("{n}"))
+            .collect::<Vec<_>>();
+        pcs.spec.replica_suffixes = Some(suffixes.clone());
+        self.set_parquet_cache_set(&pcs).await?;
+        let next_revision = self.status_mut().next.revision + 1;
+        let instances = suffixes
+            .iter()
+            .map(|suffix| format!("{}-{suffix}", pcs.metadata.name.as_ref().unwrap()))
+            .collect();
+        self.status_mut().next = ParquetCacheInstanceSet {
+            revision: next_revision,
+            instances,
+        };
+        self.status_mut().current = self.status_mut().next.clone();
+        Ok(())
+    }
+
+    async fn migrate(&mut self, pod_template_hash: &str, template: &PodTemplateSpec) -> Result<()> {
+        let current = self.status_mut().current.clone();
+        assert_eq!(current.revision, self.status_mut().next.revision);
+        let parquet_cache_set_name = self
+            .parquet_cache
+            .parquet_cache_set_name(pod_template_hash)?;
+
+        for (idx, name) in current.instances.iter().enumerate() {
+            let (prefix, suffix) = split_pod_name(name);
+            if prefix == parquet_cache_set_name {
+                continue;
+            }
+            let mut pcs = self
+                .get_parquet_cache_set(pod_template_hash, template)
+                .await?;
+            if pcs.spec.replica_suffixes.is_none() {
+                pcs.spec.replica_suffixes = Some(vec![]);
+            }
+            pcs.spec
+                .replica_suffixes
+                .as_mut()
+                .unwrap()
+                .push(suffix.clone());
+            self.set_parquet_cache_set(&pcs).await?;
+            self.status_mut().next.revision = current.revision + 1;
+            self.status_mut().next.instances[idx] = format!("{parquet_cache_set_name}-{suffix}");
+            break;
+        }
+        Ok(())
+    }
+
+    async fn owned_parquet_cache_sets(&self) -> Result<Vec<ParquetCacheSet>> {
+        let uid = self
+            .parquet_cache
+            .metadata
+            .uid
+            .as_ref()
+            .ok_or(Error::internal("ParquetCache has no uid"))?;
+        Ok(list_owned(&self.parquet_cache_set_api, uid).await?)
+    }
+
+    /// Create or update the specified ParquetCacheSet.
+    async fn set_parquet_cache_set(&mut self, pcs: &ParquetCacheSet) -> Result<ParquetCacheSet> {
+        let name = pcs.metadata.name.as_ref().ok_or(Error::internal(
+            "attempt to set a ParquetCacheSet without a name",
+        ))?;
+        let pp = Default::default();
+        if pcs.metadata.uid.is_some() {
+            Ok(self.parquet_cache_set_api.replace(name, &pp, pcs).await?)
+        } else {
+            Ok(self.parquet_cache_set_api.create(&pp, pcs).await?)
+        }
+    }
+
+    /// Retrieve the ParquetCacheSet for the specified Pod template hash. If there is no such
+    /// ParquetCacheSet then create a ParquetCacheSet object with appropriate defaults taken from
+    /// the current ParquetCache document.
+    async fn get_parquet_cache_set(
+        &mut self,
+        pod_template_hash: &str,
+        template: &PodTemplateSpec,
+    ) -> Result<ParquetCacheSet> {
+        let name = self
+            .parquet_cache
+            .parquet_cache_set_name(pod_template_hash)?;
+        Ok(self
+            .parquet_cache_set_api
+            .get_opt(&name)
+            .await?
+            .unwrap_or_else(|| self.new_parquet_cache_set(name, pod_template_hash, template)))
+    }
+
+    fn new_parquet_cache_set(
+        &self,
+        name: String,
+        pod_template_hash: &str,
+        template: &PodTemplateSpec,
+    ) -> ParquetCacheSet {
+        let pod_template_hash_key = String::from(POD_TEMPLATE_HASH_LABEL);
+        let replica_count_key = String::from(PARQUET_CACHE_REPLICAS_LABEL);
+        let replica_count_value = format!("{}", self.parquet_cache.replicas());
+
+        let mut labels = self
+            .parquet_cache
+            .metadata
+            .labels
+            .clone()
+            .unwrap_or_default();
+        labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+        labels.insert(replica_count_key.clone(), replica_count_value.clone());
+        let mut selector = self.parquet_cache.spec.selector.clone();
+        let match_labels = selector.match_labels.get_or_insert(Default::default());
+        match_labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+        match_labels.insert(replica_count_key.clone(), replica_count_value.clone());
+
+        let mut template = template.clone();
+        let template_metadata = template.metadata.get_or_insert(Default::default());
+        template_metadata.namespace = self.parquet_cache.metadata.namespace.clone();
+        let template_labels = template_metadata.labels.get_or_insert(Default::default());
+        template_labels.insert(
+            pod_template_hash_key.clone(),
+            String::from(pod_template_hash),
+        );
+
+        ParquetCacheSet {
+            metadata: ObjectMeta {
+                labels: Some(labels),
+                name: Some(name),
+                namespace: self.parquet_cache.metadata.namespace.clone(),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..Default::default()
+            },
+            spec: ParquetCacheSetSpec {
+                replica_suffixes: None,
+                selector,
+                state_port: self.parquet_cache.spec.state_port.clone(),
+                template: Some(template),
+            },
+            status: None,
+        }
+    }
+
+    fn status_config_map(&mut self) -> Result<ConfigMap> {
+        let mut data = BTreeMap::new();
+        let status = self.status_mut();
+        data.insert(
+            "current".to_string(),
+            serde_json::to_string(&status.current)?,
+        );
+        data.insert("next".to_string(), serde_json::to_string(&status.next)?);
+        Ok(ConfigMap {
+            metadata: ObjectMeta {
+                namespace: self.parquet_cache.metadata.namespace.clone(),
+                name: Some(self.parquet_cache.config_map_name()?.clone()),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..Default::default()
+            },
+            data: Some(data),
+            ..Default::default()
+        })
+    }
+
+    /// Generate an owner reference for the current ParquetCache document.
+    fn owner_reference(&self) -> OwnerReference {
+        owner_reference(&self.parquet_cache)
+    }
+
+    fn status_mut(&mut self) -> &mut ParquetCacheStatus {
+        self.parquet_cache.status.get_or_insert(Default::default())
+    }
+}
+
+fn split_pod_name(name: &str) -> (String, String) {
+    if let Some((prefix, suffix)) = name.rsplit_once('-') {
+        (String::from(prefix), String::from(suffix))
+    } else {
+        (String::from(name), String::from(""))
+    }
+}
+
+fn has_pod_template_hash<K>(obj: &K, pod_template_hash: &String) -> bool
+where
+    K: Resource,
+{
+    if let Some(hash) = obj
+        .meta()
+        .labels
+        .as_ref()
+        .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL))
+    {
+        hash == pod_template_hash
+    } else {
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::controller::parquet_cache::{ParquetCacheInstanceSet, ParquetCacheSpec};
+    use crate::controller::parquet_cache_set::InstanceStatus;
+    use crate::controller::{ParquetCacheSet, ParquetCacheSetStatus, SHORT_WAIT};
+    use crate::data_types::{InstanceState, State};
+    use hyper::Body;
+    use kube::client::ClientBuilder;
+    use kube::ResourceExt;
+    use kube_test::{AsHandler, ResourceHandler, Service};
+    use std::ops::Sub;
+    use std::sync::Arc;
+
+    #[tokio::test]
+    async fn create_config_map() {
+        let ns = "create_config_map";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+        let uid = pc.metadata.uid.clone().unwrap_or_default();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap());
+        assert_eq!(name, cm.metadata.name.as_ref().unwrap());
+        assert_eq!(uid, cm.metadata.owner_references.as_ref().unwrap()[0].uid);
+        assert!(!cm.data.as_ref().unwrap().get("current").unwrap().is_empty());
+        assert!(!cm.data.as_ref().unwrap().get("next").unwrap().is_empty());
+    }
+
+    #[tokio::test]
+    async fn create_config_map_no_fail_on_existing() {
+        let ns = "create_config_map_no_fail_on_existing";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        fixture.config_maps.set(ns, name, Default::default());
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap());
+        assert_eq!(name, cm.metadata.name.as_ref().unwrap());
+    }
+
+    #[tokio::test]
+    async fn create_initial_parquet_cache_set_at_full_size() {
+        let ns = "create_initial_parquet_cache_set_at_full_size";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(5),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+        let uid = pc.metadata.uid.clone().unwrap_or_default();
+        let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap();
+
+        fixture.reconcile(ns, pc.clone()).await.unwrap();
+
+        let parquet_cache_sets = fixture
+            .parquet_cache_sets
+            .all(ns)
+            .into_iter()
+            .filter(|pcs| pcs.owner_references().iter().any(|or| or.uid == uid))
+            .collect::<Vec<_>>();
+
+        assert_eq!(1, parquet_cache_sets.len());
+        let pcs = &parquet_cache_sets[0];
+        assert_eq!(
+            &template_hash,
+            pcs.metadata
+                .labels
+                .as_ref()
+                .and_then(|map| map.get(POD_TEMPLATE_HASH_LABEL))
+                .unwrap()
+        );
+        assert_eq!(
+            5,
+            pcs.spec
+                .replica_suffixes
+                .as_ref()
+                .map(Vec::len)
+                .unwrap_or_default()
+        );
+
+        let cm = fixture.config_maps.get(ns, name).unwrap();
+        let current = cm.data.as_ref().unwrap().get("current").unwrap().clone();
+        let next = cm.data.as_ref().unwrap().get("next").unwrap().clone();
+        assert_eq!(current, next);
+
+        let pcis = serde_json::from_str::<ParquetCacheInstanceSet>(&current).unwrap();
+        assert_eq!(5, pcis.instances.len());
+    }
+
+    #[tokio::test]
+    async fn old_parquet_cache_set_removed() {
+        let ns = "old_parquet_cache_set_removed";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(ns, name, Default::default());
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([])),
+                }),
+            },
+        );
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{pcs2_name}-0"),
+                        Default::default(),
+                    )])),
+                }),
+            },
+        );
+        let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap();
+        let pcs3_name = format!("{name}-{template_hash}");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs3_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        template_hash.clone(),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs1_name).is_none());
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs2_name).is_some());
+        assert!(fixture.parquet_cache_sets.get(ns, &pcs3_name).is_some());
+    }
+
+    #[tokio::test]
+    async fn warming_pods_retry_shortly() {
+        let ns = "warming_pods_retry_shortly";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn warm_pods_update_status() {
+        let ns = "warm_pods_update_status";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let status = fixture
+            .parquet_caches
+            .get(ns, name)
+            .unwrap()
+            .status
+            .unwrap();
+        assert_eq!(status.next, status.current);
+    }
+
+    #[tokio::test]
+    async fn cooling_pods_retry_shortly() {
+        let ns = "cooling_pods_retry_shortly";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(30))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn cooled_pods_are_removed() {
+        let ns = "cooled_pods_are_removed";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![format!("{name}-bbbbbbbbbb-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pcs2_name = format!("{name}-bbbbbbbbbb");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("bbbbbbbbbb"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-bbbbbbbbbb-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(400))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert!(
+            pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty()
+        );
+    }
+
+    #[tokio::test]
+    async fn resizing_recreates_everything() {
+        let ns = "resizing_recreates_everything";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(2),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 1,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert!(
+            pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty()
+        );
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[tokio::test]
+    async fn template_change_starts_migration() {
+        let ns = "template_change_starts_migration";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 1,
+                        instances: vec![format!("{name}-aaaaaaaaaa-0")],
+                    },
+                }),
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-aaaaaaaaaa-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 1,
+                                next_node_set_revision: 1,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pc = fixture.parquet_caches.get(ns, name).unwrap();
+        let status = &pc.status.unwrap();
+        assert!(status.current.revision < status.next.revision);
+        assert_eq!(format!("{name}-aaaaaaaaaa-0"), status.current.instances[0]);
+        assert_eq!(format!("{name}-{hash}-0"), status.next.instances[0]);
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert_eq!(1, pcs1.spec.replica_suffixes.unwrap().len());
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(1, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[tokio::test]
+    async fn one_pod_migrated_at_a_time() {
+        let ns = "template_change_starts_migration";
+        let name = "parquet-cache";
+        let fixture: Fixture = Default::default();
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                spec: ParquetCacheSpec {
+                    replicas: Some(3),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        let pcs1_name = format!("{name}-aaaaaaaaaa");
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs1_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        String::from("aaaaaaaaaa"),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("1"), String::from("2")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([
+                        (
+                            format!("{name}-aaaaaaaaaa-1"),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    state_changed: chrono::Utc::now()
+                                        .sub(Duration::from_secs(1800))
+                                        .timestamp(),
+                                    current_node_set_revision: 2,
+                                    next_node_set_revision: 2,
+                                }),
+                            },
+                        ),
+                        (
+                            format!("{name}-aaaaaaaaaa-2"),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    state_changed: chrono::Utc::now()
+                                        .sub(Duration::from_secs(1800))
+                                        .timestamp(),
+                                    current_node_set_revision: 2,
+                                    next_node_set_revision: 2,
+                                }),
+                            },
+                        ),
+                    ])),
+                }),
+            },
+        );
+        let template = pc.parquet_cache_set_template();
+        let hash = hash_object(&template).unwrap();
+        let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap();
+        fixture.parquet_cache_sets.set(
+            ns,
+            &pcs2_name,
+            ParquetCacheSet {
+                metadata: ObjectMeta {
+                    labels: Some(BTreeMap::from([(
+                        String::from(POD_TEMPLATE_HASH_LABEL),
+                        hash.clone(),
+                    )])),
+                    owner_references: Some(vec![owner_reference(&pc)]),
+                    ..Default::default()
+                },
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        format!("{name}-{hash}-0"),
+                        InstanceStatus {
+                            phase: Some(String::from("Running")),
+                            state: Some(State {
+                                state: InstanceState::Warming,
+                                state_changed: chrono::Utc::now()
+                                    .sub(Duration::from_secs(600))
+                                    .timestamp(),
+                                current_node_set_revision: 2,
+                                next_node_set_revision: 2,
+                            }),
+                        },
+                    )])),
+                }),
+            },
+        );
+
+        let pc = fixture.parquet_caches.set(
+            ns,
+            name,
+            ParquetCache {
+                status: Some(ParquetCacheStatus {
+                    current: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![
+                            format!("{name}-{hash}-0"),
+                            format!("{name}-aaaaaaaaaa-1"),
+                            format!("{name}-aaaaaaaaaa-2"),
+                        ],
+                    },
+                    next: ParquetCacheInstanceSet {
+                        revision: 2,
+                        instances: vec![
+                            format!("{name}-{hash}-0"),
+                            format!("{name}-aaaaaaaaaa-1"),
+                            format!("{name}-aaaaaaaaaa-2"),
+                        ],
+                    },
+                }),
+                ..pc
+            },
+        );
+
+        fixture.reconcile(ns, pc).await.unwrap();
+
+        let pc = fixture.parquet_caches.get(ns, name).unwrap();
+        let status = &pc.status.unwrap();
+        assert!(status.current.revision < status.next.revision);
+        assert_eq!(status.current.instances[0], status.next.instances[0]);
+        assert_eq!(format!("{name}-aaaaaaaaaa-1"), status.current.instances[1]);
+        assert_eq!(format!("{name}-{hash}-1"), status.next.instances[1]);
+        assert_eq!(status.current.instances[2], status.next.instances[2]);
+
+        let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap();
+        assert_eq!(2, pcs1.spec.replica_suffixes.unwrap().len());
+        let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap();
+        assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len())
+    }
+
+    #[derive(Debug, Default)]
+    struct Fixture {
+        pub config_maps: Arc<ResourceHandler<ConfigMap>>,
+        pub parquet_cache_sets: Arc<ResourceHandler<ParquetCacheSet>>,
+        pub parquet_caches: Arc<ResourceHandler<ParquetCache>>,
+    }
+
+    impl Fixture {
+        fn service(&self) -> Service {
+            let service = Service::new();
+            service.add_handler(self.config_maps.as_handler());
+            service.add_handler(self.parquet_cache_sets.as_handler());
+            service.add_handler(self.parquet_caches.as_handler());
+            service
+        }
+
+        async fn reconcile(
+            &self,
+            ns: impl Into<String> + Send,
+            pc: ParquetCache,
+        ) -> Result<Duration> {
+            let service = self.service();
+            let client = ClientBuilder::new(service, ns).build::<Body>();
+            let mut controller = ParquetCacheController::new(pc, client);
+            let hnd = tokio::spawn(async move { controller.reconcile().await });
+            hnd.await.unwrap()
+        }
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_set.rs b/parquet_cache/src/controller/parquet_cache_set.rs
new file mode 100644
index 00000000000..bb172fa15bf
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_set.rs
@@ -0,0 +1,75 @@
+use crate::data_types::{InstanceState, State};
+use k8s_openapi::api::core::v1::PodTemplateSpec;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector;
+use k8s_openapi::schemars::JsonSchema;
+use kube::CustomResource;
+use serde::{Deserialize, Serialize};
+use std::collections::BTreeMap;
+
+/// Specification of a ParquetCacheSet.
+#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)]
+#[kube(
+    kind = "ParquetCacheSet",
+    group = "iox.influxdata.com",
+    version = "v1alpha1",
+    namespaced
+)]
+#[kube(status = "ParquetCacheSetStatus")]
+#[kube(derive = "Default")]
+#[serde(rename_all = "camelCase")]
+pub struct ParquetCacheSetSpec {
+    /// Suffixes for the pods required to be in the set.
+    pub replica_suffixes: Option<Vec<String>>,
+
+    /// Selector is a label query over pods that should match the replica
+    /// count. Label keys and values that must match in order to be controlled
+    /// by this parquet cache set. It must match the pod template's labels.
+    pub selector: LabelSelector,
+
+    /// Port to connect to on the pod in order to enquire about the status of
+    /// the cache.
+    pub state_port: Option<String>,
+
+    /// Template is the object that describes the pod that will be created
+    /// if insufficient replicas are detected.
+    pub template: Option<PodTemplateSpec>,
+}
+
+/// Status of a ParquetCacheSet.
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct ParquetCacheSetStatus {
+    /// Status of the pods that form the set.
+    pub pods: Option<BTreeMap<String, InstanceStatus>>,
+}
+
+#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)]
+pub struct InstanceStatus {
+    /// The phase the pod is in.
+    pub phase: Option<String>,
+
+    /// The state reported by the pod. This is only included if the pod is in the "Running" phase
+    /// and the state could be queried successfully.
+    pub state: Option<State>,
+}
+
+impl InstanceStatus {
+    /// Determine if the status represents a warming instance.
+    pub(super) fn is_warming(&self) -> bool {
+        match &self.phase {
+            None => false,
+            Some(phase) => match phase.as_str() {
+                "Running" => match &self.state {
+                    None => true,
+                    Some(state) => state.state == InstanceState::Warming,
+                },
+                _ => false,
+            },
+        }
+    }
+}
+
+impl ParquetCacheSet {
+    pub(super) fn selectors(&self) -> Option<String> {
+        super::kube_util::selectors(&self.spec.selector)
+    }
+}
diff --git a/parquet_cache/src/controller/parquet_cache_set_controller.rs b/parquet_cache/src/controller/parquet_cache_set_controller.rs
new file mode 100644
index 00000000000..ad478bf1d10
--- /dev/null
+++ b/parquet_cache/src/controller/parquet_cache_set_controller.rs
@@ -0,0 +1,676 @@
+use super::{
+    kube_util::owner_reference, parquet_cache_set::InstanceStatus, state_service, Error,
+    ParquetCacheSet, ParquetCacheSetStatus, Result, CONTROLLER_NAME, LONG_WAIT,
+    PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL, SHORT_WAIT,
+};
+use futures::StreamExt;
+use k8s_openapi::api::core::v1::Pod;
+use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference};
+use kube::api::{ListParams, PostParams};
+use kube::runtime::controller::Action;
+use kube::runtime::watcher::Config;
+use kube::runtime::Controller;
+use kube::{Api, Client, ResourceExt};
+use observability_deps::tracing::{error, info};
+use std::collections::BTreeSet;
+use std::fmt::Debug;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio::task::JoinHandle;
+
+/// Start a new controller task to reconcile [ParquetCacheSet] objects.
+pub fn spawn_controller(client: Client, ns: Option<String>) -> JoinHandle<Result<(), kube::Error>> {
+    tokio::spawn(run_controller(client, ns))
+}
+
+async fn run_controller(client: Client, ns: Option<String>) -> Result<(), kube::Error> {
+    let parquet_cache_set_api = match &ns {
+        Some(ns) => Api::<ParquetCacheSet>::namespaced(client.clone(), ns),
+        None => Api::<ParquetCacheSet>::all(client.clone()),
+    };
+    let pod_api = match &ns {
+        Some(ns) => Api::<Pod>::namespaced(client.clone(), ns),
+        None => Api::<Pod>::all(client.clone()),
+    };
+
+    Controller::new(parquet_cache_set_api, Default::default())
+        .owns(
+            pod_api,
+            Config::default().labels(&format!(
+                "{},{}",
+                PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL
+            )),
+        )
+        .run(
+            reconcile,
+            error_policy,
+            Arc::new(Context {
+                client,
+                state_service: Default::default(),
+            }),
+        )
+        .for_each(|_| futures::future::ready(()))
+        .await;
+    Ok(())
+}
+
+async fn reconcile(obj: Arc<ParquetCacheSet>, ctx: Arc<Context>) -> Result<Action> {
+    let namespace = obj.metadata.namespace.as_deref();
+    let name = obj.name_any();
+    info!(namespace, name, "reconcile request");
+    let sleep = ParquetCacheSetController::new(
+        obj.as_ref().clone(),
+        ctx.state_service.clone(),
+        ctx.client.clone(),
+    )
+    .reconcile()
+    .await?;
+    Ok(Action::requeue(sleep))
+}
+
+fn error_policy(_object: Arc<ParquetCacheSet>, err: &Error, _ctx: Arc<Context>) -> Action {
+    // TODO add exponential backoff
+    let sleep = Duration::from_secs(5);
+    error!(
+        err = err as &dyn std::error::Error,
+        "reconcile failed, requeue in {:?}", sleep
+    );
+    Action::requeue(sleep)
+}
+
+/// Context used when reconciling [ParquetCacheSet] objects.
+struct Context {
+    client: Client,
+    state_service: state_service::Client,
+}
+
+/// Controller for the ParquetCacheSet custom resource. This controller maintains the set of pods
+/// created by a ParquetCacheSet.
+#[derive(Debug)]
+struct ParquetCacheSetController {
+    parquet_cache_set_api: Api<ParquetCacheSet>,
+    pod_api: Api<Pod>,
+    state_service: state_service::Client,
+
+    parquet_cache_set: ParquetCacheSet,
+}
+
+impl ParquetCacheSetController {
+    /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet]
+    /// and [Client].
+    fn new(
+        parquet_cache_set: ParquetCacheSet,
+        state_service: state_service::Client,
+        client: Client,
+    ) -> Self {
+        let ns = parquet_cache_set.metadata.namespace.as_ref().unwrap();
+        let parquet_cache_sets: Api<ParquetCacheSet> = Api::namespaced(client.clone(), ns);
+        let pods: Api<Pod> = Api::namespaced(client.clone(), ns);
+
+        Self {
+            parquet_cache_set_api: parquet_cache_sets,
+            pod_api: pods,
+            state_service,
+            parquet_cache_set,
+        }
+    }
+
+    /// Perform the business logic required to move the ParquetCacheSet state forward towards the
+    /// desired state.
+    async fn reconcile(&mut self) -> Result<Duration> {
+        let duration = self.reconcile_inner().await?;
+
+        // Ensure the status is always kept up-to-date.
+        self.parquet_cache_set_api
+            .replace_status(
+                self.parquet_cache_set.metadata.name.as_ref().unwrap(),
+                &Default::default(),
+                serde_json::to_vec(&self.parquet_cache_set)?,
+            )
+            .await?;
+        Ok(duration)
+    }
+
+    async fn reconcile_inner(&mut self) -> Result<Duration> {
+        let prefix = self.parquet_cache_set.metadata.name.as_ref().unwrap();
+        let pod_names = self
+            .parquet_cache_set
+            .spec
+            .replica_suffixes
+            .as_ref()
+            .map_or_else(BTreeSet::new, |v| {
+                v.iter()
+                    .map(|suffix| format!("{prefix}-{suffix}"))
+                    .collect::<BTreeSet<_>>()
+            });
+
+        self.delete_removed(&pod_names).await?;
+        self.create_missing(&pod_names).await?;
+        self.update_status(&pod_names).await?;
+
+        let warming = self
+            .status_mut()
+            .pods
+            .as_ref()
+            .map(|pods| pods.iter().any(|(_, status)| status.is_warming()))
+            .unwrap_or(false);
+
+        // If there are cache pods in the warming state then check them in a minute, otherwise wait
+        // for an hour, or for a state change.
+        Ok(if warming { SHORT_WAIT } else { LONG_WAIT })
+    }
+
+    async fn delete_removed(&mut self, pod_names: &BTreeSet<String>) -> Result<()> {
+        let pods = self
+            .pod_api
+            .list(&ListParams {
+                label_selector: self.parquet_cache_set.selectors(),
+                ..Default::default()
+            })
+            .await?;
+        let to_delete = pods
+            .iter()
+            .filter_map(|pod| pod.metadata.name.as_ref())
+            .filter(|&name| !pod_names.contains(name))
+            .collect::<Vec<_>>();
+
+        for pod_name in to_delete {
+            info!(name = pod_name, "Deleting Pod");
+            self.pod_api.delete(pod_name, &Default::default()).await?;
+        }
+        Ok(())
+    }
+
+    async fn create_missing(&mut self, pods: &BTreeSet<String>) -> Result<()> {
+        for pod in pods {
+            if !self.pod_exists(pod).await? {
+                info!(name = pod, "Creating Pod");
+                self.create_pod(pod.clone()).await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn update_status(&mut self, pod_names: &BTreeSet<String>) -> Result<()> {
+        if let Some(pods) = self.status_mut().pods.as_mut() {
+            pods.clear();
+        }
+        for name in pod_names {
+            let pod = self.pod_api.get_status(name).await?;
+            let phase = pod.status.clone().and_then(|status| status.phase);
+            let state = match phase.as_deref() {
+                Some("Running") => {
+                    self.state_service
+                        .state(&pod, &self.parquet_cache_set.spec.state_port)
+                        .await?
+                }
+                _ => None,
+            };
+            self.status_mut()
+                .pods
+                .get_or_insert(Default::default())
+                .insert(name.clone(), InstanceStatus { phase, state });
+        }
+        Ok(())
+    }
+
+    async fn pod_exists(&self, name: &str) -> Result<bool> {
+        match self.pod_api.get(name).await {
+            Ok(_) => Ok(true),
+            Err(kube::Error::Api(error_response)) if error_response.reason == "NotFound" => {
+                Ok(false)
+            }
+            Err(error) => Err(Error::from(error)),
+        }
+    }
+
+    /// Create a new data cache instance pod.
+    async fn create_pod(&self, name: String) -> Result<Pod> {
+        let template = self
+            .parquet_cache_set
+            .spec
+            .template
+            .clone()
+            .unwrap_or_default();
+        let pod = Pod {
+            metadata: ObjectMeta {
+                namespace: self.parquet_cache_set.metadata.namespace.clone(),
+                name: Some(name),
+                owner_references: Some(vec![self.owner_reference()]),
+                ..template.metadata.unwrap_or_default()
+            },
+            spec: template.spec,
+            ..Default::default()
+        };
+        Ok(self
+            .pod_api
+            .create(
+                &PostParams {
+                    dry_run: false,
+                    field_manager: Some(CONTROLLER_NAME.to_string()),
+                },
+                &pod,
+            )
+            .await?)
+    }
+
+    /// Generate an owner reference for the current ParquetCacheSet document.
+    fn owner_reference(&self) -> OwnerReference {
+        owner_reference(&self.parquet_cache_set)
+    }
+
+    fn status_mut(&mut self) -> &mut ParquetCacheSetStatus {
+        self.parquet_cache_set
+            .status
+            .get_or_insert(Default::default())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::controller::state_service::Request;
+    use crate::controller::{ParquetCacheSet, ParquetCacheSetSpec};
+    use crate::data_types::{InstanceState, State};
+    use hyper::Body;
+    use k8s_openapi::api::core::v1::{Pod, PodSpec, PodTemplateSpec};
+    use kube::client::ClientBuilder;
+    use kube_test::{AsHandler, ResourceHandler, Service};
+    use std::collections::BTreeMap;
+    use std::sync::Arc;
+    use std::task::{Context, Poll};
+
+    #[tokio::test]
+    async fn create_pods() {
+        let ns = "create_pods";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0"), String::from("1")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.reconcile(ns, pcs.clone()).await.unwrap();
+
+        let pods = fixture.pods.all(ns);
+        assert_eq!(2, pods.len());
+
+        let mut pod_names = pods
+            .iter()
+            .map(|pod| pod.metadata.name.as_ref().unwrap().clone())
+            .collect::<Vec<_>>();
+        pod_names.sort();
+        assert_eq!(
+            &vec!["parquet-cache-aaaaaaaaaa-0", "parquet-cache-aaaaaaaaaa-1"],
+            &pod_names
+        );
+
+        // Make sure the provided template has been used, and the pods are owned by the
+        // ParquetCacheSet.
+        for pod in &pods {
+            assert_eq!(2, pod.spec.as_ref().unwrap().priority.unwrap());
+            assert_eq!(
+                owner_reference(&pcs),
+                pod.metadata.owner_references.as_ref().unwrap()[0].clone()
+            );
+        }
+    }
+
+    #[tokio::test]
+    async fn remove_pods() {
+        let ns = "remove_pods";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+        let pod1_name = format!("{name}-1");
+        let pod2_name = format!("{name}-2");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("1"), String::from("2")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([
+                        (
+                            pod0_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                        (
+                            pod1_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                        (
+                            pod2_name.clone(),
+                            InstanceStatus {
+                                phase: Some(String::from("Running")),
+                                state: Some(State {
+                                    state: InstanceState::Warming,
+                                    ..Default::default()
+                                }),
+                            },
+                        ),
+                    ])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod1_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod2_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                ..Default::default()
+            },
+        );
+
+        fixture.reconcile(ns, pcs).await.unwrap();
+
+        let pods = fixture.pods.all(ns);
+        assert_eq!(2, pods.len());
+
+        let mut pod_names = pods
+            .iter()
+            .map(|pod| pod.metadata.name.as_ref().unwrap().clone())
+            .collect::<Vec<_>>();
+        pod_names.sort();
+        assert_eq!(vec![pod1_name.clone(), pod2_name.clone()], pod_names);
+    }
+
+    #[tokio::test]
+    async fn warming_pods_refresh_shortly() {
+        let ns = "warming_pods_refresh_shortly";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let mut fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        pod0_name.clone(),
+                        InstanceStatus {
+                            phase: Some(String::from("Pending")),
+                            state: None,
+                        },
+                    )])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                status: Some(k8s_openapi::api::core::v1::PodStatus {
+                    phase: Some(String::from("Running")),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.instance_state.insert(
+            pod0_name.clone(),
+            State {
+                state: InstanceState::Warming,
+                ..Default::default()
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap());
+
+        let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap();
+        assert_eq!(
+            "Running",
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .phase
+                .as_deref()
+                .unwrap()
+        );
+        assert_eq!(
+            InstanceState::Warming,
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .state
+                .as_ref()
+                .unwrap()
+                .state
+        );
+    }
+
+    #[tokio::test]
+    async fn no_status_pods_refresh_shortly() {
+        let ns = "no_status_pods_refresh_shortly";
+        let name = "parquet-cache-aaaaaaaaaa";
+
+        let fixture: Fixture = Default::default();
+
+        let pod0_name = format!("{name}-0");
+
+        let pcs = fixture.parquet_cache_sets.set(
+            ns,
+            name,
+            ParquetCacheSet {
+                spec: ParquetCacheSetSpec {
+                    replica_suffixes: Some(vec![String::from("0")]),
+                    template: Some(PodTemplateSpec {
+                        spec: Some(PodSpec {
+                            priority: Some(2),
+                            ..Default::default()
+                        }),
+                        ..Default::default()
+                    }),
+                    ..Default::default()
+                },
+                status: Some(ParquetCacheSetStatus {
+                    pods: Some(BTreeMap::from([(
+                        pod0_name.clone(),
+                        InstanceStatus {
+                            phase: Some(String::from("Pending")),
+                            state: None,
+                        },
+                    )])),
+                }),
+                ..Default::default()
+            },
+        );
+
+        fixture.pods.set(
+            ns,
+            &pod0_name,
+            Pod {
+                metadata: ObjectMeta {
+                    owner_references: Some(vec![owner_reference(&pcs)]),
+                    ..Default::default()
+                },
+                status: Some(k8s_openapi::api::core::v1::PodStatus {
+                    phase: Some(String::from("Running")),
+                    ..Default::default()
+                }),
+                ..Default::default()
+            },
+        );
+
+        assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap());
+
+        let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap();
+        assert_eq!(
+            "Running",
+            pcs.status
+                .as_ref()
+                .unwrap()
+                .pods
+                .as_ref()
+                .unwrap()
+                .get(&pod0_name)
+                .unwrap()
+                .phase
+                .as_deref()
+                .unwrap()
+        );
+        assert!(&pcs
+            .status
+            .unwrap()
+            .pods
+            .unwrap()
+            .get(&pod0_name)
+            .unwrap()
+            .state
+            .is_none());
+    }
+
+    #[derive(Debug, Default)]
+    struct Fixture {
+        pub parquet_cache_sets: Arc<ResourceHandler<ParquetCacheSet>>,
+        pub pods: Arc<ResourceHandler<Pod>>,
+        pub instance_state: BTreeMap<String, State>,
+    }
+
+    impl Fixture {
+        fn service(&self) -> Service {
+            let service = Service::new();
+            service.add_handler(self.parquet_cache_sets.as_handler());
+            service.add_handler(self.pods.as_handler());
+            service
+        }
+
+        async fn reconcile(
+            &self,
+            ns: impl Into<String> + Send,
+            pcs: ParquetCacheSet,
+        ) -> Result<Duration> {
+            let service = self.service();
+            let client = ClientBuilder::new(service, ns).build::<Body>();
+            let state_service_client =
+                state_service::Client::new(StateService(self.instance_state.clone()));
+            let mut controller = ParquetCacheSetController::new(pcs, state_service_client, client);
+            let hnd = tokio::spawn(async move { controller.reconcile().await });
+            hnd.await.unwrap()
+        }
+    }
+
+    #[derive(Debug, Clone)]
+    struct StateService(BTreeMap<String, State>);
+
+    impl tower::Service<state_service::Request> for StateService {
+        type Response = Option<State>;
+        type Error = Box<dyn std::error::Error + Send + Sync>;
+        type Future = std::future::Ready<Result<Self::Response, Self::Error>>;
+
+        fn poll_ready(
+            &mut self,
+            _cx: &mut Context<'_>,
+        ) -> Poll<std::result::Result<(), Self::Error>> {
+            Poll::Ready(Ok(()))
+        }
+
+        fn call(&mut self, req: Request) -> Self::Future {
+            std::future::ready(Ok(self
+                .0
+                .get(req.pod.metadata.name.as_deref().unwrap_or_default())
+                .cloned()))
+        }
+    }
+}
diff --git a/parquet_cache/src/controller/state_service.rs b/parquet_cache/src/controller/state_service.rs
new file mode 100644
index 00000000000..847d3b887cf
--- /dev/null
+++ b/parquet_cache/src/controller/state_service.rs
@@ -0,0 +1,109 @@
+use super::{Error, Result};
+use crate::data_types::State;
+use hyper::service::Service;
+use k8s_openapi::api::core::v1::Pod;
+use observability_deps::tracing::debug;
+use std::fmt::{Debug, Formatter};
+use std::future::{poll_fn, Future};
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tower::buffer::Buffer;
+use tower::util::BoxService;
+use tower::{BoxError, ServiceExt};
+
+#[derive(Debug, Clone)]
+pub struct Request {
+    pub pod: Pod,
+    pub port: Option<String>,
+}
+
+#[derive(Clone)]
+pub struct Client {
+    inner: Buffer<BoxService<Request, Option<State>, BoxError>, Request>,
+}
+
+impl Client {
+    pub fn new<S>(svc: S) -> Self
+    where
+        S: Service<Request, Response = Option<State>> + Clone + Send + 'static,
+        S::Error: Into<BoxError> + Send + Sync,
+        S::Future: Future<Output = Result<Option<State>, S::Error>> + Send + 'static,
+    {
+        Self {
+            inner: Buffer::new(BoxService::new(svc.map_err(|e| e.into())), 1024),
+        }
+    }
+
+    pub async fn state(&mut self, pod: &Pod, port: &Option<String>) -> Result<Option<State>> {
+        let request = Request {
+            pod: pod.clone(),
+            port: port.clone(),
+        };
+        poll_fn(|cx| (self.inner.poll_ready(cx)))
+            .await
+            .map_err(Error::NodeStateError)?;
+        self.inner
+            .call(request)
+            .await
+            .map_err(Error::NodeStateError)
+    }
+}
+
+impl Debug for Client {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "pod state service client")
+    }
+}
+
+impl Default for Client {
+    fn default() -> Self {
+        Self::new(ReqwestClient {})
+    }
+}
+
+#[derive(Debug, Clone)]
+struct ReqwestClient {}
+
+impl Service<Request> for ReqwestClient {
+    type Response = Option<State>;
+    type Error = reqwest::Error;
+    type Future = Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send>>;
+
+    fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request) -> Self::Future {
+        let fut = async {
+            let url = req
+                .pod
+                .status
+                .and_then(|status| status.pod_ip)
+                .map(|ip_addr| match req.port {
+                    Some(port) => format!("http://{ip_addr}:{port}/state"),
+                    None => format!("http://{ip_addr}/state"),
+                });
+            debug!(url, "Getting pod state");
+            if let Some(url) = url {
+                let response = match reqwest::get(url).await {
+                    Ok(response) => Some(response.json().await?),
+                    Err(error) => {
+                        debug!(
+                            error = &error as &dyn std::error::Error,
+                            "Error getting state"
+                        );
+                        if error.is_connect() {
+                            None
+                        } else {
+                            return Err(error);
+                        }
+                    }
+                };
+                Ok(response)
+            } else {
+                Ok(None)
+            }
+        };
+        Box::pin(fut)
+    }
+}
diff --git a/parquet_cache/src/data_types.rs b/parquet_cache/src/data_types.rs
new file mode 100644
index 00000000000..f69dd60c321
--- /dev/null
+++ b/parquet_cache/src/data_types.rs
@@ -0,0 +1,12 @@
+//! Contains the datatypes to be shared across the data cache server and client.
+
+mod keyspace;
+pub use keyspace::*;
+mod objects;
+pub use objects::*;
+mod policy;
+pub use policy::*;
+mod state;
+pub use state::*;
+mod write_hints;
+pub use write_hints::*;
diff --git a/parquet_cache/src/data_types/keyspace.rs b/parquet_cache/src/data_types/keyspace.rs
new file mode 100644
index 00000000000..faecda8fed4
--- /dev/null
+++ b/parquet_cache/src/data_types/keyspace.rs
@@ -0,0 +1,164 @@
+use crate::data_types::State;
+use k8s_openapi::schemars::JsonSchema;
+use serde::{Deserialize, Serialize, Serializer};
+use std::time::{SystemTime, UNIX_EPOCH};
+
+use super::state::InstanceState;
+
+/// Response body for keyspace request.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct KeyspaceResponseBody {
+    /// Complete list of nodes for the hashring assignment of keyspace.
+    pub nodes: Vec<ServiceNode>,
+}
+
+/// Identifier used by data cache node.
+///
+/// This identifier should remain consistent for any nodes being cycled (e.g. k8s),
+/// as it determines the location in the hashring.
+pub type ServiceNodeId = u64;
+
+/// Hostname data cache node.
+pub type ServiceNodeHostname = String;
+
+/// Data cache service node.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ServiceNode {
+    /// Id of data cache service node.
+    pub id: ServiceNodeId,
+    /// Hostname.
+    pub hostname: ServiceNodeHostname,
+}
+
+/// The set of instances that form a parquet cache group.
+#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)]
+pub struct ParquetCacheInstanceSet {
+    /// The revision number of the cache instance set.
+    pub revision: i64,
+
+    /// The set of instances that form the cache set.
+    pub instances: Vec<String>,
+}
+
+impl ParquetCacheInstanceSet {
+    /// Returns true if the instance set is empty.
+    pub fn contains(&self, node_hostname: &ServiceNodeHostname) -> bool {
+        self.instances.contains(node_hostname)
+    }
+}
+
+// TODO: make on-disc and in-mem representations match!
+/// Converts on on-disc representation of the keyspace from the controller
+/// into the keyspace respresentation consumed by the cache client & server.
+impl From<&ParquetCacheInstanceSet> for KeyspaceResponseBody {
+    fn from(value: &ParquetCacheInstanceSet) -> Self {
+        Self {
+            nodes: value
+                .clone()
+                .instances
+                .into_iter()
+                .enumerate()
+                .map(|(id, hostname)| ServiceNode {
+                    id: id as u64,
+                    hostname,
+                })
+                .collect(),
+        }
+    }
+}
+
+impl From<&KeyspaceVersion> for InstanceState {
+    fn from(value: &KeyspaceVersion) -> Self {
+        match (&value.current, &value.next) {
+            (Some(current), Some(next)) => {
+                match (
+                    current.contains(&value.self_node),
+                    next.contains(&value.self_node),
+                ) {
+                    (false, true) => Self::Warming,
+                    (true, true) => Self::Running,
+                    (true, false) => Self::Cooling,
+                    (false, false) => Self::Cooling,
+                }
+            }
+            (None, Some(next)) if next.contains(&value.self_node) => Self::Warming,
+            (Some(_), None) => unreachable!("next should always be set, if curr exists"),
+            _ => Self::Pending,
+        }
+    }
+}
+
+/// Tracker of Keyspace version changes.
+///
+/// The response of `GET /state` is the serialized version of this struct.
+#[derive(Clone, Debug)]
+pub struct KeyspaceVersion {
+    /// Hostname of node, in order to identify self in [`ParquetCacheInstanceSet`].
+    ///
+    /// Does not change.
+    self_node: ServiceNodeHostname,
+    /// current ParquetCacheInstanceSet
+    pub current: Option<ParquetCacheInstanceSet>,
+    /// next ParquetCacheInstanceSet
+    pub next: Option<ParquetCacheInstanceSet>,
+    /// time that the service was last updated
+    pub changed: SystemTime,
+}
+
+impl Serialize for KeyspaceVersion {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: Serializer,
+    {
+        let state = State {
+            state: InstanceState::from(self),
+            state_changed: self
+                .changed
+                .duration_since(UNIX_EPOCH)
+                .unwrap_or_default()
+                .as_secs() as i64,
+            current_node_set_revision: self.current.as_ref().map(|pcis| pcis.revision).unwrap_or(0),
+            next_node_set_revision: self.next.as_ref().map(|pcis| pcis.revision).unwrap_or(0),
+        };
+        state.serialize(serializer)
+    }
+}
+
+impl KeyspaceVersion {
+    /// Initialize the KeyspaceVersion with only the hostname (config option) known.
+    pub fn new(self_node: ServiceNodeHostname) -> Self {
+        Self {
+            self_node,
+            current: None,
+            next: None,
+            changed: SystemTime::now(),
+        }
+    }
+
+    /// Get hostname.
+    pub fn hostname(&self) -> &ServiceNodeHostname {
+        &self.self_node
+    }
+
+    /// Duplicate the `next` to `current`.
+    ///
+    /// This method is tightly coupled to the definition of InstanceState::from(KeyspaceVersion).
+    pub fn clone_next_to_curr(&self) -> Self {
+        Self {
+            self_node: self.self_node.clone(),
+            current: self.next.clone(),
+            next: self.next.clone(),
+            changed: SystemTime::now(),
+        }
+    }
+
+    /// Set next.
+    pub fn set_next(&self, next: ParquetCacheInstanceSet) -> Self {
+        Self {
+            self_node: self.self_node.clone(),
+            current: self.next.clone(), // increment forward
+            next: Some(next),
+            changed: SystemTime::now(),
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/objects.rs b/parquet_cache/src/data_types/objects.rs
new file mode 100644
index 00000000000..55555698313
--- /dev/null
+++ b/parquet_cache/src/data_types/objects.rs
@@ -0,0 +1,79 @@
+use hyper::{header::HeaderValue, HeaderMap};
+use serde::{Deserialize, Serialize};
+
+use crate::client::cache_connector::Error as CacheServerError;
+
+pub static X_RANGE_START_HEADER: &str = "x-object-range-start";
+pub static X_RANGE_END_HEADER: &str = "x-object-range-end";
+
+pub fn extract_usize_header(
+    header: &'static str,
+    values: &HeaderMap<HeaderValue>,
+) -> Result<usize, CacheServerError> {
+    let val = values
+        .get(header)
+        .ok_or(CacheServerError::ReadData(format!(
+            "missing header {}",
+            header
+        )))?
+        .to_str()
+        .map_err(|_| CacheServerError::ReadData(format!("missing {} header", header)))?;
+
+    val.parse::<usize>()
+        .map_err(|_| CacheServerError::ReadData(format!("invalid {} header", header)))
+}
+
+/// Metadata for object.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
+pub struct GetObjectMetaResponse {
+    /// The full path to the object
+    pub location: String,
+    /// The last modified time
+    pub last_modified: chrono::DateTime<chrono::Utc>,
+    /// The size in bytes of the object
+    pub size: usize,
+    /// The unique identifier for the object
+    pub e_tag: Option<String>,
+    /// A version indicator for this object
+    pub version: Option<String>,
+}
+
+impl From<GetObjectMetaResponse> for object_store::ObjectMeta {
+    fn from(value: GetObjectMetaResponse) -> Self {
+        let GetObjectMetaResponse {
+            location,
+            last_modified,
+            size,
+            e_tag,
+            version,
+        } = value;
+
+        Self {
+            location: object_store::path::Path::parse(location).expect("should be valid path"),
+            last_modified,
+            size,
+            e_tag,
+            version,
+        }
+    }
+}
+
+impl From<object_store::ObjectMeta> for GetObjectMetaResponse {
+    fn from(value: object_store::ObjectMeta) -> Self {
+        let object_store::ObjectMeta {
+            location,
+            last_modified,
+            size,
+            e_tag,
+            version,
+        } = value;
+
+        Self {
+            location: location.to_string(),
+            last_modified,
+            size,
+            e_tag,
+            version,
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/policy.rs b/parquet_cache/src/data_types/policy.rs
new file mode 100644
index 00000000000..da13b6be98a
--- /dev/null
+++ b/parquet_cache/src/data_types/policy.rs
@@ -0,0 +1,17 @@
+use serde::{Deserialize, Serialize};
+
+/// TODO: clap_blocks
+#[derive(Debug, Default, Clone, Copy)]
+pub struct PolicyConfig {
+    pub max_capacity: u64,
+    pub event_recency_max_duration_nanoseconds: u64,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy, Default)]
+pub struct ObjectParams {
+    pub namespace_id: i64,
+    pub table_id: i64,
+    pub min_time: i64,
+    pub max_time: i64,
+    pub file_size_bytes: i64,
+}
diff --git a/parquet_cache/src/data_types/state.rs b/parquet_cache/src/data_types/state.rs
new file mode 100644
index 00000000000..9afe54e8740
--- /dev/null
+++ b/parquet_cache/src/data_types/state.rs
@@ -0,0 +1,52 @@
+use schemars::JsonSchema;
+use serde::{Deserialize, Serialize};
+use std::fmt::{Display, Formatter};
+
+#[derive(Debug, Default, Clone, PartialEq, Deserialize, Copy, Serialize, JsonSchema)]
+pub struct State {
+    /// The current state of the cache node.
+    pub state: InstanceState,
+
+    /// Timestamp (seconds from unix epoch) that the state last changed.
+    pub state_changed: i64,
+
+    /// The revision number of the current node set known to the cache node.
+    pub current_node_set_revision: i64,
+
+    /// The revision number of the next node set known to the cache node.
+    pub next_node_set_revision: i64,
+}
+
+#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Default, Copy, Clone, JsonSchema)]
+pub enum InstanceState {
+    #[default]
+    /// Default state, prior to loading any configmap keyspace.
+    #[serde(rename = "pending")]
+    Pending,
+    /// Have configmap, are warming, and not receiving traffic.
+    ///
+    /// Can still respond to `GET /state` requests (from controller).
+    #[serde(rename = "warming")]
+    Warming,
+    /// Ready for traffic.
+    ///
+    /// Includes own host in `GET /keyspace` responses.
+    #[serde(rename = "running")]
+    Running,
+    /// Response to `GET /keyspace` requests are now directing traffic elsewhere.
+    ///
+    /// May still have ongoing requests.
+    #[serde(rename = "cooling")]
+    Cooling,
+}
+
+impl Display for InstanceState {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Pending => write!(f, "pending"),
+            Self::Warming => write!(f, "warming"),
+            Self::Running => write!(f, "running"),
+            Self::Cooling => write!(f, "cooling"),
+        }
+    }
+}
diff --git a/parquet_cache/src/data_types/write_hints.rs b/parquet_cache/src/data_types/write_hints.rs
new file mode 100644
index 00000000000..fdff107a5fc
--- /dev/null
+++ b/parquet_cache/src/data_types/write_hints.rs
@@ -0,0 +1,81 @@
+use data_types::{ParquetFile, ParquetFileParams};
+use serde::{Deserialize, Serialize};
+
+use super::ObjectParams;
+
+/// Request payload provided on WriteHinting.
+#[derive(Debug, Serialize, Deserialize)]
+pub struct WriteHintRequestBody {
+    /// Object store [`Path`](object_store::path::Path) converted to cache key.
+    pub location: String,
+    /// The actual [`WriteHint`].
+    pub hint: WriteHint,
+    /// Requested server contract to fulfill prior to ACK.
+    pub ack_setting: WriteHintAck,
+}
+
+/// DataCache is a read-only, write-hinting service.
+///
+/// Cache writes to store, then hints to pull into cache.
+/// Return ok based upon a configurable level of cache server ack.
+#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)]
+pub enum WriteHintAck {
+    /// cache client sent write hint
+    Sent,
+    /// cache server received write hint
+    Received,
+    /// cache server completed downstream action
+    #[default]
+    Completed,
+}
+
+impl std::fmt::Display for WriteHintAck {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:?}", self)
+    }
+}
+
+/// Write hint metadata provided by the client.
+pub type WriteHint = ObjectParams;
+
+impl From<&ParquetFileParams> for WriteHint {
+    fn from(value: &ParquetFileParams) -> Self {
+        let ParquetFileParams {
+            namespace_id,
+            table_id,
+            min_time,
+            max_time,
+            file_size_bytes,
+            ..
+        } = value;
+
+        Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            min_time: min_time.get(),
+            max_time: max_time.get(),
+            file_size_bytes: file_size_bytes.to_owned(),
+        }
+    }
+}
+
+impl From<&ParquetFile> for WriteHint {
+    fn from(value: &ParquetFile) -> Self {
+        let ParquetFile {
+            namespace_id,
+            table_id,
+            min_time,
+            max_time,
+            file_size_bytes,
+            ..
+        } = value;
+
+        Self {
+            namespace_id: namespace_id.get(),
+            table_id: table_id.get(),
+            min_time: min_time.get(),
+            max_time: max_time.get(),
+            file_size_bytes: file_size_bytes.to_owned(),
+        }
+    }
+}
diff --git a/parquet_cache/src/lib.rs b/parquet_cache/src/lib.rs
new file mode 100644
index 00000000000..b4f4d5d1b77
--- /dev/null
+++ b/parquet_cache/src/lib.rs
@@ -0,0 +1,51 @@
+//! IOx parquet cache client.
+//!
+//! ParquetCache client interface to be used by IOx components to
+//! get and put parquet files into the cache.
+
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+#![allow(rustdoc::private_intra_doc_links, unreachable_pub)]
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+mod client;
+pub use client::{cache_connector::Error, write_hints::WriteHintingObjectStore};
+
+pub mod controller;
+
+pub(crate) mod data_types;
+
+mod server;
+#[cfg(test)]
+pub use server::mock::MockCacheServer;
+pub use server::{build_cache_server, ParquetCacheServer, ParquetCacheServerConfig, ServerError};
+
+use object_store::ObjectStore;
+use std::sync::Arc;
+
+use crate::client::{cache_connector::build_cache_connector, object_store::DataCacheObjectStore};
+
+// TODO: change this to `Arc<dyn WriteHintingObjectStore>`
+// and have consumers (e.g. ingester, compactor) issue write-hints.
+//
+/// Build a cache client.
+pub fn make_client(
+    namespace_service_address: String,
+    object_store: Arc<dyn ObjectStore>,
+) -> Arc<dyn ObjectStore> {
+    let server_connection = build_cache_connector(namespace_service_address);
+    Arc::new(DataCacheObjectStore::new(server_connection, object_store))
+}
diff --git a/parquet_cache/src/server.rs b/parquet_cache/src/server.rs
new file mode 100644
index 00000000000..0f5308b3f82
--- /dev/null
+++ b/parquet_cache/src/server.rs
@@ -0,0 +1,482 @@
+#![allow(dead_code)]
+//! Contains the cache server.
+
+use std::sync::Arc;
+
+use iox_catalog::interface::Catalog;
+use object_store::ObjectStore;
+use tower::ServiceBuilder;
+
+use crate::data_types::PolicyConfig;
+
+use self::{
+    cache::{BuildCacheService, CacheService},
+    data::DataService,
+    keyspace::{BuildKeyspaceService, KeyspaceService},
+    precondition::{BuildPreconditionService, PreconditionService},
+};
+
+// Layers in the cache server:
+mod cache;
+mod data;
+mod keyspace;
+mod precondition;
+
+// Shared server types:
+mod error;
+pub use error::Error as ServerError;
+mod response;
+
+#[cfg(test)]
+pub(crate) mod mock;
+
+/// The cache server type.
+pub type ParquetCacheServer = CacheService<KeyspaceService<PreconditionService<DataService>>>;
+
+/// Config for cache server.
+#[derive(Debug)]
+pub struct ParquetCacheServerConfig {
+    /// The path to the config file for the keyspace.
+    pub keyspace_config_path: String,
+    /// The hostname of the cache instance (k8s pod) running this process.
+    pub hostname: String,
+    /// The local directory to store data.
+    pub local_dir: String,
+    /// The policy config for the cache eviction.
+    pub policy_config: PolicyConfig,
+}
+
+/// Build a cache server.
+pub async fn build_cache_server(
+    config: ParquetCacheServerConfig,
+    direct_store: Arc<dyn ObjectStore>,
+    catalog: Arc<dyn Catalog>,
+) -> ParquetCacheServer {
+    let ParquetCacheServerConfig {
+        keyspace_config_path: configfile_path,
+        hostname: node_hostname,
+        local_dir,
+        policy_config,
+    } = config;
+
+    ServiceBuilder::new()
+        // outermost layer 0
+        .layer(BuildCacheService)
+        // layer 1
+        .layer(BuildKeyspaceService {
+            configfile_path,
+            node_hostname,
+        })
+        // layer 2
+        .layer(BuildPreconditionService)
+        // innermost layer 3
+        .service(DataService::new(direct_store, catalog, policy_config, Some(local_dir)).await)
+}
+
+#[cfg(test)]
+mod integration_tests {
+    use std::{
+        fs::create_dir_all,
+        io::{Seek, Write},
+        path::Path,
+        time::Duration,
+    };
+
+    use bytes::{Buf, BufMut, BytesMut};
+    use http::{Method, StatusCode};
+    use hyper::{Body, Request};
+    use iox_tests::{TestCatalog, TestParquetFileBuilder};
+    use object_store::{local::LocalFileSystem, ObjectMeta};
+    use serde::Deserialize;
+    use serde_json::Deserializer;
+    use tempfile::{tempdir, NamedTempFile, TempDir};
+    use tower::Service;
+
+    use crate::data_types::{
+        GetObjectMetaResponse, InstanceState, KeyspaceResponseBody, ParquetCacheInstanceSet,
+        ServiceNode, State, WriteHint, WriteHintRequestBody,
+    };
+    use crate::server::response::Response as ServerInternalResponse;
+
+    use super::*;
+
+    fn create_fs_direct_store(local_dir: &Path) -> Arc<dyn ObjectStore> {
+        create_dir_all(local_dir).unwrap();
+        Arc::new(LocalFileSystem::new_with_prefix(local_dir).expect("should create fs ObjectStore"))
+    }
+
+    #[tokio::test]
+    async fn test_invalid_path() {
+        let tmpdir = tempdir().unwrap();
+        let direct_store = create_fs_direct_store(tmpdir.path());
+        let catalog = iox_tests::TestCatalog::new();
+
+        let config = ParquetCacheServerConfig {
+            keyspace_config_path: "/tmp".to_string(),
+            hostname: "localhost".to_string(),
+            local_dir: tmpdir.path().to_str().unwrap().to_string(),
+            policy_config: PolicyConfig::default(),
+        };
+
+        let mut server = build_cache_server(config, direct_store, catalog.catalog()).await;
+
+        let req = Request::get("http://foo.io/invalid-path/")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await;
+
+        // assert expected http response
+        assert_matches::assert_matches!(
+            resp,
+            Err(ServerError::BadRequest(msg)) if msg.contains("invalid path"),
+            "expected bad request, instead found {:?}", resp
+        );
+    }
+
+    const VALID_HOSTNAME: &str = "hostname-a";
+    lazy_static::lazy_static! {
+        static ref KEYSPACE_DEFINITION: ParquetCacheInstanceSet = ParquetCacheInstanceSet {
+            revision: 0,
+            // a single node in the keyspace, therefore all keys should hash to this keyspace
+            instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(),
+        };
+    }
+
+    const LOCATION: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet";
+    const DATA: &[u8] = b"all my pretty words";
+
+    async fn setup_service_and_direct_store(
+        direct_store: Arc<dyn ObjectStore>,
+        cache_tmpdir: TempDir,
+        file: &mut NamedTempFile,
+    ) -> (ParquetCacheServer, Arc<TestCatalog>, ObjectMeta) {
+        let catalog = iox_tests::TestCatalog::new();
+
+        let policy_config = PolicyConfig {
+            max_capacity: 3_200_000_000,
+            event_recency_max_duration_nanoseconds: 1_000_000_000 * 5, // 5 seconds
+        };
+
+        writeln!(file, "{}", serde_json::json!(*KEYSPACE_DEFINITION))
+            .expect("should write keyspace definition to configfile");
+
+        let obj_store_path = object_store::path::Path::from(LOCATION);
+
+        let config = ParquetCacheServerConfig {
+            keyspace_config_path: file.path().to_str().unwrap().to_string(),
+            hostname: VALID_HOSTNAME.to_string(),
+            local_dir: cache_tmpdir.path().to_str().unwrap().to_string(),
+            policy_config,
+        };
+
+        let server = build_cache_server(config, Arc::clone(&direct_store), catalog.catalog()).await;
+
+        // add object to direct store
+        direct_store
+            .put(&obj_store_path, DATA.into())
+            .await
+            .expect("should write object to direct store");
+        let expected_meta = direct_store
+            .head(&obj_store_path)
+            .await
+            .expect("should have object in direct store");
+
+        // wait until service is ready
+        let mut this = server.clone();
+        futures::future::poll_fn(move |cx| this.poll_ready(cx))
+            .await
+            .expect("should not have failed");
+
+        (server, catalog, expected_meta)
+    }
+
+    async fn confirm_data_exists(expected_meta: ObjectMeta, server: &mut ParquetCacheServer) {
+        // issue read metadata
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for metadata
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body: GetObjectMetaResponse = serde_json::from_reader(
+            hyper::body::aggregate(resp.into_body())
+                .await
+                .expect("should create reader")
+                .reader(),
+        )
+        .expect("should read response body");
+        let resp_meta: object_store::ObjectMeta = resp_body.into();
+        assert_eq!(
+            resp_meta, expected_meta,
+            "expected proper metadata, instead found {:?}",
+            resp_meta
+        );
+
+        // issue read object
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for object
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let body = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("reading response body");
+        assert_eq!(
+            body.len(),
+            DATA.to_vec().len(),
+            "expected data in body, instead found {}",
+            std::str::from_utf8(&body).unwrap()
+        );
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_and_read() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, expected_meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri(format!("http://foo.io/write-hint?location={}", LOCATION))
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+
+        // assert expected http response for write-hint
+        let expected_resp = ServerInternalResponse::Written;
+        assert_eq!(
+            resp.status(),
+            expected_resp.code(),
+            "expected http response status code to match, instead found {:?}",
+            resp
+        );
+        let body = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("reading response body");
+        assert_eq!(
+            body.len(),
+            0,
+            "expected empty body, instead found {}",
+            std::str::from_utf8(&body).unwrap()
+        );
+
+        confirm_data_exists(expected_meta, &mut server).await;
+    }
+
+    #[tokio::test]
+    async fn test_cache_miss_writeback_and_read() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, catalog, expected_meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // write-back requires catalog data, therefore insert into catalog
+        let namespace = catalog.create_namespace_1hr_retention("ns0").await;
+        let table = namespace.create_table("table0").await;
+        let partition = table.create_partition("partition_key").await;
+
+        // insert parquet file into catalog, with proper matching object store id
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(&LOCATION.to_string())
+            .expect("should be valid parquet file path");
+        let parquet_file = TestParquetFileBuilder::default()
+            .with_creation_time(iox_time::Time::from_date_time(expected_meta.last_modified))
+            .with_file_size_bytes(DATA.to_vec().len() as u64)
+            .with_object_store_id(parquet_file_path.object_store_id());
+        partition
+            .create_parquet_file_catalog_record(parquet_file)
+            .await;
+
+        // trigger cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION))
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await;
+        assert_matches::assert_matches!(
+            resp,
+            Err(ServerError::CacheMiss),
+            "expected cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        confirm_data_exists(expected_meta, &mut server).await;
+    }
+
+    #[tokio::test]
+    async fn test_state_responses() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, _meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // check keyspace status is running
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/state")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body_json = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("should read response body");
+        let mut de = Deserializer::from_slice(&resp_body_json);
+        let mut state = State::deserialize(&mut de).expect("valid State object");
+        state.state_changed = 0; // ignore the timestamp
+        assert_eq!(
+            state,
+            State {
+                state: InstanceState::Running,
+                state_changed: 0,
+                current_node_set_revision: 0,
+                next_node_set_revision: 0,
+            },
+        );
+
+        // tell keyspace to cool, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(configfile.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // waiting for new_keyspace_definition to load
+        // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests)
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        // check keyspace status is cooling
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/state")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body_json = hyper::body::to_bytes(resp.into_body())
+            .await
+            .expect("should read response body");
+        let mut de = Deserializer::from_slice(&resp_body_json);
+        let mut state = State::deserialize(&mut de).expect("valid State object");
+        state.state_changed = 0; // ignore the timestamp
+        assert_eq!(
+            state,
+            State {
+                state: InstanceState::Cooling,
+                state_changed: 0,
+                current_node_set_revision: 0,
+                next_node_set_revision: 1,
+            },
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_nodes() {
+        // keep in scope so they are not dropped
+        let dir_store_tmpdir = tempdir().unwrap();
+        let cache_tmpdir = tempdir().unwrap();
+        let mut configfile = NamedTempFile::new().unwrap();
+        let direct_store = create_fs_direct_store(dir_store_tmpdir.path());
+
+        // setup server
+        let (mut server, _, _meta) =
+            setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await;
+
+        // get keyspace nodes
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/keyspace")
+            .body(Body::empty())
+            .unwrap();
+        let resp = server.call(req).await.expect("should get a response");
+        assert_eq!(
+            resp.status(),
+            StatusCode::OK,
+            "expected http 200, instead found {:?}",
+            resp
+        );
+        let resp_body: KeyspaceResponseBody = serde_json::from_reader(
+            hyper::body::aggregate(resp.into_body())
+                .await
+                .expect("should create reader")
+                .reader(),
+        )
+        .expect("should read response body");
+        assert_matches::assert_matches!(
+            resp_body,
+            KeyspaceResponseBody { nodes } if matches!(
+                &nodes[..],
+                [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME
+            )
+        );
+    }
+}
diff --git a/parquet_cache/src/server/cache.rs b/parquet_cache/src/server/cache.rs
new file mode 100644
index 00000000000..6acb7c4e46c
--- /dev/null
+++ b/parquet_cache/src/server/cache.rs
@@ -0,0 +1,113 @@
+use std::{pin::Pin, task::Poll};
+
+use futures::{ready, Future};
+use http::{Method, Request, Response, StatusCode};
+use hyper::Body;
+use tokio::sync::OnceCell;
+use tower::{Layer, Service};
+
+use super::response::PinnedFuture;
+
+pub type FinalResponseFuture =
+    Pin<Box<dyn Future<Output = Result<Response<Body>, super::error::Error>> + Send>>;
+
+/// Cache Service
+#[derive(Debug, Clone)]
+pub struct CacheService<S: Clone> {
+    inner: S,
+    initialize_once: OnceCell<()>,
+}
+
+impl<S> CacheService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    pub fn new(inner: S) -> Self {
+        Self {
+            inner,
+            initialize_once: Default::default(),
+        }
+    }
+
+    pub async fn prewarm(&mut self) -> Result<(), super::error::Error> {
+        // TODO:
+        // 0. (already done): LruCacheManager::new() => should have cache policy.
+        // 1. (already done): Keyspace::poll_ready() => should have the keyspace.
+        // 2. TODO(optional): may have persisted state from previous LruCacheManager, to reduce catalog load
+        // 3. GET list of obj_keys from catalog.
+        //      * Query limits based on cache policy.
+        //      * Use slower prewarming, paginated catalog queries, prioritized cache insertion.
+        // 4. for key in list => self.call(<`/write-hint` request for key>)
+        //      * inner KeyspaceService will filter by key hash
+        //      * inner DataService will filter by cache eviction policy
+        //      * inner WriteService will handle write-back
+
+        // 5. message to inner that prewarming is done.
+        let req = Request::builder()
+            .method(Method::PATCH)
+            .uri("/warmed")
+            .body(Body::empty())
+            .expect("should create prewarm PATCH /warmed req");
+        self.inner
+            .call(req)
+            .await
+            .map_err(|e| super::error::Error::Warming(e.to_string()))?;
+
+        Ok(())
+    }
+}
+
+impl<S> Service<Request<Body>> for CacheService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    type Response = Response<Body>;
+    type Error = super::error::Error;
+    type Future = FinalResponseFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        // wait for inner service to receive requests
+        let _ = ready!(self.inner.poll_ready(cx));
+
+        // initialize once (which issues a request to inner service)
+        let mut this = self.clone();
+        Box::pin(async move {
+            self.initialize_once
+                .get_or_try_init(|| this.prewarm())
+                .await
+        })
+        .as_mut()
+        .poll(cx)
+        .map_ok(|_| ())
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let clone = self.inner.clone();
+        let mut inner = std::mem::replace(&mut self.inner, clone);
+        Box::pin(async move {
+            match inner.call(req).await {
+                Ok(resp) => match Response::builder().status(resp.code()).body(resp.into()) {
+                    Ok(resp) => Ok(resp),
+                    Err(e) => Ok(Response::builder()
+                        .status(StatusCode::INTERNAL_SERVER_ERROR)
+                        .body(e.to_string().into())
+                        .expect("should build error response")),
+                },
+                Err(e) => Err(e),
+            }
+        })
+    }
+}
+
+pub struct BuildCacheService;
+
+impl<S> Layer<S> for BuildCacheService
+where
+    S: Service<Request<Body>, Future = PinnedFuture> + Clone + Send + Sync + 'static,
+{
+    type Service = CacheService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        CacheService::new(service)
+    }
+}
diff --git a/parquet_cache/src/server/data.rs b/parquet_cache/src/server/data.rs
new file mode 100644
index 00000000000..fd50aab2eaf
--- /dev/null
+++ b/parquet_cache/src/server/data.rs
@@ -0,0 +1,810 @@
+mod manager;
+mod reads;
+mod store;
+mod writes;
+
+use std::{sync::Arc, task::Poll};
+
+use backoff::{Backoff, BackoffConfig};
+use bytes::Buf;
+use http::{Request, Uri};
+use hyper::{Body, Method};
+use iox_catalog::interface::Catalog;
+use object_store::ObjectStore;
+use observability_deps::tracing::{error, warn};
+use tokio::task::JoinHandle;
+use tower::Service;
+
+use self::{
+    manager::{CacheManager, CacheManagerValue},
+    reads::ReadHandler,
+    store::LocalStore,
+    writes::WriteHandler,
+};
+use super::{error::Error, response::Response};
+use crate::data_types::{PolicyConfig, WriteHint, WriteHintRequestBody};
+
+#[derive(Debug, thiserror::Error)]
+pub enum DataError {
+    #[error("Read error: {0}")]
+    Read(String),
+    #[error("Write-stream error: {0}")]
+    Stream(String),
+    #[error("Write-file error: {0}")]
+    File(String),
+    #[error("Bad Request: {0}")]
+    BadRequest(String),
+    #[error("Bad Request: object location does not exist in catalog or object store")]
+    DoesNotExist,
+}
+
+/// Service that provides access to the data.
+#[derive(Debug, Clone)]
+pub struct DataService {
+    catalog: Arc<dyn Catalog>,
+    cache_manager: Arc<CacheManager>,
+    read_handler: ReadHandler,
+    write_hander: WriteHandler,
+    handle: Arc<JoinHandle<()>>,
+    backoff_config: BackoffConfig,
+}
+
+impl DataService {
+    pub async fn new(
+        direct_store: Arc<dyn ObjectStore>,
+        catalog: Arc<dyn Catalog>,
+        config: PolicyConfig,
+        dir: Option<impl ToString + Send>,
+    ) -> Self {
+        let data_accessor = Arc::new(LocalStore::new(dir));
+
+        // TODO: use a bounded channel
+        // Apply back pressure if we can't keep up (a.k.a. the actual eviction from the local store).
+        let (evict_tx, evict_rx) = async_channel::unbounded();
+
+        // start background task to evict from local store
+        let data_accessor_ = Arc::clone(&data_accessor);
+        let handle = tokio::spawn(async move {
+            while let Ok(key) = evict_rx.recv().await {
+                let _ = data_accessor_.delete_object(&key).await;
+            }
+        });
+
+        Self {
+            catalog,
+            read_handler: ReadHandler::new(Arc::clone(&data_accessor)),
+            write_hander: WriteHandler::new(Arc::clone(&data_accessor), direct_store),
+            cache_manager: Arc::new(CacheManager::new(config, evict_tx)),
+            handle: Arc::new(handle),
+            backoff_config: Default::default(),
+        }
+    }
+
+    async fn create_write_hint(&self, location: &String) -> Result<WriteHint, Error> {
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(location)
+            .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+        let maybe_parquet_file = Backoff::new(&self.backoff_config)
+            .retry_all_errors("lookup write-hint in catalog", || async {
+                self.catalog
+                    .repositories()
+                    .parquet_files()
+                    .get_by_object_store_id(parquet_file_path.object_store_id())
+                    .await
+            })
+            .await
+            .expect("retry forever");
+
+        match maybe_parquet_file {
+            None => Err(Error::DoesNotExist),
+            Some(parquet_file) => Ok(WriteHint::from(&parquet_file)),
+        }
+    }
+
+    async fn write_back(&self, location: String, write_hint: WriteHint) -> Result<(), Error> {
+        // confirm valid location
+        parquet_file::ParquetFilePath::try_from(&location)
+            .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+        // write to local store
+        let metadata = self
+            .write_hander
+            .write_local(&location, &write_hint)
+            .await?;
+
+        // update cache manager
+        self.cache_manager
+            .insert(
+                location,
+                CacheManagerValue {
+                    params: write_hint,
+                    metadata,
+                },
+            )
+            .await;
+
+        Ok(())
+    }
+}
+
+impl Service<Request<Body>> for DataService {
+    type Response = Response;
+    type Error = Error;
+    type Future = super::response::PinnedFuture;
+
+    fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        Poll::Ready(Ok(()))
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/state")
+            | (&Method::PATCH, "/warmed")
+            | (&Method::GET, "/keyspace") => {
+                unreachable!("`this request should have already been handled in the KeyspaceLayer`")
+            }
+            (&Method::GET, "/metadata") | (&Method::GET, "/object") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let obj_location = parse_object_location(req.uri())?;
+                    match this.cache_manager.in_cache(&obj_location).await {
+                        Ok(_) => match req.uri().path() {
+                            "/metadata" => {
+                                let meta = this.cache_manager.fetch_metadata(&obj_location).await?;
+                                Ok(Response::Head(meta.into()))
+                            }
+                            "/object" => {
+                                let stream = this.read_handler.read_local(&obj_location).await?;
+                                Ok(Response::Data(stream))
+                            }
+                            _ => unreachable!(),
+                        },
+                        Err(Error::CacheMiss) => {
+                            // trigger write-back on another thread
+                            let this_ = this.clone();
+                            tokio::spawn(async move {
+                                let write_hint = match this_.create_write_hint(&obj_location).await
+                                {
+                                    Ok(write_hint) => write_hint,
+                                    Err(error) => {
+                                        warn!(%error, "write-back failed to create write-hint (likely missing from catalog)");
+                                        return;
+                                    }
+                                };
+
+                                if let Err(error) = this_.write_back(obj_location, write_hint).await
+                                {
+                                    error!(%error, "write-back failed to perform local-store write");
+                                }
+                            });
+
+                            // still return immediate response, such that client will use direct_store fallback
+                            Err(Error::CacheMiss)
+                        }
+                        Err(e) => Err(e),
+                    }
+                })
+            }
+            (&Method::POST, "/write-hint") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let reader = hyper::body::aggregate(req.into_body())
+                        .await
+                        .map_err(|e| Error::BadRequest(e.to_string()))?
+                        .reader();
+                    let write_hint: WriteHintRequestBody = serde_json::from_reader(reader)
+                        .map_err(|e| Error::BadRequest(e.to_string()))?;
+
+                    match this.cache_manager.in_cache(&write_hint.location).await {
+                        Ok(_) => Ok(Response::Written),
+                        Err(_) => {
+                            this.write_back(write_hint.location, write_hint.hint)
+                                .await?;
+                            Ok(Response::Written)
+                        }
+                    }
+                })
+            }
+            (any_method, any_path) => {
+                let msg = format!("invalid path: {} {}", any_method, any_path);
+                Box::pin(async { Err(Error::BadRequest(msg)) })
+            }
+        }
+    }
+}
+
+fn parse_object_location(uri: &Uri) -> Result<String, Error> {
+    let as_url = url::Url::parse(uri.to_string().as_str())
+        .expect("should be already validated path & query");
+    match as_url.query_pairs().find(|(k, _v)| k.eq("location")) {
+        None => Err(Error::BadRequest(
+            "missing required query parameter: location".into(),
+        )),
+        Some((_key, location)) => Ok(location.to_string()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, fs::File, io::Write, ops::Range, path::PathBuf};
+
+    use assert_matches::assert_matches;
+    use bytes::{BufMut, Bytes, BytesMut};
+    use chrono::{DateTime, Utc};
+    use futures::{stream::BoxStream, TryStreamExt};
+    use iox_tests::TestParquetFileBuilder;
+    use object_store::{
+        path::Path, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta,
+        ObjectStore, PutOptions, PutResult,
+    };
+    use tempfile::{tempdir, TempDir};
+    use tokio::{fs::create_dir_all, io::AsyncWrite};
+
+    use crate::data_types::GetObjectMetaResponse;
+
+    use super::*;
+
+    const ONE_SECOND: u64 = 1_000_000_000;
+
+    // refer to valid path in parquet_file::ParquetFilePath
+    const LOCATION_F: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000000.parquet";
+    const LOCATION_S: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet";
+    const LOCATION_MISSING: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000002.parquet"; // not in catalog, nor remote store
+
+    const DATA: &[u8] = b"all my pretty words";
+
+    lazy_static::lazy_static! {
+        static ref LAST_MODIFIED: DateTime<Utc> = Utc::now();
+    }
+
+    #[derive(Debug)]
+    struct MockData(Bytes, bool /* as_stream */);
+
+    #[derive(Debug)]
+    struct MockDirectStore {
+        mocked: HashMap<String /* location */, MockData>,
+        temp_dir: TempDir,
+    }
+
+    impl MockDirectStore {
+        fn default() -> Self {
+            Self {
+                mocked: HashMap::new(),
+                temp_dir: tempdir().expect("should create temp dir"),
+            }
+        }
+
+        fn put_mock(&mut self, location: String, data: MockData) {
+            self.mocked.insert(location, data);
+        }
+    }
+
+    #[async_trait::async_trait]
+    impl ObjectStore for MockDirectStore {
+        async fn get_opts(
+            &self,
+            location: &Path,
+            _options: GetOptions,
+        ) -> object_store::Result<GetResult> {
+            let MockData(bytes, as_stream) = match self.mocked.get(&location.to_string()) {
+                Some(data) => data,
+                _ => {
+                    return Err(object_store::Error::NotFound {
+                        path: location.to_string(),
+                        source: "not found in remote store".into(),
+                    })
+                }
+            };
+
+            let meta = ObjectMeta {
+                location: location.clone(),
+                last_modified: *LAST_MODIFIED,
+                size: DATA.to_vec().len(),
+                e_tag: Default::default(),
+                version: Default::default(),
+            };
+
+            let bytes = bytes.to_owned();
+            let payload =
+                match as_stream {
+                    true => GetResultPayload::Stream(Box::pin(futures::stream::once(async move {
+                        Ok(bytes)
+                    }))),
+                    false => {
+                        let path = self.temp_dir.path().join(location.to_string());
+                        create_dir_all(path.parent().unwrap())
+                            .await
+                            .expect("should create nested path");
+                        let mut file =
+                            File::create(path.as_path()).expect("should be able to open temp file");
+                        file.write_all(&bytes)
+                            .expect("should be able to write to temp file");
+                        file.flush().expect("should be able to flush temp file");
+                        GetResultPayload::File(file, path)
+                    }
+                };
+
+            Ok(GetResult {
+                payload,
+                meta,
+                range: Range {
+                    start: 0,
+                    end: DATA.to_vec().len(),
+                },
+            })
+        }
+
+        async fn put_opts(
+            &self,
+            _location: &Path,
+            _bytes: Bytes,
+            _opts: PutOptions,
+        ) -> object_store::Result<PutResult> {
+            unimplemented!()
+        }
+        async fn put_multipart(
+            &self,
+            _location: &Path,
+        ) -> object_store::Result<(MultipartId, Box<dyn AsyncWrite + Unpin + Send>)> {
+            unimplemented!()
+        }
+        async fn abort_multipart(
+            &self,
+            _location: &Path,
+            _multipart_id: &MultipartId,
+        ) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        async fn delete(&self, _location: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, object_store::Result<ObjectMeta>> {
+            unimplemented!()
+        }
+        async fn list_with_delimiter(
+            &self,
+            _prefix: Option<&Path>,
+        ) -> object_store::Result<ListResult> {
+            unimplemented!()
+        }
+        async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+        async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> object_store::Result<()> {
+            unimplemented!()
+        }
+    }
+
+    impl std::fmt::Display for MockDirectStore {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "MockDirectStore")
+        }
+    }
+
+    fn make_parquet_file(location: &str) -> TestParquetFileBuilder {
+        let parquet_file_path = parquet_file::ParquetFilePath::try_from(&location.to_string())
+            .expect("should be valid parquet file path");
+
+        TestParquetFileBuilder::default()
+            .with_creation_time(iox_time::Time::from_date_time(*LAST_MODIFIED))
+            .with_file_size_bytes(DATA.to_vec().len() as u64)
+            .with_object_store_id(parquet_file_path.object_store_id())
+    }
+
+    async fn make_service(temp_dir: PathBuf, policy_config: Option<PolicyConfig>) -> DataService {
+        let mut direct_store = MockDirectStore::default();
+        // data returned as file, for write-back
+        direct_store.put_mock(
+            LOCATION_F.to_string(),
+            MockData(Bytes::from(DATA.to_vec()), false),
+        );
+        // data returned as stream, for write-back
+        direct_store.put_mock(
+            LOCATION_S.to_string(),
+            MockData(Bytes::from(DATA.to_vec()), true),
+        );
+
+        // create catalog
+        let test_catalog = iox_tests::TestCatalog::new();
+        let namespace = test_catalog.create_namespace_1hr_retention("ns0").await;
+        let table = namespace.create_table("table0").await;
+        let partition = table.create_partition("partition_key").await;
+
+        // add parquet files to catalog
+        partition
+            .create_parquet_file_catalog_record(make_parquet_file(LOCATION_F))
+            .await;
+        partition
+            .create_parquet_file_catalog_record(make_parquet_file(LOCATION_S))
+            .await;
+
+        DataService::new(
+            Arc::new(direct_store),
+            test_catalog.catalog(),
+            policy_config.unwrap_or(PolicyConfig {
+                max_capacity: 3_200_000,
+                event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2,
+            }),
+            Some(temp_dir.to_str().unwrap()),
+        )
+        .await
+    }
+
+    // note: uses file for write-back
+    #[tokio::test]
+    async fn test_metadata_writeback_on_cache_miss() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        let expected = GetObjectMetaResponse::from(ObjectMeta {
+            location: LOCATION_F.into(),
+            size: DATA.to_vec().len(),
+            last_modified: *LAST_MODIFIED,
+            e_tag: Default::default(),
+            version: Default::default(),
+        });
+        assert_matches!(
+            resp,
+            Ok(Response::Head(meta)) if meta == expected,
+            "should return metadata for location, instead found {:?}", resp
+        );
+    }
+
+    // note: uses file for write-back
+    #[tokio::test]
+    async fn test_object_writeback_on_cache_miss() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // wait for write-back to complete
+        tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        match resp {
+            Ok(Response::Data(stream)) => {
+                let data = stream.try_collect::<Vec<_>>().await.unwrap();
+                assert_eq!(
+                    data,
+                    vec![DATA.to_vec()],
+                    "should have returned matching bytes"
+                );
+            }
+            _ => panic!("should return data for location, instead found {:?}", resp),
+        }
+    }
+
+    // note: uses stream for write-back
+    #[tokio::test]
+    async fn test_write_hint() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+
+        // return cache hit -- metadata
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        let expected = GetObjectMetaResponse::from(ObjectMeta {
+            location: LOCATION_S.into(),
+            size: DATA.to_vec().len(),
+            last_modified: *LAST_MODIFIED,
+            e_tag: Default::default(),
+            version: Default::default(),
+        });
+        assert_matches!(
+            resp,
+            Ok(Response::Head(meta)) if meta == expected,
+            "should return metadata for location, instead found {:?}", resp
+        );
+
+        // return cache hit -- object
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/object?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        match resp {
+            Ok(Response::Data(stream)) => {
+                let data = stream.try_collect::<Vec<_>>().await.unwrap();
+                assert_eq!(
+                    data,
+                    vec![DATA.to_vec()],
+                    "should have returned matching bytes"
+                );
+            }
+            _ => panic!("should return data for location, instead found {:?}", resp),
+        }
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_fails_for_invalid_path() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: "not_a_valid_path.parquet".into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::BadRequest(_)),
+            "should return failed write-back, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_write_hint_fails_for_incorrect_size() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: 12312,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::Data(_)),
+            "should error for incorrect file size in write-hint, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_fails_for_nonexistent_object() {
+        // setup
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), None).await;
+
+        // issue write-hint
+        // Fails when looking up in remote store. Does not check catalog first.
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_MISSING.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::Data(DataError::DoesNotExist)),
+            "should return failed write-back, instead found {:?}",
+            resp
+        );
+    }
+
+    #[tokio::test]
+    async fn test_eviction() {
+        // setup
+        let policy_config = PolicyConfig {
+            max_capacity: DATA.to_vec().len() as u64 + 1,
+            event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2,
+        };
+        let dir = tempdir().expect("should create temp dir");
+        let mut service = make_service(PathBuf::from(dir.path()), Some(policy_config)).await;
+
+        // issue write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_S.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+
+        // return cache hit
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Head(_)),
+            "should return metadata for location, instead found {:?}",
+            resp
+        );
+        service.cache_manager.flush_pending().await;
+
+        // issue 2nd write-hint
+        let mut buf = BytesMut::new().writer();
+        serde_json::to_writer(
+            &mut buf,
+            &WriteHintRequestBody {
+                location: LOCATION_F.into(),
+                hint: WriteHint {
+                    file_size_bytes: DATA.to_vec().len() as i64,
+                    ..Default::default()
+                },
+                ack_setting: crate::data_types::WriteHintAck::Completed,
+            },
+        )
+        .expect("should write request body");
+        let req = Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint")
+            .body(hyper::Body::from(buf.into_inner().freeze()))
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Written),
+            "should return successful write-back, instead found {:?}",
+            resp
+        );
+        service.cache_manager.flush_pending().await;
+
+        // eviction should have happened
+        // should return cache miss
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_S))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Err(Error::CacheMiss),
+            "should return cache miss, instead found {:?}",
+            resp
+        );
+
+        // other object should still be in cache
+        let req = Request::builder()
+            .method(Method::GET)
+            .uri(format!("http://foo.io/metadata?location={}", LOCATION_F))
+            .body(Body::empty())
+            .unwrap();
+        let resp = service.call(req).await;
+        assert_matches!(
+            resp,
+            Ok(Response::Head(_)),
+            "should return metadata for location, instead found {:?}",
+            resp
+        );
+
+        dir.close().expect("should close temp dir");
+    }
+}
diff --git a/parquet_cache/src/server/data/manager.rs b/parquet_cache/src/server/data/manager.rs
new file mode 100644
index 00000000000..5b29e1dc9bb
--- /dev/null
+++ b/parquet_cache/src/server/data/manager.rs
@@ -0,0 +1,836 @@
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use arc_swap::ArcSwap;
+use async_channel::Sender;
+use moka::future::{Cache, FutureExt};
+use moka::notification::ListenerFuture;
+use moka::Expiry;
+use object_store::ObjectMeta;
+use observability_deps::tracing::error;
+use parking_lot::Mutex;
+use tokio::task::JoinSet;
+
+use crate::data_types::{ObjectParams, PolicyConfig};
+use crate::server::error::Error;
+
+type ExternalRequestKey = String;
+type CacheManagerKey = Arc<String>;
+
+#[derive(Debug, Clone)]
+pub struct CacheManagerValue {
+    /// Required for eviction policy.
+    pub params: ObjectParams,
+    /// Returned on `GET /metadata` head requests.
+    pub metadata: ObjectMeta,
+}
+
+type InMemoryCache = Cache<CacheManagerKey, CacheManagerValue>;
+
+/// Manages the cache eviction policy.
+///
+/// Cache manager built upon a fast, concurrent in-memory cache.
+/// In-memory will be the keys, as well as minimum metadata for managing cache eviction.
+#[derive(Debug)]
+pub struct CacheManager {
+    /// High-concurrency in-memory cache, used for the eviction policy.
+    manager: Arc<InMemoryCache>,
+    /// Current size of the cache.
+    current_size: Arc<AtomicU64>,
+}
+
+impl CacheManager {
+    pub fn new(config: PolicyConfig, evict_tx: Sender<String>) -> Self {
+        let current_size = Arc::new(AtomicU64::new(0));
+
+        // listener => then evict from local store
+        let current_size_ = Arc::clone(&current_size);
+        let listener =
+            move |k: Arc<CacheManagerKey>, v: CacheManagerValue, _cause| -> ListenerFuture {
+                let evict_tx = evict_tx.clone();
+                let current_size = Arc::clone(&current_size_);
+                async move {
+                    // use async_channel to ensure evicted, before removing from current_size
+                    match evict_tx.send((**k).clone()).await {
+                        Ok(_) => {
+                            current_size
+                                .fetch_sub(v.params.file_size_bytes as u64, AtomicOrdering::SeqCst);
+                        }
+                        Err(e) => {
+                            error!("CacheManager eviction listener failed to send: {:?}", e);
+                        }
+                    }
+                }
+                .boxed()
+            };
+
+        // event-recency
+        let evicter = Arc::new(Evictor::new_with_placeholder_cache_ref());
+        let expiry = Arc::new(EventRecency::new(
+            Arc::clone(&current_size),
+            config.max_capacity,
+            Arc::clone(&evicter),
+        ));
+
+        // cache manager
+        let manager = Arc::new(
+            Cache::builder()
+                .max_capacity(config.max_capacity)
+                .weigher(Self::size_weigher) // triggers eviction
+                .expire_after(EntryExpiry::new(config, Arc::clone(&expiry))) // triggered on insert & read
+                .async_eviction_listener(listener)
+                .build(),
+        );
+
+        // set cache on evicter
+        evicter.set_cache(Arc::clone(&manager));
+
+        Self {
+            manager,
+            current_size,
+        }
+    }
+
+    /// Maps the max_capacity to the disk bytes.
+    fn size_weigher(_k: &CacheManagerKey, v: &CacheManagerValue) -> u32 {
+        v.params.file_size_bytes as u32
+    }
+
+    /// Inserts the key-value pair into the cache.
+    pub async fn insert(&self, k: ExternalRequestKey, v: CacheManagerValue) {
+        let size = v.params.file_size_bytes;
+        self.manager.entry(Arc::new(k)).or_insert(v).await;
+        self.current_size
+            .fetch_add(size as u64, AtomicOrdering::SeqCst);
+    }
+
+    /// Returns Ok if the key is in the cache.
+    pub async fn in_cache(&self, k: &ExternalRequestKey) -> Result<(), Error> {
+        self.manager
+            .get(k)
+            .await
+            .map(|_| ())
+            .ok_or(Error::CacheMiss)
+    }
+
+    /// Returns the metadata for the object.
+    pub async fn fetch_metadata(&self, k: &ExternalRequestKey) -> Result<ObjectMeta, Error> {
+        Ok(self.manager.get(k).await.ok_or(Error::CacheMiss)?.metadata)
+    }
+
+    /// Explicitly evict key from cache.
+    #[cfg(test)]
+    async fn invalidate(&self, k: ExternalRequestKey) {
+        self.manager.invalidate(&k).await;
+    }
+
+    /// Trigger moka to flush all pending tasks. Use for testing ONLY.
+    #[cfg(test)]
+    pub(crate) async fn flush_pending(&self) {
+        self.manager.run_pending_tasks().await;
+    }
+}
+
+#[derive(Clone)]
+pub struct EntryExpiry {
+    /// Outer bound on how long to hold.
+    max_recency_duration: Duration,
+    /// Handles event recency.
+    event_recency: Arc<EventRecency>,
+}
+
+impl EntryExpiry {
+    fn new(config: PolicyConfig, evicter: Arc<EventRecency>) -> Self {
+        Self {
+            max_recency_duration: Duration::from_nanos(
+                config.event_recency_max_duration_nanoseconds,
+            ),
+            event_recency: evicter,
+        }
+    }
+}
+
+/// Moka helps achieve high concurrency with buffered inserts.
+///
+/// When pending tasks are applied, if more space is needed then existing keys are flushed
+/// based upon expiration.
+impl Expiry<CacheManagerKey, CacheManagerValue> for EntryExpiry {
+    /// Sets the expiry duration for every insertion.
+    /// If incoming should not be inserted, then set expiry to 0.
+    fn expire_after_create(
+        &self,
+        k: &CacheManagerKey,
+        v: &CacheManagerValue,
+        _inserted_at: Instant,
+    ) -> Option<Duration> {
+        if !self.event_recency.should_insert(k, v) {
+            return Some(Duration::from_secs(0));
+        }
+
+        if let Some(now) = chrono::Utc::now().timestamp_nanos_opt() {
+            let event_timestamp_nanos = v.params.max_time;
+
+            let age_out_nanoseconds =
+                event_timestamp_nanos.saturating_add(self.max_recency_duration.as_nanos() as i64);
+            let duration_until_event_ages_out = age_out_nanoseconds.saturating_sub(now);
+
+            Some(Duration::from_nanos(duration_until_event_ages_out as u64))
+        } else {
+            None
+        }
+    }
+}
+
+/// Tracks the event time recency, and evicts based upon the event time.
+struct EventRecency {
+    /// Current size of the cache.
+    ///
+    /// Used to determine when to evict.
+    /// Does not rely upon the moka-buffered inserts (unlike [`Cache`].weighted_size()).
+    current_size: Arc<AtomicU64>,
+    /// Upper bound on cache size.
+    max_capacity: u64,
+
+    /// Min-heap, used to track event time recency.
+    min_heap: Arc<Mutex<BinaryHeap<Slot>>>,
+    /// Tracks the current min, which will be updated with store() to minimize lock contention.
+    current_min: Arc<AtomicU64>,
+    /// Handles updates to min-heap on separate threads, to avoid locking on the hot path.
+    background_tasks: JoinSet<()>,
+    insert_tx: tokio::sync::mpsc::UnboundedSender<Slot>,
+    remove_tx: tokio::sync::mpsc::UnboundedSender<()>,
+}
+
+impl EventRecency {
+    /// Creates a new [`EventRecency`].
+    fn new(current_size: Arc<AtomicU64>, max_capacity: u64, evictor: Arc<Evictor>) -> Self {
+        let min_heap: Arc<Mutex<BinaryHeap<Slot>>> = Default::default();
+        let current_min: Arc<AtomicU64> = Default::default();
+
+        // TODO: replace with bounded channels.
+        let (insert_tx, mut insert_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (remove_tx, mut remove_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        // insert into min-heap, off the hot path
+        let mut background_tasks = JoinSet::new();
+        let min_heap_ = Arc::clone(&min_heap);
+        background_tasks.spawn(async move {
+            loop {
+                if let Some(slot) = insert_rx.recv().await {
+                    let mut guard = min_heap_.lock();
+                    guard.push(slot);
+                    drop(guard);
+                }
+            }
+        });
+
+        // remove from min-heap, off the hot path
+        let min_heap_ = Arc::clone(&min_heap);
+        let current_min_ = Arc::clone(&current_min);
+        background_tasks.spawn(async move {
+            loop {
+                if remove_rx.recv().await.is_some() {
+                    let mut guard = min_heap_.lock();
+                    let to_evict = guard.pop().expect("should have entry via peek");
+                    let new_min = guard.peek().map(|slot| slot.max_time);
+                    drop(guard);
+
+                    if let Some(new_min) = new_min {
+                        current_min_.store(new_min as u64, AtomicOrdering::Release);
+                    }
+
+                    evictor.evict_from_cache(to_evict.key);
+                }
+            }
+        });
+
+        Self {
+            current_size,
+            max_capacity,
+            min_heap,
+            current_min,
+            background_tasks,
+            insert_tx,
+            remove_tx,
+        }
+    }
+
+    /// Returns true if the incoming entry should be inserted.
+    fn should_insert(&self, incoming_k: &CacheManagerKey, incoming_v: &CacheManagerValue) -> bool {
+        let incoming_size = incoming_v.params.file_size_bytes as u64;
+        let should_insert =
+            if self.current_size.load(AtomicOrdering::SeqCst) + incoming_size > self.max_capacity {
+                self.max_capacity_should_insert(incoming_v)
+            } else {
+                true
+            };
+
+        if should_insert {
+            self.insert_tx
+                .send(Slot {
+                    max_time: incoming_v.params.max_time,
+                    key: Arc::clone(incoming_k),
+                })
+                .expect("should send min heap insert");
+        }
+        should_insert
+    }
+
+    /// Returns true if incoming entry should be inserted.
+    ///
+    /// Handles the case where the cache is at max_capacity,
+    /// by either evicting based upon event time recency,
+    /// or rejecting the incoming entry.
+    fn max_capacity_should_insert(&self, incoming_v: &CacheManagerValue) -> bool {
+        match self
+            .current_min
+            .load(AtomicOrdering::Relaxed)
+            .partial_cmp(&(incoming_v.params.max_time as u64))
+        {
+            Some(Ordering::Less) | Some(Ordering::Equal) => {
+                // incoming event time is more recent than current min
+                // therefore, evict current min
+                let _ = self.remove_tx.send(());
+                true
+            }
+            Some(Ordering::Greater) => false, // incoming event time is older than current min
+            None => true,                     // no entries in min-heap
+        }
+    }
+}
+
+/// Slot in the min-heap, used to evict based upon event timestamp recency.
+///
+/// [`BinaryHeap`] is a max-heap, therefore the Ord implementation is reversed.
+#[derive(Debug, Eq, PartialEq)]
+struct Slot {
+    max_time: i64,
+    key: CacheManagerKey,
+}
+
+#[allow(clippy::non_canonical_partial_ord_impl)]
+impl PartialOrd for Slot {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        other.max_time.partial_cmp(&self.max_time)
+    }
+}
+
+impl Ord for Slot {
+    fn cmp(&self, other: &Self) -> Ordering {
+        self.partial_cmp(other).unwrap()
+    }
+}
+
+/// Does the eviction.
+#[derive(Debug)]
+struct Evictor {
+    /// Ref to cache, in order to evict.
+    cache_manager: ArcSwap<InMemoryCache>,
+}
+
+impl Evictor {
+    /// Creates a new [`Evictor`], with a placeholder cache ref.
+    fn new_with_placeholder_cache_ref() -> Self {
+        Self {
+            cache_manager: ArcSwap::new(Arc::new(Cache::new(0))),
+        }
+    }
+
+    /// Sets the cache ref.
+    fn set_cache(&self, cache: Arc<InMemoryCache>) {
+        self.cache_manager.store(cache);
+    }
+
+    /// Evicts the key from the cache.
+    ///
+    /// Must be a non-blocking downstream action from [`EntryExpiry`].
+    ///
+    /// [`Cache`].invalidate() provides immediate invalidation of the entry,
+    /// outside of any pending moka insert tasks.
+    ///
+    /// When pending moka insert tasks are applied, if max_capacity is reached
+    /// then existing keys are flushed based upon expiration.
+    /// As we are spawning a non-blocking thread, we are not guaranteed
+    /// to have this eviction occur prior to the flushing of the task queue.
+    ///
+    /// Worst case scenario is that an incoming key is rejected (not accepted into cache) due to space.
+    fn evict_from_cache(&self, key: CacheManagerKey) {
+        let guard = self.cache_manager.load();
+        let cache = guard.as_ref().clone();
+        tokio::spawn(async move {
+            cache.invalidate(&key).await;
+        });
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use async_channel::unbounded;
+
+    use crate::data_types::PolicyConfig;
+
+    use super::*;
+
+    fn now_nanos() -> i64 {
+        chrono::Utc::now().timestamp_nanos_opt().unwrap()
+    }
+
+    fn cache_manager_value(size: usize, max_time: Option<i64>) -> CacheManagerValue {
+        let max_time = max_time.unwrap_or(now_nanos());
+
+        CacheManagerValue {
+            params: ObjectParams {
+                file_size_bytes: size as i64,
+                max_time,
+                min_time: max_time - 1_000_000_000,
+                ..Default::default()
+            },
+            metadata: ObjectMeta {
+                last_modified: chrono::Utc::now(),
+                location: object_store::path::Path::from("not_used"),
+                size,
+                e_tag: None,
+                version: None,
+            },
+        }
+    }
+
+    fn policy_config(max_capacity: u64) -> PolicyConfig {
+        PolicyConfig {
+            max_capacity,
+            event_recency_max_duration_nanoseconds: 1_000_000_000 * 60 * 60,
+        }
+    }
+
+    #[tokio::test]
+    async fn test_eviction_listener() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert entry
+        let value = cache_manager_value(
+            1_000_000, None, // all will have same event timestamp
+        );
+        let to_evict = "k_a".to_string();
+        cache_manager.insert(to_evict.clone(), value.clone()).await;
+
+        // check current_size
+        assert_eq!(
+            cache_manager.current_size.load(AtomicOrdering::SeqCst),
+            1_000_000
+        );
+
+        // explicitly evict
+        cache_manager.invalidate(to_evict.clone()).await;
+
+        // eviction listener should receive notification
+        assert_matches!(
+            evict_rx.recv().await,
+            Ok(_),
+            "should have received eviction notice",
+        );
+
+        assert_eq!(
+            cache_manager.current_size.load(AtomicOrdering::SeqCst),
+            0,
+            "should have zero current_size after eviction"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_evicts_at_max_capacity() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let oldest = "k_a".to_string();
+        cache_manager.insert(oldest.clone(), value.clone()).await;
+        cache_manager.insert("k_b".into(), value.clone()).await;
+
+        // To Discuss: this flush is needed, in order to apply ordering in k_a+k_b, as before k_c.
+        // otherwise, the k_c is evicted instead
+        cache_manager.manager.run_pending_tasks().await;
+
+        // insert 1 more entry, which should force an eviction (over capacity)
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict oldest inserted entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == oldest,
+            "should have evicted oldest inserted key, instead found {:?}", res
+        );
+
+        // should still have other 2 entries
+        assert!(
+            cache_manager.in_cache(&"k_b".to_string()).await.is_ok(),
+            "should still have k_b"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should still have k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_lfu_eviction() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let read = "k_a".to_string();
+        cache_manager.insert(read.clone(), value.clone()).await;
+        let not_read = "k_b".to_string();
+        cache_manager.insert(not_read.clone(), value.clone()).await;
+
+        // read one entry, many times, to pass the probability threshold
+        // To Discuss: is this sufficient for LFU?
+        //      * the write-back will be triggered on a single cache miss
+        //      * the LFU eviction would be using moka's probabilistic algorithm
+        for _ in 0..63 {
+            assert!(
+                cache_manager.in_cache(&read).await.is_ok(),
+                "should have read key"
+            );
+        }
+
+        // insert 1 more entry
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict unread entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == not_read,
+            "should have evicted unread key, instead found {:?}", res
+        );
+
+        // should have other 2 entries
+        assert!(
+            cache_manager.in_cache(&read).await.is_ok(),
+            "should still have the read key"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should have newly inserted k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_eviction() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries, where the older entry has a more recent event time
+        let older_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 5_000_000_000), // 5 seconds ago
+        );
+        let newer_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 1_000_000_000), // 1 second ago
+        );
+        let should_keep = "younger_event_time_but_older_insert".to_string();
+        cache_manager
+            .insert(should_keep.clone(), newer_event_time.clone())
+            .await;
+
+        let should_evict = "older_event_time_but_younger_insert".to_string();
+        cache_manager
+            .insert(should_evict.clone(), older_event_time)
+            .await;
+
+        // insert 1 more entry, with same event time as should_keep
+        cache_manager
+            .insert("k_c".into(), newer_event_time.clone())
+            .await;
+
+        // To Discuss: this is waiting for event time recency eviction to occur
+        // before the moka task queue is flushed.
+        // This is the race condition explained in doc comments for
+        // Evicter::evict_from_cache().
+        tokio::time::sleep(Duration::from_micros(1)).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict based on event time, not insertion order
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == should_evict,
+            "should have evicted older_event_time, instead found {:?}", res
+        );
+
+        // LFU as a tie-breaker with same event time
+        assert!(
+            cache_manager.in_cache(&should_keep).await.is_ok(),
+            "should have read key"
+        );
+        cache_manager.insert("k_d".into(), newer_event_time).await; // now have 3 with newer_event_time
+        cache_manager.manager.run_pending_tasks().await;
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if v == "k_c",
+            "should have evicted least recently queried key, instead found {:?}", res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_trumps_lfu() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries, where the older entry has a more recent event time
+        let older_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 5_000_000_000), // 5 seconds ago
+        );
+        let newer_event_time = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() - 1_000_000_000), // 1 second ago
+        );
+        let should_keep = "younger_event_time_but_older_insert".to_string();
+        cache_manager
+            .insert(should_keep.clone(), newer_event_time.clone())
+            .await;
+        let should_evict = "older_event_time_but_younger_insert".to_string();
+        cache_manager
+            .insert(should_evict.clone(), older_event_time)
+            .await;
+
+        // query the older timestamp, many times, to pass the probability threshold
+        for _ in 0..63 {
+            assert!(
+                cache_manager.in_cache(&should_evict).await.is_ok(),
+                "should have read key"
+            );
+        }
+
+        // insert 1 more entry, with same event time as should_keep
+        cache_manager
+            .insert("k_c".into(), newer_event_time.clone())
+            .await;
+
+        // To Discuss: this is waiting for event time recency eviction to occur
+        // before the moka task queue is flushed.
+        // This is the race condition explained in doc comments for
+        // Evicter::evict_from_cache().
+        tokio::time::sleep(Duration::from_micros(1)).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict based on event time, not LFU
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == should_evict,
+            "should have evicted older_event_time, instead found {:?}", res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_age_out() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager, with 2 second ageout
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            PolicyConfig {
+                max_capacity: max_capacity as u64,
+                event_recency_max_duration_nanoseconds: 1_000_000_000 * 2,
+            },
+            evict_tx,
+        ));
+
+        // insert
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // will have current event timestamp
+        );
+        let now = "now_event_time".to_string();
+        cache_manager.insert(now.clone(), value.clone()).await;
+        assert!(
+            cache_manager.in_cache(&now).await.is_ok(),
+            "should have now"
+        );
+
+        // age out
+        tokio::time::sleep(Duration::from_secs(3)).await;
+        cache_manager.manager.run_pending_tasks().await;
+        assert!(
+            cache_manager.in_cache(&now).await.is_err(),
+            "should no longer have now"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_event_time_recency_age_out_with_future_time() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager, with 2 second ageout
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            PolicyConfig {
+                max_capacity: max_capacity as u64,
+                event_recency_max_duration_nanoseconds: 1_000_000_000 * 2,
+            },
+            evict_tx,
+        ));
+
+        // insert
+        let value = cache_manager_value(
+            max_capacity / 2,
+            Some(now_nanos() + 2_000_000_000), // 2 seconds into future
+        );
+        let future_event = "future_event_time".to_string();
+        cache_manager
+            .insert(future_event.clone(), value.clone())
+            .await;
+        assert!(
+            cache_manager.in_cache(&future_event).await.is_ok(),
+            "should have future_event"
+        );
+
+        // age out
+        tokio::time::sleep(Duration::from_secs(3 + 2)).await;
+        cache_manager.manager.run_pending_tasks().await;
+        assert!(
+            cache_manager.in_cache(&future_event).await.is_err(),
+            "should no longer have future_event"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_fetch_metadata() {
+        let (evict_tx, evict_rx) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // insert 2 entries
+        let value = cache_manager_value(
+            max_capacity / 2,
+            None, // all will have same event timestamp
+        );
+        let read = "k_a".to_string();
+        cache_manager.insert(read.clone(), value.clone()).await;
+        let not_read = "k_b".to_string();
+        cache_manager.insert(not_read.clone(), value.clone()).await;
+
+        // assert can find metadata
+        let expected_metadata = value.clone().metadata;
+        assert_matches!(
+            cache_manager.fetch_metadata(&read).await,
+            Ok(metadata) if metadata == expected_metadata,
+            "should have found metadata"
+        );
+
+        // assert metadata access applies to LFU eviction policy
+        for _ in 0..63 {
+            assert!(
+                cache_manager.fetch_metadata(&read).await.is_ok(),
+                "should have read key"
+            );
+        }
+        cache_manager.manager.run_pending_tasks().await;
+
+        // insert 1 more entry
+        cache_manager.insert("k_c".into(), value).await;
+        cache_manager.manager.run_pending_tasks().await;
+
+        // should evict unread entry
+        let res = evict_rx.recv().await;
+        assert_matches!(
+            res,
+            Ok(v) if *v == not_read,
+            "should have evicted unread key, instead found {:?}", res
+        );
+
+        // should have other 2 entries
+        assert!(
+            cache_manager.in_cache(&read).await.is_ok(),
+            "should still have the read key"
+        );
+        assert!(
+            cache_manager.in_cache(&"k_c".to_string()).await.is_ok(),
+            "should have newly inserted k_c"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_cache_misses() {
+        let (evict_tx, _) = unbounded();
+
+        // build cache manager
+        let max_capacity: usize = 3_200_000;
+        let cache_manager = Arc::new(CacheManager::new(
+            policy_config(max_capacity as u64),
+            evict_tx,
+        ));
+
+        // cache misses
+        assert_matches!(
+            cache_manager
+                .fetch_metadata(&"not_in_cache".to_string())
+                .await,
+            Err(Error::CacheMiss),
+            "should have returned cache miss for metadata",
+        );
+        assert_matches!(
+            cache_manager.in_cache(&"not_in_cache".to_string()).await,
+            Err(Error::CacheMiss),
+            "should have returned cache miss for object",
+        );
+        // when cache miss:
+        // 1. return error
+        // 2. upper layer (DataService) will handle any write back
+    }
+}
diff --git a/parquet_cache/src/server/data/reads.rs b/parquet_cache/src/server/data/reads.rs
new file mode 100644
index 00000000000..07c09afcae5
--- /dev/null
+++ b/parquet_cache/src/server/data/reads.rs
@@ -0,0 +1,23 @@
+use std::sync::Arc;
+
+use super::store::{LocalStore, StreamedObject};
+use super::DataError;
+
+/// Service that handles the READ requests (`GET /object`).
+#[derive(Debug, Clone)]
+pub struct ReadHandler {
+    cache: Arc<LocalStore>,
+}
+
+impl ReadHandler {
+    pub fn new(cache: Arc<LocalStore>) -> Self {
+        Self { cache }
+    }
+
+    pub async fn read_local(&self, location: &String) -> Result<StreamedObject, DataError> {
+        self.cache
+            .read_object(location)
+            .await
+            .map_err(|e| DataError::Read(e.to_string()))
+    }
+}
diff --git a/parquet_cache/src/server/data/store.rs b/parquet_cache/src/server/data/store.rs
new file mode 100644
index 00000000000..e4ef7956f0b
--- /dev/null
+++ b/parquet_cache/src/server/data/store.rs
@@ -0,0 +1,510 @@
+use std::{
+    path::{Path, PathBuf},
+    pin::Pin,
+    task::{Context, Poll},
+};
+
+use bytes::{Bytes, BytesMut};
+use futures::{
+    stream::{BoxStream, StreamExt},
+    FutureExt, TryStreamExt,
+};
+use pin_project::pin_project;
+use tokio::fs::{create_dir_all, remove_dir, remove_file, File};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, Error, ReadBuf};
+use tokio_util::codec::{BytesCodec, FramedRead};
+
+/// object_store expected stream IO type
+pub type StreamedObject = BoxStream<'static, object_store::Result<Bytes>>;
+
+/// identifier for `object_store::Error::Generic`
+const DATA_CACHE: &str = "local store accessor";
+
+/// Access to stored data.
+#[derive(Debug)]
+pub struct LocalStore {
+    dir: PathBuf,
+}
+
+impl LocalStore {
+    pub fn new(path: Option<impl ToString>) -> Self {
+        let dir = path.map(|p| p.to_string()).unwrap_or("/tmp".to_string());
+        Self {
+            dir: Path::new(dir.as_str()).to_owned(),
+        }
+    }
+
+    fn local_path(&self, location: &String) -> PathBuf {
+        self.dir.join(location)
+    }
+
+    /// Move a given file location, into cache
+    pub async fn move_file_to_cache(&self, from: PathBuf, location: &String) -> Result<(), Error> {
+        let to = self.local_path(location);
+        match to.parent() {
+            None => {
+                return Err(Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "object location is not valid",
+                ))
+            }
+            Some(path) => create_dir_all(path).await?,
+        };
+        std::fs::rename(from, to)
+    }
+
+    /// Async write operation
+    pub async fn write_object(
+        &self,
+        location: &String,
+        size: i64,
+        mut stream: StreamedObject,
+    ) -> Result<(), Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+        let path = self.local_path(location);
+        let mut obj = AsyncStoreObject::new(path.as_path(), size).await?;
+
+        while let Some(maybe_bytes) = stream.next().await {
+            if maybe_bytes.is_err() {
+                let _ = obj.delete().await;
+                return Err(Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    "error reading incoming byte stream",
+                ));
+            }
+
+            match obj.write_all(&maybe_bytes.unwrap()).await {
+                Ok(_) => continue,
+                Err(e) => {
+                    let _ = obj.delete().await;
+                    return Err(e);
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Read `GET /object` returns a stream
+    pub async fn read_object(&self, location: &String) -> Result<StreamedObject, Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+
+        // Potential TODO: replace the StreamedObject with sendfile?
+        // the the client can return a GetResultPayload::File() through the interface.
+        let path = self.dir.join(location);
+        Ok(AsyncStoreObject::open(path.as_path()).await?.read_stream())
+    }
+
+    /// Delete object in local store, such as on cache eviction.
+    pub async fn delete_object(&self, location: &String) -> Result<(), Error> {
+        if location.starts_with('/') {
+            return Err(Error::new(
+                std::io::ErrorKind::InvalidData,
+                "object location cannot be an absolute path",
+            ));
+        }
+
+        let path = self.dir.join(location);
+        AsyncStoreObject::open(path.as_path()).await?.delete().await
+    }
+}
+
+#[pin_project]
+pub struct AsyncStoreObject<'a> {
+    #[pin]
+    inner: File,
+    path: &'a Path,
+}
+
+impl<'a> AsyncStoreObject<'a> {
+    /// Create a new AsyncStoreObject, honoring the path provided.
+    async fn new(path: &'a Path, size: i64) -> std::io::Result<Self> {
+        // The path of the object (in the ObjectStore implementations) is:
+        // <namespace_id>/<table_id>/<partition_id>/<object_store_id>.
+        //
+        // Future cache eviction policies may be mapped to resource allocation per table_id.
+        create_dir_all(path.parent().unwrap_or(path)).await?;
+        let file = File::create(path).await?;
+        file.set_len(size as u64).await?;
+
+        Ok(Self { inner: file, path })
+    }
+
+    async fn open(path: &'a Path) -> std::io::Result<Self> {
+        Ok(Self {
+            inner: File::open(path).await?,
+            path,
+        })
+    }
+
+    fn read_stream(self) -> StreamedObject {
+        Box::pin(
+            FramedRead::new(self.inner, BytesCodec::new())
+                .map_ok(BytesMut::freeze)
+                .map_err(|e| object_store::Error::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                }),
+        )
+    }
+
+    async fn delete(&self) -> std::io::Result<()> {
+        remove_file(self.path).await?;
+        let dir = self.path.parent().unwrap_or(self.path);
+        if dir.read_dir()?.next().is_none() {
+            remove_dir(dir).await
+        } else {
+            Ok(())
+        }
+    }
+}
+
+impl<'a> AsyncRead for AsyncStoreObject<'a> {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
+impl<'a> AsyncWrite for AsyncStoreObject<'a> {
+    fn poll_write(
+        self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<std::io::Result<usize>> {
+        let this = self.project();
+        this.inner.poll_write(cx, buf)
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        Box::pin(this.inner.get_mut().sync_all()).poll_unpin(cx)
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<std::io::Result<()>> {
+        let this = self.project();
+        Box::pin(this.inner.get_mut().shutdown()).poll_unpin(cx)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{hash::Hasher, io::ErrorKind};
+
+    use assert_matches::assert_matches;
+    use rand::{distributions::Alphanumeric, thread_rng, Rng};
+    use tempfile::TempDir;
+    use tokio::io::AsyncReadExt;
+
+    use super::*;
+
+    async fn create_incoming_stream(file_path: &PathBuf) -> StreamedObject {
+        let mut writeable = File::create(file_path)
+            .await
+            .expect("should create file in tempdir");
+
+        for _ in 0..5 {
+            let rand_string: String = thread_rng()
+                .sample_iter(&Alphanumeric)
+                .take(1_000_000)
+                .map(char::from)
+                .collect();
+            writeable
+                .write_all(rand_string.as_bytes())
+                .await
+                .expect("should write to mock incoming");
+        }
+        writeable
+            .sync_all()
+            .await
+            .expect("should fsync incoming mock data file");
+
+        let readable = File::open(file_path)
+            .await
+            .expect("file should be readable");
+        Box::pin(
+            FramedRead::new(readable, BytesCodec::new())
+                .map_ok(BytesMut::freeze)
+                .map_err(|e| object_store::Error::Generic {
+                    store: DATA_CACHE,
+                    source: Box::new(e),
+                }),
+        )
+    }
+
+    async fn run_write_read_test() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-io.txt");
+        let obj_stream = create_incoming_stream(&incoming_file_path).await;
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(&location.to_string(), 1_000_000 * 5, obj_stream)
+            .await;
+        assert_matches!(
+            write_res,
+            Ok(()),
+            "write should return ok, instead found {:?}",
+            write_res
+        );
+
+        let read_res = local_store.read_object(&location.to_string()).await;
+        assert!(read_res.is_ok(), "read should return ok");
+
+        // expected == data which was streamed in to WRITE
+        let mut expected = Vec::new();
+        File::open(incoming_file_path)
+            .await
+            .expect("should open original incoming data file")
+            .read_to_end(&mut expected)
+            .await
+            .unwrap();
+        let mut expected_hash = ahash::AHasher::default();
+        expected_hash.write(&expected);
+
+        // got == data that was WRITE then READ
+        let mut got = Vec::new();
+        tokio_util::io::StreamReader::new(read_res.unwrap())
+            .read_to_end(&mut got)
+            .await
+            .unwrap();
+        let mut got_hash = ahash::AHasher::default();
+        got_hash.write(&got);
+
+        assert_eq!(
+            1_000_000 * 5,
+            expected.len(),
+            "incoming mock file stream was incorrect"
+        );
+        assert_eq!(
+            expected.len(),
+            got.len(),
+            "expected {} bytes but found {} bytes",
+            expected.len(),
+            got.len()
+        );
+        assert_eq!(
+            got_hash.finish(),
+            expected_hash.finish(),
+            "hash of file contents do not match"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn can_duplicate_write_to_key() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-dupe-writes.txt");
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert!(
+            write_res.is_ok(),
+            "first write should succeed, instead found {:?}",
+            write_res
+        );
+
+        let duplicate_write = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert!(
+            duplicate_write.is_ok(),
+            "second write should also succeed, instead found {:?}",
+            duplicate_write
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn run_delete_test() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-io.txt");
+        let obj_stream = create_incoming_stream(&incoming_file_path).await;
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(&location.to_string(), 1_000_000 * 5, obj_stream)
+            .await;
+        assert_matches!(
+            write_res,
+            Ok(()),
+            "write should return ok, instead found {:?}",
+            write_res
+        );
+
+        // confirm obj is written
+        let written_obj_path = tempdir.path().join(location);
+        let mut written_obj = Vec::new();
+        File::open(written_obj_path.clone())
+            .await
+            .expect("should open original incoming data file")
+            .read_to_end(&mut written_obj)
+            .await
+            .unwrap();
+        assert_eq!(
+            1_000_000 * 5,
+            written_obj.len(),
+            "object should be written to full length"
+        );
+
+        // delete obj
+        let del_res = local_store.delete_object(&location.to_string()).await;
+        assert!(del_res.is_ok(), "should return OK on delete");
+
+        // confirm does not exist
+        let should_be_err = File::open(written_obj_path).await;
+        assert_matches!(
+            should_be_err,
+            Err(e) if e.kind() == ErrorKind::NotFound,
+            "cache obj should not exist"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn error_with_absolute_path_in_obj_key() {
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let incoming_file_path = tempdir.path().join("./incoming-abs-key-path.txt");
+
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "/absolute/pathed/object.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                create_incoming_stream(&incoming_file_path).await,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("object location cannot be an absolute path"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        let read_res = local_store.read_object(&location.to_string()).await;
+        assert!(read_res.is_err(), "expected read to error",);
+
+        let delete_res = local_store.delete_object(&location.to_string()).await;
+        assert!(delete_res.is_err(), "expected delete to error",);
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn write_aborts_are_handled() {
+        let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![Err(
+            object_store::Error::Generic {
+                store: "error in bytes stream from remote object store",
+                source: "delete on first write".into(),
+            },
+        )])) as StreamedObject;
+
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                stream_with_partial_write,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("error reading incoming byte stream"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    async fn partial_files_are_deleted_on_write_abort() {
+        let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![
+            Ok(Bytes::from(&b"good yield"[..])),
+            Err(object_store::Error::Generic {
+                store: "error in bytes stream from remote object store",
+                source: "foobar".into(),
+            }),
+        ])) as StreamedObject;
+
+        let tempdir = TempDir::new().expect("should make tempdir");
+        let local_store = LocalStore::new(tempdir.path().to_str());
+        let location = "obj/to/write.parquet";
+
+        let write_res = local_store
+            .write_object(
+                &location.to_string(),
+                1_000_000 * 5,
+                stream_with_partial_write,
+            )
+            .await;
+        assert_matches!(
+            write_res,
+            Err(e) if e.to_string().contains("error reading incoming byte stream"),
+            "expected write to error, instead found {:?}",
+            write_res
+        );
+
+        let incoming_file_path = tempdir.path().join("./incoming-partial.txt");
+        let should_not_exist = File::open(incoming_file_path).await;
+        assert_matches!(
+            should_not_exist,
+            Err(e) if e.kind() == ErrorKind::NotFound,
+            "file partial should not exist"
+        );
+
+        tempdir.close().expect("should delete tempdir");
+    }
+
+    #[tokio::test]
+    async fn test_write_read_object() {
+        futures::join!(run_write_read_test(), can_duplicate_write_to_key(),);
+    }
+
+    #[tokio::test]
+    async fn test_delete_object() {
+        run_delete_test().await;
+    }
+
+    #[tokio::test]
+    async fn test_error_handling() {
+        futures::join!(
+            error_with_absolute_path_in_obj_key(),
+            write_aborts_are_handled(),
+            partial_files_are_deleted_on_write_abort(),
+        );
+    }
+}
diff --git a/parquet_cache/src/server/data/writes.rs b/parquet_cache/src/server/data/writes.rs
new file mode 100644
index 00000000000..d42fc5b417f
--- /dev/null
+++ b/parquet_cache/src/server/data/writes.rs
@@ -0,0 +1,69 @@
+use std::sync::Arc;
+
+use object_store::{GetResult, GetResultPayload, ObjectMeta, ObjectStore};
+use observability_deps::tracing::warn;
+
+use crate::data_types::WriteHint;
+
+use super::{store::LocalStore, DataError};
+
+/// Handles the WRITE requests (`/write-hint`)
+#[derive(Debug, Clone)]
+pub struct WriteHandler {
+    cache: Arc<LocalStore>,
+    direct_store: Arc<dyn ObjectStore>,
+}
+
+impl WriteHandler {
+    pub fn new(cache: Arc<LocalStore>, direct_store: Arc<dyn ObjectStore>) -> Self {
+        Self {
+            cache,
+            direct_store,
+        }
+    }
+
+    pub async fn write_local(
+        &self,
+        location: &str,
+        write_hint: &WriteHint,
+    ) -> Result<ObjectMeta, DataError> {
+        // get from remote
+        let WriteHint {
+            file_size_bytes, ..
+        } = write_hint;
+        let GetResult { meta, payload, .. } = self
+            .direct_store
+            .get(&location.into())
+            .await
+            .map_err(|e| match e {
+                object_store::Error::NotFound { .. } => DataError::DoesNotExist,
+                _ => DataError::Stream(e.to_string()),
+            })?;
+
+        if !(meta.size as i64).eq(file_size_bytes) {
+            warn!(
+                "failed to perform writeback due to file size mismatch: {} != {}",
+                meta.size, file_size_bytes
+            );
+            return Err(DataError::BadRequest(
+                "failed to perform writeback due to file size mismatch".to_string(),
+            ));
+        }
+
+        // write local
+        match payload {
+            GetResultPayload::File(_, pathbuf) => self
+                .cache
+                .move_file_to_cache(pathbuf, &location.into())
+                .await
+                .map_err(|e| DataError::File(e.to_string()))?,
+            GetResultPayload::Stream(stream) => self
+                .cache
+                .write_object(&location.into(), *file_size_bytes, stream)
+                .await
+                .map_err(|e| DataError::Stream(e.to_string()))?,
+        };
+
+        Ok(meta)
+    }
+}
diff --git a/parquet_cache/src/server/error.rs b/parquet_cache/src/server/error.rs
new file mode 100644
index 00000000000..24e87c49f91
--- /dev/null
+++ b/parquet_cache/src/server/error.rs
@@ -0,0 +1,55 @@
+use hyper::StatusCode;
+
+use crate::server::data::DataError;
+
+/// Error type for the server.
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    /// Error in the keyspace layer.
+    #[error("Keyspace error: {0}")]
+    Keyspace(String),
+    /// Error in the precondition layer.
+    #[error("Precondition error: {0}")]
+    Precondition(String),
+    /// Error in the data layer.
+    #[error("Data error: {0}")]
+    Data(#[from] DataError),
+
+    /// Error with warming.
+    #[error("Warming error: {0}")]
+    Warming(String),
+    /// Cache miss.
+    #[error("Cache miss")]
+    CacheMiss,
+    /// Bad request from the user.
+    #[error("Bad Request: {0}")]
+    BadRequest(String),
+    /// Object does not exist.
+    #[error("Bad Request: object location does not exist in catalog or object store")]
+    DoesNotExist,
+    /// Error due to server shutdown.
+    #[error("Server shutdown")]
+    ServerShutdown,
+}
+
+impl Error {
+    /// Return the HTTP status code for this error.
+    ///
+    /// Should match the handling, per code, in the [client](crate::client::object_store::DataCacheObjectStore).
+    pub fn code(&self) -> StatusCode {
+        match self {
+            // If errors here, have the client return an error.
+            Self::BadRequest(_)
+            | Self::DoesNotExist
+            | Self::Data(DataError::BadRequest(_))
+            | Self::Data(DataError::DoesNotExist) => StatusCode::BAD_REQUEST,
+            Self::Precondition(_) => StatusCode::PRECONDITION_FAILED,
+            // If errors below here, result in the client using the fallback.
+            Self::CacheMiss => StatusCode::NOT_FOUND,
+            Self::Keyspace(_) | Self::Warming(_) | Self::Data(_) => {
+                StatusCode::INTERNAL_SERVER_ERROR
+            }
+            Self::ServerShutdown => StatusCode::SERVICE_UNAVAILABLE,
+        }
+    }
+}
diff --git a/parquet_cache/src/server/keyspace.rs b/parquet_cache/src/server/keyspace.rs
new file mode 100644
index 00000000000..88ea0f8310f
--- /dev/null
+++ b/parquet_cache/src/server/keyspace.rs
@@ -0,0 +1,957 @@
+use std::{path::Path, sync::Arc, task::Poll};
+
+use arc_swap::ArcSwap;
+use futures::Future;
+use http::{Method, Request};
+use hyper::Body;
+use mpchash::HashRing;
+use notify::{RecommendedWatcher, RecursiveMode, Watcher};
+use observability_deps::tracing::error;
+use tokio::{sync::Notify, task::JoinHandle};
+use tower::{Layer, Service};
+
+use crate::{
+    data_types::{
+        InstanceState, KeyspaceResponseBody, KeyspaceVersion, ParquetCacheInstanceSet, ServiceNode,
+        ServiceNodeHostname, ServiceNodeId,
+    },
+    server::response::Response,
+};
+
+use super::{error::Error, response::PinnedFuture};
+
+struct BackgroundTask {
+    path: String,
+    fswatcher: RecommendedWatcher,
+    notifier_handle: JoinHandle<()>,
+}
+
+impl Drop for BackgroundTask {
+    fn drop(&mut self) {
+        if let Err(e) = self.fswatcher.unwatch(Path::new(&self.path)) {
+            error!("KeyspaceService fswatcher failed to unwatch: {}", e)
+        }
+        self.notifier_handle.abort();
+    }
+}
+
+/// Service that applies the keyspace per request.
+pub struct KeyspaceService<S> {
+    shared: Arc<BackgroundTask>,
+    ready_tx: Arc<Notify>,
+    ready_rx: std::pin::Pin<Box<dyn Future<Output = ()> + Send + Sync + 'static>>,
+    keyspace: Arc<Keyspace>,
+    inner: S,
+}
+
+impl<S> std::fmt::Debug for KeyspaceService<S> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("KeyspaceService")
+            .field("keyspace", &self.keyspace)
+            .finish_non_exhaustive()
+    }
+}
+
+impl<S: Clone> Clone for KeyspaceService<S> {
+    fn clone(&self) -> Self {
+        let ready_rx = Arc::clone(&self.ready_tx);
+        let ready_rx = Box::pin(async move {
+            ready_rx.notified().await;
+        });
+
+        Self {
+            shared: Arc::clone(&self.shared),
+            ready_tx: Arc::clone(&self.ready_tx),
+            ready_rx,
+            keyspace: Arc::clone(&self.keyspace),
+            inner: self.inner.clone(),
+        }
+    }
+}
+
+impl<S: Service<Request<Body>> + Clone + Send + Sync + 'static> KeyspaceService<S> {
+    fn new(inner: S, configfile_path: String, node_hostname: String) -> Result<Self, Error> {
+        let path = configfile_path.clone();
+
+        let data = Arc::new(KeyspaceData::new(node_hostname));
+        let keyspace = Arc::new(Keyspace {
+            data: data.into(),
+            configfile_path,
+        });
+
+        let ready_tx = Arc::new(Notify::new());
+        let (fswatcher, notifier_handle) =
+            Self::start_background_task(Arc::clone(&keyspace), Arc::clone(&ready_tx))?;
+
+        let ready_rx = Arc::clone(&ready_tx);
+        let ready_rx = Box::pin(async move {
+            ready_rx.notified().await;
+        });
+
+        Ok(Self {
+            shared: Arc::new(BackgroundTask {
+                path,
+                fswatcher,
+                notifier_handle,
+            }),
+            ready_tx,
+            ready_rx,
+            keyspace,
+            inner,
+        })
+    }
+
+    fn start_background_task(
+        keyspace: Arc<Keyspace>,
+        ready_tx: Arc<Notify>,
+    ) -> Result<(RecommendedWatcher, JoinHandle<()>), Error> {
+        let changed = Arc::new(Notify::new());
+        let has_changed = Arc::clone(&changed);
+
+        let configfile_path = keyspace.configfile_path.clone();
+        let ready_tx_ = Arc::clone(&ready_tx);
+        let keyspace_ = Arc::clone(&keyspace);
+
+        // start watcher -- default is to poll for changes every 30 seconds
+        let watcher_and_listener =
+            notify::recommended_watcher(move |res: notify::Result<notify::Event>| match res {
+                Ok(notify::Event { kind, .. }) => {
+                    if kind.is_modify() || kind.is_create() {
+                        has_changed.notify_one();
+                    }
+                }
+                Err(e) => error!(error=%e, "KeyspaceService fswatcher failed"),
+            })
+            .and_then(move |mut watcher| {
+                watcher.watch(Path::new(&configfile_path), RecursiveMode::NonRecursive)?;
+                Ok((
+                    watcher,
+                    tokio::spawn(async move {
+                        loop {
+                            changed.notified().await;
+                            keyspace.update(Arc::clone(&ready_tx)).await;
+                        }
+                    }),
+                ))
+            })
+            .map_err(|e| Error::Keyspace(e.to_string()))?;
+
+        // handle race where the file is created before the watcher is started
+        if Path::exists(Path::new(&keyspace_.configfile_path)) {
+            tokio::spawn(async move {
+                keyspace_.update(ready_tx_).await;
+            });
+        }
+
+        Ok(watcher_and_listener)
+    }
+}
+
+impl<S> Service<Request<Body>> for KeyspaceService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static,
+{
+    type Response = super::response::Response;
+    type Error = Error;
+    type Future = PinnedFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        if !self.keyspace.ready() {
+            futures::ready!(self.ready_rx.as_mut().poll(cx));
+        }
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/state") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    // return the version we have loaded
+                    // serde serialization will add the CacheState enum, based on this version
+                    Ok(Response::KeyspaceVersion(
+                        this.keyspace.data.load().version.clone(),
+                    ))
+                })
+            }
+            (&Method::PATCH, "/warmed") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    this.keyspace.set_to_running();
+                    Ok(Response::Ready)
+                })
+            }
+            (&Method::GET, "/keyspace") => {
+                let this = self.clone();
+                Box::pin(async move {
+                    let (_, _, keyspace) = this.keyspace.read_definition().await;
+                    Ok(Response::Keyspace(keyspace))
+                })
+            }
+            (&Method::GET, "/metadata")
+            | (&Method::GET, "/object")
+            | (&Method::POST, "/write-hint") => {
+                let clone = self.inner.clone();
+                let mut inner = std::mem::replace(&mut self.inner, clone);
+                let this = self.clone();
+                Box::pin(async move {
+                    let as_url = url::Url::parse(req.uri().to_string().as_str())
+                        .expect("should be already validated path & query");
+                    let obj_location = match as_url.query_pairs().find(|(k, _v)| k.eq("location")) {
+                        None => {
+                            return Err(Error::Keyspace(
+                                "invalid or missing object location".into(),
+                            ));
+                        }
+                        Some((_key, location)) => location.to_string(),
+                    };
+
+                    // when keyspace is invalid (being re-built), return error such that
+                    // cache client decides to (1) re-fetch keyspace, and/or (2) uses fallback
+                    match this.keyspace.in_keyspace(&obj_location) {
+                        true => inner.call(req).await,
+                        false => Err(Error::Keyspace(format!(
+                            "object {} is not found in keyspace",
+                            obj_location
+                        ))),
+                    }
+                })
+            }
+            (any_method, any_path) => {
+                let msg = format!("invalid path: {} {}", any_method, any_path);
+                Box::pin(async { Err(Error::BadRequest(msg)) })
+            }
+        }
+    }
+}
+
+#[derive(Debug, Clone)]
+struct KeyspaceData {
+    /// ID self
+    /// Is none if keyspace has been invalidated.
+    own: Option<ServiceNodeId>,
+    // Hashring
+    keyspace: Arc<HashRing<ServiceNodeId>>,
+    /// Versioning, so can provide current vs next, per GET `/state` request
+    /// Is none if Self::Pending (a.k.a. no definition loaded yet)
+    version: KeyspaceVersion,
+}
+
+impl KeyspaceData {
+    pub fn new(self_node: ServiceNodeHostname) -> Self {
+        Self {
+            own: None,
+            keyspace: Default::default(),
+            version: KeyspaceVersion::new(self_node),
+        }
+    }
+}
+
+#[derive(Debug)]
+struct Keyspace {
+    /// Atomically updated keyspace data.
+    data: ArcSwap<KeyspaceData>,
+    /// Fs-accessible file containing the [`ParquetCacheInstanceSet`]
+    configfile_path: String,
+}
+
+impl Keyspace {
+    /// `Valid` as in able to check keyspace hashring.
+    ///
+    /// Returns true if the keyspace definition exists, and own-node is within keyspace.
+    fn is_valid(&self) -> bool {
+        self.data.load().own.is_some()
+    }
+
+    /// `Ready` as in poll_ready (to receive requests).
+    /// Returns true if the keyspace is not in the init phase.
+    ///
+    /// Request include `GET /state` requests during warming and cooling phases.
+    fn ready(&self) -> bool {
+        let data = self.data.load();
+        match InstanceState::from(&data.as_ref().version) {
+            InstanceState::Pending => false,
+            InstanceState::Warming | InstanceState::Running | InstanceState::Cooling => true,
+        }
+    }
+
+    /// `Running` as in the [`InstanceState`].
+    fn set_to_running(&self) {
+        self.data.rcu(|data| KeyspaceData {
+            own: data.own,
+            keyspace: Arc::clone(&data.keyspace),
+            version: data.version.clone_next_to_curr(),
+        });
+    }
+
+    /// Returns true if the object location is in the keyspace.
+    fn in_keyspace(&self, object: &String) -> bool {
+        let data = self.data.load();
+        self.is_valid()
+            && match data.own {
+                None => false,
+                Some(id) => match data.keyspace.primary_node(object) {
+                    Some(&assigned_node) => assigned_node == id,
+                    None => false,
+                },
+            }
+    }
+
+    /// Read keyspace definition from file.
+    async fn read_definition(
+        &self,
+    ) -> (
+        ParquetCacheInstanceSet, /* KeyspaceVersion.next */
+        Option<ServiceNodeId>,   /* None == current node is not in KeyspaceVersion.next */
+        Vec<ServiceNode>,        /* full set of KeyspaceVersion.next hashring */
+    ) {
+        let current_instance_set_next = tokio::fs::read_to_string(self.configfile_path.clone())
+            .await
+            .expect("config map file should always exist on pod");
+        let parquet_cache_instance_set: ParquetCacheInstanceSet =
+            serde_json::from_str(current_instance_set_next.as_str())
+                .expect("should have valid ParquetCacheInstanceSet format");
+
+        let service_nodes = KeyspaceResponseBody::from(&parquet_cache_instance_set).nodes;
+
+        let self_hostname = self.data.load().version.hostname().clone();
+        (
+            parquet_cache_instance_set,
+            service_nodes
+                .iter()
+                .position(|node| node.hostname == self_hostname)
+                .map(|node_id| node_id as u64),
+            service_nodes,
+        )
+    }
+
+    /// Update keyspace definition.
+    async fn update(&self, ready: Arc<Notify>) {
+        let (next_version, own, all_nodes) = self.read_definition().await;
+
+        let mut keyspace = HashRing::new();
+        for ServiceNode { id, hostname: _ } in all_nodes {
+            keyspace.add(id);
+        }
+        let keyspace = Arc::new(keyspace);
+
+        // determine if KeyspaceVersion changed
+        let prev_data = self.data.rcu(|curr_data| {
+            match &curr_data.version.next {
+                Some(next) if next_version.revision == next.revision => {
+                    // no change -- already knows about next
+                    Arc::clone(curr_data)
+                }
+                _ => Arc::new(KeyspaceData {
+                    own,
+                    keyspace: Arc::clone(&keyspace),
+                    version: curr_data.version.set_next(next_version.to_owned()),
+                }),
+            }
+        });
+
+        if InstanceState::from(&prev_data.version) == InstanceState::Pending && self.ready() {
+            // Let anyone waiting on poll_ready know that we're no longer pending.
+            ready.notify_waiters();
+        }
+    }
+}
+
+pub struct BuildKeyspaceService {
+    pub configfile_path: String,
+    pub node_hostname: String,
+}
+
+impl<S: Service<Request<Body>> + Clone + Send + Sync + 'static> Layer<S> for BuildKeyspaceService {
+    type Service = KeyspaceService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        KeyspaceService::new(
+            service,
+            self.configfile_path.clone(),
+            self.node_hostname.clone(),
+        )
+        .expect("cache server failed to deploy due to keyspace layer init error")
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::{
+        io::{Seek, Write},
+        sync::atomic::{AtomicU32, Ordering},
+        task::Context,
+        time::Duration,
+    };
+
+    use assert_matches::assert_matches;
+    use futures::{future, task::noop_waker_ref};
+    use tempfile::{NamedTempFile, TempDir};
+    use tokio::io::AsyncWriteExt;
+    use tokio_stream::StreamExt;
+    use tower::{ServiceBuilder, ServiceExt};
+
+    use super::super::response::Response;
+    use super::*;
+
+    const VALID_HOSTNAME: &str = "hostname-a";
+    lazy_static::lazy_static! {
+        static ref KEYSPACE_DEFINITION: String = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 0,
+            // a single node in the keyspace, therefore all keys should hash to this keyspace
+            instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(),
+        }).to_string();
+    }
+
+    #[derive(Clone, Default)]
+    struct MockInnermostService {
+        call: Arc<AtomicU32>,
+        poll_ready: Arc<AtomicU32>,
+    }
+
+    impl Service<Request<Body>> for MockInnermostService {
+        type Response = Response;
+        type Error = Error;
+        type Future = PinnedFuture;
+
+        fn poll_ready(
+            &mut self,
+            _cx: &mut std::task::Context<'_>,
+        ) -> Poll<Result<(), Self::Error>> {
+            self.poll_ready.fetch_add(1, Ordering::SeqCst);
+            Poll::Ready(Ok(()))
+        }
+        fn call(&mut self, _req: Request<Body>) -> Self::Future {
+            self.call.fetch_add(1, Ordering::SeqCst);
+            Box::pin(future::ok(Response::Ready))
+        }
+    }
+
+    fn metadata_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/metadata?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn object_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("http://foo.io/object?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn write_hint_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::POST)
+            .uri("http://foo.io/write-hint?location=bar")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn state_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("/state")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn warmed_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::PATCH)
+            .uri("/warmed")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    fn keyspace_defn_req() -> Request<Body> {
+        Request::builder()
+            .method(Method::GET)
+            .uri("/keyspace")
+            .body(Body::empty())
+            .unwrap()
+    }
+
+    async fn write_defn_to_file(defn: &[u8], configfile_path: &Path) {
+        let mut file = tokio::fs::File::create(&configfile_path).await.unwrap();
+        file.write_all(defn)
+            .await
+            .expect("should write keyspace definition to configfile");
+
+        // notify fswatcher will sometimes skip events when the file descriptor is still open
+        file.shutdown()
+            .await
+            .expect("should shutdown file descriptor");
+    }
+
+    #[allow(clippy::future_not_send)]
+    async fn wait_until_service_is_ready(server: &mut KeyspaceService<MockInnermostService>) {
+        future::poll_fn(move |cx| server.poll_ready(cx))
+            .await
+            .expect("should not have failed");
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_can_load_definition() {
+        let mut file = NamedTempFile::new().unwrap();
+        writeln!(file, "{}", KEYSPACE_DEFINITION.as_str())
+            .expect("should write keyspace definition to configfile");
+
+        let keyspace = Keyspace {
+            configfile_path: file.path().to_str().unwrap().to_string(),
+            data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(),
+        };
+
+        assert!(
+            !keyspace.is_valid(),
+            "default keyspace should be invalid, due to no definition loaded"
+        );
+
+        let notify = Arc::new(Notify::new());
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            keyspace.is_valid(),
+            "keyspace should be valid, after definition is loaded"
+        );
+
+        // remove from keyspace, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(file.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // should no longer be in keyspace
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            !keyspace.is_valid(),
+            "keyspace should not be valid, when own-hostname not in definition"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_poll_ready_during_instance_phases() {
+        let mut file = NamedTempFile::new().unwrap();
+        writeln!(file, "{}", KEYSPACE_DEFINITION.as_str())
+            .expect("should write keyspace definition to configfile");
+
+        let keyspace = Keyspace {
+            configfile_path: file.path().to_str().unwrap().to_string(),
+            data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(),
+        };
+
+        // init phase
+        assert!(!keyspace.ready(), "default keyspace should not poll_ready");
+
+        // warming phase
+        // this in when the outer service layers will be calling the inner KeyspaceService
+        let notify = Arc::new(Notify::new());
+        keyspace.update(Arc::clone(&notify)).await;
+        assert!(
+            keyspace.ready(),
+            "keyspace should poll_ready, after definition (with own node) is loaded"
+        );
+
+        // running phase
+        keyspace.set_to_running();
+        assert!(keyspace.ready(), "keyspace should poll_ready, when running");
+
+        // remove from keyspace, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(file.path())
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // cooling phase
+        keyspace.update(notify).await;
+        assert!(
+            keyspace.ready(),
+            "keyspace should still poll_ready when cooling, to handle `GET /state` requests"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_watcher_consumes_definition_file() {
+        // no keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        let mut file = tokio::fs::File::create(&configfile_path).await.unwrap();
+
+        // start service
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(MockInnermostService::default());
+
+        // assert poll_ready returns pending, when no keyspace definition
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Pending,
+            "should return pending status, as keyspace definition does not yet exist"
+        );
+
+        // write keyspace definition to configfile
+        file.write_all(KEYSPACE_DEFINITION.as_bytes())
+            .await
+            .expect("should write keyspace definition to configfile");
+        file.shutdown()
+            .await
+            .expect("should shutdown file descriptor");
+
+        // wait for keyspace to be loaded by the watcher
+        wait_until_service_is_ready(&mut server).await;
+
+        // call service
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming,
+            "should return successful response, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_service_instance_phases() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // call service when warming
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming,
+            "should return InstanceState::Warming, instead found {:?}",
+            res
+        );
+
+        // tell keyspace it's warmed
+        assert!(
+            server.call(warmed_req()).await.is_ok(),
+            "should be able to PATCH /warmed"
+        );
+
+        // call poll_ready when warmed
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Ready(Ok(_)),
+            "should return ready status"
+        );
+
+        // call `GET /state` when warmed
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Running,
+            "should return InstanceState::Running, instead found {:?}",
+            res
+        );
+
+        // tell keyspace to cool, by changing keyspace definition
+        let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet {
+            revision: 1,
+            instances: vec!["another-node"].into_iter().map(String::from).collect(),
+        })
+        .to_string();
+        let mut file = std::fs::OpenOptions::new()
+            .write(true)
+            .truncate(true)
+            .open(&configfile_path)
+            .unwrap();
+        file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite
+        writeln!(file, "{}", new_keyspace_definition.as_str())
+            .expect("should write keyspace definition to configfile");
+        file.sync_all().unwrap();
+
+        // waiting for new_keyspace_definition to load
+        // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests)
+        tokio::time::sleep(Duration::from_secs(10)).await;
+
+        // call poll_ready when cooling
+        assert_matches!(
+            server.poll_ready(&mut Context::from_waker(noop_waker_ref())),
+            Poll::Ready(Ok(_)),
+            "should return ready status"
+        );
+        // call `GET /state` when cooling
+        let res = server.call(state_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Cooling,
+            "should return InstanceState::Cooling, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_oks_for_included_key() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // GET /metadata
+        let res = server.call(metadata_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `GET /metadata`, instead found {:?}",
+            res
+        );
+
+        // GET /object
+        let res = server.call(object_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `GET /object`, instead found {:?}",
+            res
+        );
+
+        // GET /write-hint
+        let res = server.call(write_hint_req()).await;
+        assert!(
+            res.is_ok(),
+            "should return successful `POST /write-hint`, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_errs_for_excluded_key() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for keyspace to be loaded by the watcher
+        wait_until_service_is_ready(&mut server).await;
+
+        // update, to remove self from keyspace
+        server.keyspace.data.rcu(|data| {
+            Arc::new(KeyspaceData {
+                own: None,
+                keyspace: Arc::clone(&data.keyspace),
+                version: data.version.set_next(ParquetCacheInstanceSet {
+                    revision: data.version.next.as_ref().unwrap().revision + 1,
+                    instances: vec!["another-node"].into_iter().map(String::from).collect(),
+                }),
+            })
+        });
+
+        // GET /metadata
+        let res = server.call(metadata_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `GET /metadata`, instead found {:?}",
+            res
+        );
+
+        // GET /object
+        let res = server.call(object_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `GET /object`, instead found {:?}",
+            res
+        );
+
+        // GET /write-hint
+        let res = server.call(write_hint_req()).await;
+        assert_matches!(
+            res,
+            Err(Error::Keyspace(_)),
+            "should return errored `POST /write-hint`, instead found {:?}",
+            res
+        );
+    }
+
+    #[tokio::test]
+    async fn test_keyspace_service_fetch_keyspace() {
+        // provide keyspace definition
+        let dir = TempDir::new().unwrap();
+        let configfile_path = dir.path().join("configfile.json");
+        write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+        // start service
+        let innermost_service = MockInnermostService::default();
+        let mut server = ServiceBuilder::new()
+            .layer(BuildKeyspaceService {
+                configfile_path: configfile_path.to_str().unwrap().to_string(),
+                node_hostname: VALID_HOSTNAME.into(),
+            })
+            .service(innermost_service.clone());
+
+        // wait for service.poll_ready to return ready
+        wait_until_service_is_ready(&mut server).await;
+
+        // GET /keyspace
+        let res = server.call(keyspace_defn_req()).await;
+        assert_matches!(
+            res,
+            Ok(Response::Keyspace(nodes)) if matches!(
+                &nodes[..],
+                [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME
+            ),
+            "should return successful `GET /keyspace`, instead found {:?}",
+            res
+        );
+    }
+
+    mod usage_of_poll_ready {
+        use super::*;
+
+        #[tokio::test]
+        async fn test_poll_ready_is_not_triggered_on_call() {
+            // provide keyspace definition
+            let dir = TempDir::new().unwrap();
+            let configfile_path = dir.path().join("configfile.json");
+            write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+            // start service
+            let innermost_service = MockInnermostService::default();
+            let mut server = ServiceBuilder::new()
+                .layer(BuildKeyspaceService {
+                    configfile_path: configfile_path.to_str().unwrap().to_string(),
+                    node_hostname: VALID_HOSTNAME.into(),
+                })
+                .service(innermost_service.clone());
+
+            // wait for keyspace to be loaded by the watcher
+            wait_until_service_is_ready(&mut server).await;
+            let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst);
+
+            // call service
+            // use `GET /object` since it calls inner service
+            let res = server.call(object_req()).await;
+            assert!(
+                res.is_ok(),
+                "should return successful response, instead found {:?}",
+                res
+            );
+
+            // assert that poll_ready was not called
+            assert_eq!(
+                innermost_service.call.load(Ordering::SeqCst),
+                1,
+                "should call innermost service once"
+            );
+            assert_eq!(
+                innermost_service.poll_ready.load(Ordering::SeqCst),
+                init_poll_ready,
+                "should not have called innermost poll_ready, on Service::call()"
+            );
+        }
+
+        #[tokio::test]
+        async fn test_poll_ready_used_when_connected_to_stream() {
+            // provide keyspace definition
+            let dir = TempDir::new().unwrap();
+            let configfile_path = dir.path().join("configfile.json");
+            write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await;
+
+            // start service
+            let innermost_service = MockInnermostService::default();
+            let mut server = ServiceBuilder::new()
+                .layer(BuildKeyspaceService {
+                    configfile_path: configfile_path.to_str().unwrap().to_string(),
+                    node_hostname: VALID_HOSTNAME.into(),
+                })
+                .service(innermost_service.clone());
+
+            // Stream of requests, processed by service.
+            let (reqs, rx) = futures::channel::mpsc::unbounded();
+            let mut resps = server.clone().call_all(rx);
+
+            // wait for service.poll_ready to return ready
+            wait_until_service_is_ready(&mut server).await;
+            let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst);
+
+            // stream Service::call() requests
+            vec![metadata_req(), object_req(), write_hint_req()]
+                .into_iter()
+                .for_each(|req| {
+                    reqs.unbounded_send(req).unwrap();
+                });
+            drop(reqs);
+
+            // await responses
+            while let Some(rsp) = resps.next().await {
+                assert!(
+                    rsp.is_ok(),
+                    "should return successful response, instead found {:?}",
+                    rsp
+                );
+            }
+
+            // assert that Service::poll_ready() was called at least as many times as Service::call()
+            assert_eq!(
+                innermost_service.call.load(Ordering::SeqCst),
+                3,
+                "should call innermost service once"
+            );
+            assert!(
+                innermost_service.poll_ready.load(Ordering::SeqCst) >= 3 + init_poll_ready,
+                "should have called innermost poll_ready"
+            );
+        }
+    }
+}
diff --git a/parquet_cache/src/server/mock.rs b/parquet_cache/src/server/mock.rs
new file mode 100644
index 00000000000..deebe15ce58
--- /dev/null
+++ b/parquet_cache/src/server/mock.rs
@@ -0,0 +1,217 @@
+use std::{
+    collections::{HashMap, HashSet},
+    convert::Infallible,
+    ops::Range,
+    sync::Arc,
+};
+
+use bytes::{BufMut, Bytes, BytesMut};
+use hyper::{
+    server::conn::{AddrIncoming, AddrStream},
+    service::{make_service_fn, service_fn},
+    Body, Method, Request, Response, Server,
+};
+use object_store::ObjectStore;
+use parking_lot::Mutex;
+use std::net::SocketAddr;
+use tokio::{net::TcpListener, sync::oneshot, task::JoinHandle};
+
+use crate::data_types::{
+    KeyspaceResponseBody, ServiceNode, X_RANGE_END_HEADER, X_RANGE_START_HEADER,
+};
+
+#[allow(missing_debug_implementations)]
+pub struct MockCacheServer {
+    addr: SocketAddr,
+    stop: oneshot::Sender<()>,
+    join: JoinHandle<()>,
+    req_handler: Arc<MockCacheServerRequestHandler>,
+}
+
+impl MockCacheServer {
+    pub async fn create(addr: &str, _object_store: Arc<dyn ObjectStore>) -> Self {
+        let listener = TcpListener::bind(addr)
+            .await
+            .expect("listener should have bound to addr");
+        let addr = listener.local_addr().unwrap();
+
+        let req_handler: Arc<MockCacheServerRequestHandler> =
+            Arc::new(MockCacheServerRequestHandler::new(addr.to_string()));
+
+        let handler = Arc::clone(&req_handler);
+        let make_svc = make_service_fn(move |_socket: &AddrStream| {
+            let handler = Arc::clone(&handler);
+            async move {
+                Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
+                    let handler = Arc::clone(&handler);
+                    async move { Arc::clone(&handler).handle(req) }
+                }))
+            }
+        });
+
+        let (tx, rx) = tokio::sync::oneshot::channel::<()>();
+
+        let join = tokio::spawn(async {
+            Server::builder(AddrIncoming::from_listener(listener).unwrap())
+                .http2_only(true)
+                .serve(make_svc)
+                .with_graceful_shutdown(async {
+                    rx.await.ok();
+                })
+                .await
+                .unwrap()
+        });
+
+        Self {
+            addr,
+            stop: tx,
+            join,
+            req_handler,
+        }
+    }
+
+    pub fn addr(&self) -> String {
+        format!("http://{}", self.addr)
+    }
+
+    pub async fn close(self) {
+        self.stop
+            .send(())
+            .expect("Error sending stop signal to server");
+        self.join
+            .await
+            .expect("Error stopping parquet cache server");
+    }
+
+    pub fn was_called(&self, path_and_query: &String) -> bool {
+        self.req_handler.called.lock().contains(path_and_query)
+    }
+
+    pub fn was_called_with_payload(&self, path_and_query: &String) -> bool {
+        self.req_handler.called.lock().contains(path_and_query)
+    }
+
+    pub fn respond_with(&self, path_and_query: String, expected: ExpectedResponse) {
+        self.req_handler
+            .respond_with
+            .lock()
+            .insert(path_and_query, expected);
+    }
+}
+
+#[derive(Clone)]
+pub struct MockCacheServerRequestHandler {
+    pub hostname: String,
+    pub called: Arc<Mutex<HashSet<String>>>, // route_&_query
+    pub respond_with: Arc<Mutex<HashMap<String, ExpectedResponse>>>, // route_&_query, reponse_payload_body
+}
+
+#[derive(Clone, Debug)]
+pub struct ExpectedResponse {
+    pub bytes: Bytes,
+    pub range: Option<Range<usize>>,
+}
+
+impl MockCacheServerRequestHandler {
+    fn new(hostname: String) -> Self {
+        Self {
+            hostname,
+            called: Default::default(),
+            respond_with: Default::default(),
+        }
+    }
+
+    fn handle(&self, req: Request<Body>) -> Result<Response<hyper::body::Body>, Infallible> {
+        let path_and_query = req.uri().path_and_query().unwrap().to_string();
+
+        match (req.method(), req.uri().path()) {
+            (&Method::GET, "/keyspace") => {
+                self.insert_into_tracker(req);
+
+                let body = KeyspaceResponseBody {
+                    nodes: vec![ServiceNode {
+                        id: 42,
+                        hostname: self.hostname.clone(),
+                    }],
+                };
+
+                Ok::<_, Infallible>(Response::new(Body::from(build_resp_body(&body))))
+            }
+            (&Method::GET, "/metadata") => {
+                self.insert_into_tracker(req);
+                Ok::<_, Infallible>(Response::new(self.get_resp_body(&path_and_query)))
+            }
+            (&Method::GET, "/object") => {
+                // assert range header in mock server
+                if let Some(range) = req.headers().get("range") {
+                    // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range
+                    // <unit>=<range-start>-<range-end>
+                    let mut range = range.to_str().unwrap().to_string();
+                    range = range
+                        .strip_prefix("bytes=")
+                        .expect("should start range header with `bytes=`")
+                        .to_string();
+                    let (start, end) = range.split_at(
+                        range
+                            .find('-')
+                            .expect("should have dash delineating range `start-end`"),
+                    );
+                    assert!(start.parse::<usize>().is_ok());
+                    assert!(end[1..].parse::<usize>().is_ok());
+                };
+
+                self.insert_into_tracker(req);
+
+                let range = self
+                    .get_size_range(&path_and_query)
+                    .expect("should have used respond_with() for mocked response");
+
+                let resp = Response::builder()
+                    .header(X_RANGE_START_HEADER, range.start.to_string())
+                    .header(X_RANGE_END_HEADER, range.end.to_string())
+                    .body(self.get_resp_body(&path_and_query))
+                    .expect("should be a valid response");
+
+                Ok::<_, Infallible>(resp)
+            }
+            (&Method::POST, "/write-hint") => {
+                self.insert_into_tracker(req);
+                Ok::<_, Infallible>(Response::new(Body::empty()))
+            }
+            _ => unimplemented!(),
+        }
+    }
+
+    fn insert_into_tracker(&self, req: Request<Body>) {
+        self.called.lock().insert(
+            req.uri()
+                .path_and_query()
+                .expect("should exist")
+                .to_string(),
+        );
+    }
+
+    fn get_resp_body(&self, path_and_query: &String) -> Body {
+        match self.respond_with.lock().get(path_and_query) {
+            None => Body::empty(),
+            Some(expected) => Body::from(expected.clone().bytes),
+        }
+    }
+
+    fn get_size_range(&self, path_and_query: &String) -> Option<Range<usize>> {
+        self.respond_with
+            .lock()
+            .get(path_and_query)
+            .map(|expected| expected.clone().range.unwrap())
+    }
+}
+
+pub fn build_resp_body<T>(body: &T) -> Bytes
+where
+    T: Sized + serde::Serialize,
+{
+    let mut buf = BytesMut::new().writer();
+    serde_json::to_writer(&mut buf, body).expect("should write response body");
+
+    buf.into_inner().freeze()
+}
diff --git a/parquet_cache/src/server/precondition.rs b/parquet_cache/src/server/precondition.rs
new file mode 100644
index 00000000000..a591e7c0f08
--- /dev/null
+++ b/parquet_cache/src/server/precondition.rs
@@ -0,0 +1,57 @@
+use std::task::Poll;
+
+use http::{HeaderMap, Request};
+use hyper::Body;
+use object_store::ObjectMeta;
+use tower::{Layer, Service};
+
+use super::error::Error;
+use super::response::PinnedFuture;
+
+/// Service that applies the preconditions per request.
+///
+/// Refer to GetOptions:
+/// <https://github.com/apache/arrow-rs/blob/481652a4f8d972b633063158903dbdb0adcf094d/object_store/src/lib.rs#L871>
+#[derive(Debug, Clone)]
+pub struct PreconditionService<S: Clone + Send + Sync + 'static> {
+    inner: S,
+}
+
+impl<S: Clone + Send + Sync + 'static> PreconditionService<S> {
+    pub fn new(inner: S) -> Self {
+        Self { inner }
+    }
+
+    fn passes(&self, _preconditions: HeaderMap, _metadata: ObjectMeta) -> bool {
+        unimplemented!("TODO: precondition applied for any request, per HTTP header contract")
+    }
+}
+
+impl<S> Service<Request<Body>> for PreconditionService<S>
+where
+    S: Service<Request<Body>, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static,
+{
+    type Response = super::response::Response;
+    type Error = Error;
+    type Future = super::response::PinnedFuture;
+
+    fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.inner.poll_ready(cx)
+    }
+
+    fn call(&mut self, req: Request<Body>) -> Self::Future {
+        let clone = self.inner.clone();
+        let mut inner = std::mem::replace(&mut self.inner, clone);
+        Box::pin(async move { inner.call(req).await })
+    }
+}
+
+pub struct BuildPreconditionService;
+
+impl<S: Clone + Send + Sync + 'static> Layer<S> for BuildPreconditionService {
+    type Service = PreconditionService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        PreconditionService::new(service)
+    }
+}
diff --git a/parquet_cache/src/server/response.rs b/parquet_cache/src/server/response.rs
new file mode 100644
index 00000000000..70cb31a255d
--- /dev/null
+++ b/parquet_cache/src/server/response.rs
@@ -0,0 +1,83 @@
+use std::{fmt::Debug, pin::Pin};
+
+use bytes::{BufMut, Bytes, BytesMut};
+use futures::{stream::BoxStream, Future};
+use http::StatusCode;
+use hyper::Body;
+
+use crate::data_types::{
+    GetObjectMetaResponse, KeyspaceResponseBody, KeyspaceVersion, ServiceNode,
+};
+
+pub type PinnedFuture = Pin<Box<dyn Future<Output = Result<Response, super::error::Error>> + Send>>;
+
+pub enum Response {
+    /// Internal-only response used during pre-warming, for `PATCH /warmed`
+    Ready,
+    /// For `GET /keyspace`
+    Keyspace(Vec<ServiceNode>),
+    /// For `GET /state`
+    KeyspaceVersion(KeyspaceVersion),
+    /// For `GET /metadata`
+    Head(GetObjectMetaResponse),
+    /// For `GET /object`
+    Data(BoxStream<'static, object_store::Result<Bytes>>),
+    /// For `POST /write-hint`
+    Written,
+}
+
+impl Debug for Response {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::Ready => write!(f, "Response::Ready"),
+            Self::Keyspace(k) => write!(f, "Response::Keyspace({:?})", k),
+            Self::KeyspaceVersion(v) => write!(f, "Response::KeyspaceVersion({:?})", v),
+            Self::Head(h) => write!(f, "Response::Head({:?})", h),
+            Self::Data(_) => write!(f, "Response::Data"),
+            Self::Written => write!(f, "Response::Written"),
+        }
+    }
+}
+
+impl Response {
+    pub fn code(&self) -> StatusCode {
+        match self {
+            Self::Ready => {
+                unreachable!("should be an internal-only Response, and not sent across the wire")
+            }
+            Self::Keyspace(_) | Self::KeyspaceVersion(_) | Self::Head(_) | Self::Data(_) => {
+                StatusCode::OK
+            }
+            Self::Written => StatusCode::CREATED,
+        }
+    }
+}
+
+impl From<Response> for Body {
+    fn from(value: Response) -> Self {
+        match value {
+            Response::Ready => {
+                unreachable!("should be an internal-only Response, and not sent across the wire")
+            }
+            Response::Keyspace(nodes) => {
+                Self::from(build_resp_body(&KeyspaceResponseBody { nodes }))
+            }
+            Response::KeyspaceVersion(version) => {
+                Self::from(serde_json::json!(version).to_string())
+            }
+            Response::Head(data) => Self::from(build_resp_body(&data)),
+            Response::Data(stream) => Self::wrap_stream(stream),
+            Response::Written => Self::empty(),
+        }
+    }
+}
+
+fn build_resp_body<T>(body: &T) -> Bytes
+where
+    T: Sized + serde::Serialize,
+{
+    let mut buf = BytesMut::new().writer();
+    serde_json::to_writer(&mut buf, body).expect("should write response body");
+
+    buf.into_inner().freeze()
+}
diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml
index 5f616f1d3f3..4f59e04dd97 100644
--- a/parquet_file/Cargo.toml
+++ b/parquet_file/Cargo.toml
@@ -5,8 +5,11 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 base64 = "0.21"
 bytes = "1.5"
 data_types = { path = "../data_types" }
@@ -17,19 +20,19 @@ generated_types = { path = "../generated_types" }
 iox_time = { path = "../iox_time" }
 object_store = { workspace = true }
 observability_deps = { path = "../observability_deps" }
-parquet = { workspace = true, features = ["experimental"]}
-pbjson-types = "0.5"
-prost = "0.11"
+parquet = { workspace = true }
+pbjson-types = { workspace = true }
+prost = { workspace = true }
 schema = { path = "../schema" }
-snafu = "0.7"
-thiserror = "1.0.48"
+snafu = "0.8"
+thiserror = "1.0.56"
 thrift = "0.17"
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] }
 uuid = { version = "1", features = ["v4"] }
-zstd = "0.12"
+zstd = { version = "0.13", default-features = false }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
-
 [dev-dependencies] # In alphabetical order
+assert_matches = "1.5.0"
 rand = "0.8.3"
-test_helpers = { version = "0.1.0", path = "../test_helpers" }
\ No newline at end of file
+test_helpers = { version = "0.1.0", path = "../test_helpers" }
diff --git a/parquet_file/src/chunk.rs b/parquet_file/src/chunk.rs
index 4e7f4e6bbbb..c9c43257b6b 100644
--- a/parquet_file/src/chunk.rs
+++ b/parquet_file/src/chunk.rs
@@ -5,10 +5,9 @@ use crate::{
     storage::{ParquetExecInput, ParquetStorage},
     ParquetFilePath,
 };
-use data_types::{ParquetFile, TimestampMinMax};
+use data_types::{ObjectStoreId, ParquetFile, TimestampMinMax};
 use schema::Schema;
 use std::{mem, sync::Arc};
-use uuid::Uuid;
 
 /// A abstract representation of a Parquet file in object storage, with
 /// associated metadata.
@@ -45,7 +44,7 @@ impl ParquetChunk {
     }
 
     /// Return object store id
-    pub fn object_store_id(&self) -> Uuid {
+    pub fn object_store_id(&self) -> ObjectStoreId {
         self.parquet_file.object_store_id
     }
 
diff --git a/parquet_file/src/lib.rs b/parquet_file/src/lib.rs
index 55ab7f470cb..4dc5a7f2234 100644
--- a/parquet_file/src/lib.rs
+++ b/parquet_file/src/lib.rs
@@ -17,6 +17,8 @@
 )]
 #![allow(clippy::missing_docs_in_private_items)]
 
+use std::{path::PathBuf, str::FromStr};
+
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
@@ -26,9 +28,11 @@ pub mod serialize;
 pub mod storage;
 pub mod writer;
 
-use data_types::{NamespaceId, ParquetFile, ParquetFileParams, TableId, TransitionPartitionId};
+use data_types::{
+    NamespaceId, ObjectStoreId, ParquetFile, ParquetFileParams, PartitionKey, TableId,
+    TransitionPartitionId,
+};
 use object_store::path::Path;
-use uuid::Uuid;
 
 /// Location of a Parquet file within a namespace's object store.
 /// The exact format is an implementation detail and is subject to change.
@@ -37,7 +41,7 @@ pub struct ParquetFilePath {
     namespace_id: NamespaceId,
     table_id: TableId,
     partition_id: TransitionPartitionId,
-    object_store_id: Uuid,
+    object_store_id: ObjectStoreId,
 }
 
 impl ParquetFilePath {
@@ -46,7 +50,7 @@ impl ParquetFilePath {
         namespace_id: NamespaceId,
         table_id: TableId,
         partition_id: &TransitionPartitionId,
-        object_store_id: Uuid,
+        object_store_id: ObjectStoreId,
     ) -> Self {
         Self {
             namespace_id,
@@ -73,12 +77,12 @@ impl ParquetFilePath {
     }
 
     /// Get object store ID.
-    pub fn objest_store_id(&self) -> Uuid {
+    pub fn object_store_id(&self) -> ObjectStoreId {
         self.object_store_id
     }
 
     /// Set new object store ID.
-    pub fn with_object_store_id(self, object_store_id: Uuid) -> Self {
+    pub fn with_object_store_id(self, object_store_id: ObjectStoreId) -> Self {
         Self {
             object_store_id,
             ..self
@@ -108,7 +112,10 @@ impl From<&ParquetFile> for ParquetFilePath {
         Self {
             namespace_id: f.namespace_id,
             table_id: f.table_id,
-            partition_id: f.partition_id.clone(),
+            partition_id: TransitionPartitionId::from_parts(
+                f.partition_id,
+                f.partition_hash_id.clone(),
+            ),
             object_store_id: f.object_store_id,
         }
     }
@@ -116,19 +123,80 @@ impl From<&ParquetFile> for ParquetFilePath {
 
 impl From<&ParquetFileParams> for ParquetFilePath {
     fn from(f: &ParquetFileParams) -> Self {
+        let partition_id =
+            TransitionPartitionId::from_parts(f.partition_id, f.partition_hash_id.clone());
+
         Self {
+            partition_id,
             namespace_id: f.namespace_id,
             table_id: f.table_id,
-            partition_id: f.partition_id.clone(),
             object_store_id: f.object_store_id,
         }
     }
 }
 
+impl TryFrom<&String> for ParquetFilePath {
+    type Error = object_store::path::Error;
+
+    fn try_from(path: &String) -> Result<Self, Self::Error> {
+        let mut parts = path.split(object_store::path::DELIMITER);
+
+        let namespace_id = parts
+            .next()
+            .ok_or(Self::Error::EmptySegment {
+                path: path.to_owned(),
+            })?
+            .parse::<i64>()
+            .map_err(|_| Self::Error::InvalidPath {
+                path: PathBuf::from(path.to_owned()),
+            })?;
+
+        let table_id = parts
+            .next()
+            .ok_or(Self::Error::EmptySegment {
+                path: path.to_owned(),
+            })?
+            .parse::<i64>()
+            .map_err(|_| Self::Error::InvalidPath {
+                path: path.clone().into(),
+            })?;
+        let table_id = TableId::new(table_id);
+
+        let partition_id = parts.next().ok_or(Self::Error::EmptySegment {
+            path: path.to_owned(),
+        })?;
+        let partition_key = PartitionKey::from(partition_id);
+
+        let object_store_id = parts.next().ok_or(Self::Error::EmptySegment {
+            path: path.to_owned(),
+        })?; // uuid.parquet
+        let object_store_id =
+            object_store_id
+                .split('.')
+                .next()
+                .ok_or(Self::Error::EmptySegment {
+                    path: path.to_owned(),
+                })?;
+
+        Ok(Self {
+            namespace_id: NamespaceId::new(namespace_id),
+            table_id,
+            partition_id: TransitionPartitionId::new(table_id, &partition_key),
+            object_store_id: ObjectStoreId::from_str(object_store_id).map_err(|_| {
+                Self::Error::InvalidPath {
+                    path: path.clone().into(),
+                }
+            })?,
+        })
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
+    use assert_matches::assert_matches;
     use data_types::{PartitionId, PartitionKey, TransitionPartitionId};
+    use uuid::Uuid;
 
     #[test]
     fn parquet_file_absolute_dirs_and_file_path_database_partition_ids() {
@@ -136,7 +204,7 @@ mod tests {
             NamespaceId::new(1),
             TableId::new(2),
             &TransitionPartitionId::Deprecated(PartitionId::new(4)),
-            Uuid::nil(),
+            ObjectStoreId::from_uuid(Uuid::nil()),
         );
         let path = pfp.object_store_path();
         assert_eq!(
@@ -152,7 +220,7 @@ mod tests {
             NamespaceId::new(1),
             table_id,
             &TransitionPartitionId::new(table_id, &PartitionKey::from("hello there")),
-            Uuid::nil(),
+            ObjectStoreId::from_uuid(Uuid::nil()),
         );
         let path = pfp.object_store_path();
         assert_eq!(
@@ -161,4 +229,99 @@ mod tests {
             /00000000-0000-0000-0000-000000000000.parquet",
         );
     }
+
+    #[test]
+    fn parquet_file_path_parsed_from_object_store_path() {
+        let object_store_id = uuid::Uuid::new_v4();
+
+        // valid
+        let path = format!("1/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Ok(res) if res == ParquetFilePath::new(
+                NamespaceId::new(1),
+                TableId::new(2),
+                &TransitionPartitionId::new(
+                    TableId::new(2),
+                    &PartitionKey::from("4"),
+                ),
+                ObjectStoreId::from_uuid(object_store_id),
+            ),
+            "should parse valid path, instead found {:?}", pfp
+        );
+
+        // namespace_id errors
+        let path = format!("2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("bad/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid namespace_id, instead found {:?}", pfp
+        );
+
+        // table_id errors
+        let path = format!("1/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("1/bad/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid table_id, instead found {:?}", pfp
+        );
+
+        // namespace_id errors
+        let path = format!("2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = format!("bad/2/4/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid namespace_id, instead found {:?}", pfp
+        );
+
+        // partition_id errors
+        let path = format!("1/2/{}.parquet", object_store_id);
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+
+        // object_store_id errors
+        let path = "1/2/4".to_string();
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }),
+            "should error when missing part, instead found {:?}", pfp
+        );
+        let path = "1/2/4/bad".to_string();
+        let pfp = ParquetFilePath::try_from(&path);
+        assert_matches!(
+            pfp,
+            Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }),
+            "should error when invalid object_store_id, instead found {:?}", pfp
+        );
+    }
 }
diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs
index fc612e4b959..3e304a8c467 100644
--- a/parquet_file/src/metadata.rs
+++ b/parquet_file/src/metadata.rs
@@ -89,9 +89,9 @@
 use base64::{prelude::BASE64_STANDARD, Engine};
 use bytes::Bytes;
 use data_types::{
-    ColumnId, ColumnSet, ColumnSummary, CompactionLevel, InfluxDbType, NamespaceId,
-    ParquetFileParams, PartitionKey, StatValues, Statistics, TableId, Timestamp,
-    TransitionPartitionId,
+    ColumnId, ColumnSet, ColumnSummary, CompactionLevel, CompactionLevelProtoError, InfluxDbType,
+    NamespaceId, ObjectStoreId, ParquetFileParams, PartitionHashId, PartitionId, PartitionKey,
+    StatValues, Statistics, TableId, Timestamp,
 };
 use generated_types::influxdata::iox::ingester::v1 as proto;
 use iox_time::Time;
@@ -108,6 +108,7 @@ use parquet::{
         statistics::Statistics as ParquetStatistics,
     },
     schema::types::SchemaDescriptor as ParquetSchemaDescriptor,
+    thrift::TSerializable,
 };
 use prost::Message;
 use schema::{
@@ -116,9 +117,7 @@ use schema::{
 };
 use snafu::{ensure, OptionExt, ResultExt, Snafu};
 use std::{convert::TryInto, fmt::Debug, mem, sync::Arc};
-use thrift::protocol::{
-    TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable,
-};
+use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol};
 use uuid::Uuid;
 
 /// Current version for serialized metadata.
@@ -218,6 +217,9 @@ pub enum Error {
     #[snafu(display("Field missing while parsing IOx metadata: {}", field))]
     IoxMetadataFieldMissing { field: String },
 
+    #[snafu(display("Cannot parse timestamp from parquet metadata: {}", e))]
+    IoxInvalidTimestamp { e: String },
+
     #[snafu(display("Cannot parse IOx metadata from Protobuf: {}", source))]
     IoxMetadataBroken {
         source: Box<dyn std::error::Error + Send + Sync + 'static>,
@@ -234,7 +236,7 @@ pub enum Error {
 
     #[snafu(display("{}: `{}`", source, compaction_level))]
     InvalidCompactionLevel {
-        source: Box<dyn std::error::Error + Send + Sync + 'static>,
+        source: CompactionLevelProtoError,
         compaction_level: i32,
     },
 }
@@ -251,7 +253,7 @@ pub type Result<T, E = Error> = std::result::Result<T, E>;
 pub struct IoxMetadata {
     /// The uuid used as the location of the parquet file in the OS.
     /// This uuid will later be used as the catalog's ParquetFileId
-    pub object_store_id: Uuid,
+    pub object_store_id: ObjectStoreId,
 
     /// Timestamp when this file was created.
     pub creation_timestamp: Time,
@@ -313,7 +315,7 @@ impl IoxMetadata {
     }
 
     /// Convert to protobuf v3 message.
-    pub(crate) fn to_protobuf(&self) -> std::result::Result<Vec<u8>, prost::EncodeError> {
+    pub fn to_protobuf(&self) -> std::result::Result<Vec<u8>, prost::EncodeError> {
         let sort_key = self.sort_key.as_ref().map(|key| proto::SortKey {
             expressions: key
                 .iter()
@@ -326,7 +328,7 @@ impl IoxMetadata {
         });
 
         let proto_msg = proto::IoxMetadata {
-            object_store_id: self.object_store_id.as_bytes().to_vec(),
+            object_store_id: self.object_store_id.get_uuid().as_bytes().to_vec(),
             creation_timestamp: Some(self.creation_timestamp.date_time().into()),
             namespace_id: self.namespace_id.get(),
             namespace_name: self.namespace_name.to_string(),
@@ -345,7 +347,7 @@ impl IoxMetadata {
     }
 
     /// Read from protobuf message
-    fn from_protobuf(data: &[u8]) -> Result<Self> {
+    pub fn from_protobuf(data: &[u8]) -> Result<Self> {
         // extract protobuf message from bytes
         let proto_msg = proto::IoxMetadata::decode(data)
             .map_err(|err| Box::new(err) as _)
@@ -372,11 +374,13 @@ impl IoxMetadata {
         });
 
         Ok(Self {
-            object_store_id: parse_uuid(&proto_msg.object_store_id)?.ok_or_else(|| {
-                Error::IoxMetadataFieldMissing {
-                    field: "object_store_id".to_string(),
-                }
-            })?,
+            object_store_id: ObjectStoreId::from_uuid(
+                parse_uuid(&proto_msg.object_store_id)?.ok_or_else(|| {
+                    Error::IoxMetadataFieldMissing {
+                        field: "object_store_id".to_string(),
+                    }
+                })?,
+            ),
             creation_timestamp,
             namespace_id: NamespaceId::new(proto_msg.namespace_id),
             namespace_name,
@@ -399,7 +403,7 @@ impl IoxMetadata {
     /// the catalog should get valid values out-of-band.
     pub fn external(creation_timestamp_ns: i64, table_name: impl Into<Arc<str>>) -> Self {
         Self {
-            object_store_id: Default::default(),
+            object_store_id: ObjectStoreId::from_uuid(Uuid::nil()),
             creation_timestamp: Time::from_timestamp_nanos(creation_timestamp_ns),
             namespace_id: NamespaceId::new(1),
             namespace_name: "external".into(),
@@ -413,8 +417,8 @@ impl IoxMetadata {
     }
 
     /// verify uuid
-    pub fn match_object_store_id(&self, uuid: Uuid) -> bool {
-        uuid == self.object_store_id
+    pub fn match_object_store_id(&self, id: ObjectStoreId) -> bool {
+        id == self.object_store_id
     }
 
     /// Create a corresponding iox catalog's ParquetFile
@@ -434,7 +438,8 @@ impl IoxMetadata {
     /// [`RecordBatch`]: arrow::record_batch::RecordBatch
     pub fn to_parquet_file<F>(
         &self,
-        partition_id: TransitionPartitionId,
+        partition_id: PartitionId,
+        partition_hash_id: Option<PartitionHashId>,
         file_size_bytes: usize,
         metadata: &IoxParquetMetaData,
         column_id_map: F,
@@ -486,6 +491,7 @@ impl IoxMetadata {
             namespace_id: self.namespace_id,
             table_id: self.table_id,
             partition_id,
+            partition_hash_id,
             object_store_id: self.object_store_id,
             min_time,
             max_time,
@@ -534,8 +540,7 @@ fn decode_timestamp_from_field(
     let date_time = value
         .context(IoxMetadataFieldMissingSnafu { field })?
         .try_into()
-        .map_err(|e| Box::new(e) as _)
-        .context(IoxMetadataBrokenSnafu)?;
+        .map_err(|e: &str| Error::IoxInvalidTimestamp { e: e.to_string() })?;
 
     Ok(Time::from_date_time(date_time))
 }
@@ -985,11 +990,11 @@ mod tests {
     };
     use data_types::CompactionLevel;
     use datafusion_util::{unbounded_memory_pool, MemoryStream};
-    use schema::builder::SchemaBuilder;
+    use schema::{builder::SchemaBuilder, TIME_DATA_TIMEZONE};
 
     #[test]
     fn iox_metadata_protobuf_round_trip() {
-        let object_store_id = Uuid::new_v4();
+        let object_store_id = ObjectStoreId::new();
 
         let sort_key = SortKeyBuilder::new().with_col("sort_col").build();
 
@@ -1018,7 +1023,7 @@ mod tests {
     #[tokio::test]
     async fn test_metadata_from_parquet_metadata() {
         let meta = IoxMetadata {
-            object_store_id: Default::default(),
+            object_store_id: ObjectStoreId::new(),
             creation_timestamp: Time::from_timestamp_nanos(42),
             namespace_id: NamespaceId::new(1),
             namespace_name: "bananas".into(),
@@ -1101,7 +1106,11 @@ mod tests {
     }
 
     fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef {
-        let array: TimestampNanosecondArray = timestamps.iter().map(|v| Some(*v)).collect();
+        let array = timestamps
+            .iter()
+            .map(|v| Some(*v))
+            .collect::<TimestampNanosecondArray>()
+            .with_timezone_opt(TIME_DATA_TIMEZONE());
         Arc::new(array)
     }
 }
diff --git a/parquet_file/src/serialize.rs b/parquet_file/src/serialize.rs
index 5a25f07df7f..5cec2fb6c94 100644
--- a/parquet_file/src/serialize.rs
+++ b/parquet_file/src/serialize.rs
@@ -213,7 +213,7 @@ mod tests {
         record_batch::RecordBatch,
     };
     use bytes::Bytes;
-    use data_types::{CompactionLevel, NamespaceId, TableId};
+    use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, TableId};
     use datafusion::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
     use datafusion_util::{unbounded_memory_pool, MemoryStream};
     use iox_time::Time;
@@ -222,7 +222,7 @@ mod tests {
     #[tokio::test]
     async fn test_encode_stream() {
         let meta = IoxMetadata {
-            object_store_id: Default::default(),
+            object_store_id: ObjectStoreId::new(),
             creation_timestamp: Time::from_timestamp_nanos(42),
             namespace_id: NamespaceId::new(1),
             namespace_name: "bananas".into(),
diff --git a/parquet_file/src/storage.rs b/parquet_file/src/storage.rs
index c520e3bd03b..69798b2abbc 100644
--- a/parquet_file/src/storage.rs
+++ b/parquet_file/src/storage.rs
@@ -122,7 +122,7 @@ impl ParquetExecInput {
                 .clone()
                 .with_metadata(Default::default()),
         );
-
+        let statistics = Statistics::new_unknown(&schema);
         let base_config = FileScanConfig {
             object_store_url: self.object_store_url.clone(),
             file_schema: schema,
@@ -132,13 +132,12 @@ impl ParquetExecInput {
                 range: None,
                 extensions: None,
             }]],
-            statistics: Statistics::default(),
+            statistics,
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             // Parquet files ARE actually sorted but we don't care here since we just construct a `collect` plan.
             output_ordering: vec![],
-            infinite_source: false,
         };
         let exec = ParquetExec::new(base_config, None, None);
         let exec_schema = exec.schema();
@@ -204,7 +203,7 @@ impl ParquetStorage {
     pub fn test_df_context(&self) -> SessionContext {
         // set up "fake" DataFusion session
         let object_store = Arc::clone(&self.object_store);
-        let session_ctx = SessionContext::with_config(iox_session_config());
+        let session_ctx = SessionContext::new_with_config(iox_session_config());
         register_iox_object_store(session_ctx.runtime_env(), self.id, object_store);
         session_ctx
     }
@@ -298,6 +297,7 @@ impl ParquetStorage {
                 last_modified: Default::default(),
                 size: file_size,
                 e_tag: None,
+                version: None,
             },
         }
     }
@@ -326,10 +326,10 @@ pub enum ProjectionError {
 mod tests {
     use super::*;
     use arrow::{
-        array::{ArrayRef, BinaryArray, Int64Array, StringArray},
+        array::{ArrayRef, Int64Array, IntervalMonthDayNanoArray, StringArray},
         record_batch::RecordBatch,
     };
-    use data_types::{CompactionLevel, NamespaceId, PartitionId, TableId};
+    use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId};
     use datafusion::common::DataFusionError;
     use datafusion_util::{unbounded_memory_pool, MemoryStream};
     use iox_time::Time;
@@ -442,13 +442,13 @@ mod tests {
 
     #[tokio::test]
     async fn test_schema_check_fail_different_types() {
-        let batch = RecordBatch::try_from_iter([("a", to_binary_array(&["value"]))]).unwrap();
-        let other_batch = RecordBatch::try_from_iter([("a", to_int_array(&[1]))]).unwrap();
+        let batch = RecordBatch::try_from_iter([("a", to_interval_array(&[123456]))]).unwrap();
+        let other_batch = RecordBatch::try_from_iter([("a", to_int_array(&[123456]))]).unwrap();
         let schema = batch.schema();
         assert_schema_check_fail(
             other_batch,
             schema,
-            "Error during planning: Cannot cast file schema field a of type Int64 to table schema field of type Binary",
+            "Error during planning: Cannot cast file schema field a of type Int64 to table schema field of type Interval(MonthDayNano)",
         ).await;
     }
 
@@ -584,8 +584,8 @@ mod tests {
         Arc::new(array)
     }
 
-    fn to_binary_array(strs: &[&str]) -> ArrayRef {
-        let array: BinaryArray = strs.iter().map(|s| Some(*s)).collect();
+    fn to_interval_array(vals: &[i128]) -> ArrayRef {
+        let array: IntervalMonthDayNanoArray = vals.iter().map(|v| Some(*v)).collect();
         Arc::new(array)
     }
 
@@ -598,7 +598,7 @@ mod tests {
         (
             TransitionPartitionId::Deprecated(PartitionId::new(4)),
             IoxMetadata {
-                object_store_id: Default::default(),
+                object_store_id: ObjectStoreId::new(),
                 creation_timestamp: Time::from_timestamp_nanos(42),
                 namespace_id: NamespaceId::new(1),
                 namespace_name: "bananas".into(),
diff --git a/parquet_file/tests/metadata.rs b/parquet_file/tests/metadata.rs
index 658d4dc756f..cfdf3ee855c 100644
--- a/parquet_file/tests/metadata.rs
+++ b/parquet_file/tests/metadata.rs
@@ -5,8 +5,8 @@ use arrow::{
     record_batch::RecordBatch,
 };
 use data_types::{
-    ColumnId, CompactionLevel, NamespaceId, PartitionId, PartitionKey, TableId, Timestamp,
-    TransitionPartitionId,
+    ColumnId, CompactionLevel, NamespaceId, ObjectStoreId, PartitionHashId, PartitionId,
+    PartitionKey, TableId, Timestamp, TransitionPartitionId,
 };
 use datafusion_util::{unbounded_memory_pool, MemoryStream};
 use iox_time::Time;
@@ -18,6 +18,7 @@ use parquet_file::{
 };
 use schema::{
     builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME,
+    TIME_DATA_TIMEZONE,
 };
 
 #[tokio::test]
@@ -52,7 +53,7 @@ async fn test_decoded_iox_metadata() {
     // And the metadata the batch would be encoded with if it came through the
     // IOx write path.
     let meta = IoxMetadata {
-        object_store_id: Default::default(),
+        object_store_id: ObjectStoreId::new(),
         creation_timestamp: Time::from_timestamp_nanos(42),
         namespace_id: NamespaceId::new(1),
         namespace_name: "bananas".into(),
@@ -193,7 +194,7 @@ async fn test_empty_parquet_file_panic() {
     // And the metadata the batch would be encoded with if it came through the
     // IOx write path.
     let meta = IoxMetadata {
-        object_store_id: Default::default(),
+        object_store_id: ObjectStoreId::new(),
         creation_timestamp: Time::from_timestamp_nanos(42),
         namespace_id: NamespaceId::new(1),
         namespace_name: "bananas".into(),
@@ -285,7 +286,7 @@ async fn test_decoded_many_columns_with_null_cols_iox_metadata() {
     let sort_key = SortKey::from_columns(sort_key_data);
     let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(4));
     let meta = IoxMetadata {
-        object_store_id: Default::default(),
+        object_store_id: ObjectStoreId::new(),
         creation_timestamp: Time::from_timestamp_nanos(42),
         namespace_id: NamespaceId::new(1),
         namespace_name: "bananas".into(),
@@ -371,10 +372,11 @@ async fn test_derive_parquet_file_params() {
     // IOx write path.
     let table_id = TableId::new(3);
     let partition_key = PartitionKey::from("potato");
-    let partition_id = TransitionPartitionId::new(table_id, &partition_key);
+    let partition_hash_id = PartitionHashId::new(table_id, &partition_key);
+    let partition_id = TransitionPartitionId::Deterministic(partition_hash_id.clone());
 
     let meta = IoxMetadata {
-        object_store_id: Default::default(),
+        object_store_id: ObjectStoreId::new(),
         creation_timestamp: Time::from_timestamp_nanos(1234),
         namespace_id: NamespaceId::new(1),
         namespace_name: "bananas".into(),
@@ -412,9 +414,14 @@ async fn test_derive_parquet_file_params() {
         ("some_field".into(), ColumnId::new(1)),
         ("time".into(), ColumnId::new(2)),
     ]);
-    let catalog_data = meta.to_parquet_file(partition_id, file_size, &iox_parquet_meta, |name| {
-        *column_id_map.get(name).unwrap()
-    });
+    let partition_id = PartitionId::new(1);
+    let catalog_data = meta.to_parquet_file(
+        partition_id,
+        Some(partition_hash_id),
+        file_size,
+        &iox_parquet_meta,
+        |name| *column_id_map.get(name).unwrap(),
+    );
 
     // And verify the resulting statistics used in the catalog.
     //
@@ -438,7 +445,11 @@ fn to_string_array(strs: &[&str]) -> ArrayRef {
 }
 
 fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef {
-    let array: TimestampNanosecondArray = timestamps.iter().map(|v| Some(*v)).collect();
+    let array = timestamps
+        .iter()
+        .map(|v| Some(*v))
+        .collect::<TimestampNanosecondArray>()
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
     Arc::new(array)
 }
 
diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml
index ced9d53f4bc..a1e6b7fe5ca 100644
--- a/parquet_to_line_protocol/Cargo.toml
+++ b/parquet_to_line_protocol/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 datafusion = { workspace = true }
 datafusion_util = { path = "../datafusion_util" }
@@ -14,10 +17,9 @@ num_cpus = "1.16.0"
 object_store = { workspace = true }
 parquet_file  = { path = "../parquet_file" }
 schema = { path = "../schema" }
-tokio = "1.32"
-snafu = "0.7"
+tokio = "1.35"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
-
 [dev-dependencies]
 mutable_batch_lp = { path = "../mutable_batch_lp" }
diff --git a/parquet_to_line_protocol/src/batch.rs b/parquet_to_line_protocol/src/batch.rs
index 1b0b3c22f95..734628ed64d 100644
--- a/parquet_to_line_protocol/src/batch.rs
+++ b/parquet_to_line_protocol/src/batch.rs
@@ -192,7 +192,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic = "Error parsing line protocol: LineProtocol { source: FieldSetMissing, line: 1 }"]
+    #[should_panic = "Error parsing line protocol: PerLine { lines: [LineProtocol { source: FieldSetMissing, line: 1 }] }"]
     fn no_fields() {
         round_trip("my_no_tag_measurement_name,tag=4 1000");
     }
diff --git a/parquet_to_line_protocol/src/lib.rs b/parquet_to_line_protocol/src/lib.rs
index ed8c6a73d05..9efebb6dee9 100644
--- a/parquet_to_line_protocol/src/lib.rs
+++ b/parquet_to_line_protocol/src/lib.rs
@@ -207,7 +207,7 @@ impl ParquetFileReader {
     ) -> Result<Self, Error> {
         let runtime = Arc::new(RuntimeEnv::default());
         let session_config = iox_session_config();
-        let session_state = SessionState::with_config_rt(session_config, runtime);
+        let session_state = SessionState::new_with_config_rt(session_config, runtime);
 
         // Keep metadata so we can find the measurement name
         let format = ParquetFormat::new().with_skip_metadata(Some(false));
@@ -219,7 +219,7 @@ impl ParquetFileReader {
             .await
             .context(InferringSchemaSnafu)?;
 
-        let session_ctx = SessionContext::with_state(session_state);
+        let session_ctx = SessionContext::new_with_state(session_state);
 
         Ok(Self {
             object_store,
@@ -237,21 +237,22 @@ impl ParquetFileReader {
 
     /// read the parquet file as a stream
     pub async fn read(&self) -> Result<SendableRecordBatchStream, Error> {
+        let file_schema = self.schema();
+        let statistics = Statistics::new_unknown(&file_schema);
         let base_config = FileScanConfig {
             object_store_url: self.object_store_url.clone(),
-            file_schema: self.schema(),
+            file_schema,
             file_groups: vec![vec![PartitionedFile {
                 object_meta: self.object_meta.clone(),
                 partition_values: vec![],
                 range: None,
                 extensions: None,
             }]],
-            statistics: Statistics::default(),
+            statistics,
             projection: None,
             limit: None,
             table_partition_cols: vec![],
             output_ordering: vec![],
-            infinite_source: false,
         };
 
         // set up enough datafusion context to do the real read session
diff --git a/partition/Cargo.toml b/partition/Cargo.toml
new file mode 100644
index 00000000000..4ec967cf9a4
--- /dev/null
+++ b/partition/Cargo.toml
@@ -0,0 +1,37 @@
+[package]
+name = "partition"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+arrow = { workspace = true }
+chrono = { version = "0.4", default-features = false }
+data_types = { path = "../data_types" }
+hashbrown = { workspace = true }
+mutable_batch = { path = "../mutable_batch" }
+percent-encoding = "2.3.1"
+schema = { path = "../schema" }
+thiserror = "1.0.56"
+unicode-segmentation = "1.10.1"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+assert_matches = "1.5.0"
+criterion = { version = "0.5", default-features = false, features = [
+    "rayon",
+] }
+generated_types = { path = "../generated_types" }
+mutable_batch_lp = { path = "../mutable_batch_lp" }
+paste = "1.0.14"
+proptest = { version = "1.4.0", default-features = false }
+rand = "0.8"
+test_helpers = { path = "../test_helpers" }
+
+[[bench]]
+name = "partitioner"
+harness = false
diff --git a/partition/benches/partitioner.rs b/partition/benches/partitioner.rs
new file mode 100644
index 00000000000..21a2f9f7865
--- /dev/null
+++ b/partition/benches/partitioner.rs
@@ -0,0 +1,246 @@
+use std::path::Path;
+
+use criterion::{
+    criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup, BenchmarkId,
+    Criterion, Throughput,
+};
+use data_types::partition_template::TablePartitionTemplateOverride;
+use generated_types::influxdata::iox::partition_template::v1::{self as proto, Bucket};
+use partition::partition_batch;
+use schema::Projection;
+
+fn partitioner_benchmarks(c: &mut Criterion) {
+    let mut group = c.benchmark_group("partitioner");
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A medium batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("env".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "env".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/prometheus.lp",
+    );
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A large batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("host".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "host".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/metrics.lp",
+    );
+
+    ////////////////////////////////////////////////////////////////////////////
+    // A small batch.
+    bench(
+        &mut group,
+        "tag_hit",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("location".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "tag_miss",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TagValue("bananas".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "YYYY-MM-DD strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat(
+                "%Y-%m-%d".to_string(),
+            )),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "long strftime",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    bench(
+        &mut group,
+        "hash bucket on tag",
+        vec![proto::TemplatePart {
+            part: Some(proto::template_part::Part::Bucket(Bucket {
+                tag_name: "location".to_string(),
+                num_buckets: 100,
+            })),
+        }],
+        "test_fixtures/lineproto/temperature.lp",
+    );
+
+    group.finish();
+}
+
+fn bench(
+    group: &mut BenchmarkGroup<'_, WallTime>,
+    template_name: &str,
+    partition_template: Vec<proto::TemplatePart>,
+    file_path: &str, // Relative to the crate root
+) {
+    // Un-normalise the path, adjusting back to the crate root.
+    let file_path = format!("{}/../{}", env!("CARGO_MANIFEST_DIR"), file_path);
+    let path = Path::new(&file_path);
+    let partition_template = TablePartitionTemplateOverride::try_new(
+        Some(proto::PartitionTemplate {
+            parts: partition_template,
+        }),
+        &Default::default(),
+    )
+    .unwrap();
+
+    // Read the benchmark data
+    let data = std::fs::read_to_string(path).unwrap();
+    let row_count = data.chars().filter(|&v| v == '\n').count();
+
+    // Generate the mutable batch partitioner input
+    let mutable_batch_input: Vec<_> = mutable_batch_lp::lines_to_batches(&data, 42)
+        .unwrap()
+        .into_iter()
+        .map(|(_table_name, batch)| batch)
+        .collect();
+
+    // Generate the record batch partitioner input
+    let record_batch_input: Vec<_> = mutable_batch_input
+        .iter()
+        .map(|batch| batch.to_arrow(Projection::All).unwrap())
+        .collect();
+
+    group.throughput(Throughput::Elements(row_count as _));
+    group.bench_function(
+        BenchmarkId::new(
+            format!("{template_name} (mutable batch)"),
+            path.file_name().unwrap().to_str().unwrap(),
+        ),
+        |b| {
+            b.iter_batched(
+                || mutable_batch_input.clone(),
+                |input| {
+                    for batch in input {
+                        partition_batch(&batch, &partition_template).for_each(drop);
+                    }
+                },
+                BatchSize::NumIterations(1),
+            )
+        },
+    );
+    group.bench_function(
+        BenchmarkId::new(
+            format!("{template_name} (record batch)"),
+            path.file_name().unwrap().to_str().unwrap(),
+        ),
+        |b| {
+            b.iter_batched(
+                || record_batch_input.clone(),
+                |input| {
+                    for batch in input {
+                        partition_batch(&batch, &partition_template).for_each(drop);
+                    }
+                },
+                BatchSize::NumIterations(1),
+            )
+        },
+    );
+}
+
+criterion_group!(benches, partitioner_benchmarks);
+criterion_main!(benches);
diff --git a/partition/src/bucket.rs b/partition/src/bucket.rs
new file mode 100644
index 00000000000..6e7df804119
--- /dev/null
+++ b/partition/src/bucket.rs
@@ -0,0 +1,49 @@
+use data_types::partition_template;
+
+#[derive(Debug)]
+pub(super) struct BucketHasher {
+    num_buckets: u32,
+    last_assigned_bucket: Option<u32>,
+}
+
+impl BucketHasher {
+    pub(super) fn new(num_buckets: u32) -> Self {
+        Self {
+            num_buckets,
+            last_assigned_bucket: None,
+        }
+    }
+
+    /// Assign a bucket for the provided `tag_value` using the [`BucketHasher`]s
+    /// configuration.
+    pub(super) fn assign_bucket(&mut self, tag_value: &str) -> u32 {
+        let bucket = partition_template::bucket_for_tag_value(tag_value, self.num_buckets);
+        self.last_assigned_bucket = Some(bucket);
+        bucket
+    }
+
+    /// The last bucket assigned by the [`BucketHasher`].
+    pub(super) fn last_assigned_bucket(&self) -> Option<u32> {
+        self.last_assigned_bucket
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_last_assigned_bucket() {
+        let mut bucketer = BucketHasher::new(10);
+        assert_eq!(bucketer.last_assigned_bucket, None);
+
+        assert_eq!(bucketer.assign_bucket("foo"), 6);
+        assert_eq!(bucketer.last_assigned_bucket, Some(6));
+
+        assert_eq!(bucketer.assign_bucket("bat"), 5);
+        assert_eq!(bucketer.last_assigned_bucket, Some(5));
+
+        assert_eq!(bucketer.assign_bucket("qux"), 5);
+        assert_eq!(bucketer.last_assigned_bucket, Some(5));
+    }
+}
diff --git a/partition/src/filter.rs b/partition/src/filter.rs
new file mode 100644
index 00000000000..099c900b771
--- /dev/null
+++ b/partition/src/filter.rs
@@ -0,0 +1,145 @@
+//! Functions for filtering rows from a [`MutableBatch`]
+//!
+//! The returned ranges can then be used with `MutableBatch::extend_from_range`
+
+use crate::Batch;
+use mutable_batch::MutableBatch;
+use std::ops::Range;
+
+/// Given a [`MutableBatch`] a time predicate and a set of row ranges, returns the row
+/// indexes that pass the predicate
+///
+/// # Panic
+///
+/// Panics if `batch` does not contain a time column of the correct type
+pub(crate) fn filter_time<'a, F>(
+    batch: &'a MutableBatch,
+    ranges: &'a [Range<usize>],
+    mut predicate: F,
+) -> Vec<Range<usize>>
+where
+    F: FnMut(i64) -> bool,
+{
+    let col_data = batch.time_column().expect("time column");
+
+    // Time column is not nullable so can skip checking mask
+    let mut ret = vec![];
+    for range in ranges {
+        let offset = range.start;
+        ret.extend(
+            filter_slice(&col_data[range.clone()], &mut predicate)
+                .map(|r| (r.start + offset)..(r.end + offset)),
+        )
+    }
+    ret
+}
+
+fn filter_slice<'a, T, F>(
+    col_data: &'a [T],
+    predicate: &'a mut F,
+) -> impl Iterator<Item = Range<usize>> + 'a
+where
+    T: Copy,
+    F: 'a + FnMut(T) -> bool,
+{
+    let mut range: Range<usize> = 0..0;
+    let mut values = col_data.iter();
+
+    std::iter::from_fn(move || loop {
+        match values.next() {
+            Some(value) if predicate(*value) => {
+                range.end += 1;
+                continue;
+            }
+            // Either finished or predicate failed
+            _ if range.start != range.end => {
+                let t = range.clone();
+                range.end += 1;
+                range.start = range.end;
+                return Some(t);
+            }
+            // Predicate failed and start == end
+            Some(_) => {
+                range.start += 1;
+                range.end += 1;
+            }
+            None => return None,
+        }
+    })
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use mutable_batch::writer::Writer;
+    use rand::prelude::*;
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    #[test]
+    fn test_filter_slice() {
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x != 1 && x != 4).collect();
+        assert_eq!(collected, vec![0..1, 2..4, 5..7]);
+
+        let collected: Vec<_> =
+            filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x == 1 || x == 2 || x == 6).collect();
+        assert_eq!(collected, vec![1..3, 6..7])
+    }
+
+    #[test]
+    fn test_filter_fuzz() {
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32()))
+            .take(1000)
+            .collect();
+
+        let mut predicate = |x: u32| x & 1 == 0;
+
+        let indexes: Vec<_> = filter_slice(&data, &mut predicate).flatten().collect();
+
+        let expected: Vec<_> = data
+            .iter()
+            .enumerate()
+            .filter_map(|(idx, x)| match predicate(*x) {
+                true => Some(idx),
+                false => None,
+            })
+            .collect();
+
+        assert_eq!(indexes, expected);
+    }
+
+    #[test]
+    fn test_filter_batch() {
+        let mut batch = MutableBatch::new();
+        let mut rng = make_rng();
+        let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() as i64))
+            .take(1000)
+            .collect();
+
+        let ranges = &[0..87, 90..442, 634..800];
+        let mut predicate = |x: i64| x & 1 == 0;
+
+        let mut writer = Writer::new(&mut batch, 1000);
+        writer.write_time("time", data.iter().cloned()).unwrap();
+        writer.commit();
+
+        let actual: Vec<_> = filter_time(&batch, ranges, &mut predicate)
+            .into_iter()
+            .flatten()
+            .collect();
+
+        let expected: Vec<_> = ranges
+            .iter()
+            .flat_map(|r| r.clone())
+            .filter(|idx| predicate(data[*idx]))
+            .collect();
+
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/partition/src/lib.rs b/partition/src/lib.rs
new file mode 100644
index 00000000000..d542bba454e
--- /dev/null
+++ b/partition/src/lib.rs
@@ -0,0 +1,1704 @@
+//! Functionality for partitioning data based on a partition template.
+//!
+//! The partitioning template, derived partition key format, and encodings are
+//! described in detail in the [`data_types::partition_template`] module.
+
+mod bucket;
+mod filter;
+mod strftime;
+mod traits;
+
+use std::{borrow::Cow, num::NonZeroUsize, ops::Range};
+
+use data_types::{
+    partition_template::{
+        TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS,
+        MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, PARTITION_KEY_DELIMITER, PARTITION_KEY_MAX_PART_LEN,
+        PARTITION_KEY_PART_TRUNCATED, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR,
+    },
+    PartitionKey,
+};
+use hashbrown::HashMap;
+use mutable_batch::{MutableBatch, WritePayload};
+use percent_encoding::utf8_percent_encode;
+use thiserror::Error;
+use unicode_segmentation::UnicodeSegmentation;
+
+pub use self::traits::{Batch, PartitioningColumn, TimeColumnError};
+use self::{bucket::BucketHasher, strftime::StrftimeFormatter};
+
+/// An error generating a partition key for a row.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum PartitionKeyError {
+    /// The partition template defines a [`Template::TimeFormat`] part, but the
+    /// provided strftime formatter is invalid.
+    #[error("invalid strftime format in partition template")]
+    InvalidStrftime,
+
+    /// The partition template defines a [`Template::TagValue`] part, but the
+    /// column type is not "tag".
+    #[error("tag value partitioner does not accept input columns of type {0}")]
+    TagValueNotTag(String),
+
+    /// A "catch all" error for when a formatter returns [`std::fmt::Error`],
+    /// which contains no context.
+    #[error("partition key generation error")]
+    FmtError(#[from] std::fmt::Error),
+}
+
+/// Returns an iterator identifying consecutive ranges for a given partition key
+pub fn partition_batch<'a, T>(
+    batch: &'a T,
+    template: &'a TablePartitionTemplateOverride,
+) -> impl Iterator<Item = (Result<String, PartitionKeyError>, Range<usize>)> + 'a
+where
+    T: Batch,
+{
+    let parts = template.len();
+    if parts > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS {
+        panic!(
+            "partition template contains {} parts, which exceeds the maximum of {} parts",
+            parts, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS
+        );
+    }
+
+    range_encode(partition_keys(batch, template.parts()))
+}
+
+/// A [`TablePartitionTemplateOverride`] is made up of one of more
+/// [`TemplatePart`]s that are rendered and joined together by
+/// [`PARTITION_KEY_DELIMITER`] to form a single partition key.
+///
+/// To avoid allocating intermediate strings, and performing column lookups for
+/// every row, each [`TemplatePart`] is converted to a [`Template`].
+///
+/// [`Template::fmt_row`] can then be used to render the template for that
+/// particular row to the provided string, without performing any additional
+/// column lookups
+#[derive(Debug)]
+#[allow(clippy::large_enum_variant)]
+enum Template<'a, T: PartitioningColumn> {
+    TagValue(&'a T, Option<&'a T::TagIdentityKey>),
+    TimeFormat(&'a [i64], StrftimeFormatter<'a>),
+    Bucket(&'a T, BucketHasher, Option<&'a T::TagIdentityKey>),
+
+    /// This batch is missing a partitioning tag column.
+    MissingTag,
+}
+
+impl<'a, T> Template<'a, T>
+where
+    T: PartitioningColumn,
+{
+    /// Renders this template to `out` for the row `idx`.
+    fn fmt_row<W: std::fmt::Write>(
+        &mut self,
+        out: &mut W,
+        idx: usize,
+    ) -> Result<(), PartitionKeyError> {
+        match self {
+            Template::TagValue(col, last_key) if col.is_valid(idx) => {
+                let this_key = col
+                    .get_tag_identity_key(idx)
+                    .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?;
+
+                // Update the "is identical" tracking key for this new,
+                // potentially different key.
+                *last_key = Some(this_key);
+
+                out.write_str(encode_key_part(col.get_tag_value(this_key).unwrap()).as_ref())?
+            }
+            Template::TimeFormat(t, fmt) => fmt.render(t[idx], out)?,
+            Template::Bucket(col, bucketer, last_key) if col.is_valid(idx) => {
+                let this_key = col
+                    .get_tag_identity_key(idx)
+                    .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?;
+                let this_value = col.get_tag_value(this_key).unwrap();
+                let bucket = bucketer.assign_bucket(this_value);
+
+                // Update the "is identical" tracking key for this new,
+                // potentially different key.
+                *last_key = Some(this_key);
+
+                write!(out, "{bucket}")?
+            }
+            // Either a tag that has no value for this given row index, or the
+            // batch does not contain this tag at all.
+            Template::TagValue(_, last_key) => {
+                // This row doesn't have a tag value, which should be carried
+                // forwards to be checked against the next row.
+                *last_key = None;
+                out.write_str(PARTITION_KEY_VALUE_NULL_STR)?
+            }
+            // Either a tag that has no value for this given row index, or the
+            // batch does not contain this tag at all.
+            Template::Bucket(_, _, last_key) => {
+                // This row doesn't have a tag value, which should be carried
+                // forwards to be checked against the next row.
+                *last_key = None;
+                out.write_str(PARTITION_KEY_VALUE_NULL_STR)?
+            }
+            Template::MissingTag => out.write_str(PARTITION_KEY_VALUE_NULL_STR)?,
+        }
+
+        Ok(())
+    }
+
+    /// Returns true if the partition key generated by `self` for `idx` will be
+    /// identical to the last generated key.
+    fn is_identical(&mut self, idx: usize) -> bool {
+        match self {
+            Template::TagValue(col, last_key) if col.is_valid(idx) => {
+                let this_key = match col.get_tag_identity_key(idx) {
+                    Some(key) => key,
+                    // This is an error, but for the purposes of identical checks,
+                    // it is treated as not identical, causing the error to be
+                    // raised when formatting is attempted.
+                    None => return false,
+                };
+
+                // Check if the key matches the last key, indicating the same value is going to
+                // be rendered.
+                last_key.map(|v| v == this_key).unwrap_or_default()
+            }
+            Template::TimeFormat(t, fmt) => {
+                // Check if the last value matches the current value, after
+                // optionally applying the precision reduction optimisation.
+                fmt.equals_last(t[idx])
+            }
+            Template::Bucket(col, fmt, last_key) if col.is_valid(idx) => {
+                // To perform an equality check for `idx` when it is a
+                // `Bucket` template part we must check in order:
+                //
+                //     1. If this dictionary key is the same as the
+                //        previous
+                //     2. If the assigned bucket is the same as the
+                //        previous
+                //
+                // While just checking the bucket is correct, checking
+                // the dictionary key first avoids unnecessary throwaway
+                // hashing work.
+                let this_key = match col.get_tag_identity_key(idx) {
+                    Some(key) => key,
+                    // This is an error, but for the purposes of identical checks,
+                    // it is treated as not identical, causing the error to be
+                    // raised when formatting is attempted.
+                    None => return false,
+                };
+
+                match last_key {
+                    Some(v) if this_key == *v => true,
+                    Some(_) => {
+                        col.get_tag_value(this_key)
+                            .map(|this_value| {
+                                // Grab the last assigned bucket, assign
+                                // a bucket for the current value and
+                                // check for equality.
+                                fmt.last_assigned_bucket()
+                                    .map(|last_bucket| last_bucket == fmt.assign_bucket(this_value))
+                                    .unwrap_or_default()
+                            })
+                            .unwrap_or_default()
+                    }
+                    None => false,
+                }
+            }
+            // The last row did not contain this key, and neither does this.
+            Template::TagValue(_, None) | Template::Bucket(_, _, None) => true,
+            // The last row did contain a key, but this one does not (therefore
+            // it differs).
+            Template::TagValue(_, Some(_)) | Template::Bucket(_, _, Some(_)) => false,
+            // The batch does not contain this tag at all - it always matches
+            // with the previous row.
+            Template::MissingTag => true,
+        }
+    }
+}
+
+fn encode_key_part(s: &str) -> Cow<'_, str> {
+    // Encode reserved characters and non-ascii characters.
+    let as_str: Cow<'_, str> = utf8_percent_encode(s, &ENCODED_PARTITION_KEY_CHARS).into();
+
+    match as_str.len() {
+        0 => Cow::Borrowed(PARTITION_KEY_VALUE_EMPTY_STR),
+        1..=PARTITION_KEY_MAX_PART_LEN => as_str,
+        _ => {
+            // This string exceeds the maximum byte length limit and must be
+            // truncated.
+            //
+            // Truncation of unicode strings can be tricky - this implementation
+            // avoids splitting unicode code-points nor graphemes. See the
+            // partition_template module docs in data_types before altering
+            // this.
+
+            // Preallocate the string to hold the long partition key part.
+            let mut buf = String::with_capacity(PARTITION_KEY_MAX_PART_LEN);
+
+            // This is a slow path, re-encoding the original input string -
+            // fortunately this is an uncommon path.
+            //
+            // Walk the string, encoding each grapheme (which includes spaces)
+            // individually, tracking the total length of the encoded string.
+            // Once it hits 199 bytes, stop and append a #.
+
+            let mut bytes = 0;
+            s.graphemes(true)
+                .map(|v| Cow::from(utf8_percent_encode(v, &ENCODED_PARTITION_KEY_CHARS)))
+                .take_while(|v| {
+                    bytes += v.len(); // Byte length of encoded grapheme
+                    bytes < PARTITION_KEY_MAX_PART_LEN
+                })
+                .for_each(|v| buf.push_str(v.as_ref()));
+
+            // Append the truncation marker.
+            buf.push(PARTITION_KEY_PART_TRUNCATED);
+
+            assert!(buf.len() <= PARTITION_KEY_MAX_PART_LEN);
+
+            Cow::Owned(buf)
+        }
+    }
+}
+
+/// Returns an iterator of partition keys for the given table batch.
+///
+/// This function performs deduplication on returned keys; the returned iterator
+/// yields [`Some`] containing the partition key string when a new key is
+/// generated, and [`None`] when the generated key would equal the last key.
+fn partition_keys<'a, T>(
+    batch: &'a T,
+    template_parts: impl Iterator<Item = TemplatePart<'a>>,
+) -> impl Iterator<Item = Option<Result<String, PartitionKeyError>>> + 'a
+where
+    T: Batch,
+{
+    // Extract the timestamp data.
+    let time = batch.time_column().expect("error reading time column");
+
+    // Convert TemplatePart into an ordered array of Template
+    let mut template = template_parts
+        .map(|v| match v {
+            TemplatePart::TagValue(col_name) => batch
+                .column(col_name)
+                .map_or_else(|| Template::MissingTag, |v| Template::TagValue(v, None)),
+            TemplatePart::TimeFormat(fmt) => {
+                Template::TimeFormat(time, StrftimeFormatter::new(fmt))
+            }
+            TemplatePart::Bucket(col_name, num_buckets) => batch.column(col_name).map_or_else(
+                || Template::MissingTag,
+                |v| Template::Bucket(v, BucketHasher::new(num_buckets), None),
+            ),
+        })
+        .collect::<Vec<_>>();
+
+    // Track the length of the last yielded partition key, and pre-allocate the
+    // next partition key string to match it.
+    //
+    // In the happy path, keys of consistent sizes are generated and the
+    // allocations reach a minimum. If the keys are inconsistent, at best a
+    // subset of allocations are eliminated, and at worst, a few bytes of memory
+    // is temporarily allocated until the resulting string is shrunk down.
+    let mut last_len = 5;
+
+    // The first row in a batch must always be evaluated to produce a key.
+    //
+    // Row 0 is guaranteed to exist, otherwise attempting to read the time
+    // column above would have caused a panic (no rows -> no time column).
+    let first = std::iter::once(Some(evaluate_template(&mut template, &mut last_len, 0)));
+
+    // The subsequent rows in a batch may generate the same key, and therefore a
+    // dedupe check is used before allocating & populating the partition key.
+    let rest = (1..batch.num_rows()).map(move |idx| {
+        // Check if this partition key is going to be different from the
+        // last, short-circuiting the check if it is.
+        if template.iter_mut().all(|t| t.is_identical(idx)) {
+            return None;
+        }
+
+        Some(evaluate_template(&mut template, &mut last_len, idx))
+    });
+
+    first.chain(rest)
+}
+
+/// Evaluate the partition template against the row indexed by `idx`.
+///
+/// # Panics
+///
+/// This method panics if `idx` exceeds the number of rows in the batch.
+fn evaluate_template<T: PartitioningColumn>(
+    template: &mut [Template<'_, T>],
+    last_len: &mut usize,
+    idx: usize,
+) -> Result<String, PartitionKeyError> {
+    let mut buf = String::with_capacity(*last_len);
+    let template_len = template.len();
+
+    // Evaluate each template part for this row
+    for (col_idx, col) in template.iter_mut().enumerate() {
+        // Evaluate the formatter for this template part against the row.
+        col.fmt_row(&mut buf, idx)?;
+
+        // If this isn't the last element in the template, insert a field
+        // delimiter.
+        if col_idx + 1 != template_len {
+            buf.push(PARTITION_KEY_DELIMITER);
+        }
+    }
+
+    *last_len = buf.len();
+    Ok(buf)
+}
+
+/// Takes an iterator of [`Option`] and merges identical consecutive elements
+/// together.
+///
+/// Any [`None`] yielded by `iterator` is added to the range for the previous
+/// [`Some`].
+fn range_encode<I, T>(mut iterator: I) -> impl Iterator<Item = (T, Range<usize>)>
+where
+    I: Iterator<Item = Option<T>>,
+    T: Eq,
+{
+    let mut last: Option<I::Item> = None;
+    let mut range: Range<usize> = 0..0;
+    std::iter::from_fn(move || loop {
+        match (iterator.next(), last.take()) {
+            // The iterator yeilds a NULL/identical value and there is a prior value
+            (Some(None), Some(v)) => {
+                range.end += 1;
+                last = Some(v);
+            }
+            // The iterator yeilds a value, and the last value matches
+            (Some(cur), Some(next)) => match cur == next {
+                true => {
+                    range.end += 1;
+                    last = Some(next);
+                }
+                false => {
+                    let t = range.clone();
+                    range.start = range.end;
+                    range.end += 1;
+                    last = Some(cur);
+                    return Some((next.unwrap(), t));
+                }
+            },
+            // There is no last value
+            (Some(cur), None) => {
+                range.end += 1;
+                last = Some(cur);
+            }
+            (None, Some(next)) => return Some((next.unwrap(), range.clone())),
+            (None, None) => return None,
+        }
+    })
+}
+
+/// An error partitioning a batch.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum PartitionWriteError {
+    /// An error deriving the partition key from the partition key template.
+    #[error("{0}")]
+    PartitionKey(#[from] PartitionKeyError),
+
+    /// An error accessing the time column.
+    #[error("{0}")]
+    TimeColumn(#[from] TimeColumnError),
+}
+
+/// A [`MutableBatch`] with a non-zero set of row ranges to write
+#[derive(Debug)]
+pub struct PartitionWrite<'a> {
+    batch: &'a MutableBatch,
+    ranges: Vec<Range<usize>>,
+    min_timestamp: i64,
+    max_timestamp: i64,
+    row_count: NonZeroUsize,
+}
+
+impl<'a> PartitionWrite<'a> {
+    /// Create a new [`PartitionWrite`] with the entire range of the provided batch
+    ///
+    /// # Panic
+    ///
+    /// Panics if the batch has no rows
+    pub fn new(batch: &'a MutableBatch) -> Result<Self, PartitionWriteError> {
+        let row_count = NonZeroUsize::new(batch.rows()).unwrap();
+        let time = batch.time_column()?;
+        let (min_timestamp, max_timestamp) = min_max_time(time);
+
+        // This `allow` can be removed when this issue is fixed and released:
+        // <https://github.com/rust-lang/rust-clippy/issues/11086>
+        #[allow(clippy::single_range_in_vec_init)]
+        Ok(Self {
+            batch,
+            ranges: vec![0..batch.rows()],
+            min_timestamp,
+            max_timestamp,
+            row_count,
+        })
+    }
+
+    /// Returns the minimum timestamp in the write
+    pub fn min_timestamp(&self) -> i64 {
+        self.min_timestamp
+    }
+
+    /// Returns the maximum timestamp in the write
+    pub fn max_timestamp(&self) -> i64 {
+        self.max_timestamp
+    }
+
+    /// Returns the number of rows in the write
+    pub fn rows(&self) -> NonZeroUsize {
+        self.row_count
+    }
+
+    /// Returns a [`PartitionWrite`] containing just the rows of `Self` that pass
+    /// the provided time predicate, or None if no rows
+    pub fn filter(&self, predicate: impl Fn(i64) -> bool) -> Option<PartitionWrite<'a>> {
+        let mut min_timestamp = i64::MAX;
+        let mut max_timestamp = i64::MIN;
+        let mut row_count = 0_usize;
+
+        // Construct a predicate that lets us inspect the timestamps as they are filtered
+        let inspect = |t| match predicate(t) {
+            true => {
+                min_timestamp = min_timestamp.min(t);
+                max_timestamp = max_timestamp.max(t);
+                row_count += 1;
+                true
+            }
+            false => false,
+        };
+
+        let ranges: Vec<_> = filter::filter_time(self.batch, &self.ranges, inspect);
+        let row_count = NonZeroUsize::new(row_count)?;
+
+        Some(PartitionWrite {
+            batch: self.batch,
+            ranges,
+            min_timestamp,
+            max_timestamp,
+            row_count,
+        })
+    }
+
+    /// Create a collection of [`PartitionWrite`] indexed by partition key
+    /// from a [`MutableBatch`] and [`TablePartitionTemplateOverride`]
+    pub fn partition(
+        batch: &'a MutableBatch,
+        partition_template: &TablePartitionTemplateOverride,
+    ) -> Result<HashMap<PartitionKey, Self>, PartitionWriteError> {
+        use hashbrown::hash_map::Entry;
+        let time = batch.time_column()?;
+
+        let mut partition_ranges = HashMap::new();
+        for (partition, range) in partition_batch(batch, partition_template) {
+            let row_count = NonZeroUsize::new(range.end - range.start).unwrap();
+            let (min_timestamp, max_timestamp) = min_max_time(&time[range.clone()]);
+
+            match partition_ranges.entry(PartitionKey::from(partition?)) {
+                Entry::Vacant(v) => {
+                    v.insert(PartitionWrite {
+                        batch,
+                        ranges: vec![range],
+                        min_timestamp,
+                        max_timestamp,
+                        row_count,
+                    });
+                }
+                Entry::Occupied(mut o) => {
+                    let pw = o.get_mut();
+                    pw.min_timestamp = pw.min_timestamp.min(min_timestamp);
+                    pw.max_timestamp = pw.max_timestamp.max(max_timestamp);
+                    pw.row_count = NonZeroUsize::new(pw.row_count.get() + row_count.get()).unwrap();
+                    pw.ranges.push(range);
+                }
+            }
+        }
+        Ok(partition_ranges)
+    }
+}
+
+impl<'a> WritePayload for PartitionWrite<'a> {
+    fn write_to_batch(&self, batch: &mut MutableBatch) -> mutable_batch::Result<()> {
+        batch.extend_from_ranges(self.batch, &self.ranges)
+    }
+}
+
+fn min_max_time(col: &[i64]) -> (i64, i64) {
+    let mut min_timestamp = i64::MAX;
+    let mut max_timestamp = i64::MIN;
+    for t in col {
+        min_timestamp = min_timestamp.min(*t);
+        max_timestamp = max_timestamp.max(*t);
+    }
+    (min_timestamp, max_timestamp)
+}
+
+#[cfg(test)]
+mod tests {
+    use std::collections::HashMap;
+
+    use super::*;
+
+    use assert_matches::assert_matches;
+    use chrono::{format::StrftimeItems, DateTime, Datelike, Days, TimeZone, Utc};
+    use data_types::partition_template::{
+        build_column_values, test_table_partition_override, ColumnValue,
+    };
+    use mutable_batch::{writer::Writer, MutableBatch};
+    use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy};
+    use rand::prelude::*;
+    use schema::{Projection, TIME_COLUMN_NAME};
+    use test_helpers::assert_error;
+
+    #[test]
+    fn return_err_if_no_time_column() {
+        let batch = MutableBatch::new();
+        let table_partition_template = Default::default();
+        assert_error!(
+            PartitionWrite::partition(&batch, &table_partition_template),
+            PartitionWriteError::TimeColumn(TimeColumnError::NotFound),
+        );
+    }
+
+    fn make_rng() -> StdRng {
+        let seed = rand::rngs::OsRng.next_u64();
+        println!("Seed: {seed}");
+        StdRng::seed_from_u64(seed)
+    }
+
+    /// Reproducer for https://github.com/influxdata/idpe/issues/17765
+    #[test]
+    fn test_equals_last() {
+        let ts = [
+            1686756903736785920, // last_eq=false, render, set last_ptr
+            42,                  // last_eq=false, render, set last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            1686756903736785920, // last_eq=false, re-use, don't change last_ptr
+            42,                  // last_eq=true (wrong), re-use
+        ];
+
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, ts.len());
+
+        writer.write_time("time", ts.into_iter()).unwrap();
+        writer.commit();
+
+        let keys =
+            generate_denormalised_keys(&batch, TablePartitionTemplateOverride::default().parts())
+                .unwrap();
+
+        assert_eq!(
+            keys,
+            &[
+                "2023-06-14",
+                "1970-01-01",
+                "2023-06-14",
+                "2023-06-14",
+                "1970-01-01",
+            ]
+        );
+    }
+
+    /// Generates a vector of partition key strings, or an error.
+    ///
+    /// This function normalises the de-duplicated output of
+    /// [`partition_keys()`], returning the last observed key when the dedupe
+    /// [`partition_keys()`] process returns [`None`].
+    fn generate_denormalised_keys<'a, 'b: 'a, T: Batch>(
+        batch: &'b T,
+        template_parts: impl Iterator<Item = TemplatePart<'a>>,
+    ) -> Result<Vec<String>, PartitionKeyError> {
+        let mut last_ret = None;
+        partition_keys(batch, template_parts)
+            .map(|v| match v {
+                Some(this) => {
+                    last_ret = Some(this.clone());
+                    this
+                }
+                None => last_ret
+                    .as_ref()
+                    .expect("must have observed prior key")
+                    .clone(),
+            })
+            .collect::<Result<Vec<_>, _>>()
+    }
+
+    /// A fixture test asserting the default partition key format, derived from
+    /// the default partition key template.
+    #[test]
+    fn test_default_fixture() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template_parts =
+            TablePartitionTemplateOverride::try_new(None, &Default::default()).unwrap();
+        let keys: Vec<_> = partition_keys(&batch, template_parts.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(keys, vec!["1970-01-01".to_string()])
+    }
+
+    #[test]
+    #[should_panic(expected = r#"error reading time column: NotFound"#)]
+    fn test_zero_sized_batch() {
+        let batch = MutableBatch::new();
+
+        let template_parts = test_table_partition_override(vec![
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::TagValue("bananas"),
+        ]);
+
+        let keys: Vec<_> = partition_batch(&batch, &template_parts).collect::<Vec<_>>();
+        assert_eq!(keys, vec![])
+    }
+
+    #[test]
+    fn test_range_encode() {
+        let collected: Vec<_> =
+            range_encode(vec![5, 5, 5, 7, 2, 2, 3].into_iter().map(Some)).collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_sparse() {
+        let collected: Vec<_> =
+            range_encode(vec![Some(5), None, None, Some(7), Some(2), None, Some(3)].into_iter())
+                .collect();
+        assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)])
+    }
+
+    #[test]
+    fn test_range_encode_fuzz() {
+        let mut rng = make_rng();
+        let original: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() % 20))
+            .take(1000)
+            .collect();
+
+        let rle: Vec<_> = range_encode(original.iter().cloned().map(Some)).collect();
+
+        let mut last_range = rle[0].1.clone();
+        for (_, range) in &rle[1..] {
+            assert_eq!(range.start, last_range.end);
+            assert_ne!(range.start, range.end);
+            last_range = range.clone();
+        }
+
+        let hydrated: Vec<_> = rle
+            .iter()
+            .flat_map(|(v, r)| std::iter::repeat(*v).take(r.end - r.start))
+            .collect();
+
+        assert_eq!(original, hydrated)
+    }
+
+    #[test]
+    fn test_partition() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+        writer
+            .write_tag(
+                "device",
+                Some(&[0b00001110]),
+                vec![
+                    "97c953a1-70e6-4569-80e4-59d1f49ec3fa",
+                    "f1aac284-b8a1-4938-acf3-52a3d516ca14",
+                    "420bb984-4d1e-48ec-bbfc-10825fbf3221",
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::Bucket("device", 10),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let keys: Vec<_> = partition_keys(&batch, template_parts.clone().into_iter())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(
+            keys,
+            vec![
+                "1970-01-01 00:00:00|!|!|!".to_string(),
+                "1970-01-01 00:00:00|west|6|!".to_string(),
+                "1970-01-01 00:00:00|!|4|!".to_string(),
+                "1970-01-01 00:00:00|east|5|!".to_string(),
+                "1970-01-01 00:00:00|!|!|!".to_string()
+            ]
+        );
+
+        let record_batch = batch.to_arrow(Projection::All).unwrap();
+
+        let keys: Vec<_> = partition_keys(&record_batch, template_parts.into_iter())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>()
+            .unwrap();
+
+        assert_eq!(
+            keys,
+            vec![
+                "1970-01-01 00:00:00|!|!|!".to_string(),
+                "1970-01-01 00:00:00|west|6|!".to_string(),
+                "1970-01-01 00:00:00|!|4|!".to_string(),
+                "1970-01-01 00:00:00|east|5|!".to_string(),
+                "1970-01-01 00:00:00|!|!|!".to_string()
+            ]
+        );
+    }
+
+    #[test]
+    fn test_bucket_fixture() {
+        let mut bucketer = BucketHasher::new(10);
+        assert_eq!(bucketer.assign_bucket("foo"), 6);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(6));
+        assert_eq!(bucketer.assign_bucket("bat"), 5);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(5));
+        assert_eq!(bucketer.assign_bucket("qux"), 5);
+        assert_eq!(bucketer.last_assigned_bucket(), Some(5));
+    }
+
+    #[test]
+    fn test_sparse_representation() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 7);
+
+        writer
+            .write_time(
+                "time",
+                vec![
+                    1,
+                    1,
+                    1,
+                    1,
+                    1685971961464736000,
+                    1685971961464736000,
+                    1685971961464736000,
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        writer
+            .write_tag(
+                "region",
+                Some(&[0b01111111]),
+                vec![
+                    "platanos", "platanos", "platanos", "platanos", "platanos", "platanos",
+                    "bananas",
+                ]
+                .into_iter(),
+            )
+            .unwrap();
+
+        writer
+            .write_tag(
+                "device",
+                Some(&[0b01111111]),
+                vec!["foo", "bat", "qux", "bat", "foo", "foo", "foo"].into_iter(), // `bat` and `qux` both go to bucket 5, so those 3 values should yield the same key
+            )
+            .unwrap();
+
+        let template_parts = [
+            TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"),
+            TemplatePart::TagValue("region"),
+            TemplatePart::Bucket("device", 10),
+            TemplatePart::TagValue("bananas"), // column not present
+        ];
+
+        writer.commit();
+
+        let mut iter = partition_keys(&batch, template_parts.into_iter());
+
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("1970-01-01 00:00:00|platanos|6|!".to_string()))
+        );
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("1970-01-01 00:00:00|platanos|5|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|platanos|6|!".to_string()))
+        );
+        assert_eq!(iter.next().unwrap(), None);
+        assert_eq!(
+            iter.next().unwrap(),
+            Some(Ok("2023-06-05 13:32:41|bananas|6|!".to_string()))
+        );
+    }
+
+    #[test]
+    fn partitioning_on_fields_panics() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_string(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [TemplatePart::TagValue("region")];
+
+        writer.commit();
+
+        let got: Result<Vec<_>, _> = generate_denormalised_keys(&batch, template_parts.into_iter());
+        assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
+    }
+
+    #[test]
+    fn bucketing_on_fields_panics() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 5);
+
+        writer
+            .write_time("time", vec![1, 2, 3, 4, 5].into_iter())
+            .unwrap();
+
+        writer
+            .write_string(
+                "region",
+                Some(&[0b00001010]),
+                vec!["west", "east"].into_iter(),
+            )
+            .unwrap();
+
+        let template_parts = [TemplatePart::Bucket("region", 10)];
+
+        writer.commit();
+
+        let got: Result<Vec<_>, _> = generate_denormalised_keys(&batch, template_parts.into_iter());
+        assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_)));
+    }
+
+    fn identity<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Identity(s.into())
+    }
+
+    fn prefix<'a, T>(s: T) -> ColumnValue<'a>
+    where
+        T: Into<Cow<'a, str>>,
+    {
+        ColumnValue::Prefix(s.into())
+    }
+
+    fn year(y: i32) -> ColumnValue<'static> {
+        ColumnValue::Datetime {
+            begin: Utc.with_ymd_and_hms(y, 1, 1, 0, 0, 0).unwrap(),
+            end: Utc.with_ymd_and_hms(y + 1, 1, 1, 0, 0, 0).unwrap(),
+        }
+    }
+
+    fn bucket(bucket_id: u32) -> ColumnValue<'static> {
+        ColumnValue::Bucket(bucket_id)
+    }
+
+    // Generate a test that asserts the derived partition key matches
+    // "want_key", when using the provided "template" parts and set of "tags".
+    //
+    // Additionally validates that the derived key is reversible into the
+    // expected set of "want_reversed_tags" from the original inputs.
+    macro_rules! test_partition_key {
+        (
+            $name:ident,
+            template = $template:expr,              // Array/vec of TemplatePart
+            tags = $tags:expr,                      // Array/vec of (tag_name, value) tuples
+            want_key = $want_key:expr,              // Expected partition key string
+            want_reversed_tags = $want_reversed_tags:expr // Array/vec of (tag_name, value) reversed from $tags
+        ) => {
+            paste::paste! {
+                #[test]
+                fn [<test_partition_key_ $name>]() {
+                    let mut batch = MutableBatch::new();
+                    let mut writer = Writer::new(&mut batch, 1);
+
+                    let template = $template.into_iter().collect::<Vec<_>>();
+                    let template = test_table_partition_override(template);
+
+                    // Timestamp: 2023-05-29T13:03:16Z
+                    writer
+                        .write_time("time", vec![1685365396931384064].into_iter())
+                        .unwrap();
+
+                    for (col, value) in $tags {
+                        let v = String::from(value);
+                        writer
+                            .write_tag(col, Some(&[0b00000001]), vec![v.as_str()].into_iter())
+                            .unwrap();
+                    }
+
+                    writer.commit();
+
+                    // Generate the full set of partition keys, inserting the
+                    // last observed value when the next key is identical to
+                    // normalise the values.
+                    let keys = generate_denormalised_keys(&batch, template.parts())
+                        .unwrap();
+                    assert_eq!(keys, vec![$want_key.to_string()], "generated key differs");
+
+                    // Reverse the encoding.
+                    let reversed = build_column_values(&template, &keys[0]);
+
+                    // Expect the tags to be (str, ColumnValue) for the
+                    // comparison
+                    let want: Vec<(&str, ColumnValue<'_>)> = $want_reversed_tags
+                        .into_iter()
+                        .collect();
+
+                    let got = reversed.collect::<Vec<_>>();
+                    assert_eq!(got, want, "reversed key differs");
+                }
+            }
+        };
+    }
+
+    test_partition_key!(
+        simple,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 5),
+        ],
+        tags = [
+            ("a", "bananas"),
+            ("b", "are_good"),
+            ("c", "for_test_strings")
+        ],
+        want_key = "2023|bananas|are_good|1",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("are_good")),
+            ("c", bucket(1)),
+        ]
+    );
+
+    test_partition_key!(
+        non_ascii,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+        ],
+        tags = [("a", "bananas"), ("b", "plátanos")],
+        want_key = "2023|bananas|pl%C3%A1tanos",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("bananas")),
+            ("b", identity("plátanos")),
+        ]
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_not_present,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("b", "bananas")],
+        want_key = "!",
+        want_reversed_tags = []
+    );
+
+    test_partition_key!(
+        single_bucket_template_tag_not_present,
+        template = [TemplatePart::Bucket("a", 10)],
+        tags = [("b", "bananas")],
+        want_key = "!",
+        want_reversed_tags = []
+    );
+
+    test_partition_key!(
+        single_tag_template_tag_empty,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "")],
+        want_key = "^",
+        want_reversed_tags = [("a", identity(""))]
+    );
+
+    test_partition_key!(
+        single_bucket_template_tag_empty,
+        template = [TemplatePart::Bucket("a", 10)],
+        tags = [("a", "")],
+        want_key = "0",
+        want_reversed_tags = [("a", bucket(0))]
+    );
+
+    test_partition_key!(
+        missing_tag,
+        template = [
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::Bucket("c", 10)
+        ],
+        tags = [("a", "bananas")],
+        want_key = "bananas|!|!",
+        want_reversed_tags = [("a", identity("bananas"))]
+    );
+
+    test_partition_key!(
+        unambiguous,
+        template = [
+            TemplatePart::TimeFormat("%Y"),
+            TemplatePart::TagValue("a"),
+            TemplatePart::TagValue("b"),
+            TemplatePart::TagValue("c"),
+            TemplatePart::TagValue("d"),
+            TemplatePart::TagValue("e"),
+        ],
+        tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")],
+        want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E",
+        want_reversed_tags = [
+            (TIME_COLUMN_NAME, year(2023)),
+            ("a", identity("|")),
+            ("b", identity("!")),
+            ("d", identity("%7C%21%257C")),
+            ("e", identity("^"))
+        ]
+    );
+
+    test_partition_key!(
+        truncated_char_reserved,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "#")],
+        want_key = "%23",
+        want_reversed_tags = [("a", identity("#"))]
+    );
+
+    // Keys < 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_199,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(199))],
+        want_key = "A".repeat(199),
+        want_reversed_tags = [("a", identity("A".repeat(199)))]
+    );
+
+    // Keys of exactly 200 bytes long should not be truncated.
+    test_partition_key!(
+        truncate_length_200,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(200))],
+        want_key = "A".repeat(200),
+        want_reversed_tags = [("a", identity("A".repeat(200)))]
+    );
+
+    // Keys > 200 bytes long should be truncated to exactly 200 bytes,
+    // terminated by a # character.
+    test_partition_key!(
+        truncate_length_201,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", "A".repeat(201))],
+        want_key = format!("{}#", "A".repeat(199)),
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in an encoded sequence that does not cross the cut-off point
+    // is preserved.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                      ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%25`
+    //                      ^ cutoff
+    //
+    // So the entire encoded sequence should be preserved.
+    test_partition_key!(
+        truncate_encoding_sequence_ok,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(197)))],
+        want_key = format!("{}%25", "A".repeat(197)), // Not truncated
+        want_reversed_tags = [("a", identity(format!("{}%", "A".repeat(197))))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                    ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>% 25`            (space added for clarity)
+    //                    ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(198)))],
+        want_key = format!("{}#", "A".repeat(198)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(198)))]
+    );
+
+    // A key ending in an encoded sequence should not be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>%`
+    //                     ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%2 5`            (space added for clarity)
+    //                     ^ cutoff
+    //
+    // Where naive slicing would result in truncating an encoding sequence and
+    // therefore the whole encoded sequence should be truncated.
+    test_partition_key!(
+        truncate_encoding_sequence_truncated_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}%", "A".repeat(199)))],
+        want_key = format!("{}#", "A".repeat(199)), // Truncated
+        want_reversed_tags = [("a", prefix("A".repeat(199)))]
+    );
+
+    // A key ending in a unicode code-point should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>🍌`
+    //                         ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>%F0%9F%8D%8C`
+    //                         ^ cutoff
+    //
+    // Therefore the entire code-point should be removed from the truncated
+    // output.
+    //
+    // This test MUST NOT fail, or an invalid UTF-8 string is being generated
+    // which is unusable in languages (like Rust).
+    //
+    // Advances the cut-off to ensure the position within the code-point doesn't
+    // affect the output.
+    test_partition_key!(
+        truncate_within_code_point_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(194)))],
+        want_key = format!("{}#", "A".repeat(194)),
+        want_reversed_tags = [("a", prefix("A".repeat(194)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(195)))],
+        want_key = format!("{}#", "A".repeat(195)),
+        want_reversed_tags = [("a", prefix("A".repeat(195)))]
+    );
+    test_partition_key!(
+        truncate_within_code_point_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}🍌", "A".repeat(196)))],
+        want_key = format!("{}#", "A".repeat(196)),
+        want_reversed_tags = [("a", prefix("A".repeat(196)))]
+    );
+
+    // A key ending in a unicode grapheme should never be split.
+    //
+    // This subtest generates a key of:
+    //
+    //      `A..<repeats>நிbananas`
+    //                   ^ cutoff
+    //
+    // Which when encoded, becomes:
+    //
+    //      `A..<repeats>நிbananas`    (within a grapheme)
+    //                   ^ cutoff
+    //
+    // Therefore the entire grapheme (நி) should be removed from the truncated
+    // output.
+    //
+    // This is a conservative implementation, and may be relaxed in the future.
+    //
+    // This first test asserts that a grapheme can be included, and then
+    // subsequent tests increment the cut-off point by 1 byte each time.
+    test_partition_key!(
+        truncate_within_grapheme_0,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(181)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF#", "A".repeat(181)),
+        want_reversed_tags = [("a", prefix(format!("{}நி", "A".repeat(181))))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_1,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(182)))],
+        want_key = format!("{}#", "A".repeat(182)),
+        want_reversed_tags = [("a", prefix("A".repeat(182)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_2,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(183)))],
+        want_key = format!("{}#", "A".repeat(183)),
+        want_reversed_tags = [("a", prefix("A".repeat(183)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_3,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(184)))],
+        want_key = format!("{}#", "A".repeat(184)),
+        want_reversed_tags = [("a", prefix("A".repeat(184)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_4,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(185)))],
+        want_key = format!("{}#", "A".repeat(185)),
+        want_reversed_tags = [("a", prefix("A".repeat(185)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_5,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(186)))],
+        want_key = format!("{}#", "A".repeat(186)),
+        want_reversed_tags = [("a", prefix("A".repeat(186)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_6,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(187)))],
+        want_key = format!("{}#", "A".repeat(187)),
+        want_reversed_tags = [("a", prefix("A".repeat(187)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_7,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(188)))],
+        want_key = format!("{}#", "A".repeat(188)),
+        want_reversed_tags = [("a", prefix("A".repeat(188)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_8,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(189)))],
+        want_key = format!("{}#", "A".repeat(189)),
+        want_reversed_tags = [("a", prefix("A".repeat(189)))]
+    );
+    test_partition_key!(
+        truncate_within_grapheme_9,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நிbananas", "A".repeat(190)))],
+        want_key = format!("{}#", "A".repeat(190)),
+        want_reversed_tags = [("a", prefix("A".repeat(190)))]
+    );
+
+    // As above, but the grapheme is the last portion of the generated string
+    // (no trailing bananas).
+    test_partition_key!(
+        truncate_grapheme_identity,
+        template = [TemplatePart::TagValue("a")],
+        tags = [("a", format!("{}நி", "A".repeat(182)))],
+        want_key = format!("{}%E0%AE%A8%E0%AE%BF", "A".repeat(182)),
+        want_reversed_tags = [("a", identity(format!("{}நி", "A".repeat(182))))]
+    );
+
+    /// A test using an invalid strftime format string.
+    #[test]
+    fn test_invalid_strftime() {
+        let mut batch = MutableBatch::new();
+        let mut writer = Writer::new(&mut batch, 1);
+
+        writer.write_time("time", vec![1].into_iter()).unwrap();
+        writer
+            .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter())
+            .unwrap();
+        writer.commit();
+
+        let template = [TemplatePart::TimeFormat("%3F")]
+            .into_iter()
+            .collect::<Vec<_>>();
+        let template = test_table_partition_override(template);
+
+        let ret = partition_keys(&batch, template.parts())
+            .map(|v| v.expect("non-identical consecutive keys"))
+            .collect::<Result<Vec<_>, _>>();
+
+        assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "partition template contains 9 parts, which exceeds the maximum of 8 parts"
+    )]
+    fn test_too_many_parts() {
+        let template = test_table_partition_override(
+            std::iter::repeat(TemplatePart::TagValue("bananas"))
+                .take(9)
+                .collect(),
+        );
+
+        let _ = partition_batch(&MutableBatch::new(), &template);
+    }
+
+    // These values are arbitrarily chosen when building an input to the
+    // partitioner.
+
+    // Arbitrary tag names are selected from this set of candidates (to ensure
+    // there's always some overlap, rather than truly random strings).
+    const TEST_TAG_NAME_SET: &[&str] = &["A", "B", "C", "D", "E", "F"];
+
+    // Arbitrary template parts are selected from this set.
+    const TEST_TEMPLATE_PARTS: &[TemplatePart<'static>] = &[
+        TemplatePart::TimeFormat("%Y|%m|%d!-string"),
+        TemplatePart::TimeFormat("%Y|%m|%d!-%%bananas"),
+        TemplatePart::TimeFormat("%Y/%m/%d"),
+        TemplatePart::TimeFormat("%Y-%m-%d"),
+        TemplatePart::TagValue(""),
+        TemplatePart::TagValue("A"),
+        TemplatePart::TagValue("B"),
+        TemplatePart::TagValue("C"),
+        TemplatePart::TagValue("tags!"),
+        TemplatePart::TagValue("%tags!"),
+        TemplatePart::TagValue("my_tag"),
+        TemplatePart::TagValue("my|tag"),
+        TemplatePart::TagValue("%%%%|!!!!|"),
+        TemplatePart::Bucket("D", 10),
+        TemplatePart::Bucket("E", 100),
+        TemplatePart::Bucket("F", 1000),
+    ];
+
+    prop_compose! {
+        /// Yields a vector of up to [`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`] unique
+        /// template parts, chosen from [`TEST_TEMPLATE_PARTS`].
+        fn arbitrary_template_parts()(set in proptest::collection::vec(
+                proptest::sample::select(TEST_TEMPLATE_PARTS),
+                (1, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS) // Set size range
+            )) -> Vec<TemplatePart<'static>> {
+            let mut set = set;
+            set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}"));
+            set
+        }
+    }
+
+    prop_compose! {
+        /// Yield a HashMap of between 1 and 10 (column_name, random string
+        /// value) with tag names chosen from [`TEST_TAG_NAME_SET`].
+        fn arbitrary_tag_value_map()(v in proptest::collection::hash_map(
+                proptest::sample::select(TEST_TAG_NAME_SET).prop_map(ToString::to_string),
+                any::<String>(),
+                (1, 10) // Set size range
+            )) -> HashMap<String, String> {
+            v
+        }
+    }
+
+    prop_compose! {
+        /// Yield a Vec containing an identical timestamp run of random length,
+        /// up to `max_run_len`,
+        fn arbitrary_timestamp_run(max_run_len: usize)(v in 0_i64..i64::MAX, run_len in 1..max_run_len) -> Vec<i64> {
+            let mut x = Vec::with_capacity(run_len);
+            x.resize(run_len, v);
+            x
+        }
+    }
+
+    /// Yield a Vec of timestamp values that more accurately model real
+    /// timestamps than pure random selection.
+    ///
+    /// Runs of identical timestamps are generated with
+    /// [`arbitrary_timestamp_run()`], which are then shuffled to produce a list
+    /// of timestamps with limited repeats, sometimes consecutively.
+    fn arbitrary_timestamps() -> impl Strategy<Value = Vec<i64>> {
+        proptest::collection::vec(arbitrary_timestamp_run(6), 10..100)
+            .prop_map(|v| v.into_iter().flatten().collect::<Vec<_>>())
+            .prop_shuffle()
+    }
+
+    enum ExpectedColumnValue {
+        String(String),
+        TSRange(DateTime<Utc>, DateTime<Utc>),
+        Bucket(u32),
+    }
+
+    impl ExpectedColumnValue {
+        fn expect_string(&self) -> &String {
+            match self {
+                Self::String(s) => s,
+                Self::TSRange(_, _) => panic!("expected string, got TS range"),
+                Self::Bucket(_) => panic!("expected string, got bucket id"),
+            }
+        }
+
+        fn expect_ts_range(&self) -> (DateTime<Utc>, DateTime<Utc>) {
+            match self {
+                Self::String(_) => panic!("expected TS range, got string"),
+                Self::TSRange(b, e) => (*b, *e),
+                Self::Bucket(_) => panic!("expected TS range, got bucket id"),
+            }
+        }
+
+        fn expect_bucket_id(&self) -> u32 {
+            match self {
+                Self::String(_) => panic!("expected bucket id, got string"),
+                Self::TSRange(_, _) => panic!("expected bucket id, got TS range"),
+                Self::Bucket(bucket_id) => *bucket_id,
+            }
+        }
+    }
+
+    proptest! {
+        /// A property test that asserts a write comprised of an arbitrary
+        /// subset of [`TEST_TAG_NAME_SET`] with randomised values, that is
+        /// partitioned using a partitioning template arbitrarily selected from
+        /// [`TEST_TEMPLATE_PARTS`], can be reversed to the full set of tags
+        /// and/or hash-bucket IDs via [`build_column_values()`].
+        #[test]
+        fn prop_reversible_mapping(
+            template in arbitrary_template_parts(),
+            tag_values in arbitrary_tag_value_map(),
+            ts in 0_i64..i64::MAX,
+        ) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            let template = template.clone().into_iter().collect::<Vec<_>>();
+            let template = test_table_partition_override(template);
+
+            writer
+                .write_time("time", vec![ts].into_iter())
+                .unwrap();
+
+            for (col, value) in &tag_values {
+                writer
+                    .write_tag(col.as_str(), Some(&[0b00000001]), vec![value.as_str()].into_iter())
+                    .unwrap();
+            }
+
+            writer.commit();
+            let keys: Vec<_> = generate_denormalised_keys(&batch, template.parts())
+                .unwrap();
+            assert_eq!(keys.len(), 1);
+
+            // Reverse the encoding.
+            let reversed: Vec<(&str, ColumnValue<'_>)> = build_column_values(&template, &keys[0]).collect();
+
+            // Build the expected set of reversed tags by filtering out any
+            // NULL tags (preserving empty string values).
+            let ts = Utc.timestamp_nanos(ts);
+            let want_reversed: Vec<(&str, ExpectedColumnValue)> = template.parts().filter_map(|v| match v {
+                TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => {
+                    // This tag had a (potentially empty) value wrote and should
+                    // appear in the reversed output.
+                    Some((col_name, ExpectedColumnValue::String(tag_values.get(col_name).unwrap().to_string())))
+                }
+                TemplatePart::TimeFormat("%Y/%m/%d" | "%Y-%m-%d") => {
+                    let begin = Utc.with_ymd_and_hms(ts.year(), ts.month(), ts.day(), 0, 0, 0).unwrap();
+                    let end = begin + Days::new(1);
+                    Some((TIME_COLUMN_NAME, ExpectedColumnValue::TSRange(begin, end)))
+                }
+                TemplatePart::Bucket(col_name, num_buckets) if tag_values.contains_key(col_name) => {
+                    // Hash-bucketing is not fully-reversible from value to
+                    // tag-name (intentionally so, it makes it much simpler to
+                    // implement).
+                    //
+                    // The test must assign buckets as they are when the
+                    // partition key is rendered.
+                    let want_bucket = BucketHasher::new(num_buckets).assign_bucket(tag_values.get(col_name).unwrap());
+                    Some((col_name, ExpectedColumnValue::Bucket(want_bucket)))
+                }
+                _ => None,
+            }).collect();
+
+            assert_eq!(want_reversed.len(), reversed.len());
+
+            for ((want_col, want_val), (got_col, got_val)) in want_reversed.iter().zip(reversed.iter()) {
+                assert_eq!(got_col, want_col, "column names differ");
+
+                match got_val {
+                    ColumnValue::Identity(_) => {
+                        // An identity is both equal to, and a prefix of, the
+                        // original value.
+                        let want_val = want_val.expect_string();
+                        assert_eq!(got_val, &want_val, "identity values differ");
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Prefix(_) => {
+                        let want_val = want_val.expect_string();
+                        assert!(
+                            got_val.is_prefix_match_of(want_val),
+                            "prefix mismatch; {:?} is not a prefix of {:?}",
+                            got_val,
+                            want_val,
+                        );
+                    },
+                    ColumnValue::Datetime{..} => {
+                        let (want_begin, want_end) = want_val.expect_ts_range();
+                        match got_val {
+                            ColumnValue::Datetime{begin, end} => {
+                                assert_eq!(want_begin, *begin);
+                                assert_eq!(want_end, *end);
+                            }
+                            _ => panic!("expected datatime column value but got: {:?}", got_val)
+                        }
+                    },
+                    ColumnValue::Bucket(got_bucket_id) => {
+                        let want_bucket_id = want_val.expect_bucket_id();
+                        assert_eq!(*got_bucket_id, want_bucket_id);
+                    }
+                };
+            }
+        }
+
+        /// A property test that asserts the partitioner tolerates (does not
+        /// panic) randomised, potentially invalid strftime formatter strings.
+        #[test]
+        fn prop_arbitrary_strftime_format(fmt in any::<String>()) {
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, 1);
+
+            // This sequence causes chrono's formatter to panic with a "do not
+            // use this" message...
+            //
+            // This is validated to not be part of the formatter (among other
+            // invalid sequences) when constructing a template from the user
+            // input/proto.
+            //
+            // Uniquely this causes a panic, whereas others do not - so it must
+            // be filtered out when fuzz-testing that invalid sequences do not
+            // cause a panic in the key generator.
+            prop_assume!(!fmt.contains("%#z"));
+
+            // Generate a single time-based partitioning template with a
+            // randomised format string.
+            let template = vec![
+                TemplatePart::TimeFormat(&fmt),
+            ];
+            let template = test_table_partition_override(template);
+
+            // Timestamp: 2023-05-29T13:03:16Z
+            writer
+                .write_time("time", vec![1685365396931384064].into_iter())
+                .unwrap();
+
+            writer
+                .write_tag("bananas", Some(&[0b00000001]), vec!["great"].into_iter())
+                .unwrap();
+
+            writer.commit();
+            let ret = partition_keys(&batch, template.parts())
+                .map(|v| v.expect("non-identical consecutive keys"))
+                .collect::<Result<Vec<_>, _>>();
+
+            // The is allowed to succeed or fail under this test (but not
+            // panic), and the returned error/value must match certain
+            // properties:
+            match ret {
+                Ok(v) => { assert_eq!(v.len(), 1); },
+                Err(e) => { assert_matches!(e, PartitionKeyError::InvalidStrftime); },
+            }
+        }
+
+        // Drives the strftime formatter through the "front door", using the
+        // same interface as a user would call to partition data. This validates
+        // the integration between the various formatters, range encoders,
+        // dedupe, etc.
+        #[test]
+        fn prop_strftime_integration(
+            times in arbitrary_timestamps(),
+            format in prop_oneof![
+                Just("%Y-%m-%d"), // Default scheme
+                Just("%s")        // Unix seconds, to drive increased cache miss rate in strftime formatter
+            ]
+        ) {
+            use std::fmt::Write;
+
+            let mut batch = MutableBatch::new();
+            let mut writer = Writer::new(&mut batch, times.len());
+            let row_count = times.len();
+
+            let template = test_table_partition_override(vec![TemplatePart::TimeFormat(format)]);
+
+            writer
+                .write_time("time", times.clone().into_iter())
+                .unwrap();
+
+            writer.commit();
+
+            let fmt = StrftimeItems::new(format);
+            let iter = partition_batch(&batch, &template);
+
+            let mut observed_rows = 0;
+
+            // For each partition key and the calculated row range
+            for (key, range) in iter {
+                let key = key.unwrap();
+
+                observed_rows += range.len();
+
+                // Validate all rows in that range render to the same timestamp
+                // value as the partition key when using the same format, using
+                // a known-good formatter.
+                for ts in &times[range] {
+                    // Generate the control string.
+                    let mut control = String::new();
+                    let _ = write!(
+                        control,
+                        "{}",
+                        Utc.timestamp_nanos(*ts)
+                            .format_with_items(fmt.clone())
+                    );
+                    assert_eq!(control, key);
+                }
+            }
+
+            assert_eq!(observed_rows, row_count);
+        }
+    }
+}
diff --git a/partition/src/strftime.rs b/partition/src/strftime.rs
new file mode 100644
index 00000000000..bd5230035d7
--- /dev/null
+++ b/partition/src/strftime.rs
@@ -0,0 +1,415 @@
+use std::fmt::Write;
+
+use chrono::{format::StrftimeItems, TimeZone, Utc};
+
+use crate::PartitionKeyError;
+
+use super::encode_key_part;
+
+/// The number of nanoseconds in 1 day, definitely recited from memory.
+const DAY_NANOSECONDS: i64 = 86_400_000_000_000;
+
+/// The default YMD formatter spec.
+const YMD_SPEC: &str = "%Y-%m-%d";
+
+/// A FIFO ring buffer, holding `N` lazily initialised slots.
+///
+/// This is optimised for low values of `N` (where N*T covers a few cache lines)
+/// as it performs an O(n) linear search.
+#[derive(Debug)]
+struct RingBuffer<const N: usize, T> {
+    buf: [Option<T>; N],
+
+    /// Index into to the last wrote value.
+    last_idx: usize,
+}
+
+impl<const N: usize, T> Default for RingBuffer<N, T>
+where
+    T: Default,
+{
+    fn default() -> Self {
+        Self {
+            buf: [(); N].map(|_| Default::default()), // default init for non-const type
+            last_idx: N - 1,
+        }
+    }
+}
+
+impl<const N: usize, T> RingBuffer<N, T>
+where
+    T: Default,
+{
+    /// Return a mutable reference to the next slot to be overwrote. This method
+    /// initialises the slot if it has not been previously used.
+    ///
+    /// This is like an "insert" operation, but allows the caller to re-use the
+    /// contents of the slot to minimise allocations.
+    ///
+    /// This is an O(1) operation.
+    fn next_slot(&mut self) -> &mut T {
+        // Advance the next slot pointer
+        self.last_idx += 1;
+        self.last_idx %= N;
+
+        let v = self.buf[self.last_idx].get_or_insert_with(Default::default);
+
+        v
+    }
+
+    /// Drop the last buffer entry.
+    ///
+    /// This may cause spurious cache misses due to the short-circuiting search
+    /// observing an empty element, potentially before non-empty elements.
+    fn drop_last(&mut self) {
+        self.buf[self.last_idx] = None;
+    }
+
+    /// Find the first initialised slot that causes `F` to evaluate to true,
+    /// returning the slot contents.
+    ///
+    /// This is a O(n) linear search operation, which for small N can be as
+    /// fast, or faster, than a hashmap lookup by key.
+    fn find<F>(&self, f: F) -> Option<&'_ T>
+    where
+        F: Fn(&T) -> bool,
+    {
+        for v in &self.buf {
+            let v = v.as_ref()?;
+            if f(v) {
+                return Some(v);
+            }
+        }
+        None
+    }
+}
+
+/// A strftime-like formatter of epoch timestamps with nanosecond granularity.
+///
+/// # Deferred Errors
+///
+/// If the provided stftime formatter is invalid, an
+/// [`PartitionKeyError::InvalidStrftime`] error is raised during the formatting
+/// call to [`StrftimeFormatter::render()`] and not during initialisation. This
+/// is a limitation of the underlying library.
+///
+/// # Caching
+///
+/// It is very common for batches of writes to contain multiple measurements
+/// taken at the same timestamp; for example, a periodic scraper of metric
+/// values will assign a single timestamp for the entire batch of observations.
+///
+/// To leverage this reuse of timestamps, this type retains a cache of the 5
+/// most recently observed distinct timestamps to avoid recomputing the same
+/// formatted string for each repeat occurrence.
+///
+/// In the best case, this reduces N row formats down to a single format
+/// operation, and in the worst case, it changes the memory overhead from "rows"
+/// to "rows + 5" which amortises nicely as batch sizes increase. If more than 5
+/// timestamps are observed, the existing buffer allocations are reused when
+/// computing the replacement values.
+///
+/// # `YYYY-MM-DD` Reduction Specialisation
+///
+/// The default (and therefore most common) formatting spec is "%Y-%m-%d", as
+/// this is the IOx default partitioning template. The vast majority of writes
+/// will utilise this format spec.
+///
+/// Because this spec is so common, a special case optimisation is utilised for
+/// it: for any given timestamp, first normalise the value by reducing the
+/// precision such that the timestamp is rounded down to the nearest whole day
+/// before further processing.
+///
+/// This removes all the sub-day variance (hours, minutes, seconds, etc) from
+/// the value, without changing the formatter output (it still produces the same
+/// string). This in turn causes any timestamp from the same day to be a cache
+/// hit with any prior value for the same day, regardless of "time" portion of
+/// the timestamp.
+///
+/// Combined with the above cache, this raises the cache hit rate to ~100% for
+/// write batches that span less than 6 days, effectively amortising the cost of
+/// timestamp formatting to O(1) for these very common batches.
+#[derive(Debug)]
+pub(super) struct StrftimeFormatter<'a> {
+    /// The strftime formatter definition.
+    ///
+    /// NOTE: the value below is UNVALIDATED - if the input strftime format
+    /// contains invalid formatter directives, then the error is deferred until
+    /// formatting a timestamp.
+    format: StrftimeItems<'a>,
+
+    /// As an optimisation, when this formatter is using the default YYYY-MM-DD
+    /// partitioning template, timestamps are normalised to per-day granularity,
+    /// preventing variances in the timestamp of less-than 1 day from causing a
+    /// miss in the cached "values".
+    ///
+    /// This optimisation massively increases the reuse of cached, pre-formatted
+    /// strings.
+    is_ymd_format: bool,
+
+    /// A set of 5 most recently added timestamps, and the formatted string they
+    /// map to.
+    values: RingBuffer<5, (i64, String)>,
+
+    /// The last observed timestamp.
+    ///
+    /// This value changes each time a timestamp is returned to the user, either
+    /// from the cache of pre-generated strings, or by generating a new one and
+    /// MUST always track the last timestamp given to
+    /// [`StrftimeFormatter::render()`].
+    last_ts: Option<i64>,
+}
+
+impl<'a> StrftimeFormatter<'a> {
+    /// Initialise a new [`StrftimeFormatter`] with the given stftime-like
+    /// format string.
+    ///
+    /// The exact formatter specification is [documented here].
+    ///
+    /// If the formatter contains an invalid spec, an error is raised when
+    /// formatting.
+    ///
+    /// [documented here]:
+    ///     https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+    pub(super) fn new(format: &'a str) -> Self {
+        let mut is_default_format = false;
+        if format == YMD_SPEC {
+            is_default_format = true;
+        }
+
+        Self {
+            format: StrftimeItems::new(format),
+            is_ymd_format: is_default_format,
+            values: RingBuffer::default(),
+            last_ts: None,
+        }
+    }
+
+    /// Format `timestamp` to the format spec provided during initialisation,
+    /// writing the result to `out`.
+    pub(super) fn render<W>(&mut self, timestamp: i64, mut out: W) -> Result<(), PartitionKeyError>
+    where
+        W: std::fmt::Write,
+    {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        // Retain this timestamp as the last observed timestamp.
+        self.last_ts = Some(timestamp);
+
+        // Check if this timestamp has already been rendered.
+        if let Some(v) = self.values.find(|(t, _v)| *t == timestamp) {
+            // It has! Re-use the existing formatted string.
+            out.write_str(&v.1)?;
+            return Ok(());
+        }
+
+        // Obtain a mutable reference to the next item to be replaced, re-using
+        // the string buffer within it to avoid allocating (or initialising it
+        // if it was not yet initialised).
+        let buf = self.values.next_slot();
+
+        // Reset the slot value
+        buf.0 = timestamp;
+        buf.1.clear();
+
+        // Format the timestamp value into the slot buffer.
+        if write!(
+            buf.1,
+            "{}",
+            Utc.timestamp_nanos(timestamp)
+                .format_with_items(self.format.clone()) // Cheap clone of refs
+        )
+        .is_err()
+        {
+            // The string buffer may be empty, or contain partially rendered
+            // output before the error was raised.
+            //
+            // Remove this entry from the cache to prevent there being a mapping
+            // of `timestamp -> <empty or incomplete output>`.
+            self.values.drop_last();
+            return Err(PartitionKeyError::InvalidStrftime);
+        };
+
+        // Encode any reserved characters in this new string.
+        buf.1 = encode_key_part(&buf.1).to_string();
+
+        // Render this new value to the caller's buffer
+        out.write_str(&buf.1)?;
+
+        Ok(())
+    }
+
+    /// Reduce the precision of the timestamp iff using the default "%Y-%m-%d"
+    /// formatter string, returning a value rounded to the nearest whole day.
+    ///
+    /// If the formatter is not this special-case value, `timestamp` is returned
+    /// unchanged.
+    fn maybe_reduce(&self, timestamp: i64) -> i64 {
+        if !self.is_ymd_format {
+            return timestamp;
+        }
+        // Don't map timestamps less than the value we would subtract.
+        if timestamp < DAY_NANOSECONDS {
+            return timestamp;
+        }
+        timestamp - (timestamp % DAY_NANOSECONDS)
+    }
+
+    /// Returns true if the output of rendering `timestamp` will match the last
+    /// rendered timestamp, after optionally applying the precision reduction
+    /// optimisation.
+    pub(crate) fn equals_last(&self, timestamp: i64) -> bool {
+        // Optionally apply the default format reduction optimisation.
+        let timestamp = self.maybe_reduce(timestamp);
+
+        self.last_ts.map(|v| v == timestamp).unwrap_or_default()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use assert_matches::assert_matches;
+    use data_types::partition_template::{TablePartitionTemplateOverride, TemplatePart};
+    use proptest::prelude::*;
+
+    use super::*;
+
+    #[test]
+    fn test_default_formatter() {
+        let template = TablePartitionTemplateOverride::default();
+        let expect = template.parts().collect::<Vec<_>>();
+
+        // If this assert fails (and it probably shouldn't!) then you may want
+        // to consider changing the special case optimisation above.
+        assert_matches!(expect.as_slice(), &[TemplatePart::TimeFormat(YMD_SPEC)]);
+    }
+
+    #[test]
+    fn test_never_empty() {
+        let mut fmt = StrftimeFormatter::new("");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).expect("should render string");
+        assert!(!buf.is_empty());
+        assert_eq!(buf, "^");
+    }
+
+    #[test]
+    fn test_incomplete_formatter() {
+        let mut fmt = StrftimeFormatter::new("%");
+
+        let mut buf = String::new();
+        let got = fmt.render(42, &mut buf);
+        assert_matches!(got, Err(PartitionKeyError::InvalidStrftime));
+    }
+
+    #[test]
+    fn test_incomplete_formatter_removes_bad_mapping() {
+        let mut fmt = StrftimeFormatter::new("%s");
+
+        let mut buf = String::new();
+        fmt.render(42, &mut buf).unwrap();
+
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+
+        // This obviously isn't possible through normal usage, but to trigger
+        // the "failed to render" code path, reach in and tweak the formatter to
+        // cause it to fail.
+        fmt.format = StrftimeItems::new("%");
+
+        // Trigger the "cannot format" code path
+        fmt.render(4242, &mut buf).expect_err("invalid formatter");
+
+        // And ensure the ring buffer was left in a clean state
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), None, None, None, None]
+        );
+    }
+
+    #[test]
+    fn test_uses_ring_buffer() {
+        let mut fmt = StrftimeFormatter::new("%H");
+        let mut buf = String::new();
+
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+        fmt.render(12345, &mut buf).expect("should render string");
+        fmt.render(42, &mut buf).expect("should render string");
+
+        // Assert the above repetitive values were deduped in the cache.
+        assert_matches!(
+            fmt.values.buf.as_slice(),
+            [Some((42, _)), Some((12345, _)), None, None, None]
+        );
+        assert_eq!(fmt.values.last_idx, 1);
+    }
+
+    const FORMATTER_SPEC_PARTS: &[&str] = &[
+        "%Y", "%m", "%d", "%H", "%m", "%.9f", "%r", "%+", "%t", "%n", "%A", "%c",
+    ];
+
+    prop_compose! {
+        /// Yield an arbitrary formatter spec selected from
+        /// [`FORMATTER_SPEC_PARTS`] delimited by a random character.
+        fn arbitrary_formatter_spec()(
+            delimiter in any::<char>(),
+            v in proptest::collection::vec(
+                proptest::sample::select(FORMATTER_SPEC_PARTS).prop_map(ToString::to_string),
+                (0, 10) // Set size range
+            )) -> String {
+            v.join(&delimiter.to_string())
+        }
+    }
+
+    fn default_formatter_spec() -> impl Strategy<Value = String> {
+        Just(YMD_SPEC.to_string())
+    }
+
+    proptest! {
+        /// The [`StrftimeFormatter`] is a glorified wrapper around chrono's
+        /// formatter, therefore this test asserts the following property:
+        ///
+        ///     For any timestamp and formatter, the output of this type must
+        ///     match the output of chrono's formatter, after key encoding.
+        ///
+        /// Validating this asserts correctness of the wrapper itself, assuming
+        /// chrono's formatter produces correct output. Note the encoding is
+        /// tested in the actual partitioner module.
+        #[test]
+        fn prop_differential_validation(
+            timestamps in prop::collection::vec(any::<i64>(), 1..100),
+            format in prop_oneof![arbitrary_formatter_spec(), default_formatter_spec(), any::<String>()],
+        ) {
+            let mut fmt = StrftimeFormatter::new(&format);
+            let items = StrftimeItems::new(&format);
+
+            for ts in timestamps {
+                // Generate the control string.
+                let mut control = String::new();
+                let _ = write!(
+                    control,
+                    "{}",
+                    Utc.timestamp_nanos(ts)
+                        .format_with_items(items.clone())
+                );
+                let control = encode_key_part(&control);
+
+                // Generate the test string.
+                let mut test = String::new();
+                if fmt.render(ts, &mut test).is_err() {
+                    // Any error results in the key not being used, so any
+                    // differences are inconsequential.
+                    continue;
+                }
+
+                assert_eq!(control, test);
+            }
+        }
+    }
+}
diff --git a/partition/src/traits.rs b/partition/src/traits.rs
new file mode 100644
index 00000000000..439e2a67ef9
--- /dev/null
+++ b/partition/src/traits.rs
@@ -0,0 +1,61 @@
+mod mutable_batch;
+mod record_batch;
+
+use thiserror::Error;
+
+/// An error accessing the time column of a batch.
+#[allow(missing_copy_implementations)]
+#[derive(Debug, Error, PartialEq, Eq, Clone)]
+pub enum TimeColumnError {
+    /// The batch did not have a time column.
+    #[error("No time column found")]
+    NotFound,
+}
+
+/// The behavior a column in a batch needs to have to be partitioned
+pub trait PartitioningColumn: std::fmt::Debug {
+    /// The type of a thing that can be used to identify whether a tag has changed or not; may or
+    /// may not be the actual tag
+    type TagIdentityKey: ?Sized + PartialEq;
+
+    /// Whether the value at the given row index is valid or NULL
+    fn is_valid(&self, idx: usize) -> bool;
+
+    /// The raw packed validity bytes.
+    ///
+    /// The validity mask MUST follow the Arrow specification for validity masks
+    /// (<https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps>).
+    fn valid_bytes(&self) -> &[u8];
+
+    /// Get the identity of the tag at the given row index.
+    ///
+    /// The return value is only valid if `is_valid(idx)` for the same `idx`
+    /// returns true.
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey>;
+
+    /// Get the value of the tag that has the given identity
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str>;
+
+    /// A string describing this column's data type; used in error messages
+    fn type_description(&self) -> String;
+}
+
+/// Behavior of a batch of data used by partitioning code
+pub trait Batch {
+    /// The type of this batch's columns
+    type Column: PartitioningColumn;
+
+    /// How many rows are in this batch
+    fn num_rows(&self) -> usize;
+
+    /// The column in the batch with the given name, if any
+    fn column(&self, column: &str) -> Option<&Self::Column>;
+
+    /// Return the values in the time column in this batch. Return an error if the batch has no
+    /// time column.
+    ///
+    /// # Panics
+    ///
+    /// If a time column exists but its data isn't the expected type, this function will panic.
+    fn time_column(&self) -> Result<&[i64], TimeColumnError>;
+}
diff --git a/partition/src/traits/mutable_batch.rs b/partition/src/traits/mutable_batch.rs
new file mode 100644
index 00000000000..981740df4dc
--- /dev/null
+++ b/partition/src/traits/mutable_batch.rs
@@ -0,0 +1,60 @@
+use super::{Batch, PartitioningColumn, TimeColumnError};
+use mutable_batch::{
+    column::{Column as MutableBatchColumn, ColumnData},
+    MutableBatch,
+};
+use schema::TIME_COLUMN_NAME;
+
+impl PartitioningColumn for MutableBatchColumn {
+    type TagIdentityKey = i32;
+
+    fn is_valid(&self, idx: usize) -> bool {
+        self.valid_mask().get(idx)
+    }
+
+    fn valid_bytes(&self) -> &[u8] {
+        self.valid_mask().bytes()
+    }
+
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> {
+        debug_assert!(PartitioningColumn::is_valid(self, idx));
+        match self.data() {
+            ColumnData::Tag(col_data, _, _) => Some(&col_data[idx]),
+            _ => None,
+        }
+    }
+
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> {
+        match self.data() {
+            ColumnData::Tag(_, dictionary, _) => dictionary.lookup_id(*tag_identity_key),
+            _ => None,
+        }
+    }
+
+    fn type_description(&self) -> String {
+        self.influx_type().to_string()
+    }
+}
+
+impl Batch for MutableBatch {
+    type Column = MutableBatchColumn;
+
+    fn num_rows(&self) -> usize {
+        self.rows()
+    }
+
+    fn column(&self, column: &str) -> Option<&Self::Column> {
+        self.column(column).ok()
+    }
+
+    fn time_column(&self) -> Result<&[i64], TimeColumnError> {
+        let time_column = self
+            .column(TIME_COLUMN_NAME)
+            .map_err(|_| TimeColumnError::NotFound)?;
+
+        match &time_column.data() {
+            ColumnData::I64(col_data, _) => Ok(col_data),
+            x => unreachable!("expected i64 got {}", x),
+        }
+    }
+}
diff --git a/partition/src/traits/record_batch.rs b/partition/src/traits/record_batch.rs
new file mode 100644
index 00000000000..57f0dff9cea
--- /dev/null
+++ b/partition/src/traits/record_batch.rs
@@ -0,0 +1,82 @@
+use super::{Batch, PartitioningColumn, TimeColumnError};
+use arrow::{
+    array::{Array, DictionaryArray, StringArray, TimestampNanosecondArray},
+    datatypes::{DataType, Int32Type},
+    record_batch::RecordBatch,
+};
+use schema::TIME_COLUMN_NAME;
+use std::sync::Arc;
+
+impl PartitioningColumn for Arc<dyn Array> {
+    type TagIdentityKey = str;
+
+    fn is_valid(&self, idx: usize) -> bool {
+        Array::is_valid(&self, idx)
+    }
+
+    fn valid_bytes(&self) -> &[u8] {
+        self.nulls()
+            .expect("this RecordBatch's Array should be nullable")
+            .validity()
+    }
+
+    fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> {
+        debug_assert!(PartitioningColumn::is_valid(self, idx));
+        match self.data_type() {
+            DataType::Utf8 => self
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .map(|col_data| col_data.value(idx)),
+            DataType::Dictionary(key, value)
+                if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 =>
+            {
+                let dict = self
+                    .as_any()
+                    .downcast_ref::<DictionaryArray<Int32Type>>()
+                    .expect("should have gotten a DictionaryArray");
+
+                let values = dict
+                    .values()
+                    .as_any()
+                    .downcast_ref::<StringArray>()
+                    .expect("should have gotten a StringArray");
+                Some(values.value(dict.key(idx)?))
+            }
+            _ => None,
+        }
+    }
+
+    fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> {
+        Some(tag_identity_key)
+    }
+
+    fn type_description(&self) -> String {
+        self.data_type().to_string()
+    }
+}
+
+impl Batch for RecordBatch {
+    type Column = Arc<dyn Array>;
+
+    fn num_rows(&self) -> usize {
+        self.num_rows()
+    }
+
+    fn column(&self, column: &str) -> Option<&Self::Column> {
+        self.column_by_name(column)
+    }
+
+    fn time_column(&self) -> Result<&[i64], TimeColumnError> {
+        let time_column = self
+            .column_by_name(TIME_COLUMN_NAME)
+            .ok_or(TimeColumnError::NotFound)?;
+
+        Ok(time_column
+            .as_any()
+            .downcast_ref::<TimestampNanosecondArray>()
+            .expect("time column was an unexpected type")
+            .values()
+            .inner()
+            .typed_data())
+    }
+}
diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml
index ffd92b17078..5e5c828deb9 100644
--- a/predicate/Cargo.toml
+++ b/predicate/Cargo.toml
@@ -5,18 +5,21 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 chrono = { version = "0.4", default-features = false }
 data_types = { path = "../data_types" }
 datafusion = { workspace = true }
 datafusion_util = { path = "../datafusion_util" }
-itertools = "0.11"
+itertools = "0.12"
 observability_deps = { path = "../observability_deps" }
 query_functions = { path = "../query_functions"}
 schema = { path = "../schema" }
-snafu = "0.7"
-sqlparser = "0.37.0"
+snafu = "0.8"
+sqlparser = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/predicate/src/delete_expr.rs b/predicate/src/delete_expr.rs
index de7b1916464..fc241f0a032 100644
--- a/predicate/src/delete_expr.rs
+++ b/predicate/src/delete_expr.rs
@@ -126,9 +126,9 @@ pub(crate) fn df_to_scalar(
 
 #[cfg(test)]
 mod tests {
-    use std::{ops::Not, sync::Arc};
+    use std::ops::Not;
 
-    use arrow::datatypes::Field;
+    use arrow::datatypes::DataType;
     use test_helpers::assert_contains;
 
     use super::*;
@@ -194,24 +194,17 @@ mod tests {
 
     #[test]
     fn test_unsupported_scalar_value() {
-        let scalar = datafusion::scalar::ScalarValue::List(
-            Some(vec![]),
-            Arc::new(Field::new(
-                "field",
-                arrow::datatypes::DataType::Float64,
-                true,
-            )),
-        );
+        let array = datafusion::scalar::ScalarValue::new_list(&[], &DataType::Float64);
+        let scalar = datafusion::scalar::ScalarValue::List(array);
         let res = df_to_scalar(scalar);
         assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:");
     }
 
     #[test]
     fn test_unsupported_scalar_value_in_expr() {
-        let expr = col("foo").eq(lit(datafusion::scalar::ScalarValue::new_list(
-            Some(vec![]),
-            arrow::datatypes::DataType::Float64,
-        )));
+        let arr =
+            datafusion::scalar::ScalarValue::new_list(&[], &arrow::datatypes::DataType::Float64);
+        let expr = col("foo").eq(lit(datafusion::scalar::ScalarValue::List(arr)));
         let res = df_to_expr(expr);
         assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:");
     }
diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs
index 16fa16d026c..5dd9591ce60 100644
--- a/predicate/src/lib.rs
+++ b/predicate/src/lib.rs
@@ -24,9 +24,9 @@ use datafusion::{
     common::tree_node::{TreeNodeVisitor, VisitRecursion},
     error::DataFusionError,
     logical_expr::{binary_expr, BinaryExpr},
-    prelude::{col, lit_timestamp_nano, Expr},
+    prelude::{col, Expr},
 };
-use datafusion_util::{make_range_expr, AsExpr};
+use datafusion_util::{lit_timestamptz_nano, make_range_expr, AsExpr};
 use observability_deps::tracing::debug;
 use rpc_predicate::VALUE_COLUMN_NAME;
 use schema::TIME_COLUMN_NAME;
@@ -188,8 +188,8 @@ impl Predicate {
                 // time_expr =  NOT(start <= time_range <= end)
                 // Equivalent to: (time < start OR time > end)
                 let time_expr = col(TIME_COLUMN_NAME)
-                    .lt(lit_timestamp_nano(range.start()))
-                    .or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end())));
+                    .lt(lit_timestamptz_nano(range.start()))
+                    .or(col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(range.end())));
 
                 match expr {
                     None => expr = Some(time_expr),
@@ -301,7 +301,7 @@ impl Predicate {
 
     /// Add an  exprestion "time > retention_time"
     pub fn with_retention(mut self, retention_time: i64) -> Self {
-        let expr = col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(retention_time));
+        let expr = col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time));
         self.exprs.push(expr);
         self
     }
@@ -458,19 +458,14 @@ impl TreeNodeVisitor for RowBasedVisitor {
             | Expr::Not(_)
             | Expr::OuterReferenceColumn(_, _)
             | Expr::Placeholder { .. }
-            | Expr::QualifiedWildcard { .. }
             | Expr::ScalarFunction { .. }
             | Expr::ScalarSubquery(_)
-            | Expr::ScalarUDF { .. }
             | Expr::ScalarVariable(_, _)
             | Expr::SimilarTo { .. }
             | Expr::Sort { .. }
             | Expr::TryCast { .. }
-            | Expr::Wildcard => Ok(VisitRecursion::Continue),
-            Expr::AggregateFunction { .. }
-            | Expr::AggregateUDF { .. }
-            | Expr::GroupingSet(_)
-            | Expr::WindowFunction { .. } => {
+            | Expr::Wildcard { .. } => Ok(VisitRecursion::Continue),
+            Expr::AggregateFunction { .. } | Expr::GroupingSet(_) | Expr::WindowFunction { .. } => {
                 self.row_based = false;
                 Ok(VisitRecursion::Stop)
             }
diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs
index c58914fa957..a4cdf72016c 100644
--- a/predicate/src/rpc_predicate/column_rewrite.rs
+++ b/predicate/src/rpc_predicate/column_rewrite.rs
@@ -6,7 +6,10 @@ use schema::{InfluxColumnType, Schema};
 
 /// Logic for rewriting expressions from influxrpc that reference non
 /// existent columns, or columns that are not tags, to NULL.
-pub fn missing_tag_to_null(schema: &Schema, expr: Expr) -> DataFusionResult<Transformed<Expr>> {
+pub(crate) fn missing_tag_to_null(
+    schema: &Schema,
+    expr: Expr,
+) -> DataFusionResult<Transformed<Expr>> {
     Ok(match expr {
         Expr::Column(col) if !tag_column_exists(schema, &col)? => Transformed::Yes(lit_null()),
         expr => Transformed::No(expr),
diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs
index bcf0299196f..94cc4db4138 100644
--- a/predicate/src/rpc_predicate/field_rewrite.rs
+++ b/predicate/src/rpc_predicate/field_rewrite.rs
@@ -7,7 +7,7 @@ use arrow::record_batch::RecordBatch;
 use datafusion::common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion};
 use datafusion::common::DFSchema;
 use datafusion::error::{DataFusionError, Result as DataFusionResult};
-use datafusion::optimizer::utils::split_conjunction_owned;
+use datafusion::logical_expr::utils::split_conjunction_owned;
 use datafusion::physical_expr::create_physical_expr;
 use datafusion::physical_expr::execution_props::ExecutionProps;
 use datafusion::physical_plan::ColumnarValue;
diff --git a/query_functions/Cargo.toml b/query_functions/Cargo.toml
index 5fec6b2e46d..4585bad1218 100644
--- a/query_functions/Cargo.toml
+++ b/query_functions/Cargo.toml
@@ -5,18 +5,21 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 chrono = { version = "0.4", default-features = false }
 datafusion = { workspace = true }
 once_cell = "1"
 regex = "1"
-regex-syntax = "0.7.4"
+regex-syntax = "0.8.1"
 schema = { path = "../schema" }
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-itertools = "0.11.0"
-tokio = { version = "1.32", features = ["macros", "parking_lot"] }
 datafusion_util = { path = "../datafusion_util" }
+itertools = "0.12.0"
+tokio = { version = "1.35", features = ["macros", "parking_lot"] }
diff --git a/query_functions/src/coalesce_struct.rs b/query_functions/src/coalesce_struct.rs
index 0892e721e14..b33920db4d9 100644
--- a/query_functions/src/coalesce_struct.rs
+++ b/query_functions/src/coalesce_struct.rs
@@ -40,7 +40,7 @@
 //!   d: {a: 2, b: 3},
 //! }
 //! ```
-use std::sync::Arc;
+use std::{any::Any, sync::Arc};
 
 use arrow::{
     array::{Array, StructArray},
@@ -49,10 +49,8 @@ use arrow::{
 };
 use datafusion::{
     common::cast::as_struct_array,
-    error::DataFusionError,
-    logical_expr::{
-        ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, Signature, Volatility,
-    },
+    error::{DataFusionError, Result},
+    logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility},
     physical_plan::ColumnarValue,
     prelude::Expr,
     scalar::ScalarValue,
@@ -62,11 +60,25 @@ use once_cell::sync::Lazy;
 /// The name of the `coalesce_struct` UDF given to DataFusion.
 pub const COALESCE_STRUCT_UDF_NAME: &str = "coalesce_struct";
 
-/// Implementation of `coalesce_struct`.
-///
-/// See [module-level docs](self) for more information.
-pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type: ReturnTypeFunction = Arc::new(move |arg_types| {
+#[derive(Debug)]
+struct CoalesceStructUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for CoalesceStructUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        COALESCE_STRUCT_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
         if arg_types.is_empty() {
             return Err(DataFusionError::Plan(format!(
                 "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument"
@@ -83,10 +95,10 @@ pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
             }
         }
 
-        Ok(Arc::new(first_dt.clone()))
-    });
+        Ok(first_dt.clone())
+    }
 
-    let fun: ScalarFunctionImplementation = Arc::new(move |args: &[ColumnarValue]| {
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
         #[allow(clippy::manual_try_fold)]
         args.iter().enumerate().fold(Ok(None), |accu, (pos, arg)| {
             let Some(accu) = accu? else {return Ok(Some(arg.clone()))};
@@ -106,11 +118,11 @@ pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
                     return Ok(Some(ColumnarValue::Scalar(scalar_coalesce_struct(scalar1, scalar2))));
                 }
                 (ColumnarValue::Scalar(s), ColumnarValue::Array(array2)) => {
-                    let array1 = s.to_array_of_size(array2.len());
+                    let array1 = s.to_array_of_size(array2.len())?;
                     (array1, Arc::clone(array2))
                 }
                 (ColumnarValue::Array(array1), ColumnarValue::Scalar(s)) => {
-                    let array2 = s.to_array_of_size(array1.len());
+                    let array2 = s.to_array_of_size(array1.len())?;
                     (array1, array2)
                 }
                 (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => {
@@ -123,14 +135,16 @@ pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
         })?.ok_or_else(|| DataFusionError::Plan(format!(
                 "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument"
             )))
-    });
-
-    Arc::new(ScalarUDF::new(
-        COALESCE_STRUCT_UDF_NAME,
-        &Signature::variadic_any(Volatility::Immutable),
-        &return_type,
-        &fun,
-    ))
+    }
+}
+
+/// Implementation of `coalesce_struct`.
+///
+/// See [module-level docs](self) for more information.
+pub static COALESCE_STRUCT_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(CoalesceStructUDF {
+        signature: Signature::variadic_any(Volatility::Immutable),
+    }))
 });
 
 /// Recursively fold [`Array`]s.
@@ -181,10 +195,7 @@ fn scalar_coalesce_struct(scalar1: ScalarValue, scalar2: &ScalarValue) -> Scalar
 ///
 /// See [module-level docs](self) for more information.
 pub fn coalesce_struct(args: Vec<Expr>) -> Expr {
-    Expr::ScalarUDF(datafusion::logical_expr::expr::ScalarUDF {
-        fun: Arc::clone(&COALESCE_STRUCT_UDF),
-        args,
-    })
+    COALESCE_STRUCT_UDF.call(args)
 }
 
 #[cfg(test)]
@@ -193,13 +204,13 @@ mod tests {
         datatypes::{Field, Fields, Schema},
         record_batch::RecordBatch,
     };
+    use datafusion::prelude::SessionContext;
     use datafusion::{
         assert_batches_eq,
         common::assert_contains,
         prelude::{col, lit},
         scalar::ScalarValue,
     };
-    use datafusion_util::context_with_table;
 
     use super::*;
 
@@ -217,9 +228,9 @@ mod tests {
 
         assert_case_ok(
             [
-                ColumnarValue::Array(ScalarValue::UInt64(None).to_array()),
-                ColumnarValue::Array(ScalarValue::UInt64(Some(1)).to_array()),
-                ColumnarValue::Array(ScalarValue::UInt64(Some(2)).to_array()),
+                ColumnarValue::Array(ScalarValue::UInt64(None).to_array().unwrap()),
+                ColumnarValue::Array(ScalarValue::UInt64(Some(1)).to_array().unwrap()),
+                ColumnarValue::Array(ScalarValue::UInt64(Some(2)).to_array().unwrap()),
             ],
             &DataType::UInt64,
             ["+-----+", "| out |", "+-----+", "| 1   |", "+-----+"],
@@ -228,7 +239,9 @@ mod tests {
 
         assert_case_ok(
             [ColumnarValue::Array(
-                ScalarValue::Struct(None, fields.clone()).to_array(),
+                ScalarValue::Struct(None, fields.clone())
+                    .to_array()
+                    .unwrap(),
             )],
             &dt,
             ["+-----+", "| out |", "+-----+", "|     |", "+-----+"],
@@ -237,7 +250,11 @@ mod tests {
 
         assert_case_ok(
             [
-                ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
+                ),
                 ColumnarValue::Array(
                     ScalarValue::Struct(
                         Some(vec![
@@ -246,9 +263,14 @@ mod tests {
                         ]),
                         fields.clone(),
                     )
-                    .to_array(),
+                    .to_array()
+                    .unwrap(),
+                ),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
                 ),
-                ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()),
                 ColumnarValue::Array(
                     ScalarValue::Struct(
                         Some(vec![
@@ -263,7 +285,8 @@ mod tests {
                         ]),
                         fields.clone(),
                     )
-                    .to_array(),
+                    .to_array()
+                    .unwrap(),
                 ),
             ],
             &dt,
@@ -302,7 +325,11 @@ mod tests {
                     ]),
                     fields.clone(),
                 )),
-                ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()),
+                ColumnarValue::Array(
+                    ScalarValue::Struct(None, fields.clone())
+                        .to_array()
+                        .unwrap(),
+                ),
             ],
             &dt,
             [
@@ -323,21 +350,21 @@ mod tests {
         .await;
 
         assert_case_err(
-            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array())],
+            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())],
             &dt,
             "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
         )
         .await;
 
         assert_case_err(
-            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))],
+            [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))],
             &dt,
             "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
         )
         .await;
 
         assert_case_err(
-            [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array())],
+            [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())],
             &dt,
             "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is"
         )
@@ -391,7 +418,8 @@ mod tests {
             RecordBatch::try_from_iter(cols.into_iter())?
         };
 
-        let ctx = context_with_table(rb);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
         let df = ctx.table("t").await?;
         let df = df.select(vec![coalesce_struct(
             vals.iter()
diff --git a/query_functions/src/gapfill.rs b/query_functions/src/gapfill.rs
index fe288f10320..a47fececf77 100644
--- a/query_functions/src/gapfill.rs
+++ b/query_functions/src/gapfill.rs
@@ -22,11 +22,11 @@ use std::sync::Arc;
 
 use arrow::datatypes::{DataType, Field, TimeUnit};
 use datafusion::{
-    error::DataFusionError,
+    error::{DataFusionError, Result},
     logical_expr::{
-        BuiltinScalarFunction, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF,
-        Signature, TypeSignature, Volatility,
+        BuiltinScalarFunction, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, Volatility,
     },
+    physical_plan::ColumnarValue,
 };
 use once_cell::sync::Lazy;
 use schema::InfluxFieldType;
@@ -34,6 +34,35 @@ use schema::InfluxFieldType;
 /// The name of the date_bin_gapfill UDF given to DataFusion.
 pub const DATE_BIN_GAPFILL_UDF_NAME: &str = "date_bin_gapfill";
 
+#[derive(Debug)]
+struct DateBinGapFillUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for DateBinGapFillUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        DATE_BIN_GAPFILL_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{DATE_BIN_GAPFILL_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
 /// (Non-)Implementation of date_bin_gapfill.
 /// This function takes arguments identical to `date_bin()` but
 /// works in conjunction with the logical optimizer rule
@@ -45,19 +74,48 @@ pub(crate) static DATE_BIN_GAPFILL: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
     // We don't want this to be optimized away before we can give a helpful error message
     signatures.volatility = Volatility::Volatile;
 
-    let return_type_fn: ReturnTypeFunction =
-        Arc::new(|_| Ok(Arc::new(DataType::Timestamp(TimeUnit::Nanosecond, None))));
-    Arc::new(ScalarUDF::new(
-        DATE_BIN_GAPFILL_UDF_NAME,
-        &signatures,
-        &return_type_fn,
-        &unimplemented_scalar_impl(DATE_BIN_GAPFILL_UDF_NAME),
-    ))
+    Arc::new(ScalarUDF::from(DateBinGapFillUDF {
+        signature: signatures,
+    }))
 });
 
 /// The name of the locf UDF given to DataFusion.
 pub const LOCF_UDF_NAME: &str = "locf";
 
+#[derive(Debug)]
+struct LocfUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for LocfUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        LOCF_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{LOCF_UDF_NAME} should have at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{LOCF_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
 /// (Non-)Implementation of locf.
 /// This function takes a single argument of any type and
 /// produces a value of the same type. It is
@@ -66,18 +124,48 @@ pub const LOCF_UDF_NAME: &str = "locf";
 /// an implementation since it will be consumed by the logical optimizer rule
 /// `HandleGapFill`.
 pub(crate) static LOCF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone())));
-    Arc::new(ScalarUDF::new(
-        LOCF_UDF_NAME,
-        &Signature::any(1, Volatility::Volatile),
-        &return_type_fn,
-        &unimplemented_scalar_impl(LOCF_UDF_NAME),
-    ))
+    Arc::new(ScalarUDF::from(LocfUDF {
+        signature: Signature::any(1, Volatility::Volatile),
+    }))
 });
 
 /// The name of the interpolate UDF given to DataFusion.
 pub const INTERPOLATE_UDF_NAME: &str = "interpolate";
 
+#[derive(Debug)]
+struct InterpolateUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for InterpolateUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        INTERPOLATE_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        if arg_types.is_empty() {
+            return Err(DataFusionError::Plan(format!(
+                "{INTERPOLATE_UDF_NAME} should have at least 1 argument"
+            )));
+        }
+        Ok(arg_types[0].clone())
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::NotImplemented(format!(
+            "{INTERPOLATE_UDF_NAME} is not yet implemented"
+        )))
+    }
+}
+
 /// (Non-)Implementation of interpolate.
 /// This function takes a single numeric argument and
 /// produces a value of the same type. It is
@@ -86,7 +174,6 @@ pub const INTERPOLATE_UDF_NAME: &str = "interpolate";
 /// an implementation since it will be consumed by the logical optimizer rule
 /// `HandleGapFill`.
 pub(crate) static INTERPOLATE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
-    let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone())));
     let signatures = [
         InfluxFieldType::Float,
         InfluxFieldType::Integer,
@@ -107,34 +194,35 @@ pub(crate) static INTERPOLATE: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
                 ]
                 .into(),
             )]),
+            TypeSignature::Exact(vec![DataType::Struct(
+                vec![
+                    Field::new("value", influx_type.into(), true),
+                    Field::new(
+                        "time",
+                        DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())),
+                        true,
+                    ),
+                ]
+                .into(),
+            )]),
         ]
     })
     .collect();
-    Arc::new(ScalarUDF::new(
-        INTERPOLATE_UDF_NAME,
-        &Signature::one_of(signatures, Volatility::Volatile),
-        &return_type_fn,
-        &unimplemented_scalar_impl(INTERPOLATE_UDF_NAME),
-    ))
+    Arc::new(ScalarUDF::from(InterpolateUDF {
+        signature: Signature::one_of(signatures, Volatility::Volatile),
+    }))
 });
 
-fn unimplemented_scalar_impl(name: &'static str) -> ScalarFunctionImplementation {
-    Arc::new(move |_| {
-        Err(DataFusionError::NotImplemented(format!(
-            "{name} is not yet implemented"
-        )))
-    })
-}
-
 #[cfg(test)]
 mod test {
     use arrow::array::{ArrayRef, Float64Array, TimestampNanosecondArray};
     use arrow::record_batch::RecordBatch;
     use datafusion::common::assert_contains;
     use datafusion::error::Result;
-    use datafusion::prelude::{col, lit_timestamp_nano, Expr};
+    use datafusion::prelude::{col, Expr, SessionContext};
     use datafusion::scalar::ScalarValue;
-    use datafusion_util::context_with_table;
+    use datafusion_util::lit_timestamptz_nano;
+    use schema::TIME_DATA_TIMEZONE;
     use std::sync::Arc;
 
     fn date_bin_gapfill(stride: Expr, source: Expr, origin: Expr) -> Expr {
@@ -150,13 +238,18 @@ mod test {
 
     #[tokio::test]
     async fn date_bin_gapfill_errs() -> Result<()> {
-        let times = Arc::new(TimestampNanosecondArray::from(vec![Some(1000)]));
+        let times = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(1000)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
         let rb = RecordBatch::try_from_iter(vec![("time", times as ArrayRef)])?;
-        let ctx = context_with_table(rb);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
+
         let df = ctx.table("t").await?.select(vec![date_bin_gapfill(
             lit_interval_milliseconds(360_000),
             col("time"),
-            lit_timestamp_nano(0),
+            lit_timestamptz_nano(0),
         )])?;
         let res = df.collect().await;
         let expected = "date_bin_gapfill is not yet implemented";
@@ -175,7 +268,8 @@ mod test {
     async fn locf_errs() {
         let arg = Arc::new(Float64Array::from(vec![100.0]));
         let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap();
-        let ctx = context_with_table(rb);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
         let df = ctx
             .table("t")
             .await
@@ -198,7 +292,8 @@ mod test {
     async fn interpolate_errs() {
         let arg = Arc::new(Float64Array::from(vec![100.0]));
         let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap();
-        let ctx = context_with_table(rb);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
         let df = ctx
             .table("t")
             .await
diff --git a/query_functions/src/lib.rs b/query_functions/src/lib.rs
index 34586b8dc11..658a5fafb5b 100644
--- a/query_functions/src/lib.rs
+++ b/query_functions/src/lib.rs
@@ -12,6 +12,7 @@
     clippy::dbg_macro,
     unused_crate_dependencies
 )]
+#![allow(unreachable_pub)]
 
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
@@ -34,6 +35,9 @@ mod regex;
 /// Selector Functions
 pub mod selectors;
 
+/// Sleep function.
+mod sleep;
+
 /// window_bounds expressions
 mod window;
 
@@ -41,10 +45,12 @@ pub mod gapfill;
 
 /// Function registry
 mod registry;
+mod to_timestamp;
 
 pub use crate::regex::clean_non_meta_escapes;
 pub use crate::regex::REGEX_MATCH_UDF_NAME;
 pub use crate::regex::REGEX_NOT_MATCH_UDF_NAME;
+pub use crate::sleep::SLEEP_UDF_NAME;
 
 /// Return an Expr that invokes a InfluxRPC compatible regex match to
 /// determine which values satisfy the pattern. Equivalent to:
@@ -117,7 +123,7 @@ mod test {
         record_batch::RecordBatch,
     };
     use datafusion::{assert_batches_eq, prelude::col};
-    use datafusion_util::context_with_table;
+    use schema::TIME_DATA_TIMEZONE;
     use std::sync::Arc;
 
     use super::*;
@@ -132,7 +138,8 @@ mod test {
         )])
         .unwrap();
 
-        let ctx = context_with_table(batch);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
         let result = ctx
             .table("t")
             .await
@@ -165,7 +172,8 @@ mod test {
         )])
         .unwrap();
 
-        let ctx = context_with_table(batch);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
         let result = ctx
             .table("t")
             .await
@@ -187,14 +195,18 @@ mod test {
     async fn test_make_window_bound_expr() {
         let batch = RecordBatch::try_from_iter(vec![(
             "time",
-            Arc::new(TimestampNanosecondArray::from(vec![Some(1000), Some(2000)])) as ArrayRef,
+            Arc::new(
+                TimestampNanosecondArray::from(vec![Some(1000), Some(2000)])
+                    .with_timezone_opt(TIME_DATA_TIMEZONE()),
+            ) as ArrayRef,
         )])
         .unwrap();
 
         let each = WindowDuration::Fixed { nanoseconds: 100 };
         let every = WindowDuration::Fixed { nanoseconds: 200 };
 
-        let ctx = context_with_table(batch);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", batch).unwrap();
         let result = ctx
             .table("t")
             .await
diff --git a/query_functions/src/regex.rs b/query_functions/src/regex.rs
index f153a432149..2e3feae1239 100644
--- a/query_functions/src/regex.rs
+++ b/query_functions/src/regex.rs
@@ -201,11 +201,11 @@ mod test {
         record_batch::RecordBatch,
         util::pretty::pretty_format_batches,
     };
+    use datafusion::prelude::SessionContext;
     use datafusion::{
         error::DataFusionError,
         prelude::{col, lit, Expr},
     };
-    use datafusion_util::context_with_table;
     use std::sync::Arc;
 
     use super::*;
@@ -338,7 +338,8 @@ mod test {
         ])
         .unwrap();
 
-        let ctx = context_with_table(rb);
+        let ctx = SessionContext::new();
+        ctx.register_batch("t", rb).unwrap();
         let df = ctx.table("t").await.unwrap();
         let df = df.filter(op).unwrap();
 
diff --git a/query_functions/src/registry.rs b/query_functions/src/registry.rs
index a4f920db1c6..609b83f93c1 100644
--- a/query_functions/src/registry.rs
+++ b/query_functions/src/registry.rs
@@ -7,7 +7,7 @@ use datafusion::{
 };
 use once_cell::sync::Lazy;
 
-use crate::{gapfill, regex, window};
+use crate::{gapfill, regex, sleep, to_timestamp, window};
 
 static REGISTRY: Lazy<IOxFunctionRegistry> = Lazy::new(IOxFunctionRegistry::new);
 
@@ -24,11 +24,13 @@ impl IOxFunctionRegistry {
 impl FunctionRegistry for IOxFunctionRegistry {
     fn udfs(&self) -> HashSet<String> {
         [
+            to_timestamp::TO_TIMESTAMP_FUNCTION_NAME,
             gapfill::DATE_BIN_GAPFILL_UDF_NAME,
             gapfill::LOCF_UDF_NAME,
             gapfill::INTERPOLATE_UDF_NAME,
             regex::REGEX_MATCH_UDF_NAME,
             regex::REGEX_NOT_MATCH_UDF_NAME,
+            sleep::SLEEP_UDF_NAME,
             window::WINDOW_BOUNDS_UDF_NAME,
         ]
         .into_iter()
@@ -38,11 +40,13 @@ impl FunctionRegistry for IOxFunctionRegistry {
 
     fn udf(&self, name: &str) -> DataFusionResult<Arc<ScalarUDF>> {
         match name {
+            to_timestamp::TO_TIMESTAMP_FUNCTION_NAME => Ok(to_timestamp::TO_TIMESTAMP_UDF.clone()),
             gapfill::DATE_BIN_GAPFILL_UDF_NAME => Ok(gapfill::DATE_BIN_GAPFILL.clone()),
             gapfill::LOCF_UDF_NAME => Ok(gapfill::LOCF.clone()),
             gapfill::INTERPOLATE_UDF_NAME => Ok(gapfill::INTERPOLATE.clone()),
             regex::REGEX_MATCH_UDF_NAME => Ok(regex::REGEX_MATCH_UDF.clone()),
             regex::REGEX_NOT_MATCH_UDF_NAME => Ok(regex::REGEX_NOT_MATCH_UDF.clone()),
+            sleep::SLEEP_UDF_NAME => Ok(sleep::SLEEP_UDF.clone()),
             window::WINDOW_BOUNDS_UDF_NAME => Ok(window::WINDOW_BOUNDS_UDF.clone()),
             _ => Err(DataFusionError::Plan(format!(
                 "IOx FunctionRegistry does not contain function '{name}'"
diff --git a/query_functions/src/selectors/internal.rs b/query_functions/src/selectors/internal.rs
index 63c6f042bb2..a136cc1dc32 100644
--- a/query_functions/src/selectors/internal.rs
+++ b/query_functions/src/selectors/internal.rs
@@ -214,7 +214,7 @@ impl Selector {
             let time_arr = arrow::compute::nullif(
                 time_arr,
                 &arrow::compute::kernels::cmp::neq(
-                    &self.value.to_array_of_size(time_arr.len()),
+                    &self.value.to_array_of_size(time_arr.len())?,
                     &value_arr,
                 )?,
             )?;
diff --git a/query_functions/src/sleep.rs b/query_functions/src/sleep.rs
new file mode 100644
index 00000000000..1995c405ce2
--- /dev/null
+++ b/query_functions/src/sleep.rs
@@ -0,0 +1,94 @@
+use std::{any::Any, sync::Arc};
+
+use arrow::datatypes::{DataType, TimeUnit};
+use datafusion::{
+    error::{DataFusionError, Result},
+    logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility},
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the "sleep" UDF given to DataFusion.
+pub const SLEEP_UDF_NAME: &str = "sleep";
+
+#[derive(Debug)]
+struct SleepUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for SleepUDF {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        SLEEP_UDF_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Null)
+    }
+
+    fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        Err(DataFusionError::Internal(
+            "sleep function should have been replaced by optimizer pass to avoid thread blocking"
+                .to_owned(),
+        ))
+    }
+}
+
+/// Implementation of "sleep"
+pub(crate) static SLEEP_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(SleepUDF {
+        signature: Signature::uniform(
+            1,
+            vec![
+                DataType::Null,
+                DataType::Duration(TimeUnit::Second),
+                DataType::Duration(TimeUnit::Millisecond),
+                DataType::Duration(TimeUnit::Millisecond),
+                DataType::Duration(TimeUnit::Microsecond),
+                DataType::Duration(TimeUnit::Nanosecond),
+                DataType::Float32,
+                DataType::Float64,
+            ],
+            Volatility::Volatile,
+        ),
+    }))
+});
+
+#[cfg(test)]
+mod tests {
+    use datafusion::{
+        common::assert_contains,
+        logical_expr::LogicalPlanBuilder,
+        physical_plan::common::collect,
+        prelude::{lit, SessionContext},
+        scalar::ScalarValue,
+    };
+
+    use super::*;
+
+    #[tokio::test]
+    async fn test() {
+        let ctx = SessionContext::new();
+        let plan = LogicalPlanBuilder::empty(true)
+            .project([SLEEP_UDF.call(vec![lit(ScalarValue::Null)]).alias("sleep")])
+            .unwrap()
+            .build()
+            .unwrap();
+        let plan = ctx.state().create_physical_plan(&plan).await.unwrap();
+        let err = collect(plan.execute(0, ctx.task_ctx()).unwrap())
+            .await
+            .unwrap_err();
+
+        assert_contains!(
+            err.to_string(),
+            "sleep function should have been replaced by optimizer pass"
+        );
+    }
+}
diff --git a/query_functions/src/to_timestamp.rs b/query_functions/src/to_timestamp.rs
new file mode 100644
index 00000000000..4df6c0bcb52
--- /dev/null
+++ b/query_functions/src/to_timestamp.rs
@@ -0,0 +1,85 @@
+//! Implementation of `to_timestamp` function that
+//! overrides the built in version in DataFusion because the semantics changed
+//! upstream: <https://github.com/apache/arrow-datafusion/pull/7844>
+//!
+//!
+//! See <https://github.com/influxdata/influxdb_iox/issues/9164> for more details
+use std::sync::Arc;
+
+use arrow::datatypes::DataType;
+use arrow::datatypes::TimeUnit;
+use datafusion::common::internal_err;
+use datafusion::error::Result;
+use datafusion::logical_expr::ScalarUDFImpl;
+use datafusion::logical_expr::Signature;
+use datafusion::physical_expr::datetime_expressions;
+use datafusion::physical_expr::expressions::cast_column;
+use datafusion::{
+    error::DataFusionError,
+    logical_expr::{ScalarUDF, Volatility},
+    physical_plan::ColumnarValue,
+};
+use once_cell::sync::Lazy;
+
+/// The name of the function
+pub const TO_TIMESTAMP_FUNCTION_NAME: &str = "to_timestamp";
+
+#[derive(Debug)]
+struct ToTimestampUDF {
+    signature: Signature,
+}
+
+impl ScalarUDFImpl for ToTimestampUDF {
+    fn as_any(&self) -> &dyn std::any::Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        TO_TIMESTAMP_FUNCTION_NAME
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
+    }
+
+    fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
+        if args.len() != 1 {
+            return internal_err!("to_timestamp expected 1 argument, got {}", args.len());
+        }
+
+        match args[0].data_type() {
+            // call through to arrow cast kernel
+            DataType::Int64 | DataType::Timestamp(_, _) => cast_column(
+                &args[0],
+                &DataType::Timestamp(TimeUnit::Nanosecond, None),
+                None,
+            ),
+            DataType::Utf8 => datetime_expressions::to_timestamp_nanos(args),
+            dt => internal_err!("to_timestamp does not support argument type '{dt}'"),
+        }
+    }
+}
+
+/// Implementation of to_timestamp
+pub(crate) static TO_TIMESTAMP_UDF: Lazy<Arc<ScalarUDF>> = Lazy::new(|| {
+    Arc::new(ScalarUDF::from(ToTimestampUDF {
+        signature: Signature::uniform(
+            1,
+            vec![
+                DataType::Int64,
+                DataType::Timestamp(TimeUnit::Nanosecond, None),
+                DataType::Timestamp(TimeUnit::Microsecond, None),
+                DataType::Timestamp(TimeUnit::Millisecond, None),
+                DataType::Timestamp(TimeUnit::Second, None),
+                DataType::Utf8,
+            ],
+            Volatility::Immutable,
+        ),
+    }))
+});
+
+// https://github.com/apache/arrow-datafusion/pull/7844
diff --git a/query_functions/src/window.rs b/query_functions/src/window.rs
index db6058efd18..7196b0aefc4 100644
--- a/query_functions/src/window.rs
+++ b/query_functions/src/window.rs
@@ -1,7 +1,7 @@
 mod internal;
 
 pub use internal::Duration;
-use schema::TIME_DATA_TYPE;
+use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE};
 
 use std::sync::Arc;
 
@@ -158,7 +158,9 @@ fn window_bounds(arg: &dyn Array, every: WindowDuration, offset: WindowDuration)
         })
     });
 
-    let array = values.collect::<TimestampNanosecondArray>();
+    let array = values
+        .collect::<TimestampNanosecondArray>()
+        .with_timezone_opt(TIME_DATA_TIMEZONE());
     Arc::new(array) as ArrayRef
 }
 
@@ -264,26 +266,20 @@ mod tests {
 
     #[test]
     fn test_window_bounds() {
-        let input: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![
-            Some(100),
-            None,
-            Some(200),
-            Some(300),
-            Some(400),
-        ]));
+        let input: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(100), None, Some(200), Some(300), Some(400)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         let every = WindowDuration::from_nanoseconds(200);
         let offset = WindowDuration::from_nanoseconds(50);
 
         let bounds_array = window_bounds(&input, every, offset);
 
-        let expected_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![
-            Some(250),
-            None,
-            Some(250),
-            Some(450),
-            Some(450),
-        ]));
+        let expected_array: ArrayRef = Arc::new(
+            TimestampNanosecondArray::from(vec![Some(250), None, Some(250), Some(450), Some(450)])
+                .with_timezone_opt(TIME_DATA_TIMEZONE()),
+        );
 
         assert_eq!(
             &expected_array, &bounds_array,
diff --git a/schema/Cargo.toml b/schema/Cargo.toml
index 2b5a49c8c5e..0e595b344ec 100644
--- a/schema/Cargo.toml
+++ b/schema/Cargo.toml
@@ -6,10 +6,14 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 hashbrown = { workspace = true }
-indexmap = { version = "2.0", features = ["std"] }
+indexmap = { version = "2.1", features = ["std"] }
 observability_deps = { path = "../observability_deps" }
-snafu = "0.7"
+snafu = "0.8"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
+once_cell = "1"
diff --git a/schema/src/lib.rs b/schema/src/lib.rs
index c7c814813e8..08fc697efa9 100644
--- a/schema/src/lib.rs
+++ b/schema/src/lib.rs
@@ -33,6 +33,7 @@ use arrow::datatypes::{
 use hashbrown::HashSet;
 
 use crate::sort::SortKey;
+use once_cell::sync::OnceCell;
 use snafu::{OptionExt, Snafu};
 
 /// The name of the timestamp column in the InfluxDB datamodel
@@ -44,16 +45,21 @@ pub const INFLUXQL_MEASUREMENT_COLUMN_NAME: &str = "iox::measurement";
 pub const INFLUXQL_METADATA_KEY: &str = "iox::influxql::group_key::metadata";
 
 /// The Timezone to use for InfluxDB timezone (should be a constant)
+// TODO: Start Epic Add timezone support to IOx #18154
+// https://github.com/influxdata/idpe/issues/18154
 #[allow(non_snake_case)]
 pub fn TIME_DATA_TIMEZONE() -> Option<Arc<str>> {
-    // TODO: we should use the "UTC" timezone as that is what the
-    // InfluxDB data model timestamps are relative to. However,
-    // DataFusion doesn't currently do a great job with such
-    // timezones so punting for now
-    //Some(String::from("UTC"));
-    None
+    _TIME_DATA_TIMEZONE
+        .get_or_init(|| {
+            std::env::var("INFLUXDB_IOX_TIME_DATA_TIMEZONE")
+                .map_or_else(|_| None, |v| Some(v.into()))
+        })
+        .clone()
 }
 
+// TODO: refactor TIME_DATA_TIMEZONE() into a lazy static
+static _TIME_DATA_TIMEZONE: OnceCell<Option<Arc<str>>> = OnceCell::new();
+
 /// the [`ArrowDataType`] to use for InfluxDB timestamps
 #[allow(non_snake_case)]
 pub fn TIME_DATA_TYPE() -> ArrowDataType {
@@ -783,7 +789,7 @@ macro_rules! assert_column_eq {
 pub(crate) mod test_util {
     use super::*;
 
-    pub fn make_field(
+    pub(crate) fn make_field(
         name: &str,
         data_type: arrow::datatypes::DataType,
         nullable: bool,
diff --git a/schema/src/sort.rs b/schema/src/sort.rs
index 52a8ad2652b..7d4c412a53c 100644
--- a/schema/src/sort.rs
+++ b/schema/src/sort.rs
@@ -251,6 +251,18 @@ impl SortKey {
     }
 }
 
+impl From<SortKey> for Vec<String> {
+    fn from(val: SortKey) -> Self {
+        val.columns.iter().map(|(id, _)| id.to_string()).collect()
+    }
+}
+
+impl From<Vec<String>> for SortKey {
+    fn from(val: Vec<String>) -> Self {
+        Self::from_columns(val)
+    }
+}
+
 // Produces a human-readable representation of a sort key that looks like:
 //
 //  "host, region DESC, env NULLS FIRST, time"
@@ -288,20 +300,26 @@ pub fn compute_sort_key<'a>(
     let primary_key = schema.primary_key();
 
     let cardinalities = distinct_counts(batches, &primary_key);
+    let sort_key = sort_key_from_cardinalities(&cardinalities);
 
-    let mut cardinalities: Vec<_> = cardinalities.into_iter().collect();
+    debug!(?primary_key, ?sort_key, "computed sort key");
+    sort_key
+}
+
+/// Given columns and their cardinalities (the number of distinct values in the data), sort the
+/// columns by cardinality and turn that ordering into a [`SortKey`], with the time column always
+/// appearing last.
+pub fn sort_key_from_cardinalities(cardinalities: &HashMap<String, usize>) -> SortKey {
+    let mut cardinalities: Vec<_> = cardinalities.iter().collect();
     // Sort by (cardinality, column_name) to have deterministic order if same cardinality
     cardinalities.sort_by_cached_key(|x| (x.1, x.0.clone()));
 
     let mut builder = SortKeyBuilder::with_capacity(cardinalities.len() + 1);
     for (col, _) in cardinalities {
-        builder = builder.with_col(col)
+        builder = builder.with_col(col.as_str())
     }
     builder = builder.with_col(TIME_COLUMN_NAME);
-    let sort_key = builder.build();
-
-    debug!(?primary_key, ?sort_key, "computed sort key");
-    sort_key
+    builder.build()
 }
 
 /// Takes batches of data and the columns that make up the primary key. Computes the number of
@@ -310,7 +328,7 @@ pub fn compute_sort_key<'a>(
 fn distinct_counts<'a>(
     batches: impl Iterator<Item = &'a RecordBatch>,
     primary_key: &[&str],
-) -> HashMap<String, u64> {
+) -> HashMap<String, usize> {
     let mut distinct_values_across_batches = HashMap::with_capacity(primary_key.len());
 
     for batch in batches {
@@ -324,14 +342,7 @@ fn distinct_counts<'a>(
 
     distinct_values_across_batches
         .into_iter()
-        .map(|(column, distinct_values)| {
-            let count = distinct_values
-                .len()
-                .try_into()
-                .expect("usize -> u64 overflow");
-
-            (column, count)
-        })
+        .map(|(column, distinct_values)| (column, distinct_values.len()))
         .collect()
 }
 
@@ -404,7 +415,7 @@ pub fn adjust_sort_key_columns(
     let existing_columns_without_time = catalog_sort_key
         .iter()
         .map(|(col, _opts)| col)
-        .filter(|&col| TIME_COLUMN_NAME != col.as_ref())
+        .filter(|col| TIME_COLUMN_NAME != col.as_ref())
         .cloned();
     let new_columns: Vec<_> = primary_key
         .iter()
diff --git a/service_common/Cargo.toml b/service_common/Cargo.toml
index a4d83333b0a..ec328aa0b67 100644
--- a/service_common/Cargo.toml
+++ b/service_common/Cargo.toml
@@ -5,19 +5,12 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-async-trait = "0.1.73"
-bytes = "1.5"
+arrow = { workspace = true }
 datafusion = { workspace = true }
 executor = { path = "../executor" }
-iox_query = { path = "../iox_query" }
-iox_query_influxql = { path = "../iox_query_influxql" }
-iox_query_influxrpc = { path = "../iox_query_influxrpc" }
-flightsql = { path = "../flightsql" }
-metric = { path = "../metric" }
-parking_lot = "0.12"
-predicate = { path = "../predicate" }
 tonic = { workspace = true }
-trace = { path = "../trace" }
-tracker = { path = "../tracker" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/service_common/src/error.rs b/service_common/src/error.rs
index 7e0924e70d3..f9a5b2ecf3c 100644
--- a/service_common/src/error.rs
+++ b/service_common/src/error.rs
@@ -26,8 +26,8 @@ pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code {
     match e {
         DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted,
         // Map as many as possible back into user visible (non internal) errors
-        DataFusionError::SQL(_)
-        | DataFusionError::SchemaError(_)
+        DataFusionError::SQL(_, _)
+        | DataFusionError::SchemaError(_, _)
         // Execution, ArrowError and ParquetError might be due to an
         // internal error (e.g. some sort of IO error or bug) or due
         // to a user input error (e.g. you can get an Arrow error if
@@ -37,7 +37,7 @@ pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code {
         // classify them as InvalidArgument so the user has a chance
         // to see them
         | DataFusionError::Execution(_)
-        | DataFusionError::ArrowError(_)
+        | DataFusionError::ArrowError(_, _)
         | DataFusionError::ParquetError(_)
         // DataFusion most often returns "NotImplemented" when a
         // particular SQL feature is not implemented. This
@@ -99,7 +99,7 @@ mod test {
         );
 
         let e = ParserError::ParserError(s.clone());
-        do_transl_test(DataFusionError::SQL(e), tonic::Code::InvalidArgument);
+        do_transl_test(DataFusionError::SQL(e, None), tonic::Code::InvalidArgument);
 
         do_transl_test(
             DataFusionError::NotImplemented(s.clone()),
diff --git a/service_common/src/lib.rs b/service_common/src/lib.rs
index 23c420663ee..5b055ec8602 100644
--- a/service_common/src/lib.rs
+++ b/service_common/src/lib.rs
@@ -18,37 +18,8 @@
 use workspace_hack as _;
 
 mod error;
-pub mod planner;
-pub mod test_util;
-
-use std::sync::Arc;
-
-use async_trait::async_trait;
-use iox_query::QueryNamespace;
-use trace::span::Span;
-use tracker::InstrumentedAsyncOwnedSemaphorePermit;
-
-/// Trait that allows the query engine (which includes flight and storage/InfluxRPC) to access a
-/// virtual set of namespaces.
-///
-/// The query engine MUST ONLY use this trait to access the namespaces / catalogs.
-#[async_trait]
-pub trait QueryNamespaceProvider: std::fmt::Debug + Send + Sync + 'static {
-    /// Abstract namespace.
-    type Db: QueryNamespace;
-
-    /// Get namespace if it exists.
-    ///
-    /// System tables may contain debug information depending on `include_debug_info_tables`.
-    async fn db(
-        &self,
-        name: &str,
-        span: Option<Span>,
-        include_debug_info_tables: bool,
-    ) -> Option<Arc<Self::Db>>;
-
-    /// Acquire concurrency-limiting sempahore
-    async fn acquire_semaphore(&self, span: Option<Span>) -> InstrumentedAsyncOwnedSemaphorePermit;
-}
 
 pub use error::datafusion_error_to_tonic_code;
+
+// Included to avoid arrow in workspace-hack crate
+use arrow as _;
diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml
index b78a03b670b..73386f61658 100644
--- a/service_grpc_flight/Cargo.toml
+++ b/service_grpc_flight/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 # Workspace dependencies, in alphabetical order
 authz = { path = "../authz" }
@@ -14,21 +17,24 @@ flightsql = { path = "../flightsql" }
 generated_types = { path = "../generated_types" }
 observability_deps = { path = "../observability_deps" }
 iox_query = { path = "../iox_query" }
+iox_query_influxql = { path = "../iox_query_influxql" }
+iox_query_params = { path = "../iox_query_params" }
 service_common = { path = "../service_common" }
+tower_trailer = { path = "../tower_trailer"}
 trace = { path = "../trace"}
 trace_http = { path = "../trace_http"}
 tracker = { path = "../tracker" }
 
 # Crates.io dependencies, in alphabetical order
-arrow = { workspace = true, features = ["prettyprint"] }
-arrow-flight = { workspace = true, features=["flight-sql-experimental"] }
+arrow = { workspace = true }
+arrow-flight = { workspace = true }
 bytes = "1.5"
 futures = "0.3"
-prost = "0.11"
+prost = { workspace = true }
 serde = { version = "1.0", features = ["derive"] }
-serde_json = "1.0.107"
-snafu = "0.7"
-tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/service_grpc_flight/src/keep_alive.rs b/service_grpc_flight/src/keep_alive.rs
index 38f470e515c..0a1836cd9ab 100644
--- a/service_grpc_flight/src/keep_alive.rs
+++ b/service_grpc_flight/src/keep_alive.rs
@@ -136,23 +136,29 @@ use arrow::{
     ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions},
     record_batch::RecordBatch,
 };
-use arrow_flight::{error::FlightError, FlightData};
+use arrow_flight::FlightData;
 use futures::{stream::BoxStream, Stream, StreamExt};
 use observability_deps::tracing::{info, warn};
 use tokio::time::{Interval, MissedTickBehavior};
 
 /// Keep alive underlying response stream by sending regular empty [`RecordBatch`]es.
-pub struct KeepAliveStream {
-    inner: BoxStream<'static, Result<FlightData, FlightError>>,
+pub(crate) struct KeepAliveStream<E>
+where
+    E: 'static,
+{
+    inner: BoxStream<'static, Result<FlightData, E>>,
 }
 
-impl KeepAliveStream {
+impl<E> KeepAliveStream<E>
+where
+    E: 'static,
+{
     /// Create new keep-alive wrapper from the underlying stream and the given interval.
     ///
     /// The interval is measured from the last message -- which can either be a "real" message or a keep-alive.
-    pub fn new<S>(s: S, interval: Duration) -> Self
+    pub(crate) fn new<S>(s: S, interval: Duration) -> Self
     where
-        S: Stream<Item = Result<FlightData, FlightError>> + Send + 'static,
+        S: Stream<Item = Result<FlightData, E>> + Send + 'static,
     {
         let mut ticker = tokio::time::interval(interval);
         ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
@@ -194,8 +200,11 @@ impl KeepAliveStream {
     }
 }
 
-impl Stream for KeepAliveStream {
-    type Item = Result<FlightData, FlightError>;
+impl<E> Stream for KeepAliveStream<E>
+where
+    E: 'static,
+{
+    type Item = Result<FlightData, E>;
 
     fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
         self.inner.poll_next_unpin(cx)
@@ -203,9 +212,12 @@ impl Stream for KeepAliveStream {
 }
 
 /// Inner state of [`KeepAliveStream`]
-struct State {
+struct State<E>
+where
+    E: 'static,
+{
     /// The underlying stream that is kept alive.
-    inner: BoxStream<'static, Result<FlightData, FlightError>>,
+    inner: BoxStream<'static, Result<FlightData, E>>,
 
     /// A [`Schema`] that was already received from the stream.
     ///
@@ -274,13 +286,13 @@ fn build_empty_batch_msg(schema: Option<&SchemaRef>) -> Option<FlightData> {
 }
 
 #[cfg(test)]
-pub mod test_util {
+pub(crate) mod test_util {
     use std::time::Duration;
 
     use futures::{stream::BoxStream, Stream, StreamExt};
 
     /// Ensure that there is a delay between steam responses.
-    pub fn make_stream_slow<S>(s: S, delay: Duration) -> BoxStream<'static, S::Item>
+    pub(crate) fn make_stream_slow<S>(s: S, delay: Duration) -> BoxStream<'static, S::Item>
     where
         S: Send + Stream + Unpin + 'static,
     {
@@ -296,7 +308,9 @@ pub mod test_util {
 #[cfg(test)]
 mod tests {
     use arrow::{array::Int64Array, datatypes::Field};
-    use arrow_flight::{decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder};
+    use arrow_flight::{
+        decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, error::FlightError,
+    };
     use datafusion::assert_batches_eq;
     use futures::TryStreamExt;
     use test_helpers::maybe_start_logging;
@@ -376,6 +390,6 @@ mod tests {
             s
         };
 
-        (panic_on_stream_timeout(s, Duration::from_millis(250))) as _
+        panic_on_stream_timeout(s, Duration::from_millis(250))
     }
 }
diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs
index d842556aa58..5e345c7287c 100644
--- a/service_grpc_flight/src/lib.rs
+++ b/service_grpc_flight/src/lib.rs
@@ -17,15 +17,19 @@
 )]
 
 use keep_alive::KeepAliveStream;
+use planner::Planner;
+use tower_trailer::{HeaderMap, Trailers};
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
 
 mod keep_alive;
+mod planner;
 mod request;
 
 use arrow::error::ArrowError;
 use arrow_flight::{
     encode::FlightDataEncoderBuilder,
+    error::FlightError,
     flight_descriptor::DescriptorType,
     flight_service_server::{FlightService as Flight, FlightServiceServer as FlightServer},
     Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo,
@@ -35,20 +39,24 @@ use authz::{extract_token, Authorizer};
 use data_types::NamespaceNameError;
 use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan};
 use flightsql::FlightSQLCommand;
-use futures::{ready, Stream, StreamExt, TryStreamExt};
+use futures::{ready, stream::BoxStream, Stream, StreamExt, TryStreamExt};
 use generated_types::influxdata::iox::querier::v1 as proto;
-use iox_query::{exec::IOxSessionContext, QueryCompletedToken, QueryNamespace};
+use iox_query::{
+    exec::IOxSessionContext,
+    query_log::{QueryCompletedToken, QueryLogEntry, StatePermit, StatePlanned},
+    QueryNamespaceProvider,
+};
 use observability_deps::tracing::{debug, info, warn};
 use prost::Message;
 use request::{IoxGetRequest, RunQuery};
-use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryNamespaceProvider};
+use service_common::datafusion_error_to_tonic_code;
 use snafu::{OptionExt, ResultExt, Snafu};
 use std::{
     fmt::Debug,
     pin::Pin,
-    sync::Arc,
+    sync::{Arc, Mutex},
     task::Poll,
-    time::{Duration, Instant},
+    time::Duration,
 };
 use tonic::{
     metadata::{AsciiMetadataValue, MetadataMap},
@@ -63,13 +71,27 @@ use tracker::InstrumentedAsyncOwnedSemaphorePermit;
 ///
 /// See <https://lists.apache.org/thread/fd6r1n7vt91sg2c7fr35wcrsqz6x4645>
 /// for discussion on adding support to FlightSQL itself.
-const IOX_FLIGHT_SQL_DATABASE_HEADERS: [&str; 4] = [
+const IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS: [&str; 4] = [
     "database", // preferred
     "bucket",
     "bucket-name",
     "iox-namespace-name", // deprecated
 ];
 
+/// Trailer that describes the duration (in seconds) for which a query was queued due to concurrency limits.
+const IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-queue-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) of the planning phase of a query.
+const IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER: &str =
+    "x-influxdata-planning-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) of the execution phase of a query.
+const IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER: &str =
+    "x-influxdata-execution-duration-seconds";
+
+/// Trailer that describes the duration (in seconds) the CPU(s) took to compute the results.
+const IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-compute-duration-seconds";
+
 /// In which interval should the `DoGet` stream send empty messages as keep alive markers?
 const DO_GET_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(5);
 
@@ -127,7 +149,7 @@ pub enum Error {
     Planning {
         namespace_name: String,
         query: String,
-        source: service_common::planner::Error,
+        source: planner::Error,
     },
 
     #[snafu(display("Error while planning Flight SQL : {}", source))]
@@ -488,82 +510,80 @@ where
 {
     /// Implementation of the `DoGet` method
     async fn run_do_get(
-        &self,
+        server: Arc<S>,
         span_ctx: Option<SpanContext>,
         external_span_ctx: Option<RequestLogContext>,
-        permit: InstrumentedAsyncOwnedSemaphorePermit,
-        query: RunQuery,
-        namespace_name: String,
-        is_debug: bool,
-    ) -> Result<Response<TonicStream<FlightData>>, tonic::Status> {
-        let db = self
-            .server
+        request: IoxGetRequest,
+        log_entry: &mut Option<Arc<QueryLogEntry>>,
+    ) -> Result<TonicStream<FlightData>, tonic::Status> {
+        let IoxGetRequest {
+            database,
+            query,
+            params,
+            is_debug,
+        } = request;
+        let namespace_name = database.as_str();
+
+        let db = server
             .db(
-                &namespace_name,
+                namespace_name,
                 span_ctx.child_span("get namespace"),
                 is_debug,
             )
             .await
-            .context(DatabaseNotFoundSnafu {
-                namespace_name: &namespace_name,
-            })?;
+            .context(DatabaseNotFoundSnafu { namespace_name })?;
+
+        //TODO: add structured logging for parameterized queries https://github.com/influxdata/influxdb_iox/issues/9626
+        let query_completed_token = db.record_query(
+            external_span_ctx.as_ref().map(RequestLogContext::ctx),
+            query.variant(),
+            Box::new(query.to_string()),
+        );
+
+        *log_entry = Some(Arc::clone(query_completed_token.entry()));
+
+        // Log after we acquire the permit and are about to start execution
+        info!(
+            %namespace_name,
+            %query,
+            trace=external_span_ctx.format_jaeger().as_str(),
+            variant=query.variant(),
+            "DoGet request",
+        );
 
         let ctx = db.new_query_context(span_ctx);
-        let (query_completed_token, physical_plan) = match &query {
-            RunQuery::Sql(sql_query) => {
-                let token = db.record_query(
-                    external_span_ctx.as_ref().map(RequestLogContext::ctx),
-                    "sql",
-                    Box::new(sql_query.clone()),
-                );
-                let plan = Planner::new(&ctx)
-                    .sql(sql_query)
-                    .await
-                    .context(PlanningSnafu {
-                        namespace_name: &namespace_name,
-                        query: query.to_string(),
-                    })?;
-                (token, plan)
-            }
-            RunQuery::InfluxQL(sql_query) => {
-                let token = db.record_query(
-                    external_span_ctx.as_ref().map(RequestLogContext::ctx),
-                    "influxql",
-                    Box::new(sql_query.clone()),
-                );
-                let plan = Planner::new(&ctx)
-                    .influxql(sql_query)
-                    .await
-                    .context(PlanningSnafu {
-                        namespace_name: &namespace_name,
-                        query: query.to_string(),
-                    })?;
-                (token, plan)
-            }
-            RunQuery::FlightSQL(msg) => {
-                let token = db.record_query(
-                    external_span_ctx.as_ref().map(RequestLogContext::ctx),
-                    "flightsql",
-                    Box::new(msg.to_string()),
-                );
-                let plan = Planner::new(&ctx)
-                    .flight_sql_do_get(&namespace_name, db, msg.clone())
-                    .await
-                    .context(PlanningSnafu {
-                        namespace_name: &namespace_name,
-                        query: query.to_string(),
-                    })?;
-                (token, plan)
-            }
+        let physical_plan = match &query {
+            RunQuery::Sql(sql_query) => Planner::new(&ctx)
+                .sql(sql_query, params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                    namespace_name,
+                    query: query.to_string(),
+                })?,
+            RunQuery::InfluxQL(sql_query) => Planner::new(&ctx)
+                .influxql(sql_query, params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                namespace_name,
+                query: query.to_string(),
+            })?,
+            RunQuery::FlightSQL(msg) => Planner::new(&ctx)
+                .flight_sql_do_get(namespace_name, db, msg.clone(), params)
+                .await
+                .with_context(|_| PlanningSnafu {
+                    namespace_name,
+                    query: query.to_string(),
+                })?,
         };
+        let query_completed_token = query_completed_token.planned(Arc::clone(&physical_plan));
 
         let output = GetStream::new(
+            server,
             ctx,
             physical_plan,
             namespace_name.to_string(),
             &query,
             query_completed_token,
-            permit,
         )
         .await?;
 
@@ -572,7 +592,7 @@ where
         let output = output.map(move |res| {
             if let Err(e) = &res {
                 info!(
-                    %namespace_name,
+                    %database,
                     %query,
                     trace=external_span_ctx.format_jaeger().as_str(),
                     %e,
@@ -582,7 +602,7 @@ where
             res
         });
 
-        Ok(Response::new(Box::pin(output) as TonicStream<FlightData>))
+        Ok(Box::pin(output) as TonicStream<FlightData>)
     }
 }
 
@@ -613,9 +633,12 @@ where
         request: Request<Ticket>,
     ) -> Result<Response<Self::DoGetStream>, tonic::Status> {
         let external_span_ctx: Option<RequestLogContext> = request.extensions().get().cloned();
+        // technically the trailers layer should always be installed but for testing this isn' always the case, so lets
+        // make this optional
+        let trailers: Option<Trailers> = request.extensions().get().cloned();
         let span_ctx: Option<SpanContext> = request.extensions().get().cloned();
         let authz_token = get_flight_authz(request.metadata());
-        let mut is_debug = has_debug_header(request.metadata());
+        let debug_header = has_debug_header(request.metadata());
         let ticket = request.into_inner();
 
         // attempt to decode ticket
@@ -625,15 +648,12 @@ where
             info!(%e, "Error decoding Flight API ticket");
         };
 
-        let request = request?;
-        let namespace_name = request.database();
-        let query = request.query();
-        is_debug |= request.is_debug();
+        let request = request?.add_debug_header(debug_header);
 
-        let perms = match query {
-            RunQuery::FlightSQL(cmd) => flightsql_permissions(namespace_name, cmd),
+        let perms = match request.query() {
+            RunQuery::FlightSQL(cmd) => flightsql_permissions(request.database(), cmd),
             RunQuery::Sql(_) | RunQuery::InfluxQL(_) => vec![authz::Permission::ResourceAction(
-                authz::Resource::Database(namespace_name.to_string()),
+                authz::Resource::Database(request.database().to_string()),
                 authz::Action::Read,
             )],
         };
@@ -642,51 +662,49 @@ where
             .await
             .map_err(Error::from)?;
 
-        let permit = self
-            .server
-            .acquire_semaphore(span_ctx.child_span("query rate limit semaphore"))
-            .await;
-
-        // Log after we acquire the permit and are about to start execution
-        let start = Instant::now();
-        info!(
-            %namespace_name,
-            %query,
-            trace=external_span_ctx.format_jaeger().as_str(),
-            variant=query.variant(),
-            "DoGet request",
-        );
-
-        let response = self
-            .run_do_get(
-                span_ctx,
-                external_span_ctx.clone(),
-                permit,
-                query.clone(),
-                namespace_name.to_string(),
-                is_debug,
-            )
-            .await;
+        // `run_do_get` may wait for the semaphore. In this case, we shall send empty "keep alive" messages already. So
+        // wrap the whole implementation into the keep alive stream.
+        //
+        // Also note that due to the keep alive mechanism, we cannot send any headers back because they might come
+        // after a keep alive message and therefore aren't headers. gRPC metadata can only be sent at the very beginning
+        // (headers) or at the very end (trailers). We shall use trailers.
+        let server = Arc::clone(&self.server);
+        let mut log_entry = None;
+        let response = Self::run_do_get(
+            server,
+            span_ctx,
+            external_span_ctx.clone(),
+            request.clone(),
+            &mut log_entry,
+        )
+        .await;
 
         if let Err(e) = &response {
             info!(
-                %namespace_name,
-                %query,
+                %request.database,
+                %request.query,
                 trace=external_span_ctx.format_jaeger().as_str(),
                 %e,
                 "Error running DoGet",
             );
         } else {
-            let elapsed = Instant::now() - start;
             debug!(
-                %namespace_name,
-                %query,
+                %request.database,
+                %request.query,
                 trace=external_span_ctx.format_jaeger().as_str(),
-                ?elapsed,
-                "Completed DoGet request",
+                "Planned DoGet request",
             );
         }
-        response
+
+        let md = QueryResponseMetadata { log_entry };
+        let md_captured = md.clone();
+        if let Some(trailers) = trailers {
+            trailers.add_callback(move |trailers| md_captured.write_trailers(trailers));
+        }
+
+        let stream = response?;
+
+        Ok(Response::new(Box::pin(stream) as _))
     }
 
     async fn handshake(
@@ -919,7 +937,7 @@ fn cmd_from_descriptor(flight_descriptor: FlightDescriptor) -> Result<FlightSQLC
 fn get_flightsql_namespace(metadata: &MetadataMap) -> Result<String> {
     let mut found_header_keys: Vec<String> = vec![];
 
-    for key in IOX_FLIGHT_SQL_DATABASE_HEADERS {
+    for key in IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS {
         if metadata.contains_key(key) {
             found_header_keys.push(key.to_string());
         }
@@ -982,25 +1000,32 @@ fn has_debug_header(metadata: &MetadataMap) -> bool {
         .unwrap_or_default()
 }
 
-/// Wrapper over a FlightDataEncodeStream that adds IOx specfic
-/// metadata and records completion
-struct GetStream {
-    inner: KeepAliveStream,
+struct PermitAndToken {
     #[allow(dead_code)]
     permit: InstrumentedAsyncOwnedSemaphorePermit,
-    query_completed_token: QueryCompletedToken,
+    query_completed_token: QueryCompletedToken<StatePermit>,
+}
+
+/// Wrapper over a FlightDataEncodeStream that adds IOx specific
+/// metadata and records completion
+struct GetStream {
+    inner: BoxStream<'static, Result<FlightData, FlightError>>,
+    permit_state: Arc<Mutex<Option<PermitAndToken>>>,
     done: bool,
 }
 
 impl GetStream {
-    async fn new(
+    async fn new<S>(
+        server: Arc<S>,
         ctx: IOxSessionContext,
         physical_plan: Arc<dyn ExecutionPlan>,
         namespace_name: String,
         query: &RunQuery,
-        query_completed_token: QueryCompletedToken,
-        permit: InstrumentedAsyncOwnedSemaphorePermit,
-    ) -> Result<Self, tonic::Status> {
+        query_completed_token: QueryCompletedToken<StatePlanned>,
+    ) -> Result<Self, tonic::Status>
+    where
+        S: QueryNamespaceProvider,
+    {
         let app_metadata = proto::AppMetadata {};
 
         let schema = physical_plan.schema();
@@ -1017,22 +1042,45 @@ impl GetStream {
                 tonic::Status::new(code, e.to_string()).into()
             });
 
-        // setup inner stream
-        let inner = FlightDataEncoderBuilder::new()
+        // acquire token (after planning)
+        let permit_state: Arc<Mutex<Option<PermitAndToken>>> = Default::default();
+        let permit_state_captured = Arc::clone(&permit_state);
+        let permit_span = ctx.child_span("query rate limit semaphore");
+        let query_results = futures::stream::once(async move {
+            let permit = server.acquire_semaphore(permit_span).await;
+            let query_completed_token = query_completed_token.permit();
+            *permit_state_captured.lock().expect("not poisened") = Some(PermitAndToken {
+                permit,
+                query_completed_token,
+            });
+            query_results
+        })
+        .flatten();
+
+        // setup encoding stream
+        let encoded = FlightDataEncoderBuilder::new()
             .with_schema(schema)
             .with_metadata(app_metadata.encode_to_vec().into())
             .build(query_results);
 
-        // add keep alive
-        let inner = KeepAliveStream::new(inner, DO_GET_KEEP_ALIVE_INTERVAL);
+        // keep-alive
+        let inner = KeepAliveStream::new(encoded, DO_GET_KEEP_ALIVE_INTERVAL).boxed();
 
         Ok(Self {
             inner,
-            permit,
-            query_completed_token,
+            permit_state,
             done: false,
         })
     }
+
+    #[must_use]
+    fn finish_stream(&self) -> Option<QueryCompletedToken<StatePermit>> {
+        self.permit_state
+            .lock()
+            .expect("not poisened")
+            .take()
+            .map(|state| state.query_completed_token)
+    }
 }
 
 impl Stream for GetStream {
@@ -1052,27 +1100,78 @@ impl Stream for GetStream {
                 None => {
                     self.done = true;
                     // if we get here, all is good
-                    self.query_completed_token.set_success();
+                    if let Some(token) = self.finish_stream() {
+                        token.success();
+                    }
                 }
                 Some(Ok(data)) => {
                     return Poll::Ready(Some(Ok(data)));
                 }
                 Some(Err(e)) => {
                     self.done = true;
+                    if let Some(token) = self.finish_stream() {
+                        token.fail();
+                    }
                     return Poll::Ready(Some(Err(e.into())));
                 }
             }
         }
     }
 }
+
+/// Header/trailer data added to query responses.
+#[derive(Debug, Clone)]
+struct QueryResponseMetadata {
+    log_entry: Option<Arc<QueryLogEntry>>,
+}
+
+impl QueryResponseMetadata {
+    fn write_trailer_duration(md: &mut HeaderMap, key: &'static str, d: Option<Duration>) {
+        let Some(d) = d else { return };
+
+        md.insert(
+            key,
+            d.as_secs_f64().to_string().parse().expect("always valid"),
+        );
+    }
+
+    fn write_trailers(&self, md: &mut HeaderMap) {
+        let Some(log_entry) = &self.log_entry else {
+            return;
+        };
+
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER,
+            log_entry.permit_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER,
+            log_entry.plan_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER,
+            log_entry.execute_duration(),
+        );
+        Self::write_trailer_duration(
+            md,
+            IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER,
+            log_entry.compute_duration(),
+        );
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use arrow_flight::sql::ProstMessageExt;
     use async_trait::async_trait;
     use authz::Permission;
     use futures::Future;
+    use iox_query::test::TestDatabaseStore;
     use metric::{Attributes, Metric, U64Gauge};
-    use service_common::test_util::TestDatabaseStore;
+    use test_helpers::maybe_start_logging;
     use tokio::pin;
     use tonic::metadata::{MetadataKey, MetadataValue};
 
@@ -1111,10 +1210,13 @@ mod tests {
                 .to_vec()
                 .into(),
         };
-        let streaming_resp1 = service
+        let mut streaming_resp1 = service
             .do_get(tonic::Request::new(ticket.clone()))
             .await
-            .unwrap();
+            .unwrap()
+            .into_inner();
+        streaming_resp1.next().await.unwrap().unwrap(); // schema (planning)
+        streaming_resp1.next().await.unwrap().unwrap(); // record batch (execution)
 
         assert_semaphore_metric(
             &test_storage.metric_registry,
@@ -1132,10 +1234,13 @@ mod tests {
             1,
         );
 
-        let streaming_resp2 = service
+        let mut streaming_resp2 = service
             .do_get(tonic::Request::new(ticket.clone()))
             .await
-            .unwrap();
+            .unwrap()
+            .into_inner();
+        streaming_resp2.next().await.unwrap().unwrap(); // schema (planning)
+        streaming_resp2.next().await.unwrap().unwrap(); // record batch (execution)
 
         assert_semaphore_metric(
             &test_storage.metric_registry,
@@ -1154,7 +1259,13 @@ mod tests {
         );
 
         // 3rd request is pending
-        let fut = service.do_get(tonic::Request::new(ticket.clone()));
+        let mut streaming_resp3 = service
+            .do_get(tonic::Request::new(ticket.clone()))
+            .await
+            .unwrap()
+            .into_inner();
+        streaming_resp3.next().await.unwrap().unwrap(); // schema (planning)
+        let fut = streaming_resp3.next(); // record batch (execution)
         pin!(fut);
         assert_fut_pending(&mut fut).await;
 
@@ -1176,7 +1287,7 @@ mod tests {
 
         // free permit
         drop(streaming_resp1);
-        let streaming_resp3 = fut.await;
+        fut.await.unwrap().unwrap();
 
         assert_semaphore_metric(
             &test_storage.metric_registry,
@@ -1227,6 +1338,7 @@ mod tests {
         };
     }
 
+    #[track_caller]
     fn assert_semaphore_metric(registry: &metric::Registry, name: &'static str, expected: u64) {
         let actual = registry
             .get_instrument::<Metric<U64Gauge>>(name)
@@ -1262,6 +1374,8 @@ mod tests {
 
     #[tokio::test]
     async fn do_get_authz() {
+        maybe_start_logging();
+
         let test_storage = Arc::new(TestDatabaseStore::default());
         test_storage.db_or_create("bananas").await;
 
diff --git a/service_grpc_flight/src/planner.rs b/service_grpc_flight/src/planner.rs
new file mode 100644
index 00000000000..9c6caf49e7f
--- /dev/null
+++ b/service_grpc_flight/src/planner.rs
@@ -0,0 +1,113 @@
+//! Query planner wrapper for use in IOx services
+use std::sync::Arc;
+
+use bytes::Bytes;
+use datafusion::{
+    arrow::datatypes::SchemaRef, error::DataFusionError, physical_plan::ExecutionPlan,
+};
+use flightsql::{FlightSQLCommand, FlightSQLPlanner};
+use iox_query::{exec::IOxSessionContext, frontend::sql::SqlQueryPlanner, QueryNamespace};
+
+pub(crate) use datafusion::error::{DataFusionError as Error, Result};
+use iox_query_influxql::frontend::planner::InfluxQLQueryPlanner;
+use iox_query_params::StatementParams;
+
+/// Query planner that plans queries on a separate threadpool.
+///
+/// Query planning was, at time of writing, a single threaded affair. In order
+/// to avoid tying up the tokio executor that is handling API requests, IOx plan
+/// queries using a separate thread pool.
+#[derive(Debug)]
+pub(crate) struct Planner {
+    /// Executors (whose threadpool to use)
+    ctx: IOxSessionContext,
+}
+
+impl Planner {
+    /// Create a new planner that will plan queries using the provided context
+    pub(crate) fn new(ctx: &IOxSessionContext) -> Self {
+        Self {
+            ctx: ctx.child_ctx("Planner"),
+        }
+    }
+
+    /// Plan a SQL query against the data in a namespace, and return a
+    /// DataFusion physical execution plan.
+    pub(crate) async fn sql(
+        &self,
+        query: impl AsRef<str> + Send,
+        params: StatementParams,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = SqlQueryPlanner::new();
+        let query = query.as_ref();
+        let ctx = self.ctx.child_ctx("planner sql");
+        let params = params.into_df_param_values();
+
+        planner.query(query, params, &ctx).await
+    }
+
+    /// Plan an InfluxQL query against the data in `database`, and return a
+    /// DataFusion physical execution plan.
+    pub(crate) async fn influxql(
+        &self,
+        query: impl AsRef<str> + Send,
+        params: impl Into<StatementParams> + Send,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let planner = InfluxQLQueryPlanner::new();
+        let query = query.as_ref();
+        let ctx = self.ctx.child_ctx("planner influxql");
+        let params = params.into();
+
+        planner.query(query, params, &ctx).await
+    }
+
+    /// Creates a plan for a `DoGet` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_get`], on a separate threadpool
+    pub(crate) async fn flight_sql_do_get(
+        &self,
+        namespace_name: impl AsRef<str> + Send,
+        namespace: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+        params: StatementParams,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let namespace_name = namespace_name.as_ref();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+        let params = params.into_df_param_values();
+
+        FlightSQLPlanner::do_get(namespace_name, namespace, cmd, params, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+
+    /// Creates a plan for a `DoAction` FlightSQL message, as described on
+    /// [`FlightSQLPlanner::do_action`], on a separate threadpool
+    pub(crate) async fn flight_sql_do_action(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        namespace: Arc<dyn QueryNamespace>,
+        cmd: FlightSQLCommand,
+    ) -> Result<Bytes> {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_do_get");
+
+        FlightSQLPlanner::do_action(namespace_name, namespace, cmd, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+
+    /// Returns the [`SchemaRef`] to be included in the response to a
+    /// `GetFlightInfo` FlightSQL message as described on
+    /// [`FlightSQLPlanner::get_schema`], on a separate threadpool.
+    pub(crate) async fn flight_sql_get_flight_info_schema(
+        &self,
+        namespace_name: impl Into<String> + Send,
+        cmd: FlightSQLCommand,
+    ) -> Result<SchemaRef> {
+        let namespace_name = namespace_name.into();
+        let ctx = self.ctx.child_ctx("planner flight_sql_get_flight_info");
+
+        FlightSQLPlanner::get_schema(namespace_name, cmd, &ctx)
+            .await
+            .map_err(DataFusionError::from)
+    }
+}
diff --git a/service_grpc_flight/src/request.rs b/service_grpc_flight/src/request.rs
index 35e10e4c368..eeafd6f6519 100644
--- a/service_grpc_flight/src/request.rs
+++ b/service_grpc_flight/src/request.rs
@@ -2,13 +2,17 @@
 
 use arrow_flight::Ticket;
 use bytes::Bytes;
+
 use flightsql::FlightSQLCommand;
 use generated_types::google::protobuf::Any;
 use generated_types::influxdata::iox::querier::v1 as proto;
 use generated_types::influxdata::iox::querier::v1::read_info::QueryType;
+
+use iox_query_params::StatementParams;
 use observability_deps::tracing::trace;
 use prost::Message;
 use serde::Deserialize;
+
 use snafu::{ResultExt, Snafu};
 use std::fmt::{Debug, Display, Formatter};
 
@@ -18,12 +22,18 @@ pub enum Error {
     Invalid,
     #[snafu(display("Invalid ticket content: {}", msg))]
     InvalidContent { msg: String },
+    #[snafu(display("Unknown query type. Expected 'sql' or 'influxql', got {}", query_type))]
+    InvalidQueryType { query_type: String },
     #[snafu(display("Invalid Flight SQL ticket: {}", source))]
     FlightSQL { source: flightsql::Error },
-    #[snafu(display("Invalid Protobuf: {}", source))]
-    Decode { source: prost::DecodeError },
+    #[snafu(display("Protobuf decoding error: {}", source))]
+    DecodeProtobuf { source: prost::DecodeError },
+    #[snafu(display("JSON parse error: {}", source))]
+    DecodeJson { source: serde_json::Error },
+    #[snafu(display("Invalid params: {}", source))]
+    DecodeParams { source: iox_query_params::Error },
 }
-pub type Result<T, E = Error> = std::result::Result<T, E>;
+pub(crate) type Result<T, E = Error> = std::result::Result<T, E>;
 
 /// AnyError is an internal error that contains the result of attempting
 /// to decode a protobuf "Any" message. This is separate from Error so
@@ -90,15 +100,34 @@ enum AnyError {
 ///   "query_type": "influxql"
 /// }
 /// ```
+///
+/// ## Query parameters
+///
+/// You can bind parameters to the query by using `$placeholder` syntax within the query and
+/// supplying the parameter values via the `params` object. For example:
+///
+/// ```json
+/// {
+///     "database": "my_db",
+///     "sql_query": "SELECT a, b, c FROM my_table WHERE id = $id AND name = $name",
+///     "query_type": "sql",
+///     "params": {
+///         "id": 1234,
+///         "name": "alice"
+///     }
+/// }
+/// ```
+///
 #[derive(Debug, PartialEq, Clone)]
-pub struct IoxGetRequest {
-    database: String,
-    query: RunQuery,
-    is_debug: bool,
+pub(crate) struct IoxGetRequest {
+    pub(crate) database: String,
+    pub(crate) query: RunQuery,
+    pub(crate) params: StatementParams,
+    pub(crate) is_debug: bool,
 }
 
 #[derive(Debug, PartialEq, Clone)]
-pub enum RunQuery {
+pub(crate) enum RunQuery {
     /// Unparameterized SQL query
     Sql(String),
     /// InfluxQL
@@ -110,7 +139,7 @@ pub enum RunQuery {
 }
 
 impl RunQuery {
-    pub fn variant(&self) -> &'static str {
+    pub(crate) fn variant(&self) -> &'static str {
         match self {
             Self::Sql(_) => "sql",
             Self::InfluxQL(_) => "influxql",
@@ -134,16 +163,23 @@ impl IoxGetRequest {
         "type.googleapis.com/influxdata.iox.querier.v1.ReadInfo";
 
     /// Create a new request to run the specified query
-    pub fn new(database: impl Into<String>, query: RunQuery, is_debug: bool) -> Self {
+    pub(crate) fn new(database: impl Into<String>, query: RunQuery, is_debug: bool) -> Self {
         Self {
             database: database.into(),
             query,
+            params: StatementParams::default(),
             is_debug,
         }
     }
 
+    /// Merges result of the gRPC debug header into the is_debug field of this request using boolean or logic
+    pub(crate) fn add_debug_header(mut self, debug_header: bool) -> Self {
+        self.is_debug |= debug_header;
+        self
+    }
+
     /// try to decode a ReadInfo structure from a Token
-    pub fn try_decode(ticket: Ticket) -> Result<Self> {
+    pub(crate) fn try_decode(ticket: Ticket) -> Result<Self> {
         // decode ticket
         IoxGetRequest::decode_protobuf_any(ticket.ticket.clone())
             .or_else(|e| {
@@ -170,19 +206,23 @@ impl IoxGetRequest {
     }
 
     /// Encode the request as a protobuf Ticket
-    pub fn try_encode(self) -> Result<Ticket> {
+    pub(crate) fn try_encode(self) -> Result<Ticket> {
         let Self {
             database,
             query,
+            params,
             is_debug,
         } = self;
 
+        let params: Vec<proto::read_info::QueryParam> = params.into();
+
         let read_info = match query {
             RunQuery::Sql(sql_query) => proto::ReadInfo {
                 database,
                 sql_query,
                 query_type: QueryType::Sql.into(),
                 flightsql_command: vec![],
+                params,
                 is_debug,
             },
             RunQuery::InfluxQL(influxql) => proto::ReadInfo {
@@ -191,6 +231,7 @@ impl IoxGetRequest {
                 sql_query: influxql,
                 query_type: QueryType::InfluxQl.into(),
                 flightsql_command: vec![],
+                params,
                 is_debug,
             },
             RunQuery::FlightSQL(flightsql_command) => proto::ReadInfo {
@@ -201,6 +242,7 @@ impl IoxGetRequest {
                     .try_encode()
                     .context(FlightSQLSnafu)?
                     .into(),
+                params,
                 is_debug,
             },
         };
@@ -217,8 +259,10 @@ impl IoxGetRequest {
     }
 
     /// See comments on [`IoxGetRequest`] for details of this format
-    fn decode_json(ticket: Bytes) -> Result<Self, String> {
-        let json_str = String::from_utf8(ticket.to_vec()).map_err(|_| "Not UTF8".to_string())?;
+    fn decode_json(ticket: Bytes) -> Result<Self> {
+        let json_str = String::from_utf8(ticket.to_vec()).map_err(|_| Error::InvalidContent {
+            msg: "Not UTF8".to_string(),
+        })?;
 
         /// This represents ths JSON fields
         #[derive(Deserialize, Debug)]
@@ -229,6 +273,8 @@ impl IoxGetRequest {
             // If query type is not supplied, defaults to SQL
             query_type: Option<String>,
             #[serde(default = "Default::default")]
+            params: StatementParams,
+            #[serde(default = "Default::default")]
             is_debug: bool,
         }
 
@@ -236,18 +282,15 @@ impl IoxGetRequest {
             database,
             sql_query,
             query_type,
+            params,
             is_debug,
-        } = serde_json::from_str(&json_str).map_err(|e| format!("JSON parse error: {e}"))?;
+        } = serde_json::from_str(&json_str).context(DecodeJsonSnafu)?;
 
         let query = if let Some(query_type) = query_type {
             match query_type.as_str() {
                 "sql" => RunQuery::Sql(sql_query),
                 "influxql" => RunQuery::InfluxQL(sql_query),
-                _ => {
-                    return Err(format!(
-                        "unknown query type. Expected 'sql' or 'influxql', got {query_type}'"
-                    ))
-                }
+                _ => return InvalidQueryTypeSnafu { query_type }.fail(),
             }
         } else {
             // default to SQL
@@ -257,6 +300,7 @@ impl IoxGetRequest {
         Ok(Self {
             database,
             query,
+            params,
             is_debug,
         })
     }
@@ -276,7 +320,7 @@ impl IoxGetRequest {
 
     /// See comments on [`IoxGetRequest`] for details of this format
     fn decode_protobuf(ticket: Bytes) -> Result<Self, Error> {
-        let read_info = proto::ReadInfo::decode(ticket).context(DecodeSnafu)?;
+        let read_info = proto::ReadInfo::decode(ticket).context(DecodeProtobufSnafu)?;
 
         let query_type = read_info.query_type();
         let proto::ReadInfo {
@@ -285,6 +329,7 @@ impl IoxGetRequest {
             query_type: _,
             flightsql_command,
             is_debug,
+            params,
         } = read_info;
 
         Ok(Self {
@@ -320,30 +365,26 @@ impl IoxGetRequest {
                     RunQuery::FlightSQL(cmd)
                 }
             },
+            params: params.try_into().context(DecodeParamsSnafu)?,
             is_debug,
         })
     }
 
-    pub fn database(&self) -> &str {
+    pub(crate) fn database(&self) -> &str {
         self.database.as_ref()
     }
 
-    pub fn query(&self) -> &RunQuery {
+    pub(crate) fn query(&self) -> &RunQuery {
         &self.query
     }
-
-    pub fn is_debug(&self) -> bool {
-        self.is_debug
-    }
 }
-
 #[cfg(test)]
 mod tests {
+    use super::*;
     use arrow_flight::sql::CommandStatementQuery;
     use assert_matches::assert_matches;
     use generated_types::influxdata::iox::querier::v1::read_info::QueryType;
-
-    use super::*;
+    use iox_query_params::{params, StatementParams};
 
     #[test]
     fn json_ticket_decoding_compatibility() {
@@ -369,22 +410,52 @@ mod tests {
 
         impl TestCase {
             fn new_sql(json: &'static str, expected_database: &str, query: &str) -> Self {
+                Self::new_sql_with_params(
+                    json,
+                    expected_database,
+                    query,
+                    StatementParams::default(),
+                )
+            }
+
+            fn new_sql_with_params(
+                json: &'static str,
+                expected_database: &str,
+                query: &str,
+                params: impl Into<StatementParams>,
+            ) -> Self {
                 Self {
                     json,
                     expected: IoxGetRequest {
                         database: String::from(expected_database),
                         query: RunQuery::Sql(String::from(query)),
+                        params: params.into(),
                         is_debug: false,
                     },
                 }
             }
 
             fn new_influxql(json: &'static str, expected_database: &str, query: &str) -> Self {
+                Self::new_influxql_with_params(
+                    json,
+                    expected_database,
+                    query,
+                    StatementParams::default(),
+                )
+            }
+
+            fn new_influxql_with_params(
+                json: &'static str,
+                expected_database: &str,
+                query: &str,
+                params: impl Into<StatementParams>,
+            ) -> Self {
                 Self {
                     json,
                     expected: IoxGetRequest {
                         database: String::from(expected_database),
                         query: RunQuery::InfluxQL(String::from(query)),
+                        params: params.into(),
                         is_debug: false,
                     },
                 }
@@ -518,6 +589,55 @@ mod tests {
                 "my_otherdb",
                 "SHOW DATABASES;",
             ),
+            // query parameter cases
+            TestCase::new_sql_with_params(
+                r#"
+                {
+                    "bucket": "my_db",
+                    "sql_query": "SELECT $1, $2, $3, $4, $5;",
+                    "query_type": "sql",
+                    "params": {
+                        "1": null,
+                        "2": true,
+                        "3": "string",
+                        "4": 1234,
+                        "5": 12.34
+                    }
+                }"#,
+                "my_db",
+                "SELECT $1, $2, $3, $4, $5;",
+                params! {
+                    "1" => (),
+                    "2" => true,
+                    "3" => "string",
+                    "4" => 1234_u32,
+                    "5" => 12.34
+                },
+            ),
+            TestCase::new_influxql_with_params(
+                r#"
+                {
+                    "bucket": "my_db",
+                    "sql_query": "SELECT $1, $2, $3, $4, $5;",
+                    "query_type": "influxql",
+                    "params": {
+                        "1": null,
+                        "2": true,
+                        "3": "string",
+                        "4": 1234,
+                        "5": 12.34
+                    }
+                }"#,
+                "my_db",
+                "SELECT $1, $2, $3, $4, $5;",
+                params! {
+                    "1" => (),
+                    "2" => true,
+                    "3" => "string",
+                    "4" => 1234_u32,
+                    "5" => 12.34
+                },
+            ),
         ];
 
         for TestCase { json, expected } in cases {
@@ -557,6 +677,32 @@ mod tests {
         assert_matches!(e, Error::Invalid);
     }
 
+    #[test]
+    fn json_ticket_decoding_invalid_params() {
+        let ticket = make_json_ticket(
+            r#"
+        {
+            "bucket": "my_db",
+            "sql_query": "SELECT $1, $2, $3, $4, $5;",
+            "query_type": "influxql",
+            "params": ["foo", "bar"]
+        }"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid);
+
+        let ticket = make_json_ticket(
+            r#"
+        {
+            "bucket": "my_db",
+            "sql_query": "SELECT $1, $2, $3, $4, $5;",
+            "query_type": "influxql",
+            "params": null
+        }"#,
+        );
+        let e = IoxGetRequest::try_decode(ticket).unwrap_err();
+        assert_matches!(e, Error::Invalid)
+    }
     #[test]
     fn proto_ticket_decoding_unspecified() {
         let ticket = make_proto_ticket(&proto::ReadInfo {
@@ -564,6 +710,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::Unspecified.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -580,6 +727,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::Sql.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -595,6 +743,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::InfluxQl.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -610,6 +759,7 @@ mod tests {
             sql_query: "SELECT 1".into(),
             query_type: 42, // not a known query type
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -627,6 +777,7 @@ mod tests {
             query_type: QueryType::Sql.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -642,6 +793,7 @@ mod tests {
             query_type: QueryType::InfluxQl.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -657,6 +809,7 @@ mod tests {
             query_type: QueryType::FlightSqlMessage.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -682,6 +835,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::Unspecified.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -698,6 +852,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::Sql.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -713,6 +868,7 @@ mod tests {
             sql_query: "SELECT 1".to_string(),
             query_type: QueryType::InfluxQl.into(),
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -728,6 +884,7 @@ mod tests {
             sql_query: "SELECT 1".into(),
             query_type: 42, // not a known query type
             flightsql_command: vec![],
+            params: vec![],
             is_debug: false,
         });
 
@@ -745,6 +902,7 @@ mod tests {
             query_type: QueryType::Sql.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -760,6 +918,7 @@ mod tests {
             query_type: QueryType::InfluxQl.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -775,6 +934,7 @@ mod tests {
             query_type: QueryType::FlightSqlMessage.into(),
             // can't have both sql_query and flightsql
             flightsql_command: vec![1, 2, 3],
+            params: vec![],
             is_debug: false,
         });
 
@@ -797,6 +957,7 @@ mod tests {
         let request = IoxGetRequest {
             database: "foo_blarg".into(),
             query: RunQuery::Sql("select * from bar".into()),
+            params: StatementParams::default(),
             is_debug: false,
         };
 
@@ -812,6 +973,7 @@ mod tests {
         let request = IoxGetRequest {
             database: "foo_blarg".into(),
             query: RunQuery::Sql("select * from bar".into()),
+            params: StatementParams::default(),
             is_debug: true,
         };
 
@@ -827,6 +989,7 @@ mod tests {
         let request = IoxGetRequest {
             database: "foo_blarg".into(),
             query: RunQuery::InfluxQL("select * from bar".into()),
+            params: StatementParams::default(),
             is_debug: false,
         };
 
@@ -847,6 +1010,7 @@ mod tests {
         let request = IoxGetRequest {
             database: "foo_blarg".into(),
             query: RunQuery::FlightSQL(cmd),
+            params: StatementParams::default(),
             is_debug: false,
         };
 
diff --git a/service_grpc_testing/Cargo.toml b/service_grpc_testing/Cargo.toml
index 3f3ef9279f7..659799e759a 100644
--- a/service_grpc_testing/Cargo.toml
+++ b/service_grpc_testing/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 generated_types = { path = "../generated_types" }
 observability_deps = { path = "../observability_deps" }
diff --git a/sharder/Cargo.toml b/sharder/Cargo.toml
index e402d971f74..66d88536e29 100644
--- a/sharder/Cargo.toml
+++ b/sharder/Cargo.toml
@@ -5,6 +5,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 data_types = { path = "../data_types" }
 mutable_batch = { path = "../mutable_batch" }
diff --git a/sharder/benches/sharder.rs b/sharder/benches/sharder.rs
index 515052b0015..303a9b254a2 100644
--- a/sharder/benches/sharder.rs
+++ b/sharder/benches/sharder.rs
@@ -97,7 +97,7 @@ where
 }
 
 fn benchmark_scenario<T>(
-    group: &mut BenchmarkGroup<WallTime>,
+    group: &mut BenchmarkGroup<'_, WallTime>,
     bench_name: &str,
     table: &str,
     namespace: &NamespaceName<'_>,
diff --git a/sqlx-hotswap-pool/Cargo.toml b/sqlx-hotswap-pool/Cargo.toml
index a7dda82ee08..a85ee6edf48 100644
--- a/sqlx-hotswap-pool/Cargo.toml
+++ b/sqlx-hotswap-pool/Cargo.toml
@@ -9,8 +9,11 @@ license.workspace = true
 # Prevent this from being published to crates.io!
 publish = false
 
+[lints]
+workspace = true
+
 [dependencies]
-sqlx = { version = "0.7.1", features = ["runtime-tokio-rustls", "postgres", "json", "tls-rustls"] }
+sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "json", "tls-rustls"] }
 either = "1.9.0"
 futures = "0.3"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
@@ -18,4 +21,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" }
 [dev-dependencies]
 dotenvy = "0.15.7"
 rand = { version = "0.8", features = ["small_rng"] }
-tokio = { version = "1.32", features = ["rt-multi-thread", "macros", "parking_lot"] }
+tokio = { version = "1.35", features = ["rt-multi-thread", "macros", "parking_lot"] }
diff --git a/test_fixtures/README.md b/test_fixtures/README.md
new file mode 100644
index 00000000000..568e1f6ada1
--- /dev/null
+++ b/test_fixtures/README.md
@@ -0,0 +1,26 @@
+# Test fixtures
+
+This directory contains files that may be useful for testing purposes.
+
+If you add a new file to this directory, please add a brief description of it here.
+
+# Top-level files
+
+- `000000000000005-000000002.tsm.gz` - Used in testing the `influxdb_tsm` crate.
+- `cpu_usage.tsm.gz` - Used in testing the `influxdb_tsm` crate.
+- `cpu.parquet` - Parquet file generated by IOx to be imported in tests where data loaded is needed.
+
+# lineproto directory
+
+- `air_and_water.lp`
+- `metrics.lp`
+- `prometheus.lp`
+- `read_filter.lp.gz`
+- `temperature.lp`
+
+# parquet directory
+
+- `influxql_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest. Notably
+  NOT generated with IOx.
+- `sql_query_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest.
+  Notably NOT generated with IOx.
diff --git a/test_fixtures/parquet/influxql_log_1.parquet b/test_fixtures/parquet/influxql_log_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..8c4b04e31dc360148c674ce305b0c835e2a11c10
GIT binary patch
literal 309561
zcmeF437i~Nwg1Dus{D~42*^0HlZ3jr_5qY+l0YCKAqo33)S3)QW`>yw5O`5!2W1iU
ziK4QvBA|e%C?JAtB8sRWBBBT`JP;LJ(f{|}s=--0-8KE;^()dHBPUf|>#ck4*?(u=
z)cIoK+7s{Ip1-pvKC$-137bxsaOJf-Uf~^n#k;<`^4Y5ty>!J@S1bDE=+oCL`uR1l
zyg^a#$yYS}<oEw_liGgt#9eMybn*1tzNF|=_PMtz`re<O^pmO3L*9PzlZu{u)YDHX
zy5zEt{#;RSkB(m``sec>dsfl!9(LAqivHl}uRX8mamQWy8%5ve{PK5-w(o!9o%=#J
ze{kSiie7W*$(mmA+9CI;?W=Bj=zc}_nzqgZiZ1{9*$*jdz4-jYik|<_-yc!*iXXH;
zs^~A*zw*Wdpm!f}_05VtFmcB(DSFsF*WIG%GuLi&tD@Vk`NOX$+II7wG(F^BeP2`C
zpL+ejH2v)fzrI6l&)s<C-wuY({O+;;Q1s<@|NM1DC;aWXe=GX$wST|HA<*4F|KeJT
z&iKv1x{7|PbKd%j9(vD`4Hf<1fk$tw=<l{zbY}=%IpK5nE4u$(hd-?7bsv23dx~!L
z@>Y*2di+HX|47k&&ff7UMJJ#6z%z<|^|H_XO3|(6t@EOyCw}#^ZyyK!w*4bbU%uSA
zPi=b}T%+kRTb=M7wf(o_{#(;8@3!IrwJpB#Z%tqA+WbMaeaXgedsxxIIm<tNBJ`f8
z&-jd@TYv02O~qgLyI5`aZFt`26cyh)U(*GPKBnp6yL|Z)wf}2t-=b;o)TB$*_Unhd
z`p5^Nv)BLO_Z0o-v)6xLQDe{F{6NtUUUBi`ie52&w;w6`^!Xb+q3HG}pZ=txKlz_`
z|5VZaUO4$FMc;q^Et{SSef}M{X=-n?%|x|5>*3{^-ZJ;-&DHj8uRQxMMNfTr>n#-h
z;U5my^vIRryVdq-a@kgjUhv>{;rj6v@7nYeC&r4-`1Qk@x|dv@s_l=hvDG3)Uq1io
z#fnb&$~wm=y7^td({y^@;8L~ieP^p>iXLQ7x&3X>!541)s-j!3`25!t-D%6Oe_hdc
z9WnVEiq4$#eN8{RVA)-2`<lt8-L2^0R$JerXz;d6zOCq8SG=-tGw4=FuYIthms;OA
zMA84*Wt+nl{npP8Ia1LNef+d8Mepkf0!2S~{(ECZr_Pv^D!To#`z%)Um^=F}-xfOI
z;M=ZH^wUd!f3>0)?KkHdMcp0mxK7dDUB9R4_ntWF2DQE6$1c`%yKfwOliGghp-*c1
zyItG&-+}FeUUKIt8lC=4O}nm|yg+U5`<weU4WInRfol88gKj@a(eE$V`4B}n{N){o
zDSG&4=N+NwtMgyjV>0xGf4pZ;MK5~WqP-N|;tv;S`plEz47GjG?%&t+p_hIzOKrdH
z{@3?abkR@$YoVfZ<n0G4y46MJ?qWgr{lqqt6+PhTy>?TyfB&<DqD!~^r>W@9_WK=0
z*Z;yvo}$M*JJ_M<E$e@G4@I9}{N48}`lT1P|I=R3>sL;DP0?R`=fS@#dcoE&ysqdc
zcX?vMROn5w{$w3RcbWBv^%VWYd9Q7#=q306_HBwzz2UL{r|46i8=Sci`p*jw)pYJR
zW}dCKU%fk{i|6g9x4*RSuRo&ppSt#ontrc)laH$H*Y;TdJVpO}*T$MY@}U#gIgIUF
z-n;d>icT`$qv?vtQ`cA9pZUpYns%IYnWh&+AK6^(pLxsWnto8euIbD3H{DY0?|t#6
zy{}z*#k+p?kEb>L$w5!dQrj=C7}!_Q-Tv4+ThZTMc-I_77fqZsSJC%QeMHmmtatSR
zYWr<Je0sj3Uw{1)|BKKo=iSqx=tmYOdnmf>&Q11I^s)<n@P0*)-fia(C_3vPf2yMG
zUwpr&KfZWzr`jIqy-m|gzkBsoUt#;T^}fE1qBr0Dm2DLbHhcWNioWx*$KR*u4&Qis
zdqw|#?W;Q~`rg@3@2u#So1QsI(WhTtGDXqvE%@0pcdGrH9RCYNH+go(FBLuI?7K9*
z?dfkmr?&U)de^TNeRbetFDQD)E_Z0!Gi$fssqHhm_t$idlQ;PHy=>oq!A~{)<mFS>
zypQeI_B>kC1K+duI%<2`^XqDQ?&$}wr?%Jq#z~s)f8FsLs_pOo_7Y7mxV7Ua53$|-
z^^ZKM=#Qp<{wYNtKWe+5D|*q~E1y;Ls15)6D@DJs^_jm{bkgqs{2xX4e`)JKDmuO6
z-LES8`A_{}*B?O-o4VZ;MLYj+ou;qfeyO3h@7?r$LeYoTI8`dT<2fCcqCeZ?E=~V=
z!}E^Xen)z$tLW_8_t@<Ps93l}D*99N5l_*}_W9Z#ithB`E%#FN+1DTHRP?Ne-@UJ*
z9k=hhK+$OPe;ugkJumHgn4-Jh^`Yxug6_8T#V;s&?PuC<P;}{aw`zLc%}3s-wl~|N
zOVcOpt8Y@<znOi_&3gYc_iB34A%}cfZQuNXlg|7j^yqoFY5M9DSDvG`rv^7`y6Cwt
zX}ah1mp-cYcmDYBa}{-_b!mFldv`rwZFej=Uek#aCf>Xbe<vx0Ym<rVPF#2EL)IVq
zaf|hb)UOE>CTuWg!kX)EvDc+1e|W9+H{I!wyD$Clsq1ahzRyK_U3SXp8*L#MKXLbE
zr=IzawrOW=G4=A(&VJ`k^RGSRTbG~yk#`y40~bxb;*4{*^7?=C#J8?E^Zaf1I%WMW
zr(Jp0$KJQkg*zR3&y{C?e8+j0@AK(tSDo|8T@U&4;wSI9>LZ`p?dW^X+OqTNkAB)*
z{MfaJe*5ZkFLwKW`M{?;uQ~5?yPxpK-#q#4YtFyq11GP!!Mmqld%<O$XKlLkVfSA9
zu`BkzpncY7reAmARr_5eyMKD`bsxWG?xoYte)o*)KXKiHYv*5g*nQW3@(Tyw96tD&
z8DIGCFCPBY{@?!ezAt?0rmnkB*<h=gH(d0k=z$A&KKwg3eEOC}-@km;#WTP7ncJ2;
z@#XHPzVpS4Z}0uZJ!fyV_l=+Z+Q4rfyYBG&Z~WXH$N%A%4_>_YO`refNq_z0Z=bsV
zrc3Vn(1f)%+<KpzFa6f3>%ViCBObW<vTvWc$@}*G>^@(*{JxKDVIK3d2flR0{pYuJ
zp0o9=FJJlK$9GzA{Sg~5tP>`#y`?>2-A%UKV(r21Wy!<|n?JGk=FjiS-yMFr{SFhh
z-lD?S@G9(67n(R_;*{AF*VKyhJGpdUe{XDv$;tA`QPMNmA1s|L_}-gM+-&&QerJWF
zwy3m|3$DfgpICbh{y%Z!{HfPUdXCE;`loAb{?j#f*q3KMq|)iDxF%0MVd8|@>OZLa
zkWD6T@}I{Wx+T91-Ef_xzrVLXyXCyi=k2gHH*}s@TfhJKuC#%^QfF_RkNTT&wS_DC
zR&lkRChj!!Z8l*Tjns3J$V}5D_KYyJ$9|YiMjx`pm_3z0$vVCLi)KyFM(BYxHh*A^
z9j2>~GUn_`*W+_+I&ssX2VI97q`}ewUTO5SSx4{RA|sWbW0T&&E@oYnbPWW{`j#ef
z_DE-}vH2Nm>@Y<=(x{7Y^R|g?LpR@;s~)>N>0i;`yL>R|=j_qXu<`Jpw-~Wi`H9x<
z?_HXIo}1U${N^=w*h4+Vh!aO&hG%;F#J3MUlVX6ufn{SKWi9q@q5f0(#v2U`_Vx#h
zlK)JP6E{C`hh5a&hn>m2F5RBLvnM_dSHKRNAI#<Wb6=9tH9Irfxo<}A%IRKvWZReQ
zm(ha`&*<|}M%Oko`d69Jz2|52s{<K*?SPEh^D?^p0~!6}l8ny0JEI%r*BzLfZGZfT
zjGjI_qX*~5cjwzb%;|qk&whV&PDVe!TSm{bGP==8867w(qmSnAz3k9zd&B)Rdgah{
z^XIra|NS95`~7bVGWx4SGCDCo@2~lFHx0A>$LG&+SDbC1d{9PDH!`|QKHf9;%C?`#
z=@$8Qqx`r#^XvS4W_H}A`S<wK!fbnweEjDu&bB))$>^PrW%M^kXY_tAqbv5#=rJdp
ztpBm|(b@Kq`($*3d|bBA-`DHO_P=~sM*r?-bnAQ^mULv>{rPymoIm$r`St#ufA<aY
z^Jct1JAT=v8Flk{GwZ|I_O1DG*SgvEKlAZjo<Fyn|Nh$pv)^TYUpF6*^%iCOpFb|6
zhpo)$7f;FP@0>OCKW@+G#ZLM6?#$=a^Z7hl<YdSHzaXQ#=GT8Ue~y#$dHh)Zz2@cP
z_Ei47bMyKCc>bKv<nMEAK5onM@%eancKuWG=U$S}_ZMen``w%mChVSV|0=)E@0VxW
zYv=Rfw0wR)a%lE`m*mg!c>aC5^5?wg=<K|I=ig=BU9;_rXJqsn(=xhmJ}yTNWZReC
zn$fBG_<VkHwtd8*8NHz|qjvs&pU>atj{JLlC4bN3^Y!JhoFDGb$Km$;du@`BtC`bt
z^XI-IAIJUj`4b<R-EX~oy;(OOmkIyM_J82ij2@itzb1d)j{Lh$%jfm7e4I|m`Tf`V
z_kMqxoxguRPBU|UJv0CP`kdaGANO)Tf2Qr3owp!gFW25L+kWEMjGmv*qjU4?8u|Mz
z&F9NuAIgr~EdTwN`S=`@kK4KV`8@|`$Ne>@kLB~^)ckvVCLi}}re??Okn_nJ`E$+8
z*OeXe@9O94*K_&vzd9v5?-Tj;ui7iyzA?Z5;V)#{+x|AAFX!)hYCdoOo{#g+`MBPe
zKcCZ|9lufjTm$+1x;6j4pUR)>#QeK`DxZhj=ihIe{QCRm^XKFF^?&JQ*Ikg$<Im;m
z;H-SU9F?!b_vhFDeg6H7eETQ)_&=7@SMv3Cv-~-KmS6v@{5?ODKgToqceyM7UhmA0
z+aiD8N22_>a{io`pa1Rry5Gsa|5;{s+>ZHqZ_nTRxtyO)$mi=B`Fy=Azt6+@`Tx2!
zJAc3Yytn7_^^yGhot%F+FJCY0{P=h0&+))9+4=MH-{<G=zjOXv#^UVv4RblLU;bVv
z<n!#6{J!(^@xO7e?D)N}$mn}=K6&k1+4es9c-jAzZ7<8`-EH}GzL{@7lfU<~B0KKc
ze4Y4J{#~ETpZ~J1?Dtdi=l(>#eyq&jXY+jj1IK2^y-#LzmX*<W<b2VVzvtic_dX-1
z7v<~dllgJm?VO!=ZvGqx<l~LrcIP_${f~ixwL2tiw^)DZG)`WN|6}R=9~iyQcW!Ae
z`u>#%+;Ya8t=!&U-1{-}%>B37{rEpT_k&x`oV)D@J~UySO|7%$y|?qsO}3wZ>sbf9
zZ}0QlW?X2UJ%9WCK4m12-Fo(d9p_%M*Xf(u=PcZL!8P-)nt$6l2kv_CO-J8%q5YAA
zCLex#->)CL?IQ>8)^*p(>%7zX=piDy|AOroeC4Bunu{L2bjHV>a}TqZ{N(23hhI7O
zaJToFyH9_od)^Vg{rvYX+wYp5ZSFYkrJsN6BRlrK=bI<K_R_NtU(>toT_60{zt-4z
zr@pptowDvb+Gk(e_ug-x@%C+}eRQW|+wVJP^WDPlUVH5O?mzE6A2?;>o%^?Y@WLJD
zUOsyR^=aNPdf$<pkVL5W)0g)LgWbJ7nThcDuA4u;>kjv1_V3v5<5m1-XwSbmlE1Tu
zVn>^Bqrf*@FBP^af=D`^qxOYP;25dpnxQ4aAhp##k@$||*{L7+u^pwU+UJF)>j>Ko
z%_N9o>FIM_SEP|}4BIvW_NskpAUt2VL1OtyWCX6-XPHST!!UM)<wSm@?`yesXc$SD
z7}ALhU!UtZzR&YmW@M+4m+1S-I1Ielm5GrWsht|?dX5OB2rS>>sL-^8+UJYdaxIzo
zX%eQf>8pK4D#9@3z}PWFl<0f-VQQLYEc}4C@jQK8C>`Iheb*4C6?%?-H#do7Y<QgM
z*l`r=djy<pSbo3&nvr8i>K<+wTWRWAv4~vLFf6stb}Yw`uH#2moCrtXBjCj&#>EYz
z)N}&<EWCLdCTSW<VH%DM)VXn*3P*?}G{V3YhN<@P0=!sa$G#_G!`0&=5*fK^Wbt}V
znk1n*PD<PMr5)LJ9LKJupT+lO>`RAlDO?^;znc|?p|JeOcH=ZKQ>o64BEFC1S-y`Y
z?#24t*owKe5vPXlg_fo7VMeiM@ilFk265~q>f9uZY>QzJ4BplW^|<)H5ju$#r;#5d
zsip5>Cax`Q%MdOjA7t<Ad0}WJA`W@BRJeLv!qiGFgOdZt@@1&UKJg<jl4{<Drst-5
z4#_a~>`+8DBVoFMzK4uslSg)3UO8gK)$_S-;)v9dN#gU|hJHRZ0G{ty!cP*<F!i{Y
zjvofncOol^8Cm_lOn(?53Jlu`8B6`X%tn_t@oZOGmKW>$dI@)xz86@Mmlk@SGMtVX
zxlUqQ27{K(aR$$DctKYLg27hLXPbulN^pb>V#ZeOb6iU?k(U^W;qnF5zA$q55DcLy
zjWEgPPz*=I!d_s+LFkH%FMZF69AQ`vpTN)f!;TDQP8bAX9NWI9=an5sG72ouu`J=b
z*}PA!D2)s^@N7Pe2=losQg|qIB3Fc#sh=fvV`eN1LK65XS5(iJI80KH=SwZq2yH!2
zLoanhn-zx96Hzt~xMRR>&oTp()k9w|5z=Hf#%2`qRDqs<F8_3#m^DKRKVPpLhxy~f
z35IVPk*UXuZ<ZRdC*vdtlF-w96!?A=+p!tO%osO&KG@WWgl*ZebW`c;b1ggc!_Z*f
z`nD97KG$<?%XD}=$KgRE{d~fR7!i>$hTJ#Q?`8$D%gSe%zMH67r^X;)^+_ySeH3o)
zW<2Nyj_D>caP64ctJhdVvX;iK;|6?}FwpaW<$zNy>3Vj;qv*Njz|g{BlvzLFY<*uA
zE0HqAy~J`X78G?~UoV>hpALrcZ9NYX;adUgXc8qXzlMHa7PBDrVGuq>C|rFmQ<>$-
z<^vcpD~%olmYfh?j~!1sv72f>@&cEoJ#r=2hzv*Xb5iDHYC9&LJ61q_J;zfFm>>y)
zP}+gM9@7rl!H0DaG;yS##kZO9FdLIRAW)~rz;kR{CaE8?Ao@%;^=?k+*@18Q7PE_I
z&+h9AhKZFja#^wCY^<V?x3^87*~+-+>m>*=-)3b`9m8a)QP;CQH*q7rkQ!|>n}2+u
zAP^?QWl51{vWGPYVHfcrjGy%N9FJI;-M|hlmIR0AQP=ZeXxn6BrUBy|>2DP9BvBZ|
z?BtEIZ-hJyB&-%%hR@BlY_x+oftiy~FwiX0>Uxon&`J3op5q!a&gP5FKP3W}NT*u%
z^m|0e*w`~Y#CKvi`rQJ-dVx4$ihGgg=()>bd{_r5fCgZMdQ4^P@abG0gP~z2sq0B6
zK!gTH;7d48<nIx%;vx*0S*Dr2hg6HY3=&on(-qnKGP(W8=TQ=2!>sxqM&t{bx{;8`
z(ZtU9(vDb>B9|*TK^kTH20Ud4xPY10EWG-=2|M&)bC%yAag6Mnv$Qk1f#+I=EnSg~
zf#N7ua>HY#3mv`B2_#FBWZs*x%U?aFW{?K9WN8+uX9@j$tiXH=A)LTO#mL{6d7Ov<
z4Z*P@C)0fb7H7p5reQ`hvGwyY4W)|+W|nj1Or}@~`U^rH8O>+N=Dq2A5sMD|Wq2~z
zuMk+CFOl;WT*rc<o)7*okheYxgJrme9(y&OW@20Lk-^L8`DfdSv_04#1tjr&{ceU4
zN2y@)hbH`%={~N5=w*ts6d1M<X}&}@p>No+(j-EZJ#}BT))|Qt*+BveO8ssyZo<@%
zKGH4mTs?<ep2J0c!i9ltNIiat4wgiiN<^lxJbfGkmqsD$7cA`~{`9*s{|w({2@gyc
z9ZKy>MVcaZ`Rr&VQt0PXdV=e+j-XJm`snpE^%52;&vfD_GFTMUxvtG!&{WhEjQmVz
zO-z^>?Z-gzN=-ZCEyeO_f_O}gfPqox8n(qWiVzhRs%JK@B57Kz5oRDl(~JWBJrM2=
z7vu^Kt9!;<p<!BXj23~W=W$(iy~Om<pOMNez$gX!+$ge<ejfUabc5K}_f16-rAh-4
zkrE>M+%z_nu4^$`tTT>&W;0~&*|vwY2oOqoJxF*a;Tb^`CK5eE?=zX-Y@~cUl=@6g
zCsw2cwM^7Kl1!$+yP*|?mKUU9Y#N%+la!f<zN@CI;7jW<HPBu|&t*M{Q4IB5BNCI^
zzUgw0G+<(@<Je<yy5*-SvoKpvC2~TVK1#8PZkOquR%ju3kw%Gwm=pRQAu}Un;)b>e
zV#CkY0}qWoVpR(qri;+y$9IfTA8pC3lsvh*uLoPA>Y%ov-6MYVGusFwbQe1{nRMCr
z@O_3QP2ov&E<Uq5j;Ch^_^j<Lvr(ewRi=tY@FXSz5?dV?M{49yIfB696YG0KF7n93
zDv=iYMW*YagSf1HXe^!sZ|eDMrk-!J^m!>OWst2K4oj{b@Ulz|lTV}X0jo-j<-qlj
zQ<-jSM*+)7f)owCFmbe=7+6Y!K?_4qNDW7i3zt!|!}3ill76-h3C|X+S3ahe%eng9
z7<1D@ElVON=D+&g5|`O5T%-!TZf5!<x*&56&P`c&c@gz|p=<ao;R&CX<<Zg4%rp?_
zQc;xJDD|$s2Zj++Phd^)^pU5JOH%3Z{V|+emJTa>78kwQWl~zKpU4+|k2FFeAbrq2
z6NXXkL%N}5`&gYOEaqoxJHjtPbfXhE%y&I5n5ODmo6KA!h1B!Zve4$xty0&t5bgRn
z7BfFZxk79UW{Bp&I1Z6wG5S6OD}_ETVjWK{rLmwOBMjBE*sRxFDs?=8OfvPjNY)1Q
z06PpNYfq-@VUorsnn8%@cTHc9b7aB&XsGN!h=zLpA?_JLll3J)b<D;EQx<C%i^)nX
zKA;{K)^OC3$mID_7uHeFlDe4Ji5sS_&wt`fuVYU5EDj8ci%`~UiHI<QeAXMsz*NfC
zada{fB4+{<6^n_lu4kY=O9VK)o}hB*Zxnf+X|QCWgjx6ovU{LDdoJ_Ji%c5^&7LJ?
zHACCNcHvZgUl|83st1ci1ovb*s~HDK_*kF>Ms^bFZ;8ENC^^nT;RaYp>Us{ktL??6
zWy8@32(>T39>X|FP!|}6T<3BzC>SdbT`GnX)o~cJF`75B!b?$T^|?-}OwN>9A7cu|
z`upM%K-yb@VX;=&jvfQVrZOy0by&HQY`!31e6$~CX@LC5WEqAZvMII$lPQ8gQ1@kB
zMN(qum^KDNw${b5n$MUdHu{M$^;lslnNp%Q`2nhHCO?$aXE2$fzDSrd>Uvg^pngS4
zj=>L^-7PHxB@g*&;(Lg#Z2V9#QI=y)^jV?xdcX`qsX$R;6cfyLbv-YJv4n$)YdEY|
z`dLsI)Ijs(2nr(`Q`A_@e-A-{rkbz2%s&@_knrhP1v1-(p~YIj5rXpV#<_ju3)EIr
zN5oN#Gey4#0-v8Tuv}%cXJZwr1)8PBOr(nes*d9w9A)mJe46lBw%#Wad1TsrT!}%b
z&*ewX3Yk;V43jupo8pi+GfadMvM}R))Y%k%HOL(Z5a{}SmGdfs{ZpH78D#zqW>CsX
zpmaYW6I0JW)*Rkn8mMq;8PojBznlcS7x|90nE6#4d>!x%o+}6ADbU{|!j^FbUZ)tX
zD$eaK<XFTt9hPjRnCtI>b(|_Wi+4qN2Gl;}t8G|;i<pQce67a?i_}v}pwFU)ouJQ+
z(BZf#dPM*a>vJ*79G=$=O!Q*Q)88B`D`0s>iDoS$i$a}?B9FLd%3$=OEbFmP`3l%4
za3_ucbX0X58lKWCaKX8(vYvhx4?W6_EVN=42HVu*!UBedk1d1xhrOnGA0y4f?SpA$
zyI$@K!oh&8hUOr!&rvtkJse)bfL)N97R;r;k@9J{R>+s|gv3*kJ#)yX#b`u6SuCyk
zT+{Ru{9YKLc=aMZrZF}*7K9AoSVUBIoR%}VRB$-(KmBeI-#f(hhiMkEX6boqb217;
z;;@V-yq>xTPk^O@%)%Zqos72xN`2r3I8~Ia6?)x3PlK;8ZWuF(i%*@)62eWf?oE_$
z79_PVl3ZGSauyy#WI7kWp*JGmS@F}{=EO2$2RC#jqA`<$d|MB`#KK|8$8Vy?07VUU
z!C}WL&k(77GDS2wC?^Ti0GX(th2`7kSrK?X24uzyI16mFHy`dtQ`P4haU$Uw4@*q>
zQ1#d&8d>6e*bjvU8(rTQlMZ_Ur8>axn2)o;bH!ZVK{AP4e@3n=Js0E657F|~xv&7T
z2!#p963dyMrxr$&(wxi`>5yc0Z6eYDPmn|cxhDQ*buQ~DrcxAJo^lRn_eD`gKx2>z
zysHR&b)1I;kT}?+hXWP1Rr^>Ow9W0}KMiqSs(lth-ic8zJO>$^%U|A!G51_v1{aJT
z156R&CdfF{KTJ|BX9Cu^z`-Ixi$+z}?}3|@)rA3xVodim<MSAG4UZTLrPA%3O3{@~
zMl1Xz9gQ;Gc-sHFXiIaX7!KvmZS+P)R(GC^b!)u&t?>@09HkS)ihL`gEF&v)zJ1?r
zYw!stta;dznl5|kw7044JI*^-ebxyVe)lhHEBfVNvHtzeQ#V^vZLjrNPt()e9#fkW
z4q9-7{{6zu{-C$dTj}ZTN53gGm4CTc(@i(|q5ggIC%>xc8VB#7_s`$y08P6$-bmBm
z|N0P3_xSKJn!f$T4fOqHY_N@{zkmEEnl8KGZ2g?!mv7O>4P11wrtf*zd-eW<=3Dx>
zN&od#y}xJDg_{29gSTn=i64JT)An1R*L3b(**N@o;yrr%#ohi#(;0VNuIWo}ze>}S
zzJ9i*|2ld{P2bo1c1<7OaaT=0*S?jerybv}=|5g-)AXT}uhev{kFBriWk+Vm^}i!K
zf98h{)X&|$|6c2>0a|kB1N!-wKX$#QC;jSA`n=mNyI5~;xBCwI`y714`TD%O?|)wJ
zf8<GP9d&-kQ)}w&FYR`uKJUi9Z2X>l<RpFGug`lWd!Bp0uj%;*9j%Z5=GGt6+qbv<
zx2E4e=MqgjUOG$vKJD2o*=OPZ+^F}T{FA?_+fA4}bq)RdHoNbm_aC>>tD4^O#1u{U
zf3ic<3-`TG(?=gZMbq})Z2X_w@J#(4U%&n!eVsr2Z4<q{!#{qmk9%&;V!gk6<5x8O
z*VHrDQ}^w?W47M@@(J7N<2JhBVNGxNdNz+bpX=7YzwMpb^PRRV`~A?j-JwpOu;kn)
z^zmzbID6mMw>nF2pYwxf^zW~4n<f8TducXrx+ZL)fA2YCrltqXo2}{oeSg>V?jLTZ
z=@-o9n%=(NHk!Wr*~c{9_2b$0Kc3F+^UYsR)cddf@#i%C@B6aruQ_{7y}i>N3-vhs
z;lB>o+snW6oTkV9)6;a1O|tQQ{=ld7^KSd|4fOsSw%<?F2RrxFwEr)^)4yN;++})u
z_fP&mO*i>jHcu~`e4oD0wO@Qf({pY-S<|l^+@r_u{#!FXzVOhD&mKH1<AZOm^J9IT
z=d2%T`q9_+*2njpyPl>yzp|UA$ycw}zdw5OIhuZY^H1pg`=%LReSPxNdjFYcuGDnB
zrBCbMullvGpX14`-l@0$cIhLUUj5hOHT~$>^E7?ps%*ae%zjC4KNwx6>C(Y$UR`<E
zNt&;&Ua`%FY9Kl{+F9Rc%YUq)=`IIme0bQ7ll1n7XKtzKlPiwWblDo6nqIKz-I^Zt
zz5O-a{m@M{-R>(N(Dyra?QES~WBP~nbARKPTj>35)Bd85yXBQNHUB+5JG;-e`(}LI
z`}WuM?;HL5H~RQxN8O~i@7m*T{oQu?`9^yC_xB#8>ADB4&@_1DB2Bw?&A-PMpVr%t
z-@C7->tDFLrg#4?`yS_?v6a5Qd?g#l&;9sD{rhv4(Da4glQrG=wdXfhH{SW<@78kR
zrkAeN^p7`Y-)rBE|DxaT(9gb4AHT!tzNY=VZKCNb_kCH@&1PoLb>t1d(A$q5oAJkA
z?)sj7udP0qt#8{LnH~Sg6aUuxzw%JVPyc;D#t&N_lkw-@wttVl&S&n-#^aA4`k>x^
z;l+#(mVEp)z5SOpr)s+C!`o<j{PyqA^w<@9YWluUW!F96tGnp!dpFry)2qI?p{5Vr
zdyl4n`1^XAo^wyOPX6_P3EB5MJDaD6Jvyj=|K<bBG(B|hY+dXBaJFuJ>&M^H`=@>M
zR!s*kzd+NepS)Sq&nGA8?{U@9+4ubW=YFsE|NHK5YWisW6HV_w@mNh?dhElRUjK<_
zHEsXfj{5vt-t$FG_uS?fz5h#=sp-#`Mw*_o<}&^J{o7~T+e|!dT{X~0t&^=2ryTMt
z{rhv1GPz-2mwnGuugSjmUE4mRuXD2VNliEVb*4YO<7e4;edb4}>-}5L%D(@k`;XMy
zublS@O_R1zkK2Q1OxE}L(vOePa_Z&5-|Fpeta-4ea?)*@zP!=Z`uJ0xSx>L?J$K)(
zuXFdDz4g31C|}2SpEOJFf8nxB?tD}nptleB#kQLETzkHz4}SeoO|M-m(DbSYexv7E
z*Pou$+ov6p$@_1gk@3kR_rF~~$7VsI>Aa3ZH1+q4_3{7u@lQ3izkQCT>;ENNCq8oG
z#(KMwR=xfZ6K%2nkQ&4@sMq<w$ysU+EqbU>XXC!5LC?s9r9bbw`Jdk?*ctnAyow(T
zRr;ax+tvTyNLf0JZ+KGsLJEP}JK80d-ee&sOLL)Nbx5<rkW)zC!8_`k^04+v?Q{DV
z1wGv->cpv8(>Y^w#R$hNR0SCNM;x0YQla>9K~|U1-l2;*CbiE{r8VsxJ<FFaoz#AS
zs=b(>P-8JL*xqqcJ5?ob+^wrSu6(;VWdGpe?w&=*_x7K_E&BV|0ZsaPqs93@d!pW$
zBf8Sw{$&BzZV&qUmUc(_J$5@`^0I+NeL=K@lcL_9K`NKJ2C1sa&W#o??^%*?^ZbeR
zcGp1niTo}M{!D7;f%{i*^nw|)XLK%XJEDDha$I+m%<Wm)-IE;A-Zpdo+&S!>H??!Y
zUNgwa5;F)%O`kfIpAOz<#{3y=#|2B(JNGBSvYudB(za*Yj`M<O;2^bof!@7i+tmH1
zx2bxqwj-7shMDZyww*C*U&KDqHh2E?8S~qw9hx0HbwOv_>{)YWEo{@z${Y3t{lT&U
zHM|Bd;2Ne$QV?NrVqf~tq;@g1pD+e)7(3g)a#H(1@ACdg4NLEm#e;)=yIE8GDP}uA
z%ufdTdV2<v1$tCWlkYsp*bWSKNBT}``gX;6!W~y9Z;5mun{Slm;tl&vN+P1%iu03Y
z>Ptku%X|1@f_+iY)04zq*=V(QF!DpY!WDxFcVZ7?IN}JsJ&^Qw=f5Y*x(5eGUSw$3
zu%puMWNADw?6HV8;Hl&x)b|<w1GgLu7A;9u4Bv@!7H68ja%D~K@=cpyFJXxBa~Cek
z5lMx7+ZK1d!ClkLX5}tZ;Grjxr6PCXoP;~e#$80ZB_W4`+~txai!XfT$|GvYF5mJ@
z0)5{2nQL@*ZqqOqtJ?r&gZr>}_x7(CCA<!vI@KV)BfawQ7U5$R@>GAuNG>be|0YT<
zNk+FIbHc5}T!L(=5{fQ-`98wmmj&62WC!v945ZiL)%nYuLZpFM6`5h<XRy(!?oGpB
zg>=JdGs?qZjnxfp;%A=gAPdIoh9rClr3aQwH#AM@c|v#v`O6}KoD7g6>;-EfuU%{W
z>=llx8fhB#sx80B!MDo8U5%Ars6*5RNT)J)31yO%j=L!TAj=WKTFCTLH4G&5nl)`N
zq!&rK<bZp1u-E7UtfpbFLei^qT6y@ZvC@ka6H;6%+g>P#vV`g(nI0;qt{{7nuuV0G
zTY$T$jB-iNG~MwFuhFH9E$$lbH=JG~>#MQch3q1|$VJhrq#KfzRX*vZ91wU`3v!o*
zzg5~rEH8lrRb`~vb#T|HqTr_Cu3D-gMl!i;<q=(tRSiutlr7RCD>0YEi-!B4Y^tFU
z<S&zNUXZ&4B~=s)8AXH_39*v&*g6~=qwDLNhP?`@hBG?L!(WXRUZj(|hUZlddfB#H
zLe-E$L5d?;5eives$$A^oRupNspU2l1jlfh_+tFRYcx_ti@S!ahBHefy&B71(laRL
zvJg@gR6|NM(7?(jy|AMw{V3`+G>JUalw6jaVo8cb70SZ+d22M{ON+Pu!&{*0h^CcX
zHfk1V39H~Oiz!?--l8@~3Zt;(0_DRLHJuuYyQm2T2f=j4&t0RjlUm$0oV#q<tlULi
z9$J=C|Heu#=CF!{m5#ey0m4dAk31n_K)>u-R+r)~a5Mm%aO-dxjz%GCan}g$n%=zJ
zMK%=R9TmBYD&P`YU8+W#;>My@mrW(JC2K{K;Vz<GJ~%3}4(=L_=hxz{;oRjIP0L*b
zTWwn<Jynoh;2D<EXGl`B;}R+;<kavy(90ZD!-J|iH57Y+6X9F+<nfLI#%wXy2<B?u
zEP2uxBpP64=CW0med!cKu*|@WEXZ4+P#9J%ccJ1e)IyvkvW`5B(Ezh8-WtwZQ%jTw
z)mV3-0C^D~LFGs?T44$EpfI>xKn4r)7G))pIC4#?1vW2>OdTrWXqe^}XN}-2r&(1(
z9B(9P*p&?~@_I{Xa5;G4-NG6ni-TXp5sy#Nz?)G|{?ll<^A>Ln=dEeYYi?1MfihIl
z+@gX?meAY++!bWzqRtGkxFjh!D_0&`%i039pIQLBjwsjYz5y-f8o^x68|7m1fuW7@
zQo)x&ku(ymbmo?c^TjV}Zc%qa#F@I2>P5*ITy#YM{u|8O@khBvci?Do*KqFYY~K6~
zobXui70bFL7z9k>vT+xfmh>aR5-j8>q(nvGHLhH#Ft1hxr~#&iF@<m#0d=bm<{I6~
zrfHaKRhheci@l~%lI)>lR5iHv77c^a0CX3ZaaMRdydW9SP;g4${DNfiUujoSF*k!^
z6eS$W##c50E-D}k@)iC>+qZ}wA6$#CRQ{JsXM;Li@T2=dwHRxNv0PbIAzXbK%d`|=
zvR9F@RO_D-GL|rCJOf5=LC&H;i-KwoFX1n0vlhs~AljSbR|rS9Rcjj78kr71>4^3d
z=k@nS^A!40*YvpyW=!vzI%n>I`!DR8ci^0PU2s{gL&%V-bqMNZtwWT{KgXj&dMZE<
z;2x@1y}Zdxu7xAhpb`+34>D9GIFJ7&>4}$IsbcVQ1)C3PQbHzIzEyzBi}E<o1n96}
z<<#bJh48JQ=*Q3FquV65c)WZ(P8*GCA&J$O$1Pffs=fo2cpO2lFx|?gA@jJjLn;b3
zhtG-@1*$ke!EjeKJw5s!P@(SE5$_w__Oxkuyr%8x%Bs1`Z!%|9gK=!I?MWjG6_T##
z(*cIVDWS&{OOg;|(SlLnhQP+e_N*y9JwcOEi?X)y3(wKLgPVr2-a-m8O=_D{c%nVp
zlzvyXg6R-h!iXbfKj1_toUlTETbln-xQg(6rl#;D)WXHxdW_&vy}(;MUP1-ga+{RL
zsWu?Haus6S7bP@;S&`_jUNozKqI()_QzB6<8%~F2z`=Iumt7AUGd;s-KF3<((<6}n
zR87O;we@4uZy;lpNk0bDm=5$6({)Hd^-EZjj&Vp%fdZ*0Bs}RUO6O`gtEPgCN$XIQ
zF6xMS(y$vRvJul<*b6NyY#Ppb3n|Ey$2XA2%j7{;f=GiSjrAbYn6iv`(D>LXyeJ$c
zqHQKk3aEgp8Guzy%x%>*dHhB&O&&dq9?$e!4`P~}HVu#0mY!CVdY6brI4&?jmHf7*
zK+h_l-xl42ZsCO!@@VXewx}xOtLF^a2DlS+(k6Lt{G650l-i<yFr9H}W*$Z@&eEw3
zRD}c6+CZ6ABE3{HNl8+w=+pt%p_EZiS`itA+wV|z0n#67<ki(u#aBLUh-gw(M`<DV
z2%Usx<20X7QcE@sW4&e6Qq&x;k;1b?YAIC|F5s9rLC314;98cE#eurP_AHEtM(MDH
zKr13UeISETCB`p1&4f5Cuo;A7Jui?!)3A7TtyGZ7?KEjxpma!+F;w<yCB_07tbAH2
z<(LjlL<+ieK;<-upVW$nAUhSF3<b$`<O_LT7+Q&lL!zsxaO0-otGATstXiw`CTBI2
zNOS_%PVPcw*bVG81<X{od?8iBFC4pQ$Wvu6dsYoh8>>UJxa$5%kI?bU&M>u7%RoO4
z97<C|(KI|>Q+85U)SPxFjVWo*sXUKkWhePUvV`#vAgmPjOhMTRDk9hbDw&{qq)~Ae
zP;B7z`gQn96F>4I8AQ?viN2&xi?ec%j){w)IXR1#$>e~lfZ|xrlFGqRKAGjw^%yk5
zqMT)@nmVUuq)~Gg4FqYvT!+0U!`QP!5!sQOnr_f?>6ETkMY(jEHPR?e+B&bI(o^~h
z8LDi0sV~*%Dw<4y-AQX*lW<7&R6KaxF>%?_h_{YXPZ`H1$prX=Qa_5K=Fv;vLbfLm
z&<&L1C9*xGYgrUpR@RVJ|CsXeIN0T;bc<#aP@Zl0E?(Q}r7D`m0RpnjdV0jTZsLg4
zkxAl{P++t|o*7I6>#JilX)2y{0TBSzrlNvOGC>IyWYC0FC-#C0GJYx2Rsr{^EjzIj
zRGMEMHDDnlg>#Iu@gyOG$ZHzQ^Q|O1okp^FnPev=kYwV)$rWs1_`igZuAZY}m_=Ef
zU^Wm8z%^8popy(Cr&t{60Sr@za|y_pFp$0zSxFpOY4ar8R&m>EkOcBP?FK5PB}x|(
zD*aR2mtz@=fJhmUP!tTgX`n3^%3@deLPE9BsL5D1Ekq5Xo^{xrQJUIVoB@zxAtj%!
z4A&v_5g4mUy|%(7w_3QBlXNUol(4wca;Ru2ULK-A@$5p31x_p-4eF6tu1s77mw@^U
z+sDo0v@Ia98tFzUmBexky0%mfhu9>VmCubrU=$p`qN<=n^_ZoiA+;qI_8Ndeb{)|W
zFTqcW2RE=JS+2rp8f&i>W4VUktP)Fgd$KFzWzkJT<(rl*20@{SSG=2=t+7PO9Iwe(
z4(OUdr@3{czfmRZn33xw0D@BW)I8~L1sJQ5&Wln>EajXvE0<(oeU*{1fd3GMB^GHv
z5~!dejD;fSfa6+6t&>9uq2Z8t=88a-R<$Hnj_^pg1g}Y*7o=n(u~y*~6c(vnWs_Kn
zRb0sv7jj;JjOk%T7GW$n0w@eohvVNi4PVB-Ylkw3DbsB+)*Fl^nlzbHdDMyJR<yCm
zgOp{IShyn`QJAq5ZlucPtf>mRaED5gu18`4G8-h8ml%oR#(v9rG2|5l)4q{;US(1R
zU7}cYX|5Q@QknbZlUQ_kC+oRro);CyBzhH51zpFXR@wCGu(84jzcSe$K-3vw60~fr
zVT{$RaV)HL17}zz#u82`p;|1@r&zZzV<D4)=`A9$C|;*swOvQqd+d6t6F6RA#6jqa
z=E(u6>AVoKN%16r_9BMoR;-DoL|esS%VuL)o@!@NG(b$^7#?(NtUs^4YJhF0!!W0~
z7@G=Mqz-p`$QVY{9a;{s628?uiOKV3cP<zV29vHtcdy{zi-!I^t7k@ke{a9u-`Trt
z_+g{jk-iuCt_8_)Nq_g?iu`(Ga<@7c2Ll7$12co3uKDxk=xfjJje@1~g2C>jXRxcY
zcX`iX_9!a`29ssey9bu2J1-x|o@3sDVs>v3Pd#o?S6?*xz=g@OzOGJv;NJZ~-{S6Q
zpfl)?hwbj<{9&8(dXG=~7cB4VTe@P{-Wh|7<>;HJ6Nf+aoZi93Nq^Wo^78YOMco5~
z{VQh0yi0f59r0GZJ^F#T=$I#;9t;MfUT|pF(BllX3t5;f>hB&9V0=w6#8hJnnGj}G
zRU$j#P3^n|&+-TNLxtucG?n5vt;wq;N3@d)MMjuGXjy_Dll@uATGYF|KT6s=+IyEQ
z9vtl3&6?s*G28iJelpP4+cS_X=suC3RUaMo*%?hfVtdC)?R`s`GV$_$J)-R$v|a@N
zys`zX(DF+dzNdREZOA~*D&$%sCkEZyCgbe2TI-%i+m<?;+Sw5m#%UM?VI14UkebG~
z)LQr6y6axm`0r}I?$JX8xM+G~Ry3qoGO(})lt5pmq-mt2DN8Z?qM)ZIiMx&qmM%|N
zP?cJk?+RB8=6m|F)rTLUw+E2t`R~cH?!m#47a7_$?5MOmSsD)vTdOTFuF3H$TF9+>
zLaD|{1vEbLn>^Kk#T!3Ff*nRO3M`!k?7B|#m{P5k@2$J?RgELA<}05CsG4D}s+AOJ
z!R1@{h{T|y7Dj(!HDjG5=<#ZGjnBdakgDNoJ*}itD@r574QRa)h9dM^8s8D^ElYQp
zIbB9ew`yY2>b~w-FuElxx6t!26HB=68936=`3jr5Ko8K(VCBj`)m-?9A%i_oPcPgc
z5~&~K--}!kqC>Y9zShDw!jD+Oh0m%Q)L6|IJ`%COlv051l}g|6)t7JK1F(;dr$r52
z<+cOiwt&8ex<~snkU7S0=caD#$UxX`68I@TzSgqWTJ}o6>{SI0U(J_2i`sD${E14-
zo)o04m2cT2frG3=g^pD)v;n#dY4rsbK3*8W479cJFMNsPC%{KaI|bDzwA0qY*IM{W
zzwlK}yIjo|K2ReCiy!vvSba}50w|%rXVHL<n(V?0pCJv)kQll(J$zsa(kz>zo$)Vx
zp_jU$ZKM`Z2O?^s-YBie##=VBQN8`|YQFH<K1FxrhE!bm9M>peH3w}8UBJ1ETDrn=
zctO<^>MylCe85n6nEg&2*|7w31J*tQ<bf$1qvhcvEw2^YD0gVXT8%>+wu@p^HH#09
zCA-(Ehzw~NG@vYI>nhD~WHO7%nIp1UIAlaNi_$VjWwUU|(5|6u7FOOd*(}5=q|e${
z(AbsIq#2kCTu4ow4|prjHKkRD&X*7piRs>IMj?4ELGv^~Z|Qt*S)I>r=HLd5PWnb=
zFCXr|G6Kren9L)Ex#+?NTrFtYJ_}#4)~X9P+z#nap^Lr3->~ZN^0};cjuShP4=R)6
zH4o~<$jXYLGJ$zhI~VLl9=@O~h<dciI2<Nd?|3vXhz8!&DuF81yfjb{1_j&t&+sAv
zN1^f%4pV>L#3BLHbn!|n_e=$k5>-egtmeg!?%I}F*s$VT`{)dXm<&79GwgZ_<sI5^
z`Z0~`Vd=<hAyAZ@%_KE94P-M(61Jt=5R7FrL0%7zjnbv01M=c4jKV?zE8<UJlhL#9
zh?<hj@&z7VqmDp5z|W}>dooUfAPK!zzgx{@GguDA48syLJF5nT8!E;c%wMK$d8Q{T
z21bNO#cTQa%cuH}reOv7%X54@0Tg^(Rg=F!sx?p)>+lW+z8}SQY=$wt`+#R}ns{)q
zH0WQJmCsUpn+(qGT{Po3s)Mp!g-dxm=gv{SoT2LAQHJ+Obri>AUw?0u3=HTy?L!%7
zZ~qEi2iu3fsq2bo^`yOe&pb+ksXiW|!XAt-!~#UNs&{M*&(EDdea8H@X@|BAb}viX
zrY`7gn>}mJtc7g~H=NI1u=W&J3KJ67PgU2ILVJ)vL{!o(J(_fA9j=JjgRzxOHOhen
zU-N@1k{{v5$`2ml%2JOo%9W*F_&<BAd9%Pyl2+l2H3kO9KOh&iO*$7T=z;b-{`h6;
zT23TL2#%$jO22uu&5=Abf*+JgCvAt&2IgE-n{;>zTr8eSY{x`a0Aod%4(=GmaHBbG
zXnTlxhVj#|8p<KY8+O!KhcT7<3NgB%Fr>oMv>gj+7}k`AG##R$m#hP#$+AN~3=I%M
zs5%l>OWS<Iq$?+zHCp9Fd-G`HQHjlHj3rBG(%GI6wvUor$fOgXM;MqED_0&_i_28A
zE!ylkb%-)g`Et~%?Kp&{qvnZ7kJc(j2(w`}UDX~5WAwYJ`%lkQyCaqZf0FFkww=*7
za$m$g(Y7i*u$pVK(F{hGgGaGIB?d$9^U9d>LFJY07Vb7lz6TY%MRY~#xg1N0q4DdA
z!icD26$uz7cqYT<N%F1AU|L(m)$4eZfrczo0HDEK#_h<^^ec-oFyBik#ypxsx`n%E
z(c&3n6J4>ZrW6yFl6h_&$puyryS5R~G2Ts7|K_IgoYXSNO37jkj<G3JXPZRhih_(*
zh?qTPGsqk|yLzhKLm^F(7A6MZHANIfrBW)PK7KhCglU|ha)a)Phth4H{GF<DY?z`r
zwOR*-)mc#_P-WucuN=dsL>BK~+4u`J+w@IYG*JY+ZJht)1|L>chFQ)OyePC~CYf&h
z4CcE5HD`$oTssZ{)@#Wyl_z2wofsR$*Q%ysHI%&?EWRuWgqSH*?&k?fw4Ct9e=sJo
zcgl#a;aa%iMN#fj{bO+{){0%jT{JlMkYM#>VHwhP!<dw#0657oXr3&rLgEWByQ)yF
z4dt&!TVOzT5xcG|zJN6-p?XM1A0M|}LG=)eN|F|}a^+FAq?Z)7kV4dfRp>-1${{H>
zo}J*UZ?!MZo%L^suc~c;-oorfK&wU=nkX@~_mz9sk%4O$^l&J5<ERP>Rh2O!_RKLO
z;?BzC8x?mRwreON#;!3DF~k>meZ?gh002f2{Se@1&{xP+9KU|(P||_FG6}*kwxcFO
z3#jVjC?SJ2SU(h44W^^Yi7M!a1WrpRzbp$^pItPA7zLOvKQ(h3m44`uqL06E{LDqO
z6OqQ0dL^WqxM9>Z##cf0&}mxlh6!&uG@q}`Uf`jY&$~f)RNF6(bz`9NnLZx+n%)ig
z3+Y91VI2$>SY~7g;JV6C_;&MvSr?ODW@(&<4VGTY<S=MwQ<1$$<3OyH&3Q;yZt(39
ztA(8N7C~m-v50rWQCXcdHy%HCCHNf!FG-Uqk>pG@PeNi<@iokQSRVdruy+F|H#(84
zas$VB5Ah0kCDaZ*gb{Ef1^ElOIou9Fl2@;MQQD#J)8Wr`>yTb{90jRw#13dhp>SL3
z;b`|pRaCIIuzRCsmnY%lktS=Q5`STPl@S@oHGQh2i&|ggERyOAf7R9wX&_Fl-K)d5
zL4+fS$gd)GBaUU#Jh{Hbw8QdnSEIE<1IwQ_Y8APQIgHv^HtmoEMEZTR9u(3Jsqkh2
zA@)^0#~0~k0yaa_Yp)LJ<;OBKqd0bma>Q=hJgp<EYKKFq8!G><G}c#xwL{&DRQC)Y
zs~uXDMwKyjLj}WaLDZlSgW=dC*P}*ml-FTcQ)tP^ib5xg@ta>>09LRYxe_%oGMpBJ
zjVO#JIjDIV%mzfqa4WkvOh8adD8N7+@I~P&70SCT(mkq2moo+fCsLY*S4U;ElhQ&u
zB@7UIM&z0;`)fFZ(R0kI+IDSqw!ciW?L1w&I97tu+_#L#188x+G>dZ>2}?9p#sWLM
zO8$oCFwm@MYb2dI0$`N-5dKO6;RT_zgO&vbN>#2QmW#g{Y=IF-^5}4YH*PF{35qpJ
z=%2@)5+Jn~R1ir{!N62iVAay2GWL?fvj~6H!Co}4c0&``ODD8M9JL(u!vz@NMN^wr
zLo|^EG*YkZ;s8d59Hg?jIKX-EC^;_5VFJC)wreGDWE`gIF;PG3P!UNa3uR;_USb5E
z6*o_UZxt8fFacJc{EbEnFe;-gfL<yIFllJ~n$qzXU}82Y9K|IVIbco=Ct}85v=PVZ
zH0$wkIG{==sUI2+h&Ptm%8*h(rbA9dBv|u?n3+%5CzTTr0e>TPs&pLYl7?#(<}jbE
z4BI3_s#=U$GZ+FJU@ph4BZtEYJv*R$%%Ue4vF7G+A{G;1(@W!{Z?FKfC{+YoYZVq4
zxg8~BFVz?oM5%%TjHs4_n-nK~O%u$bV89oqSqDY{swLGa#LyxaV3W92qB7b9YuXAh
zVBl>vH!7-#!Y`!&qt~vA1{dWph1V|>2zPA`Qvmv8RMnv(I#Eb=FTDZiVonl$^O#^o
z;>^m6SRNy+!Et8blT6ijy0Q?%IxC?Y5mgMWuV|c^n6ynO0yko99TDJEY*N3DJgLM7
zBUo{AN;iryY#tDfLOP;To+8Tz>xjTb=&Xp!5{$ft66%OR@{l-LJkCtEvXuVHm4{Z1
zf=M|A=Oh?6W*v~Dw&x~p1j3D~Wj33rJzs71QqWxG2{JdBz0kEB`m|I|qNgURU<otl
zk1Ani&xiu%q3xm4WOc_;1<b>C4LfQ~0W%o%7*$1G^fYt#2{KntL-aZ_=7PHb`5u3k
z6ay6m!lX{ck|Ir7NQfg0uu}0?gVjUw7O=m3q}13TGkT*2pI5;C%jgNv74<If8Eo%R
znNv9)z%aDyp<UsM!DOJlV~C%I9WeqRU>GwEJ7kFEs_~cdS=h9WDQJLEXk(L?Q9X-Z
zGZ@}=0Dml_4)J9fNfJa6SD*@vYL}*Q(H9e6<LlHg61DfuMe=R1_yTl--gA}P)+=Zj
zxS(tfdIXX|7`PyJSzwHSmsrcYf%qa4O>-6iOvf+2q9iojzz(e_FdTgJO~YM9yc?F;
z>CG&ShrYq$%QA3YlMGc+e3`TtD<6L$Z-J~)>Boh#=qUwo)Zw+f8;Zkh1U4^ShyTzs
zoRlz|WuyUB%u!209IYUhM}jq&!vNtR6G~}YV--Y~h!BFJY!VCrUt(ZIB^ZUt^x6gE
zrh1OA;x9tiDg~pC2v~r3J_>^v2E(!0L?AcC^uuYTkzWng4^`T@Z>V0AW4Q~n)GwiX
zgI-ez8HEl~NPf|e$;Lxk#Qb78am>0hq#`2-$R>6}%kXg_x-A89c!m@csXPqUVDTle
zz&sj=RAeyXOt|UFCcZ36Lvg(n<S&^ihey_O&69md5m;Typ&i7D5hh7U-$BO@nkPM~
zp!qemJOJ4Z<}Rwd$hlKeYOMH@E|aZ%+(qsN-6wglLeh)W0UGSOMYKb5oah~4cy)LW
zBUAu81%tzLc(%BOKR+zJ7va^}tlA-vE*3ts$^eKim}TW-FWeQ966T`Lc~m|R6uJV=
z4VOeQM#HS5(Vl_XM$dbftgSeVo2b1*A^BA*25YeV0^ZhlnHiPj7lud~5k`T@pmlQ5
z>_Y&u2u72-u}TOj7XXe9Q%sRmSK^qg2f^2l{3MF$LeJe2)~lwmz6v-uB(ao|h%c(K
z904E(?Z0&ga6u&3MZcjY-UE8s%l5BCrt~iFkCOI|_TD9n2M7Chv!?h{%yxd5pA7W%
z_6#Hox=-Zi+5l~qf+Sl703trc5^9GQ2#7?13u=dOi)(v@xJy+p8)iMZ8-YmpXr@8K
zb1$MDQj5F(Q~L#Xl|?yhICn`&pC7!3m1LKcRvCML*(O-;0(G{K?d7=S|GOm4uQILo
z4cX<9LIs9lU8Yw8?jgV(X+YRJY}Hnd2z*)7&9a1+aa|#(=vb@&OltpMmtORpQ<e4=
zUGqR^m$CSZzIwRki}IJs0VW@#fNn^p3RYcRy(?wxfFR}KlqYkZ9uF<)HC%dmWht#}
zxbzYp=|DoI?~e5x0`($Gh|Xpa$#Ny@K_Te{a4(ixHP6P7^a3ZG`W4%%LwZRk0D?O(
z0;RMH(MsDGCA}=CG`3fxr5ElCU)YtU7XjxIx(<EIbA78Qf8n+fzC&76P0xmD({`Dr
zVs#XK1vG{Y%_NdEPch9_+Q#sxvNUEiuk9sVik6inGgf-xX)dE<9wZ*&c}24hS*<|E
zs@C}H4e8|)=OfZrM^%bUBR`Nq630MPiB{5K`zYyUmj!&j;kFk}NorCm*DOnE+GVU+
z_7r$;(X0(*ok~7gx$=-z<!!vdUp}}tI8y83FR-yA1u2k#+U+~d<J&0Ym!H|ZetAzI
z3@qkq1?go_N?*RPG9~+kTf*8jy7SCx6_rE0WvPTFn;^ybOHz!8CraIj$BHRQX&%tt
zLNR4)x>J^n!-mT)Nu<XElA<DasrXy@xXU9c$|;TqfD_-v(OAIrB9&3nu&*B6`GBah
z1z;2^q7$T1D<^7LPLwoeHnHQ7xDd%emD!7Ij`FdWty!QbAOKN%m-njz2%yE6ZB3!c
znM;QYs}9vrq@igCo|C}Kab&gdqDGir_B5+3h-?iPUxa9hyV{jgqCjTB=TWwVc_6Mx
zYH^BYZFn?kHjJ8{L&adY`)Sea)PadD?9lULKT6XeakyJ6?)9Izmnk}Dl*TXL=$t4I
zn00Ga;>&bOs2!5iF3F%R827@JXCuE>uI#Gn*-+dCBO0{+sw1N}j&a7Pfk%XmtVPj0
z%3&ev%bsShuHp+<^Q86#0Mc|140cDr%$(HTw>0SKikJ7ZueYbYgQ*G*v{^ANN<~=<
zGMG9n$GFJ>ZRk`o3NmVFdkCzC?IHY{AxN8HyM`S#1~N(k+UQ_-q0lm(LzCEEE&B@=
z%knmmVB>di*l}pmp3pbE#J57Hd04D~^UxI2%j24FxB!#XRIUQCm-W1a&O?ALZ0cx=
z3NUKEqyr*$^|V99T_#GWkVYMdHclYZ*p_J$nX${?rcn=z3NW`c3Sy%ZH(Visi>{ow
zK_PP~6F0!b@QRjH;?<*6q=@-NML$q#u2+YDBQk?Duw|T*6y=fR*fjiAOnQ|^J#4u2
zA}5bvYDMoMVxoj4DVSfG+oh0hNG1jVGljT|1QwvM>nZpWfr8o~W)PTO>^4!_VIl6C
zQ5G!h#wYbUluTlNRpKrZIwZDR*?fl}%>z$URCeJxRGAc2&<=-u^EgWBZb=+){FYZh
zr{zQhW{l%8a?(~(?<n8AQy$x^!MY(c6(lxq6^R=(4k;gh0jTTXpD$XkOj?TJfrwSW
zw?T_(0n(dQhjTuVu^-DMfJq~n*v%9DDki_m(h#=c@=Jk40KQaNegRWaO8G@moKsYO
zIfxX@%L1`w+Q!=Oty>4yAuZFSD<jp>94pL-n#T4j=-Y70V|_JPewn1rgT%sY9h-P4
zFmg)hJA_;4SE=gM3YlNjVIiSwAOhIBn*)9K5RUaIh*skJVI<rbP;8L|&69grKzsp@
zQkHJ8jTc|4x*mjxO6-NQRK~ns{O5wI<$~PB$MsFKS}O<4uZ+8V55K%!4|kcq7dep#
z0A=%JkhTI}!x9hq0Oc{h8tpptT`D|O7J3EYMKwnW9rM8H+T?B&<t~$0mQ{qi90_ia
z1Y~6V&W*^9a0I2wwLy|u8a9n`SWI@6rA=kSWf$rnlH0B5+AxKq;$dZT9fC(lTGTkW
z3yrTv(!7>kj^hv`<KpAzt}u>`*uuGy;74(d=5ZYs<F4}5q%>M~kt9wl^NRJ#o(FJJ
z`M8TTcoiuwD!b725J}bDhMK!v!-6%aSsp)ksai@iA%)Bf3^F%bO}<8GhIUy{up2JB
zXz*o_X;(RALpj4r$Xzgis~nw$yc!lcDRi<e!d<xUQ4Z@#iL&hksD6>gkz*$Sy0tXJ
z|76TtEN`Vrh-$Fx!j7l=u!D%IV0Y0{u7sK)*iXczigOq6tCDoygQ{fAXR?cW6jH$I
z(F_eEj#3eYZfJ(mHe0f5gzR#f)@Miv)mL>GmADJ8lO-(ovccIX8t?)$*C-;lK-30T
z#P{n!0&tx;RedQzvNvoaYMyXcA)Cu-UW_PUpl}aV<}6x@lu&NbX`Kv{qBfTcgg3Bg
z1++peOyI?-f*!xk6<9J26FcUwABI88qcK8mx#bBZH&`naD22Xls`{!El4$v0GQ0!~
z`M?il^A{>?dC+Ex+Fa73_^OCU0}y%-Dd*Qwh5~#&oy(ob4ie9bWh)VC1b2DmDMD#9
zcac*>u14iVC_x*!5^|SM0*Q~SrI1I1q<U2cuyW-us@PmkhY(W)4h;)<fjWv%0^ub<
zUrXPzyu@>xwR6LyBic`#*WVk>3;Kg)UDM|-m@&O;>YTX;?!T~W-hp%Gb>Z2lbqE=W
zwGP38t96J9UC*39cTU?8?R$&84xT!d;&E2MYL1n?#X-T+<;jA<{v=qoW82jIr?(Au
zFH72vSZ)|*vS-_N#;AP}`$XH^`O|02Z<}^#cJR~%oo%yc&6%~Z%~s&`IJWxKQ0VGq
z<BcXPj$xB2I|kmKC=+NfxYX32H?cX3V=6a*!r2}xK!xjx66O+<k?3NR7G!Z69|?zk
z0o9QRG9E{j5VV@8$ND24;3j@Z8~VUDWb+i7tt#0O&I0a)R%--1-O2dHCmU5=mv1tM
z>F!3W;xO8R)6+r!8P-7Je~rT|^iwpm${dD=uzW&PO5`FsVNn(%9mbU^Fz{?mG3t@H
zMQ)^Dj|u2`p$I8Bi|AG5wZv%o#HfQ~x;e$D&6+||V&$wb4Ac^Od4zBHk|nwzlT(dO
zEt?G%cr`?yOpe0g21-R8a+GSfGz#rR7y-F-O@vikTaE%BhzxCBIf|3fqv)-oF{*q(
zB@9u3+GmfWEC&{&^@W2gwWc`=<_J}Qbp_Z%KcN{V_wfSL^}}X$rPi_%|1Zr^Ouec;
zr#D$yE|EDZLHqYG@G2?GM8{QIn6mkE>Ah$=aCt#FN{A72pokci9A{YI+KpdVPDPp;
zrU$gE!4*XF1S1QGQC~E*7?mnQt=%V;96q#eE+dDL)#(;49;a51Bv4g6epD@URO;ZR
ztRrNSq;Mu#1yK@4GBlMpZ$YrEZ)p;@ccj76fn-wqjQ;-K{`QWZ<x7`NYCm9k(!XMU
za_sVCU{Jvo_VzCddb&^4XoX$foYZcl;4nvKnkE20grO=pibM7fF7EDGbbN3B3GC?a
zV+S<p>x~xY|Llo+V~*%bd;6CKT)RE!>s#6#=_~Jc!sKNGi~53S2`5FpJ%dTlVAtS^
zKGoG(?OD9MXUVW9*4teJx~;SJpiFA#f%{i*^nw|)XLK&aWjYQHt6D(*Cd;rAIedVg
zBoe2<L@IC?Zrl=92vO5cy=vi>p@NtRnkpPt)8Rwr4Nhd+ucJmN^^(X6J=2M!$Ow&Q
zjY^hTQ6-DMN#iCa9#ZAXti<HtVyc*F=@eDOBDLm49X^O$LPHLz2Q`^o0cB!R)&r34
z+PHiy-&g(LBEM<NY|G8$PLo=rKvAfvS+N?Cyd2d!t8`3m5ZYpCDrk+8#388afXQp>
z%Je4&7RRf@8cobZIl@#E1ve##jpo_Xm9yZeV|tl2+N8R&smg>Lg{U-ES5^@RgjMM{
zOaTSzjEO=iMpWREl&8P~pQ&n%n*J2m_5hi2L>+MxlHtuTO`O0<jR3LTG%PkU`@F<r
zlyH{SNG30lQ`fW%)vBOk1_TvdDwU^fOzx92L>^B;De9>t7oXCtgKM(552Bh&By0RW
zUBkBMcM`=(U?t?!G*2s`Djuh4B<~T%b{ff8CGs?xKtwpYS<qPNDX1+eVe%d!Q9tW`
zSxkDWAPDtX1*E4(+8v3LW<3@C(zJZrFar^qW)w6l8cn&Sr`x2dMd0}Y<*vF8jb(DB
zJR_vaCOzq|M1^uu8x-vr<|86nQ$Yp+A7o%$&*PV#O0&iYMNM{mH*u_%f?T#Fd=zB2
zktxC@Qjh_>a2(5`YGo{olj~l_UeTcY8sOR&RgewKFa-(1H5Fu!L@sIJb{;>AhvX-^
zagxv$*zmB0n})?}Dae8Xmj<TgmPkQHUf8~ERRj%@k}ziZWT(n>pj51A1)DT&x|;c5
zdDoVno<q-3>e$B5S_vhq<R^r-A7ED4Ed{xBx%WzX`VD0AGD%MtJbTZoh<6MQvMLTQ
zoq`PVoaYqfagb7hzC=N;h79C%;6bKR9Pbj<G2>_PD6&I4^}H0IvLN=G$Aes3diqsi
zfmYu{Cfq5mG*!ORShbX5kTQZRAU%<vsurM-dx@@37==YlPoL&Bl)~53W+oL$luGnc
z5sAc2n@2BwE5&BKX7w&1$l>J5-X+2|CDf3~URKFQMOhrTFx}k>n4Z1^c#2Xa#xFe6
z*s#oqwIrbtN943J94P<Ru^F5MjZ6?Mlj-Rh%594z8Y?`dp&^UPrj?Q&s92?F)RT@l
ztQ{t%XY~X@rI+Hu#$n{u(MHM)X@N+}MIwVhkQ~u8UY)m+^z@rldJ>kw!L+I@V4}0-
zGd(2%SeyF7g5s0(CDQjDQvItLpY+@Dr`U9|a_Q+^2ev~((3N7hAPN)dC2{jGd3ECx
z>B)-MK(9`T<fjQ#qynO>C_jZ?Mnqc5(@mOt6y+>ei9`iVRIS=O=PY0j9a?0KUo$mr
znZi;j@P1?+h0O~1T;9|MrhscZ4dn3>d2TKGM-iK<XnYE4#t>3v(~&*XHeCgLSjfLb
zeuhO0I+A~@1wJz#S3M&Q>ITQp<B1iCRCO{|4yGh%9tTrR>4`ehK+Y<W^t6OUOk@>!
z?xf<Du?CffElmHSiYXxxg$IN6SzCOf@RDLtPgEo{lh6u6%L~#lHjSiZeU>b3NQ+Oe
zflOW^@o9P<`3;rJ(}V=UuYBT@w0bN=Rqs<se)_rq4;8tl_yk~=#9|ciI>cv67fPF~
zK-wt_kb0Yj$7_mD!eb5OtP+V&8lnS{vI@*jQfNvT6(JD_fZ*cJTUYpGAdpyJ&3UUe
zQ^&(XAw{<i%~a|4M(DXw?8TAowye*x<qc`^*`#?xWX;jJRrNU;8=)awq2y}WtWP0e
zab1vJNPJ4Xhi**>2hHR-qcA@)LB}sY!755^-*j;?r9l`qkMs7e5}{E{?l-B9Y*|93
zF;xu1nkHhcd`vEhsp9KpT`I)nX!@pOnKi0UGbX28i0&A&juJUo+(J*b{4}LGXV^UT
zGX<37MtX0{q$Gpr0k~J?yj$E)B{V<jWaxv4S=7fwh68qh!Z|y%N@2HVExfm25PN<d
ztR<s3rs<$CBk(TBFl^bMI(dlTr*AfC@DuZsx!_gqB`uU^w|rtt_4-iR=Sb9Ia*LM)
z1P2TBS2dZ-2JM<)Q61PrR%l5l1j;ON!q^qf<J@@*DW`xGH_**gA_uOZ78#i4%9!?m
z!FwgF?N;ih=SbDfz8ITR(Mb&s3AhJVWplCwC3(o8iq?bR5Yjawq-A_)iy$`qpd~}I
z&St>wG_)m4iPTd;{}75nDkl(S2-szF?^rhJ=c*x8A<jZ5((e=Hv?gPjChE9PHr4o@
zJG5a*lL*-!IE<%gp32WEcAsh)2xHleM8zx<W6|vZu~pHf0>-0TLfzB?R?jV*T7Z|s
z0CcDbW2p>yn)%kDm;#6y`$lL>y6(~zyCt#mUYv@t8tJPn6Jxn95eXH-8yk?Ih)I+%
z>IfPDEia2!j95ryN7CPDVJ%fq!4v@FQ3qeqX)$rVG!#|}T3*m<eexf^@|)CADNub3
z&qc4Qz*jacuFL04K??*nwrB_d0fg@bPfBgRf?EYRv*T9;UE7UtP6|6UY0=k0*BR;^
zjlA-kRS^V_4G7B078a4`5(dzjz&II-)(!&=pb#|*D1w%Uv{GFN$In>4Pk~07nn8-t
z1Kzdex_HBN(a@lEi4;L%PO87oDl9Bdl^>MNbwTE(0-`GDx^NVXqbTCKAidS|Y-u~=
zXDqT&KtuNfFG)fXC9M`+Lz*DHTAH*hg3fSw*jB+<bf_qyg+&8d)fAv0W7&@COH#yj
zp=6tHQ9&_&#!6fm(uzD8gkc&dt$_6~#<Ch1$tsb=5*`S?V3<}ku|Sb2qfZb(Cyg3c
z%-K%kEXT%mQPbH@C4qEQgXQrvRvf7kzQ}QcfEK_;%XjexW0CmUoQx$2W5HaN8H-SM
z2_wWdkj1ogD=4unLUF>RLbfJjk-Dh7B4!;DD{^gHdZB}-UB*c)TQ*j1V}W1Nq>M$T
z0sm7qauu9{Otlg+7RdKtlwlPXl34U}#fG$2uKe@b6DFM0zOZ|l>iR2zLz^t*WN9um
zNL=(fOlu1MQKcB0bvQ2&`EDHB%p$;rDUxj(#+o;~bHQLRm~<sav;!`|zZVVtdsdI`
z{H*tP_AVQK*yx}#XXd*WB*!KF-GeLg>y63Y>RcQQ40I3740^ie&zqyKJ-aswmd>Nk
zvuf(x*}J@FFng301B1!3>D>cM)SZ_PWY010Kry>Fh^HR6sH-m;ec-}mSzlMDK5*~;
zpl@+^G|(CJ$HR7aa{jQ*dA-Lc{R@`UXnMu4y)y<E%h5McCk}t;IlY66lm4)G<mKll
zi@FB}`&Z10d6(`q?_RARh>MPS^69}~FzN+|b`3pF+syfM=g_WtVX~;ddqB8`J;e}H
zjVWaR8dbs2vl9*qmM%{g4E86%vK`x|?mxY45cX+1Vmaa`*|TjsW7NKgeWGpd{OL31
zw@o`VJ9z4X&bHaJ=FD2y23{e5V2!?j`mX`i{@ZBp;F(DD$GhPe;K1lVliJ15e#(4Y
z*Q9z*{aJ~>r+0aOq!_q&$>PDmzTK=T{uHyFALb_meZ4&c$pYQATSev7XJ<6|i0V##
zOPMn9@_yY9yuHJuha-p-6^ouE$f+fag{Zy|wkR445uoZgLSh2fT=#G-5ma^R0Yo=b
zQoYqvr7|*tw6*TBlC{>oVT)@S*S)Hm-s-;YfwzuUS5e&)q}`QJ-BUGShzK6JkiJI+
zic4>Xl`CJZrSGW*E7ZQ$gNNccG;9n=vZW7%3F=Tw-)pUVBb3TAu6tE|va9*JXOj<Q
zQfE`i%%vNQ!Y3Y3cFZzmL`2GnKm%o86!i2Yao2H54@Qrqck9ruaK&K02km&o5qf(7
zGbaB%S=K!`IPxMxyM`T=b|*{Yfnh4DAgKT)owR~N>mJ>WY?`VPOsuK!naV|NyKWu9
z#5e}S-wGlhNHL%6vev@aTKG!8@Kr_cS<M$d6XOg-;)+eDfsa)XRHa+^fIudRsAx8=
zCv-FE0_&ayNVW2I*5Tl@qaX=HlG;w_g^3fl*1gucSNe6YYM%4zzV6}P6O>$5Qurva
zv`V<{`Q#Pr6oG>Jo=JG!0Qj?}p{p>B@cfE8$_N50@coGN7)shx!)cznw^m@|tsB^=
znuorcuX`$CMtG}eWQDV<gc(R+66299oGAe!r9`Er%;QhBJbZMbqHTw8>qwQrv8Ouj
zkR)kEB(ygVn0Ra9d+RQIRSQN}^M#M>4FHu@)pdnL7RugAxbS(v1d%0H(7|Vt9!Akl
z5q*!Ee9}-v9Uc5|7KlUyz7e=Cc5d?&3$&KKx9+l66}WdbU-nFT6I1m{nrVe)52LMw
zr5plCaZ~zO-~|m`7t9K^@YM`#5Y?bWie8)J&p8Je&GZ5V^5Vq&w|R;=S_|J>cHt9M
z@#0qZg^%zknP-(3K0zj783P+sr5UOlW}$_TScS<#SwP`)=_w0<Lp^}T3B3i;K~q~g
zkqt^(E3nZDY?M8)A*#aRuIB3=nbOi)1)iTt#a#(A_$2;Fl_gwo-6NAlx>S3uTzNzl
zd?dfa_NH)Uj+Aa43SU4YGb%PLFSSKX%|_EKe4~pmhDrqHP3>H;*9^<G#S8<u@9N6M
zydWBQQ;P$t74fQsPV(9HpW(Fuh%21C+5R`NHb8>8g%7>5W&!7w(4@kXq|$l|G8hH#
zswisZ%EM~&7sCr!OdX!25D4r<*a=M%1Rd*}Cq;H-&C&?&G7Q`3l<*ea#H-cuHkP{x
zk@yspsy_|jF5M~Kaw>wl0QGKD;8M1PB)W>wQ?IDM0w630X;*-|Ofr|K#5L<Ek*C*?
zi0wFvQrifDb!!^wRgAl;mUyhz+(m^qt(GgMy-<%{O70RKWwS-gd3^)uGJ~?Vn!2Ip
zF58lIa92z%qLYd+HOc#nyylTzMRmifrK64IuLkRe64B+#iqeZ#%of@DWs_cX67apk
z-gbBkp@i*qWG&qg{&J9BejQZ=Nh%dCVkBcK^DV1+q*qn`8m1kVNPab#zbxvq4K-UU
zg|p03)+e$ckXK&RyP=>)JRb%-swRUu_;aXztV4iN!XA?89!o0;Oebibc0WZKtZFUX
zYAwJV*Czm58LXBr>ZJ^u7!>Iiw!d`4G{7>8sEDvG-o|?3#dO+{UYt6vq%s8HmzDvx
zV6R~UtVI0PSoelXxp45IRuo`{WtOn+0M{$(nCkOfC@e{A6_8t#e2m&EqN9>AKsFnH
zK88yLSQPjkIO}el_;%CqS5*l%%spSVxM%g2V4kXVuAJyeK%j&sm<xWorveJa7>w|X
zNgSq#07LJjmdUHb2BWikoEUW1qi+x0)SIWGu_}WN6Jb@WhF5C_qh~SLtrfAcgypI>
zjivK(&@3E7sUU-S2FUUftj(J7FMRoy07$Ejj&X_as0JaX59qo@zvZS8V8uN2CE~6I
zD~JYw%Tj4uV?7+8vMXq&vT+wR0f4U;MV>=Co5YN(Iyi*UA;lDmGJNS`s*PVgw5$}I
zCp%7ElSp>bl3pXTH>SE(Yamx==>^t>ivt@!Mg{(obZjgie<>x543459FhbFka~mXv
z9=d85V$NZzX$Xm-?)W*(v&e&Y1I#e0<*B4^8u?XSKQucHR-C5w&tv$j7B>~$8wzfL
zAIc`Z1Ub}XbQN`P;H@xy7w^Ww1(-|qJay~vZp3jI8Td6k18fB;nn!$9Qw~eVT8&i<
zEmaIkFYAhe3%s=wCia@xQLb$j4Sms-#}o#dVU^sCOm6v-%6$TXb!di>A!G_fyb(u+
z5hTrnP*+uMz2P{V>Q*fhU!65Wq<~32P(|5A8#V>4T{hVTBoM||QOnEs;TQ>j)f8R6
z^nBAb>re~rNCeniDMqZ57-lPNUK?J9HG|Y%r)nYcYRy{Y4%1SjqR}PEcqpM-NS9G0
zMPZOZRdQpY6&_SeEA(m4BP{9E;gXjkq%JzK!%)V;ikl}bs+v|<I?igWR!F}!Qsmr<
zLQB|Q8FB2WkWt+yiaO)z!a=~Hsxw|Gg>njRcuDx74qv>H8nG!u83L&6ntt;X_gB>k
zGoj^HEh1m7g%%SEC^4s^)DnoZ5=t$pIw4TySyUzTOcU=1*@iV-g^IgC1g8?b4)%(y
zC{jRc%JJ&Uuz6Z{*OXgjVy*_uExLA4WvI}t#`@t2f0Z%OP#Sasp#5=CSD^xIb$n`3
zx~eOMFqf-ZZ_#jS{C;@4R8#9^nt=$6C~W1*w~vl=O)F0&WrK6&G2#WvQ^j0fmUD%u
zSvH3TW#xq6fCDI~9r`xaD=tkfYRWIdTn_lj^*{mGARGIBU?f&zfqvJ@md|_|_B0&Q
zvP6j*4Q4Oe03r|Y5L6IgSS%%U7ozRqkuMxaw#f5xlybPrQWU?#lBzYT!Aj#hj$Ma8
zJ|+E$9^Ry9MJat|n<r7es_ix8kGGv^GfN`B6snAGP)n}={09j@qU0E;McMv05od$>
z1QbH$T4l5_S;9;xP(gs%P#8OfGI?1)a0c4A4Oc0?-rz7^SOCH`eg!dg9mD?r?R^P+
zTxGfbq?8(@LRlsxQv8?L3}|U{_9ZHGvb0U<lBQ{!79^ZGOB$MuNxH!ew<3iWP<90=
z2&EuMp%wLa%U`Y{$R;XM3YJAh<bn$-0xBZcd!O$;Gsz@5nK_wX?|A5$QTpZV=gge<
zd$;F(URzB#=xma3EG;xZjV#_*TD*j~5|mP9h!=DQ+U|OamqPqE2eid4h|d`fo@eAx
zMxTiQT89X90<7d$y1>n@@{AvE%ye>ExM0Nq$*U@P<hyJX^q@&3*mV#d0p(z5q|2xJ
zstZc^u;A*8MGFAi^9J(jM9CX8?+bYZ4PgkM+KX|Bqh)uN$o!yuq46MVAe~2_1>Q7O
z2Pj}e;cVR$!Xsb)v{z7IyoJ|ZXfUbjBAl$z-WRW-=Ueb%8!(f>q~+K90*{CWi_MSr
zf{&&F+L1@wI?>Yjgy8UoXlpiah9@%bS9=8|x>0!TrNMjxde2k7;0Z#l1x<VDXn01M
za9_R`A{MAG@uj_3aVDdUAHVMfa~>EH$K(<aFDT2<Scm?!S3EeOjl%j~a2iwq<vq0*
zN)=f%`=Ch|61WH*qPEkQbYb$~bV1C*0g$#0!HT7zW>m>Tdzl&<gUJcm0L(??jqpS^
z{A#bDKzR$Uy+j5uk~q)I7kV1A4MCGHs2C^+zp)Jz^@H>#D=cbIz!2OJ8PUf0)fYT2
zOH<M9Ni#*+Hp7FJ_ocqnppe)U-uVJyJ90Iip+g{im{%4w@shv<k6L2C>I;;<2;}<+
zZJ?i!h#5uB0~Y0|js=aXSt@iF($hubFoRz{I2Fo;moB7*beV<WJVF~tiUkmMC;@2<
zcgnYPfrZCcUKu=v3|6n@p}PdjK(as*Rn9cgv9K807>0|ImAU{qUxm#pBZ~x0AbJ~z
z5WD~yhg74lkAvb6lNRz1x%8JT&jR|FM_yma19>P~25+hsZz-xN+le9t8%fxTc$@=p
zfeQ)+K*4=4L4!AsdPy(!C7@U(hy)v~hl&5hvIejcV26IB3-L&i6%mZz_i|K=1I;{d
z38HF9R(Lq(efVBty$~Q&RAEDhKo%-U(|U>*F!ii)O5l7i0gNcDTE<`KPzHgD0@UG{
z>}wR+V7d!&LkObCk6*eJHDSt{B}%er8k}B)^bYARAb&!uyMT}7MWCX*bQg+tgV;Pz
zQ_;MErMrCz7wag5uA#o9iv=V#^cWq#bQ!vBOO9&W2+eb*UgS2+=$t3ki{XVF0<*yZ
zC+;a;x)MZgLs;^FHv86H66?qd%P>2h7rG0SX{<R4pNDh-5C!y@0e%8Uu@s|-m><+#
zAO)90tGhrc0aj?$8?nI?A$HIVL!zS@M_WoLBgQMA(=ddl1~WXb<q?!ed4pwupwzTA
z$8bVq82WO%#PH$`HB}Zl?{Z~CzS)MLNf(M85i<0xx{z@I7Q>HpAw;U9Y$XrXr8o%%
zmZ7PD;85kQBCp|a!!SU2jRFTXkmXTOT;^pM3Yg3T7&ByT#Zi*w)9->^2$)kgO5Su(
zLxONg8j6+2?{-N7>XSvu01DCs$xbolHDnmd;H3=?*s!qqE&<GQD*p2nE)meV;HfUK
zT}yyC`4lb?iC~WK6OWfrMJa<rHIHl;A|VMAi4DOJOv?@tqr5-EFeuc9g-2_^(_kcQ
zUed)&ptKF1bfMb_h^dk0_7yUOPYHCbFV&^10t(*rJmT@9V(A9xa%{s;9klTb4HhaZ
zl$Yg_(Xd+&5YVvDs!Iax7>f&fK%wwn2GHw5EfzwYe&g{7EuscHhu0<6h~g|sMXw=7
z);4~-P<GH2U9e<B1oJgiYiNYahw75&C=J0e3=8XZfow@s$KZzoF+~uYH9)zro^(D{
zmkdI0QS~ETG8&i38l0!`OP39uWowFVqo4=<JHtaB>Pxy*KCIG(s37VmSvNcn-36T<
zK$syy=HsaUXC*f1{*ygoNV*FRiGZ<C`8?b%%VK?$(dAqg6DD%=Mcp+7l9%VG;Y}rj
znhXr0o|YkMGV}n}Cj$hfg9gK=?t-SnB$1;8#g#5#nGr6@FO>{}9KjS!-7s`XDeA6_
zcs!&lyp|!r1q^)Zsk>ND;ot=|P=|&591;b-f*NqD(cZ+5Wrzk|-~q?k$8Q;$8fxKW
z8~rDB-UeqvQFmqPF0C%SmLU*$Fo(RPi-Pq5lr99=fn)b6UEr<(5*fNH8w`qc7pmD%
ztDO%iuVmYfs+pFlJ4mcqMb$M-b!nkhUA!*CGUSoe@^HJr@fW~UGPnmhPVyVnKuHOD
z-+@{q8w|=7E)DgM2x{aJGDI3hvJwWmfGM(NN#UvO@)gw3!drcUs37KS&!C2Yo+tqf
z$U|P>8{x+M8HVi5*ZmlVD8hk12aeJ48-@-#;U<tqhE=F*mYFCThM9(;9+Zv^h0i4;
zqXi7JXD*pZR{(R#;BIG~6?{q;>a`Tudp>j*$n`lDh~_-pE;MW4G(k>isPM8C+bl{~
zrgX(a8;}<m8dhc;Fdmj6ddvkes39Ou!L+R}-31sF`WmCZYfj5h1;kKBzuP=`H8jg|
zm<^zTUBj}iVnBYF-xUvWK%O-|V2$TIqznA$f%LlsG!KH<`;#sWRmm#+t}$D@0>UfR
z@cb+m(qLv}9uQtZ)>vUOQp#FF2c|JJx@#;P)LedcEl^P4UsbYJcFpCN8=Y(IrTt0U
z=sIayd{O<h{*Eqt+L>!nA#Q(W+H`KjIuq;oX^Wfd8=9xpu5@>fFRh!lpmAYi%QR}Y
zfG@w-NE%&zl{M#Aa+NjUJOcnn0w5jzdq?+a0L_CSfEj{)4qzJ;5$s~x81P*H3}tAC
zpjDm#)1;wWiC_FKib)_Xpt}CmSbg#aB?~ki3Xo8FScP_ilT}MIdD}pPOg^kW*C}UK
zp4XJ@F`L+n-B!PNX+wQmeBt6{i(1;6mMv^*gLRl|7pS;gyFgTP?ZPmPbD9?~oOWg<
zWSk)#*-M_aTf_mDahVC~STYX80?UB0PiCtllJWn8R!6`_Ae-%}JOz%J)dDBu3MVSO
z@8V5C69Fz!pgvo23b_iJd=#VQ5mx~pRf4x|kW!(}Tn$f7z=!tCiXlG0k`*BBiF_LL
zC+8Vb0T~7(v<{kNY2Z>M7CK*MDMMBP%|9^k%iMCM>u43B=aCLTA+*j}oMuTjl&EBc
z#w;D44#=g`{>93(kRl!+<%tG_u*z9S3J=c{!W6)$DO!eslH0H62`ml(kVvO5%O&Dq
zFG2+<Qhwvto#=-l8=w=hZQw0MF+8Q&K6Gc+($RxhcOp81q=UEa1bixh2xb<dq2^D%
zbf=2+9pGI91HZ^ESm;59vYzqFmBe!fG^V7ODymNtVFiH&T6Mx+&4xugz!DxL)hQ!X
zNtut)suPH>Ahy^Aq!Rr*{H8=yv{cnpq(pKCZ4uGaKs?Fi(ZU+s#~@tbEnc(*RW`%Z
z<;jQYR0=3q0dhM<UW9HLqxc8SB?72A;o?CX`fX3hb07=SS<v?7oVpXeanT4>&7&+F
zAQ|8QbU|?xQ5B_P&^CZUTjYWZDC0p=p2X9`2)4YGCpeFT7g}P)=1430rG7vlijx`j
zLr(39Bo(;%xO||dvMop^=y||pDTrXV4oz$+r}ositZ0G*Y?cN{dkQEX039p>Hy)BD
zgJC3qk_CN;CcJ(tBy<rNHP&D%XVg{%c@t<=@<5qFW5OU!p!<y=CTy!1v<;;4%*v>F
zfJHn=%2N>-i==1$E~^L%AQ+EleZ>L)@Q)Wl)E4FO*$re-oh1pZOheCyP%~jAV1IF<
zBkP8l5W<5O`oJV+SSmZLi5Ub`keu{1ka-<Yso<$`9kspy82J=&m`ng*2?#H($m#?$
z8As0`P{Zfr1ak?9x5;ykV;eTijG}=|m_QH#p=)LVUC#g+$bi8i&o@TOQ=#5=K|?j5
zBN~1$-EYnliA+u6bl`=uCq0>Hp_!j9fIoQr$r%N4Dbvy&M+WVn;S^~HUAQfYR@OR^
z18kTENVI??z>Df#KP-UqIf#HE9&Da0*6b_ksmZd&i3sXscRiV4VLu&8D-RqyI+Qp%
zFFA-H@q#UewR>HFv!1|$>4i+r1jzRU<q?lMW>2&&YByLACurIe+@EM@;x}%Kgb8xD
zD6`Bd;xJyZE2fX}>rND^0!%M(1}{3IY=#GH`vG!2RW7WqCwiF(gkSK8Aag<xyJV`c
zcLmXJbP0X@fD#6DDW`c15eEW7<MV+1La-H^Pw1LsgUHejPx>``YKF|?kfvrt4xBI~
zKcfW;&7IW%#&pn+nn!NZr+Ey{MFpJWKEk%(gkxnpTt0rM;2477=x8Wm8jcZGdYwR<
z$Dq_KU~nlw<}p$nAogX(=4cVe7Bzq#u&4qZr1>?Ec`%DW%FzTjdo3DVV3Zv*BJ@1W
zV^b6i6tbbonZdaCwPMtEC~6zhN^Sv^8vwd^`kkm94`Mz9T@S(J=vS~bFfjr{=)>;>
zg9upcG$oHBPut{q6TT;uv}2oAcm(SM<acUZNOdQm3s4x|i5V1f2a&-`LuEFqvwfP!
zz|*7K70OMsLjs{X5nxl%3@0Dbe@%BRN7oa)g??+26`tsEZoe~o^Z0`|Ed#Cr#&Pdm
z<O$-d01{8KmtT^BRrHmXfl(@;rY;A+kX*wFvq1ADB7)835z#ThwaiI|YB-LGg0>>q
zc945SPz#B~3sAySk{F+fcLPHtJjDPa@g(%gLO)%<k|kqz)Qn}<S*~P(+m^X|dDKrQ
zR7cPfmIdH4VnRlE(6(80mdomc0-gc}QUWAd0;Y4s=shv6Xdlg#S<v!~=m-a3uI5*?
z0Ph3)LJnpESF&^fArVc<BUlQOf5$|mR8?ie5bdHhIH<G4uyW$Tl|n?{yYz|yse`Dq
z00!3tzm#|~2=PRQe#;i2B*B7*1nMw34_nX@5(<*{sv#P-B5C2_NzSRU(8e*W8jHp6
zWaP?5hyD?<3nG;cY(e;ih^hMWvOo_4K$;&@P(`$cm(c%W{Kyz1As8}f{YBJHI-*ll
zSc3{n5(^1A3y>*@bQAa;b??wWAg2KgUjSjQ%bNf2g#?d`)e7g-SgOQ<z(dKSqESUA
z4fT)-!BJVrr%1FhBv?{NqZdF-fmccPtT6&Z1;bMX&Bu}jy#Kxf?lR(nNR$l>v}P$W
zB1;hR2Yzy(M8~hO9H2!L3DYuoCt+}EF?cZ~SXv?3Edr#mcog3N&?S%2NR4=rgC<$3
z3{)gKEBZ7B1sP1@sHDmcq9oZuwLU^4l9Gp;C4tS~OxUK&)2bCi?t`)gha#kI7O0a%
zFzR}_S)g2D#YaICEf5k*NNoF*EEE)oD5dogaYtKtuwjW(9zYNc9O0@d8EB@8ibpX#
z1qeCg7pP$lr(mIGR7P887=2#x3v{6go?x+tBCLJ8Psx%1zTo5>WwB6M;w%m}9j60U
zHhyzZc6dWYUtL);(2IssirB?;hYD2+DRvP|`6w9FAwZ)g3rXT2^5~;WE`a}jRTk1q
zJa82Q1NR&niJV{Ag62V$HI<-07H9IpOhIhn(1b>=k(5~)V?_ornlygF(i5C!ClC_^
zcu})7J2Zl2bQ;O7KI#f%^!3^O3wqic*4f?t?y8Xr4RryjRZH!4cCw>?W1CIK)-7JR
zwBP8rQ;SUt>M|Ec0VKQa{`Eb{HH}RRjlMN>g47xzW7Q&er-gC!qsN7tjBdNLt*)o5
zYi)OjY4msWbhojKtaWcSwKq;cdoNnsm9UeGoqA(qi_yKt-KTDCGKptvTh?bMQ|AQr
zl-+M9+vc^jG-aG^q0!OJj%!=cvwi^{wUc&Vy0Nd{?rK}uW4TYAzGnIs(-%!2?Ot`B
zk+jwuNqfv|EKXW>a_A~n>XzNxnb*<R?><rL>qviNLl-kwcXZqCwc_mO9C@MCZKl2%
zcbArAM|*qfTbS3?G4c@h1S8iCK5hCAx>8?K--tU+eNE}FGu=(7ZsF3|OL<=B&@dI~
z!EA<5U02@$?HVQ`vg6%S2q9ji5|FE7J7lyJ5|Ba9SbqT9h6x<f?dYlGHDTk+2_lby
z1fNLsO}@jrJ`g$O8Ce5kn6PnD9(W5ij|tl_60)Hu5;A(laDr2;fF$W!3h*J`b>yfE
zDP38DV<@T+0JR(;T+|f=E&=il>~kQ63+;*Fa(Z`0K&y@bMxA+J`B1Rvw+s?3(|MGB
zAfIz@E_+Cq0C*EbF%M{u0$dMb;gW==LaHN%huW%W4;Ag9LfS+Bni)Gt_7JG;b?}XP
zCJNEUP!D1TUK1E_*=J-3xE4{71rUy|%xMs*=qM|o8~XS?Sk*$%RZ4);36(u)aU2>C
zcBGFw%-u+f7oWB4?LfO5K{ALG=54TSyGM8kSdRc2MCdvUf6On^8KtTMh+GE-cIGsP
zG$1!Y$Cig|NeS@yYDR+58FH2w8rdpZLq%)oUttXucDfFdHG~e!ERW<pUyB?=0Qa##
zYZr@Snmuc~`zvc0sNOWXyKSp&ozc10?yIa}yQNkoHuk61ByD5Z9<+>5Xh&+h-PO_G
zKl~)=RT+CZ9d@VH*JiCvQVh1TMuPc<+GC&Akc6onm5u`gIif@AERF-1P*=4)CTs)&
zberiBOSMGZ3=b7SR%57zRcD~n3}T&GNwN-Na|pG{Xtd0GIvdRH4qyWliE&veesT0*
z0fLnS2Ay5m%^`>v3d$77ZXVGgi!(S=H3dTllaAsP?IAb$GE(@8*|K7`Oc=~?=94Y^
zSIyW#vWU=*hLL1?0vtek#s|=Ym5}s6{jE=n2mv32h(K(ZBRqsD8<YTfah@rvlF(Iy
z*Atq9{}o~6lr77UuB@=UgW4WKjt6{SUg04gxS{|CUzrGi&(x=INpJ-K5YL${gK%lA
z31dFsArpN12?e&bBAFK24Hv^h1XwAWLq+h=D0tR^GKUH}VF$__0(&~*K%N02UIiO?
z00Tq->8P^r85=BGsKOW+I5T^;3^TS46e&1?@~Eqm;llE=V=IEi+o)0q&6GWIQXZBq
z8#0Lato^$VVuJ|5NyNw)IC*rw3=JI_M{LmIuLuwEAY1U8Fw;d1ZB`&(bFxH;)EZ8j
z1(Y@@<ux^r04#FQrp~Ln<e-DJVulBvHv;(#3~YNE6y4a}kWBWtz+=Fe89X+<X{1z+
z895q-q^?^H*mg(X9HYCfxoIKUT?=|lqq7OYd6pumV=00R@N#`e-x_x16l%JzX&Jwu
z$FSn-+S_`~kvq27UA=8C`g(5C=w01m_SFH*oUxi6pRw4~v))cFUEAB+xiMpHL;q@F
z<VDzlnK!+#r+>AbO!N#teY4#T_;Ye&qlIVba8j^udK{c|)SWjju4N}+-$w1=*MTbu
z88?&pAelFiacq3pu^x9W3?7)CK3nYeWJe#bvhp6jp2I(amKB5P9&kWG&Vs_gl+a|o
z22kxt0-ayUAvuJ50^U*ri#2K^=yOUXKe!%^a1k$d*Zap|DlCG|UL*WEcq!PWdOPt7
z4kd+9JSZY+?TK=hFfsTHv@>gFcpgi_3_L86g<XavvY2HwB9X;PrBkcYi7edzs6-ae
zqhTRpO+H;qOuM?EOGsta=bA|Y*+k8@DCJ?P@P@+~nqe6U$;1`IQ<+knNsBXSY9`If
z^zgx+Nkz;}Stn9J2hpLJc~7GaQW$7KWd34YmB<VF#z%qzRz;EDz(7N;c@&g-OsnLP
zvXpIJ)>K^)LHdB!df{OxDo&%tX*4yBW{sJBu%}UY<~in35XGSFf%iO$l!EM&^OyPm
z!wpgeM-DSc(*|nV)bub&afly-Q~*&u;5vSxRU*jjfc6E6VqMOOR6~n)4mSGu!LWv4
zB9SPnwwMsXNfn;ZS#c&U&ZMcCG%Kgy2YV*XX^;XyCxaQq)3XO59?K>71q`8zuFD8T
z`tQ;Va};&z0|T|WrcqgvG}K1tL*-%$NIbAkIy|Oro-6wGh<O$xq#0q@5JX6|Y*p$9
z`80}BBD6nY#^Y$eUPOQt4u8<5QB0z&w9sdG6xHXVs9_e(X^bjp`3dxlng{VP3{YEF
z6Iz0^B?m+hp~>SGjnSepnleVSHt{*gXHf~Js5%y}psX_2IcT$}%IRox3@h1JUj&we
z2R#wc!8wymj3!N&u=DtRde!7Hhayj}qEy}Bi_>V)7#)rYIQWfGJuBkrAfHB6B;+|w
z_w?y8|1daI(56wqAX&2BZ){XRHyVU_L7S8_$;9aTMcAWx@Ux<8lcrj%r(wbY$!mBT
z%oNShqB-iCqk2}Rm4kd91%gPI(X!1Ooh>7eAHazebb;uA<aZ)ffMQb6CB;W{6wv~3
zJ85bjs3*nNL354LeVI2+kgpW|`eJl+7)bS?H%QU{B*f8Cmifj^3T)@-=qT(y)}$$D
z2C1%aXmR5g^aSK6+8x3i&56PX*QbE?IeAQ?f}@~EfGW#2+K8fRvN(wrC(%$%q5=b<
z3t>i&$}$Ho$T8}>!8_~|hF?fJDNtAgs;T(3M-i|TWkCbz>9bkt1=Si+nI$4R3cS9Q
z59kR;$w1$Ot{hk~Cp=T>$jYR2!An!TZt3g>NtO8qkojxb3CEiZ^T6`?XCc@GodKF|
zdsVtHm<5Vp*vEgROHMcpB1H)3^WJ$}^a>B4rGkP%bdzR1Z+)d8G}fz#gMei#$Rz=j
zc(ne4ah3-@ThY-L!7&oLZ5Xhm!sGB|6)^BoaCHL8HS4*oSc!vKzyuz7JTTRIxdJFM
z@LNe%NEN0dq=N1tKIIF&H9*}A`C64rfua0?v5ateM5Z~*7DPwZOdXzpV}^$wdU(a`
zF!dEk=@Q}ij;X%JU8tE;eL=cJq{(4zjdrwoMPqsDpgG!#pz+Z1ex*x8b6CNb_5$pR
zXN61S&!1Zf6R9ZHvsN=4B~k2nInq3=%@)f#sI`}XR*!fuyyOd2*{s+&X!4~qe7A3a
zipGlqSk?V#FIksWpq=yRD66PCIyIxMELdACE<}W*zO+~N)|Um9t^#W>UPH1TU0S`Q
z3vmQFfYJq~7HDzbAX;S41Rm0r+xL=z69eFO{K`w=09LRN)=(T(L6hsygo%B+UI7xX
z0>?HO_z>2pXEs~lV21@zc?pnv<`en~Z2-;=?d8upWTlv(Y0X1>nTnwr7OX?d)DtLQ
zFACU*;9)~ptwWIm_YUkWPyMBHK@88!L@@6+&8f*Ei@E!;4q1two>wWItU9Wp*cL;A
zqxGd38u{{P9cImc9n99D03tsbp#o3o!rKPD7Bt@r=BzIG?tO(8#f%Esk9CM1EucTp
z^Uz)r%WkNaB|4^R*;X+!PqB@m*r600`6{q^$b-2a9=)gc#o#RgG!KE$K{o-v+6#$I
z1umu!`2uIHC?c_#hx=voj)88HKs6y}VCdmt56|j;4W*X(tUQVZ)n5gcFA&!w*CQf`
z;Srn1`vS8uaQX|0ETVw$<$l3l1KX~Wy%(5TBMY;DBIXe3(ejWl4&XV%F)fI_pqRzj
zVTSJoZlU_DaQ%WxSAk=PBFfWwR6Ti07XoQPlrCO|KZQH?C0(qGD)``G9%e^IG4aCp
zfW()Shj>Y%X>y$6I6Mj;Bq=;4Pkz)_umr5o>I=R&3Y{7G?graj3$p7WdSI4y4xoSm
zX@VdG3|hYeh9W;56c;%J45ZKip9vB#<$;XSEQ1q7Qx!B+(}+rVh=TkIm>AmVp(LXf
zh<Ef5h$$IK=fK&A42{gP;r;|nLo@^j3=Eu=6&fXzfPvl>w1nf&f9VE{L(WlDE&=?Z
z6&`9>KLRF+LOiqq4ouoQJadE#JTw^m{QwFW&q)mB?l-~#z%IH5<W4ZV5{3{xRQmWO
zOfn(!mVpqYA{x9D9tq15pic!jn1vV|1y*PmOou^(;tg;4f^!x;1;+CdilsHb_zgS*
z!&0HnFFUxG$rplv2(_zuRHw**MS-?QQvu^J;EWX$V1u<OOumAnzX~j0h;T@7>Vbyv
z&|jzw3!r=<U?Hdswd~74)C49mun%)PU{G{m-cs{%z@Q*i+kt2)s*@1ILyqOgJ`9$8
z6*|JfgC-c?fVX^sML2l+i=pAcZtPRO&|U<UbOO{@cKjhzU#Pm%Wz;H--#j#IK{FK7
zP9!A5R@Cs|H~a_~bHKt|`-Pq@JnFwZ1&k$j0w`b_%LV%tFil2#2iA=7j9dW@9<^>z
z=lQe{Lzdps9l+*Ii%Zx}c<_h*+^^tBSHaCghFFli!yBxhdGMr*wP@Eg$*+B=s)#s9
z=q!~}eSxY9;>GcKWWY2_*G)w*G)ppdTrf1=mml@T?8D#)Sb^<BUPZ|hVu+sVOF?B=
z@B~ao10g^PecFeLAWCTSJ1`K9M8A6|{(nkz|HSB&=%nbRsjV+fI=`}|qsy+WsT7dG
zn#l_@1+j(GYa~EA#92j&Tid<Hwkm5JqqEOGrP5rD<+i?#^YDcceZ?u2=dQJr8}aom
zZCKDy*D~$Q%A&;O=<*dFi7S7>TfC$iL9F3~S(>J+U`G)1@y9?=t^v;>@`j1vWq1@?
zZhKJ@>*Mi&<eNb^r=dbCMQJnC)R}_=f<BilzK72Nq`Ki5ObAWmywn*Se;q;bpyivP
zeGyUg>y1e$Kj9VNIU<n={ISUXi6w_Ej7%skpMA@w&mB^F^vSIc-txK6e`reOyz6J*
zdg127%Z1hZAH4PRmmE2*c1wAD`$b!iIl1}H*6(lMeA#ha;_>U_w_W^&<F({(_J9Aj
zORo5Tvo9>IsNJ#Us*lgR`s9@l?bv$FNln}5-B5e`rPs}9y>IpN58Zy*m#fcyXiG)i
z9hcuAuHJj+%7^dx!i{S0E05n$_q8wH^r;PR{pR_Hzjnn}Pun!%u#eW?dF8EjTaNz3
zsvq2W)opXHtZe*B{ashze){!7#|uBW>zc1EzNL2SM;pF=?OjXnY`$yNBVWJn?iITe
zPkg1}?l0YQ#-8MFUwGv1FYju5@WR86pR@D&Z<vo?{fRRk-Fd?|+kd*f@y0p#eC69~
z_TSg>;-mN6_#ZtlKeYAux%Y0nzwbAD?>ghLdvE&A`Zr&B;>NkVZvM~n-+k-1FFv;G
ztKa+g$RQt|I&b$a-~W8+F`qp1@!hvRe94rL&)+uh8`~ebtXw?jrN_T<+hbQutGje+
z<9$1xxaQ=gUqACOe6x}0<O(@5X-Y-;hmtnbTwi}QQnr6$+1}li?7!HXS5(F#MH8ZW
z!_2O}_FltW1E<35>F&3?``h|A_A;A_tzjlZ(S$g_vs`G5_M!=K;Ki$GLM%n|$Bw@K
z4zsVa=KRXuPNO>$_ZcD+B04eJ8|_^XJt7)8IubdNX+3;qPjJWEUc*X!eq7=%KbjiN
z)b=Y<BRuQb(b<`Pb!qz0^3wE4af+0^a!7yV(2@wwHF-hgkcpAV&Iyr2HkU+Bi9{!+
z6z=4t-PhaG-DfX_tLbXpZ4=72O^97y0+n0tadypP9W?l`$8#Sd5}B6z<Bh+T(F2`H
z|3)IqQX`kzK07sfJYM?H^3v2kZ1DehsrJmZ8~g1(kH;H*cCB>uI(odbqh~LOPNrWk
zeJA)v>2qq_eV)`)MJA=zr*GNgvC@a+d@MXF`_<}){c0acJ=Sug6V{UZT$fKMyL>|I
z-{@BxeRi#M^g4R1W246|h)$r#Vz=G1W_5pmFK!s?Gheew?uPQzh&@xk0nFk->upkB
z&)TG!dW|kGDZ@L&F~e=t!FXGyM5m;$oBrBHU$r#7seJIS;a5(-#zzdg8g)HuyHn=l
zl_h0Ymc&k>Z)Eg|w82wmo*Z}fU={wqfQ7D@q&`2DqhlZY-t#o}@S8uve%^oX{{s64
zB1?{YmtKRnZdy&-Kl{|NB&#d`rqQ_OqP=Xl$f}EX()LT|zDdi^KBdv}CA>)E$&HWD
zIQQ&lX!~X5_tW_HLor%^{co1g*x!CQjc3U#X?*JE=g@eK_F=l-w*C`nY&mg1jkABg
zmG1KwfBh!ySNqzHG(IrrL|Xsq#S-nO{%sGf|J9{e)A-PH-=?vIyPn2VCcIAL+lRkE
z<6Y-IMB~+4|3KrBf7wps>xtWGJnNWCX*_1vNi-h+lOt$+ZR-pgkL^94#=4g(X*|E-
zL>d==V+V~h|LXoeuixbEH^0d}{;Z!Zqx=5R1+z)KJ~8<Ty8opWchh+Mtas@6JO6qk
zEuVdJjDDXTKe&R9d(U~D)?c<$MnB(3r1H~;(DH`WpQYnY+T(tIb#vaQ<Bs^#pK1Ns
zlb@#X`g_i%{a;yf6)peii(jJgd;j%S8l{uB()P0?)m?wmE*jsO^d9}5Q<EjMef{U>
z(fZ!c{)NT|j;f~dn&Uo2<Ny55eIF+M&xN$S=FQh=yuSVtdXB^^%jrCC%sYyfZ>;?#
zZ9o2!)wI4j@n;&jb1(T2yYL-HE~MrAe>t7@`}3q9(KtzS-=jC3b7=eD8Yj^BsqIhF
z_Rp>S4&C>?@9wAN-#+3#uhqPTmTweZrtLpk@1EzGcii`;WnMXL|KoFWX#DtH3uwIg
zvG?5LDx);Mw|Om%mwomG8u#tnOXIfJ+~w7=t+d><*i|8)JoRQ;E~%`b@sHO{pt0@4
z?tAg(f1E+ft^a(LMkA@yID6GB8qaz9MY`WNR9B8p??0W^-*mxeXncC{?`eBygZn<$
zynYm|FZ<BPY5a8GBlLY+Z|<jY-+P;AJm$%6`rg_*+~4h)n{K52+qby)U(xm)Enj*2
zvowCcaUShI`7a-$@zqaqG#+1hH*KHujw?TJcV0{DkJ{zR)ww%<PTRlyvjG}w9{4$J
z7f!9A`}ntWkD=xIU;dcJ4=>+9<2B!DqVX3mO`-i>68}WYrLW&g<HL8m@73vB&L_Eg
zdDjUaW?#hSjh~?FT-@k>&)avqa(LR-Q)vD7cDldANl%_d%inbBXguzTkJ4DmEvE7B
z-#ePdV=7&n=aINOPqrYI!mjh+%nG`Xcfa{J+OKvFGgu;#b8dF8vv~X4wEo4j|C>hp
zg5T2qpWEfi*R7i$WV=VMJ#jK^-%zoF#{PRY(zvqf1{(SE-QQz&(p~QU=X_ef>ls(>
zUaKgj^IrMT@icBY^(Qnwe5re#6RzQD|0nO+L}N+S>yz1a|MPbrCB4v6eLIb}9C8P3
zzy0mM(erKC?8;sAX`R;pWy2I2fB*h{G=An*_r8vP|7BV}^wnc&oKW%<JALGffBrh%
z$J6(@`&%()*F_@Sgn!UD^JZ7xPW`egpJhECqxG*I>wX`bU%Y^ppQv)@neXnuh?cM2
z?iw#=9Omx-?A9Y`z4n+Z4@dmnJ@4lZok8nwG^f(IKH|#HcMgA;mM4DNov)pL{FIiT
zxg|ospZ%=+p1xD=%K5g*-L$>@26tZDeE%<K`T44cXgu=GZ_)VZ3GV!Q)yunS+4$P|
z^!#h?cYn_dpZ-0qk4^b5jX&T0GaB#y$+<K-argY(uYX0$zv54(<1bi$4~?JtuB$iS
z-{k7wH|{oRzs0M%X#3pr+~s9$7m+!4&F!bt_H8%(nzrx!vO7;+^0xbXKKln(K7Z5f
z&cnNJzK-_)*lDi)u<A7T`#So_i)sBwzU2P?7fe2jmcRPcwKVR0I6>dr>u=1Y>wN9{
zHKeE1N8R`R&ljwq^<VAyHjO)4@1XrpJE4^3`GwVc=sdUn`&@eO)K{OR<&Ujtr18mL
zyLxBlRZD1jUu+tU6aIb$jdKolf4}cG8noP1^IP_sMJ}vz_5P19clG_=hunGV57kG~
zeH?L}P2+Q2tu#Kp-lF4wcFc=3u6)Ct&#sf(X!)i4-S-kk->6(+X<CTNOVef$?tnc`
zF$Vj87lTdP(jV#X={Gv-*Cye)_jJ3?#XB>~-kA}5rHX9n(Z7!7`HNK?HKC%sbjO`1
z-KL#!+j01_X3hF5-s-&VxM-xJEE1v3$Mhz9ERLUPubpWkoS8H_XYv*0UTf0_l~iCt
zV*jxf`zPWwe%9blrf`m?*rw<>wxw~2gzT{@eG%+_C}m>PW5$Nbl!#6CpoTSRRg@1~
z%-HRa{=Tk?a`tb=%F^kdV8VolFE2f3!zTJOWAhY_vrLCa=iP*uK%XBSx9C{t8(_+k
zZ1c8ZDr&~=rDf0Wp;O>&yZ*R?^iIa+*`68qaijz*%DqqFKWXn|+)n9<bg%U6>7YgC
z=G0d9)udhUF*c`N*^JF87qcSmpk{1x^%BmG6Vud#KQlL{rx<KbzuBqHQ_Fr=6}@N@
z-qlFO)UqGWjKZUI|K7qyrPQC>_`yH-2!nsVFGep)@4QEewx)J|P>c4b{(P<`+LZe9
zQ6qYF`YIPbRua80y>Rb4B|}^GUN$}Y%CyuW7e6t5=x%=Sql|^Op2}F*w|9E<%=ESQ
zO^;fszrX(e^wf!?6Dr<6cHU$one18L)@QD^yNqZI;hd^S`NV$UH99XpYU-!)(D=?K
zS4}8C#O_{K6<IJfy?m&R1CyC+4x=@7J(fLsZ3~hWnQIU2NwzoEXRevjgOo<nFzvQJ
zqpP>mwlX(PO!jmRJO1##{+^`KZfBl%BJT3YRE0x&5_9kU@WFF;44#(lJ(-0ihwqxs
zL#3}yyJd#T*f)eDGs8>ns^Os-_rj^TNao2#1T0d=4x4#d3anyk#rwy^{y1amV8D)k
z6n9d7ZARH^Gh)B1ij^Wzm!4aPCS1*9T_jTW@)xV3<z+8_zAAcZB=*9rse{oQ_S3p2
zexao7g_770XT~<ooZ9L&7kf_7IQP4B?wc>Fik=gRJ-`i}+kH*kvp1DB#kO#<H>#&9
zn7$)rS6)#SEi2oyxhgt668kAXbkNY7G<4uLKDLQZ9e4*FxYIrG4PoSg?4=kwaE}n%
zCZrC0fDZhrd*CK{=s@><bPsyGr0nsM*!yB^k2tk;l<pX<P4Fapao+N>cW~abBC#vg
zq4SP<tKCanTvB#%No<c2yHuIl>gt=6+QG?wM>o0m;;N_+iG52SIvGiUdxG61WxGpa
z4{EWCw5hGb6wz?Cgwt)hl6}h;T~ZZY6p7vOsiD(l%8+~3^UBVPJy#RkSu?ejsV~xD
zIK!9d40~~gPex+<+lOAsk<#X#W#5dleKTT@8nKH$jn^}+?8*l<pysg-SJ{cHl$X7a
ztKjwg){jdvZZt!D-AjDSD*Kid!;Bf*)Pdh<q^87gG(w%?{6E0?KU((xx6p6&;CncK
z+U^-P4Y(I@%^u9cu^;`U1W%DxvgL!CSMykhtNenla{X5KDRxdDdW!VS;$9$~;l|t^
zd*vS`#T<7r$1S1}Kw+C!6=8DYu!%GL2VgE*n5oiO4*-TygvoW=z{1UGUc3<4)L|_@
ziZHowz~su_CAIe))ZWb#BC#+15PEM=yJlL?uJUse&G2Mm?>#ns9J%JQtFK}@{GH24
zyYBmO=2bEc^31EiMT+ftl3gY3TMlJjy;HtrJLxL-T~1fI@ac@J47c&!%ed)RIA5{9
z{)Am7?W1O1X7oCk3Xjrd-gGas`<V<=!6Tgte=fFT?{trpYT3&;PkGtBU#N;Ik=TxB
zGj4Z;ui@UWn;d;`-}KnE``G=aJ>#LjG>>(--*@Rcui`o|&>pz3YUn2(J8%SdGxqp%
z>@sN&dnmH!9Xl$!@@n=QZ-+y1YNYIk&rXLk6f0$a9Lt>X^2zb$=EW;&T!*!M&*9bV
z!^Ioo@%R$_$3CaW<Ll#f?eY0+6~=hHmeykt%kg+4Tf(aJh=qk%Q@;d<#XIBpdrmv>
zz40ElJcI4Fa6JyITNcN;47Oa~j^%oeU2y}ubYnYCS8KB6Mz*||-AY{#{)s#6A9i=C
ze>XIpS6#h<mRHih>{eFOt#j4us;lWQ+Tb39yNbu>HLtVg&gpBOw|wKu*5=-XAYo;_
zR^3!xO?%a{hh(3Ix;bo^d)|!dYMuS%e%zBTW(%g+yxN+(ynodSnOg#S-b6Px*0<=@
z)!H()ojo{xmMmjmiu<Xf?Q_|JiDS&JMV;(<tDP~fztHpgi`?TEtxj~wovY@oTAi4;
zywk*Y9G_3;r@N$!upO5)(*4xp8S(q3KdE1m9aMYXN@330MAz~)OS(I`Rjuyrp4TLf
z_5Et>RdK%xcDmH}%l@1<zta?J*Co0abuJS+l~wLe4ea-z?~}a<^l6wg`2FDbXU|{D
zHnHPd@hsw!_<|;NjO*8#V_(0vZ{-TP!D^k4-==e2f_~20HQBFUyNqqGW7nUnbFE_Y
z#+54;b+!tfYvS|k@%wA^e$h+L?tMw4$p*Z*_@W>O>)rQ%)b~N>TQ03yv53cel5tP<
zOS0cjeSGkK<}Kn9;{4UCgyqZZ*4oZ^6MG-BykCoHJN+PLm)@89{@1YYYwYhozKm_B
zgEF6A#QW7++qY_k`(!KF`?G=mPQTB!wM){!XFR@?ZC}q`wc2%Nmo>kceMyb=SefO1
z=h1e|FY)-Q*7+OptnPDSIho^n-E_S**t53N?4G~QT(fB1s=4g?^(%4FEYDlt$hOzA
zN13~PBfh7N_=~;8%h+UA-yUybzbihCsrin6|H1cl$ujo~VS8Xd9O4Rq`+Kjo+S$q_
zsg+vl6z%RZz2xk@aX)n(L#J}zryjOHy=qu?KkMD8Wcl)D+ydLbmNQoHm^awxGoOi+
kSM(Z5MAZ9dw)ae1XD9oRbC|}1L0}da*@n5kDc;QgFW+<b<^TWy

literal 0
HcmV?d00001

diff --git a/test_fixtures/parquet/influxql_log_2.parquet b/test_fixtures/parquet/influxql_log_2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ada36c0818ec2504299e64fd60337ee32fc553dd
GIT binary patch
literal 320350
zcmeEv349%8)qY9=%O-0}L6B<@+Co!iUn&S~(ha(VB(#8)GHZKnHg1x#Xplu#1*Cw2
z?5iLu%ZHzWe-uyw6<Jgi5EKv;{oGIxRMh`-&O0}~`%G?glUvgMpr<qUy)*BfdC&Tu
z^PJW>ZSv$TCqJ+s{#!ixnJp(b?9$M%?{xWuxBq$L?lWFIkLefh{2S9_e=X?cXYZJy
z&u{8o!1UOsA7r}a+y7;{_)nkfqtCza#1EOCvRC87bo|sy)0pnD-+@dY|MyI$&mJ?2
z>B6-N(-%7|rp+hJ7N1|+%=F1GwlaNe{S6=f2d;O9{}rY$Tzd!8mp^nj(`%KdnI7Ev
zJk#G^G38Ji|4L7a>E5e+rqibW57WC}?D{=@e(Fa~WBR>YPG`Dv=4__hul_mHhV~bk
z?t0nFO!xP1`9J#Jh5LM+ssGIf{y@ji{_Q<mPWac2yEmNvg{_E&hqrD`^uAB7Vmklp
z+rEd6KY0DsOqJO`V7mXh`IG4U7oND9=?i-uyA2)RbdY=8dqF3kwvOo^y8g^`_74wT
zMd$x{!Ec%VYQjy&)A2stk28Jw=RaZk@Zj4_kJ|2^Oy9ikoYtK%u4~G<O#i&_4W?WD
zTso4@pLNV_OuJfVw$t%Jzgy4rgQng=$A@|gnBITdr<i_zefwE^<NQTWox)VRd5^Q{
z__$Nv$Miin?a#FTV&@z>zxubYFx~E}*PKhoT?gFCbiz@BzW105UpfHi4_&<H%S88F
zwHMQGo@_8}`_6Be-hRx#UZKx-HEwv7=ss`$hw1%aI_@<(?%MZGrti%8`qL(kFS_yW
zpAq%W+WqH5pLyR*rgxtC(P!wm`PTK%5`AUqy-b%h{f_B{+rIBPI{)EY{+H=J)89Dk
z^#9zr`#ajdncnsIf0#a)`7hJA{=Umx`h4r14`-S>?Ovwe+JFCMI{*AW3e)@E6!eju
zc6^J@?|AecOh4VZ57YmC?*OLP?|M4Z=^uQW>C_J%@HUPA+fJWiy2bOCG2QLYKV*8z
z&-Xm>419jUgt<)5Ir|)@M=$vd(_g*sKBljH?GdIe6Mw*Tvi%dL);nuXqVJ#gi9V*g
ze69CQIu75vj_Kdtyi3sEJ;bzYhliOSc=o@U-rl;!-)a0mZ{LsUIS=<Sy<_1gnFg19
zcIBD)TtD|>ruUtHFVmirf64UWZ@k3xjES!^-S3<?nBLs5{YUA0fBxZKOy5~=GPVA`
z^c!@3`Z-53ZJzrPrkURhI$@d5$9EjRn(0HQoXhn8{onsG-DBlLOn>ynpEJGb_g~-Q
zQ@GxroNXo&op_R<zg@i@AD{L3_Dq!}J28Fpnq8P4w{}maZ%uqZ)7z$gfa#C5Qx2!=
zto_Y}OuyfMIn!{vCzwv^{4LYnfAJTle?RW8OqX7N*b(%-t?r)7bl}Kim})!y^in#1
ziSbLO3m$!y>0!J7k?H@N`X{FEm0uU<zxh|D+nv~O8GUd2v);q>hCj?=y31d`G4(8*
zSLA;(U4O{dd(iRcJGNzd-pUly^M5BX-Tn1rnLd~4W4h&Ig1*%JSw6nG@%aC2v&Buj
zJ6}GD>0_U}j%n-Cv;RxyzwzL94cmgYFTZ~R(KGwsvjtJ@smqw2dwIu1I{w#JUSoPy
zr}UZaaJ=ht7Sn0JJ&NfK(zX}S`Gvb&&-BX^9$<R<!w)h2_To2~Uevz*g*0wWx|8W?
zSM2lG?Qy)~J<l<HY31s_(eZ!R|LF~)AJ}itH;KM-$8@G|&HOpj`}TjCskZvpOx^w+
z|4!pB{O|gI>;QVqt~W7#<&Cd0eP;jrm~Q)U!$0ZsQ`T(7bgw5r%=D;BzsU67N58~$
z?h76NqHzn?e{=GVpdb6#GfW5lzcAhH6N3KpmcQ}wQFp$@^b31ToI>MoJnjQbFHW^E
z)xLct(}TbE-7nGSdro+S=|6Yc`gS@#;DVi)PW_tB^w^_4rvEtMG^S^17c-qTdz&xQ
z_@7PLnd#^540hiM$N%@e)l5J6zr9Qsesh57{8K;4^jj-WW2*h;Or|$Kcp=lh?|F&o
zZ~yUYrtQZ*F-X^Y<YGa)W<Jfw2Osz>)91B!nEt?OIF-iz^5aXGzW6{p(<9&Tnf~8D
zKgLwP=|4ZAaWh`u`KLsGbl<K_|9In6rn4q|glXTQYnZ<87u`&s3{GeI>d`kbJ$S10
zG=2XM|5`O;XV62g>0;WI>1MkA*L_SczIK4=Y2hcB_U|yr^qF^l#&oY=edqxC{u}!p
z#dPx4lkeOL|E*+$i~ibaasw={y^fxg9Wr%N)(C58XxL^+!xodK9(LVnr%#-;%k-lk
zxbE~b-?LNW!pjf4{*1G>o2smN=7H<aJZHz$+;gYSx#3gi?K-{f)}tT1;jB;ZF1e3h
zKIg`>FL<AodF`18Z#?IseGWTg(jIefI``sz7p|Xv%tJSwcgeKY8x~$M_vZ62JMiec
z*F5{s&7c1KLB~II?jFszeC7&u&5v(A<~z4saHZM%@}pNYfAPYrKXlSxUwigDU%cqr
zLr>dcn>|}@{p|J4=k7A&*zewY@r?^U+qmcpEw`<|`H0Jv^mE_6?UFApyKe4zd(QjP
zrMI=;+IHKq-}};Kx39R<{r(r`-Tt|+eB>LM-#z!e+dqHDst3;4=6&<OeED78qw8mU
z<l!%0@jt7dykXIm^S|<iudRLN?({Dn{>qhK?|Jc|^WL}Mt5@CI_u7wd`^Y0-z51Ic
z{^jNGU%B9pYwrK}+kgGtFCMw$+HZfdVdA!XExhx(2hW_e>j5htz4Q9-oU_xuhhMeu
zt{cAh>8a|+o`3YN8y~qS)qMV5i|)SZ`<F~_|I*5B@K_rrZ@Guwu=P%ROx?0S-4#r3
znDWe)Q~r7&{@efM{q}FzYifzNVU;*X7n(eM^7O@%w_x*mCtSKW)8k8W(?DNS;Pmx3
z$#~`4Pu@QFg>TI-e5b;JQ;W{vYR_yr0sl|lKAP^tpnIK|?PC+BJT_tf7EJc&qVJWu
z5+;g;wwPBNroHv#Ns}k>q;Pd!hBPa>A3n_9ZL1)Y>B)%uEuXS{|GjWGgGW?xv2FNb
z%{_kfxJT%M9VhpexnRM2Q4GmeCB?UV{Ly8<?0YF3ICZDOGsW*^tDem2MJ>f|rtq7k
zu7x+V%j8|MOK2-xJajtyf^zfQia(tyNbyW|>giwApK-ikRiD$<+Zp&`Qs+;Ya{h$<
zXVG#gxBzaSnw-ksetTT;gn=M)a;9gXKgg7QO55iSnwoc1Jf|%)J)O}@x@W?adnW9E
zFwG<HONE!ftlm5Mz1dk!#5Mc-y2?G0iTG?P{Z)MH?fUwAGS2E?!!kK_%BlMwK=;r2
z7Vh?!6#iR0`9iF0utD!4Tnc~wa+IKFEfw^$iv?|2DClmjg6_3K(Ea8Kx`i+310NOi
zHb>Ao9~1P{+Xa2HP0+8sDd^6}2>Qnp1^uZj=xx#cj(&zjOhd!TKN9CVqwhByA&&3)
z>lXZvkM)b=>yH)G_`0CS+$HFrqIu5zyf}VwfuPqMID!B1%1_1dou>=>#;Fkv1idwy
z&($9l<9`}GuOOPoXCSud&x`BD_$kqI`fi6fzGuFmACBhNdH;v`#Gbo|-?#Ormh%s}
z<<dtWkQy4UpEZR)ziSPXhR;Or<um7q<KR(2&yAkjOD~AyZ!8z|VOkpa)4B7WOn<d{
z;w1XV4YwV}#|su7=+QrRh@Q9Hdl=t;Web1YaDMaN{Ew5)5XbL1Vh{eu2aglSsYmYN
z>s=hJkN>wsoL{}{{`b<q_BdG_zuhe8h0*+bFBQiJ_X&F496@h7QqW1!``z**;`p%W
z{Ke7x_*AqmADt<l_q~URaVOm)=<{ho-~UZPQ-8UJ2Y!9$zwwDJqUW~9kM`o@-nYfL
zZ~a|d@3>pV@!{tSdPS$8Ex#1>51$pZYptM-pCb9y&~V!_V%dg<>!SDD8$H*jBl`Xo
z{Bgqr(et$q7svCq6yG~I`reA4i1Uk=3HpK6g5DhQ(E(?O`@OS;nD5Si7N2i9SI{pX
zE$E}sy6lYR`B?Nk-yRU3zxaxHpI?jC<>Zzv_=Bq=e%L;mw{^3a-wDxrKR4o|kFFEf
zxhnb{&z~WVooKy(FdBDWv|gTy=AlIQ-+OB@fBz-%y_2H%_WWhy{7#WvI3^UIr@txI
z<F?Ut9vu|t?~0z&@q=QX(uLyuy~l{_q@(9@bab6CdjEg9SB$&oc|o6v?ziT3aeQtx
z?^k=p@sd{sJ?3&j-+ox(fBgFzaeRHWp6%%QZFjXe-*Snd`@SOR-qAYvQNK9u=n*s(
zt?Sj%{I)%C5`QrHO7Z!m=yxyvf;c`cx=wh9IR5?-f~HOr^h@6p^vO>Nx;_vz*i}52
z1C|MT@U!BJ9g*Dl<ASqxppkc;xif#^KTk3{H{5&AU3~oH!4iLQtB*bYK{~Pi{O2bV
zeP=fP-Ow=OQxEftIPH((__NV{uZy0~VIAVW({{LEOB!+4>0;dRk-WO>S#kWiAMCv+
z{p*m&w&oMJM!)B0(Ykypk`u=~DBimpt-G1a#5%cbrMS*((Q`d?7rtG?E$e=`6@7Bt
zsoU~IzVfsf_vJ$c{qBbZZ9PfQxhHPH|7iKLIDR9NcNa&$>-k8|U)C%>Z*c^j8GU|f
zG>?Bq>;9L~_{W|R<9-+M_ubJto^h}^e{Lja4*rZdzB!VE@3f2K^P}}S80oQ-BmRBn
zF!8w^@#QPgI@u!{x7UZo=Z{D7z~~UqWm4}p{K1>i?|bYeG4AE#1+@?V5)WK>f;e6i
ztzTFu|GO3bD>nBhWmU}7N!i7Lufn{qrHl8$>sl^W&N_VZfh+FxpSt_z37?;J+_z5K
zX5Vo8E0pl@>lWP@?r^0s@%hCkPn>q)Rd)aHpSibri)HUWWUs&f<#)fH_M?3cJAKO?
z8ap0+@&hgB?bhV}^wFpGU3l>am)vmaIrH{A;`37Q<9p6)pSJAU!_L}8Kfhx}`xjep
zZu{E#N8SFuuO9c{`TMP%(xiW_^M!{lJL2cFj=TR8|K4KC;%6lHd!JcipFQ)6yMy%Q
zdf(r-KUPcs#5(f!@Q-^MyF7bOSMztt={x0tvu;my?{#nH_Q$Sn{`}GxX0JQ&XZQ5I
z_1@#oJgN7U*RL>7JMNM#-rV|`iLd_Vnr%Muk4dln<m-Rl`_TS>za#(SJ<@fD9Wrs7
zDH_X)As27Q;H1EVG21ebar)Ce-NGQgb=s6$r|rM~1F!_AmUuT-iF54jz4+0|FMf1r
zzho!8LrJwQ*A2taH9W`k3_52RreZmgWcrHcShh^(?7;L?-3(ks^Fv(^>71^aN}yU^
zU}=hG1{$C9G|7-G*S0Lf&}^H|X|gH1mZ>U|ud0@*&^g7jU0-!=#WfwtQB6AMS)r!L
z_&|lLV|o&uGkwRgP1|&B%~LHupmR>(OS<Fus_6u(>+oDG*YZ>kSF}{$!VLHxuA}%`
zXxV|LIg-Wmb6sCi9anN>*;7=@qH&UD1hyThQfSD!Z}a?&KzAM6QH=m+ZPlZ3UKr@Q
z?mLPsE1u=>^<*RPj6hawUAHw^;<*H7AZv2y2byikn&#7YP2AA+1K*HkRdPaprkJUt
zyPg{aI)<q#k26Edvb{j{WY_U@J{QVXpnJ9y;^}#z>e4txmt;%Q4NGwpMYH+27;v0C
zOL84Y^KD<_cNv7PiC^W|k`}0@&GRz@UkMZ+Kf%N_Ren~w=Y)=7nyP1OLG(L(T~R#K
zmsHaa4ObG+-ZMhW2xZUELQT_jnv1U3nyVVBs(FSHxcp2l)xu-(WvmR(lMH@VR;X%f
zC}YJLwyVoLmr%2v06)zLeZw<Ne$KXMhl+)#>B+W&_dxeh-N3grD+nb|@okf@XKS`=
zsJf-AW}rKscyGF|yPBlBo{eAW@bxU!3$e&;S@A>LkobCz<vNC;o48lt1tEXe_6%F`
z6)DtR!*X2yy?z+jzGGXC<GQ9|@bxtCis7nY1X)+K(4qMSif;Itq9}oeHKOwS)np7&
zbk&z!S2KOSZ)nRpR-S?-psKdc*TV!12^TVD-^EMi?<!bBo`e@*xFNpFe53>+7L~79
zx+9yOC7ylg`L3bsiYJ?arSe>iAg~O@GBh2FKvQ^}ugR{W(p!^EUDx>eg_wyWtDdQ0
zg*!H%(-c*9a3Kv#$;Jxi_pAG6pqRcK%Dx%|{9GhY$DI{$gRfeS!SnMqO}5>@bX-pX
zKk@tZ@Q(0?G)Grd%@S+PQq3@MRs0xS)^T}_JF;Uqjwxf(;BJrCkm1OhAxo}~MeIuk
z{|>xnTUA0Zy98D;#2T^<*Oz_6bag4Tb^fkt`I2S1TIf3Xy<YVEd?|!9aRLSRmjqvG
zu53D3p{9!E=`+u`s$;1kCW!T@8Io8-nq``r<QRDQrf>2%Sq&T$62f&|3F3gCOK9k_
z3C_R+^?gWBdVb(NJWD^Yd^~Q)=XWG&3UPv_`e0~VVfp7Mo@5%3I=-&jro`U`V_Bi&
zLM}Pr9-Zfml_uGa=DCV%$c8NBh66?jY)Bhhhn(f-5(K7=_W*f-E9gR=8aCd!rMbSP
zghmkZ`^7KwASS>wU>rFVbH-1QAzVyH4L!%^d+0uvU#Nr*M3ZId{61`~MMVvC%f^FO
zLm_t^N3~=L?B&`J=KR}al_u?aJ|vi~iT4{QTA=$7Rf>sUCh<FRZ3C;wv2;n3h#lzp
zg@$IivLeZW5gMk=&)!s^7+hC%vA})EV;bk0cyo@S5(}G}CghrJYdVQo35(j)_+5J7
zLA)9X%R9sj#50gp3ql-n2jW)=`8jKTfR~{`J?eo5=B9gqGcY$u1!xP$_QW{+G_VDx
zAt?|bJU@s6UG-E20?@ayj_JFaX-KAt7ht(q+JTs#42E>g!1Ems;*;N-ExGuz3g(r3
zKQ#Fs24sY%8?xogO6W;E7md`at(i~*&=t`-Q1GlmGn6De2`%8?8+y7T>z1K9fg{Nd
zKW8upuC79X;ITSFCx?!zIW{DsWb0C(@OMoJJzNMs)o@4)(eH5l(6O*4><~{Q5VGC1
zEP7owq*&nKVbi$4#mlvPJG7zw!RmC*^+FGe&A{7)(iHR4G%J*BtVYOsO%l(=z`D~h
zGfP$#=x!lT!32=wrf=x5Fl3geuJ1uZYnp*)VB<m1J$wZ(#Lyk60@=oVcrNq;bq(?w
zmWLyBqhf)BG~JaTge<&28i$_?d1IIXUKH-kybv1DgOG@p?prFP1&z~uyaXA4Sw7aT
z#ph(x*0GYE(85Ykh5XZX2d`c8Rd5pcBzpF;W=mLa8VsmV=K1+1bg!X6KzOEMI?OYs
z5jY08$^gsJ%cJ{Rnxo1J)+vNjsM!1rG*dTB$%e$YB`fs!T%b#^=U@epK8kdntAyZF
z#rC1WW&C!!p5)5DWI}3&frfM9zIbPNn6@m3u+My6yDp}Ov0zKEmm4tegWwrqLCaWS
zPP9&SEKmp`NFg_bF~@UOOsuC6tHQ*x(fAy!CLjC?y9^VuB)&(W;0JkTfOQ#4y0F&_
z7%f;+U>F5o7c$Y&F*zxeA!A`4*t{n3#zQ<URnjGrCqh^Fj--SVepm>7Db};9s#pUs
z2%)8XP3TJ)W_bUq;!2pDuSWa^VQ9kwcHGdFgk2097Yjv$)V3`X_AIXh7`LVa6H#%o
zkPJSjLaaa|$XLu8sZttehCU32K(cJZ0B`YgwmcUy9Sa=uFg#%sD$pjfuakxfv9S1^
zD?XmFid6x#(^T|mt-<Po(QCL6p0beRx}$)vR0W~}uTAi(?m>V+yD5ep*t)Q36w6a2
zBQy=$b_^^Bnjh>}IS3t6|1frheIMdY<2B(5t_sD^@=Ap)R>8YaAdq%4^Cb*0Ck((!
zU>naAHmhaof#rFmSZu5u{w^LQ%n$Gp)*JRSc%6b*J**Q`f(K&z!Ulm3(o{TjmDt@@
z_?fzpAz*9RATZxy<I|i~I1Gjwc)ATPhFMSNus$_awqTlQa0dimI#@6P_!jRifDYmN
zLbF2;%Bt?bOpWX&ynL(z723_m>(y8mW4ZVaxZ3vd2I11sJrqR^6&PrN20p?XqjSFM
z!N3fCUBjY*NTqWS0x+HfnG78XlcaO5>A~W0e0Vv51i``A(+wT(%z<GC@fh$uJcw$A
z-Xc^LR3EP)m5eG3gk|O!uF3a>jT~rB0Q(Z62wIJ<=Xv-zl%ZqbZyCaOfqMcwR)z6|
zWeJ0p#%aDJk^O*E&=$VneQ=NoiwPRo4WRmHoCW0rjbMhdf|Ve6S_3oUXWHOE&`>;k
z0~RBc8^jEZYq2&|7-gXYugQfraK$|&H}njc@w$Z7g!M)DFauH?Dw*JTfWn@HPXO74
z?^-Z%O^fHOTcjsLh<eS{HK7MBm`%2<IYtPB4Dy4n=bP{y3@{NmABLQeyO5IbD+5*5
zZPSZn6Bxi#p-4R#VQ_otyRe--J&^Du99l;FoCDc4bxXlYfzct0_X{7*&_h3zWG%2=
zhj{@S0FMKY9`YFyfS-#OhMHyLg}`W442{Q0xE|Ch9Kz7XN4(a2FueqsA3`6ymdxt_
z3`ma@Z^s1(yF3n#FwA|;g2Xc=*hDmE3C~gocZHCXGL|R*4y-u{O8jEe(rks#!H<V|
z?P(fa!L)h3z*{A;hr2@%!9S(%LSsAl0r1M;c<Ew|%dk#${8ZC%z?fp4YOZD*<S<}`
zz(f$wO3_V-W_YnMl)=h$J;*BEg;_)Pyo$9?=Ws`Zj7Qv@{CIvx4vYb^CGnnpus?qn
zK0Lg9c(s~p;Airj6$^q857hUd5VPX~JqWPgAtNCK#hSzqb>TQGHXbd6e)j#USUz~f
z@WtVl&|DnIcv!3&Tw=?H=|$)0@xp^3n-U^h`0;p^8hJP1T@%&?&rgEF3OuYU9gKrz
zL*F$>0J@sydN5?IfO!im$bmhAw?n3=!RHhM4xIun_hc}QkZa`P84A2&IHoGhFMd`U
zxLLvPfWfUnn2Y(rB!s7+$Oa5vAv?)PlRO*70If-r?_t0i$17AJGO!i|&v=>-_Xvw0
z^F@e6*jKU*(<;Pj)**6)&lGAto*iVV1+A)tyw(sP0H<kzB4agy32832?8p}01Kd{A
zb`>!{@I1T%T97i>jmN3jymk>Zf*yrID`Fms<odRUPzkZGCj2*iDOBMMg-|~*_4vC!
zSOJd&3kXsijwRjKGobTi&o(d-SmlCe@G^ChLApdvBHIFxRWSY#3i1?~SmJsHA|p`p
z7K9NTO7Z-R5DVM03`@n6H^uMO5wy@lS5pF5<DS852re{KwFLPfo2o7JTVSeimMnzF
zV43K`PQ(I}3>aeMYDjnrG-nwur=@A33^OovguUj%D8<u)S%Xyv9YW(=JP$`QY4w5c
z9OhpQ%!ZYPxE6V-BKBoqUBkyw@iyR>3Au}y2%>734@^(kCPI$Gvkpmy!Q8Qpke>?_
zScs<zFH?f967f3tCLU&IAP$J|pWqo&$2(HMux!ZM{64_FmaEc&L1Ydd7tJ|91W%Un
zW1zDvygWJwV^TGB+kk7N!+PNNCPEvyD@-|v4#97TcEHpPI1U3Jkgw+i20T5QH(~_}
z^MV#gFj73-GqK#lkl#6$Zh-d-M@(^S*#7igI2cgXPUu25dr**c4lJlz(6!K_vIKR@
z^TQg3jT~TgAeiO}d8%OfKss@>1vWj61IJ6D0?#Udw1d$?=MZW`;0sm<8Lx<j(m9qu
zaFpS#+AwSQdP-;`Ht9KtZouUdb{!(0@DQN@!3UTlKNp1mu%2MaAx;X8XE_P)6V4m#
zRtG8=At#=Tj$ndhDJ~+lF03v(2jO5T@CtkjLPjxpel8+QP#q8lST2#g2kT3SyUG^C
zDwGe;IrLP_49^bC26HT0e=<Ch5L!gXgZ20w*$!My7=SK#16C407p#|nEIQAJ2a5<N
z%?18BRA=CYz6Kvw$Tb8HTn%0UMNFac__>6zA`!L#heHp*Ii>G{$q{0NQHdxco;A<K
z31OV#0pnT0PZv50-XdfKL?mtuVI*uq2s7LSPBIpRBY0oNlC&ZILl4di798CJ7Ccsg
zqswsj;K}h^5XyoJ0X~Fj>`G!iBYugPUZ~o5Qs6LNld!klAk-D8Ra1gr$=3_PZ-(bV
zL_)0z8(4)`im(cv0E}u_HRbnLb}CdWQ+Fy<=$Lbj_gltwN?kT<Y`dWhq;!;g^bLQ?
zH~jt2IF7Pz61^wn<K)++^tK&^x7W~cPDd-#w|5g8v@h<thL10N?0BZ<{7`K0J%5eQ
z$8#T8#q`vVp3L<A0kKj4@wxxz{m!4BH;3u(?^wZf&uv~~+Bo3|rW5pIc*FCZ3qH@s
z9Y=|c`Abim!soAlL~KBx^tUbe_@Q@pX8PLVeR%xK*B;08sUy#3I^(c!GVMOqWct!X
zvBAH?Az~x?u${z)`HSEBIDfw5d0%Hbzu_sSM;tEZao3`LK7R0h-)4H?>?Wq4IpSiz
zzqRysrZ3<089qPr%JX=?^1r`b&l||M|K|Q3>4AKDx662=^YPz*B}K=^zSDOndi1V)
zy`SiZRzCh7qBAG$xEs+QE&UVkZ=d<(_I#c5yzlVm?I(VYH{F&VylfX5ckdIoh~sPb
z;_E;6)Tg$k^J|}2&f|AD;UM03{z=;{e4V`)PUZ2bNf&KL<E}pQ_uCV_r}Grvcz?95
ziRb%=wNEiU=&2v_Mtt|JYkAy9p7;}g&pDLEP{RYSf00B=!v|VAb|rf2BRBB(`mUeC
z^Em&`{rG$KVbAcmr;J9P_bpF9#{2!hn56Rf)c@Sc#~<FJk>@#S@(D~+pSzXM-?!b3
z{QExLzc2r;2fLr(<NdCGm9P8nzx<o;cgs7^GHvU)gTH^uqWgLLDf15IdF=7r1U|oR
z<^xP8o;Zci|KZJl@wgv;auHwmTlaPFb2>NtfAq5&_IT}azV0r+x`v<U#qZsX695)(
z@m;0^Zy&|<c&CTQ|IhPV^YP<5q?o?8d^*zwGylNfKk6rG9{1Y`XE0rE9Lk?Bee(fc
zKTq5yu6y3$Vnct{!0o)grhWONyxx>qALHvh`lZYH_~wJf#`?<T5AgMVz1MH}{7qZk
z#Pny^Jj!(2U=iQv)uvDIef%fy<n!&HyPA*hzhW(4_rk>@F=f)n1z%75#npWO=l;Eb
z$Di>9kvOsSb}#Vxu6<AE>mKpNvzY#C!p(gCsCP_py#<STUdGP*iu2Y#`1s|+e#-Qk
zp3C^UTYdlcO#9#d8=wEbCx65AoHZX~x?9KJ#rSRBVtVX<d-;CO*)Q<%Tc=&j*IzgB
z8+?58DGT{{|Koql^q@V&b2|01Q~0>w`UQ{MZ~CQ7Z`*ST(`i$G&opS-ov(kU{t_SG
za*@bK>0fpqAOG)pk2AgVgtK|xpWXd+zV3{P3e&@mdyzli>db4IzI^;PeE!F`-_FOs
zo+c7O9(-;cpC7zv9@E`+*o*0Pf4qmsU3tYFOusen2Tb4E;bErlIb1x)*I#{zkN5sX
z$n=6qPxJe{>UJR?e{=0O`F!(-?q$00;fZ|x_1iqj$6cB4GX43n;`hDt{x9=!<1SY)
z{pt_iX1djn#dEs-`*-v4cW!x*=_TiAe7&b;uIKwcx8_Qw?=!ZZLIckD$`|?g0`qW|
zA6tH3;^PDQ{>;zq^*vr>e!Jzeefj*H8(-q<ee~|tEFaF;;RUAq$jfcI-nsvHh~=mC
ztAFi5$CrPy{V<~MeB|WWL?8U_4|XH^mFBf`h<^R7t69F?dVt`I?FT=|*Wa(Bmp*QI
zaGUK|&VAcDjOEQAU;i4@owxW4&*!AmPvZH#b>cNlGta)n^ZDI3y7~Llo|wq%<mlsH
zVY<iZVx662{|~SKtL_r-<DENR;Lmrv;6?u3x38bia_7(IU&z;4`s|+kxjB1BrboPd
zC(rZ3`{wiU;@^CZkNaPKfxka{%ir+v-rqWczrXK%p-10%Q|R3vtvHv*?Yz%hOdots
z@R8Rc^z+nHJNbL(wF`c_aKUmuKKHN3GhKB3FZuev4IgKEcIE*-|MWR;^L!rP>tsGY
z>(Gzzv3kH8e4k}czQf~xxzAVm^G{rLA|DU_s-KVNUM%F_v}->tuKUWZOfS6XdZx|o
z86Nk}t{3oeZ^v3b{`My(@%4Z4=wp2Tt3N!J=@YMu`yTS@T0TDLYGKDLyz6Bi|NK=)
zGyUp8%fx)Tci?e12I6<Wc;6yE-ust={}2DTn76%5B#wRRu1omyt3T~9J@>~G`1c+=
zOYrLn+xGJLj&)D)^?#ia_Ji?-Q~180XcE7(yj?vhyHux6%DzloX|o+&`_h!{dpn)(
z{G_F~51jJ$P>D+Mi?K?+EEv=0e|BH`e<*F~5-gIBH+G=(rEzwnf&gBVtTZWlhosMz
zrP+!-OH$#_BlSi;wsB_Tvdn6yJADc#Pc3SpZxobLU`z*<9Po#Q=}`EvT~s18&gRmE
znT_+PtfX;v_dsXo%*G?B+Mq2!Jwac8<Lr+&qLO0hZmZIM>D&1(&i7`5bxyi-RbSBE
z*OOVbPM%eC1~<?2mKur=g5Dl)4L<AYql0d*$4_^!UKRFax*SZT(dq5&OndyJ5BgYN
zPd5g6J>C65H|i@;bAl0cX3aqN+5nS`ei<LH>Pw%3&t(aJW;Sx61V*>dTRgA1Bh?i+
z-P2b#wy#O|_6C_$`x+<gPqn32ujx<C4V(<7km?A!dRI2iNX>6swj{N(_-C$coK<*s
zWn*f^!g+1;FdR=Z-P?s}E_J#BoIfNrt$iU<ou;McENw|qxl?N809@nXkQDZeuosW$
zD*fr|OF7+s&HyhvfPV-NSkSiYsMgfnV|dhbS?ZdR3e%mP)9FplNG)q?nb(&6G&QHa
z84swhXCULz!&;C*{JGmXNS0=)v!o_UI<guEXEwG4eZ4*1eL*`vGt0(m>BmFw>rZ=q
zv>b>DR{2rtUgK;81`$=$@WMn=U(QFeWzqBQ!`B1){j0p5fo{A>8RtBwyF2h#t#dlD
z%o=CYu;`R~a({Ft6F7Mz__!~~q@&M+u5^EY{zbB<az=&epwsWmnHKKVjW<zns^3|?
zHaI!&<lx}v;@J{G7X-uY;j?97Mk3zLX0|M5%M*zK1v}2qfZbCIz?7cT7o?V@J8Qw2
z*t&j~a3=QUu$@ziGZjR9H*L<mES57h#H7KS>hL+UFfVvBXTl&T;>?9T&<UwFaARs2
zRVJonKJ*7X%IU5NXG;3WbEZB9DiizVNMbF?n361!dfaqX=K3UziQO1Q89rkcCg*Qv
z%oxU8(t~2#)QWUBS<<Pw&9&i5>s`T>C_BJbZAq?_u@SgwtIGsjiS1Zq-)U8FWkGq&
zX0EIQu0;CGyMrr{`Uu7>r7LA5t#8^~d1<V!M6nD~O|9WGWnq2VW~MB~l&zVb-k>vb
zFY}dU^PE~JOQh0`JX?-Q*atb$rmC0XO9lDrn>Js@M`Mw*3Gt&>p)CuGL^m^LDaIV-
zXe=_z-W_adpkh}stdeZ0pk!f_b}cWCRhI?|S&;EOe05n^roEXfOL1jR$R}S}&Y4>a
zWr@nHk>|=Wi1!$%-9jm5DW*hL>bpMPW1>6}h3{1uv4ubho4K+MxKfvtcLi6XuoZQf
z(1^twmni9x;Bs!lQP}v6OVs9~%1N(+D+{4YHgjbuuB_l%)+P>XBWt7_V<Lo5Ii-Mb
zAm|Tr4#qyEz%)@ALWu_@IFqKiNpt3T6^&R`nx!G30>ztJ6;^B^U{LKivkOTk8`6Am
zA5Q8iIy<HrJ;$czoRscMHL+<L0BUL=gY=uBES~(yate>rh5Qev-x*5YL0(W_x<5+V
zS+H=K;IEa9{XM;DPo(lRFI&1?Hq54uxvfp}=C&=EHlt)(53ngvQAdBONb6aX&Fbmt
z4n`!aM^$68dH^YaqEQXQO7I}UJ9W56X`^&gg3w5m+oKZJ)ZkzeW<(t;Kx&5XQli){
z52<T2Gma87nsQUe5v@&2%QGY1r#+<aO6S<2sy@rD1qyNWnNdNJI!d}ro2n+NitA4y
z8mQow;PED6Mg{eY(%|3?2{faLI!V(UJ~I|V<c)7;gsz&m9E$OntcF~+4*IdPGqu85
z7f`-!s%=SoYI<8&-;5e5OkIgnm>b2TLpePor7#ifkI|R_JRt1Fk~@_Yl}zYQLgN-B
z!cs~?UM4Jt<YmHw{j+JW*>qac+fj0CHWL=3iZWr5AP2CLgj@<JL`fMOd^Q1>Vq;dW
zA|tjCvvQm>DU!}3tCeinjMLr6uQYSeBBb*m556Z;nK_Zz-UpMHGq2Yvu{N#K*BA6J
z^7E|Pti9{?bRj1kDcp;DL;}7T)ru_h;N(<0Iuoo(bvSE-rh#50%_C_&n>oL)QpO&p
zG<TrwRA&zlvG+)2xyReX0B3+pS~7;K09d3BeNr80r84G~NCm43fC>f&UroT=z{tZM
z=I~W;A^Pn2=59(}pWoSYVhV90FX-z_Ek~(|<95b)sO5Rufld%y*|K=i(s^0V&Ex78
zwJ9~1^!yUI@6%H3sIF|}A=HioUr!w}jc3)8UYGWR%tkOT9Nvh#3+(5v88cB7lXFu#
zEFoZqMzKudsurpyO7{%(&A{3jW*rNlZ{ifV48S{6f;udAc?nY}p_L6-jv^Hf;J^YM
z&ZFYAIOQTH(BcICCz%Nu9nfFI=;y{VIwIh}E3~VKLKQ+vk8egF%_x-8w4|f0sbyYS
zw%w%MUSLuJuU#sgV?_F@{@h-2)O8sXOQv)Qb#id<ngqNE^erHV4PWUNLb;D`UR=^L
zZ{e7_y~ewXLxCnou*xL^Qz}@u9yt_Ps-}`uTdEXY8yu{hr%QV_0Ko=|vRp-&xUjK8
z?Krfk7JCz_&d$j^62)eVx_bxuQ?2Rln$?=UOR8D`h6(WMQbsKhMeEUTP3p2DV!SON
zSIVO%Fkb=*m>}zhuQ>~wo79dk^EV=KpE}%2B?N?QFlzezn5b}hCw0SSYFo?DX6gpd
z6~b0ZE$Q(GI)jq_Yx|r9sVK)G&x0gL1(A>##htR>F5y<v*tU+AlvUP9u0Ak@+`>Gs
zwTx#QK{Sk1Y&t%8g_JE|Ntaa9GFm3op_&FVD4L-pw4(u1N?Q68@pd%8`%S3-;Y)+U
zwnnv+2E)+On8cVSShNLQJ^evyVc>Kk{=1~7JB^q$6`HosS(X!;&i_RSGF1ddQRNyW
z{mOmW42qG87|Vd%0Yg=UMnNez(7m;iiA>Z@168HTTv*zpDcBWKDak)?TZjh5MU<G=
zlf;w^1J*%O{zDTq+8KJfGXE*F5c0Q>q+wK1HG{?=c`d|h$A6oGHlCBtAa0DPF}8P7
z(_7mY)lMNxY9)o-+&R$itx3^*+E4BS%1>7D=7e`i;nILMv;<3IpSgnG6}U#jPmmh<
zoPH%<sUktjv?!`v94^)&91<#3y?DbH@XQ6QO@rcCJ87^Oy+D#HR&4pg%)T(5v5_MM
zX*YbmTG$3~eDgKpQ<Ro5Iw9k^ZAY}Bmp~0e#zWh*qi5-AXb*tYzEVsH!1Q`_cd3aP
z`c)(}b*XO-Ox?XIJHDi9k&}uzR~4yJg<UwuH%r#P<J*$%^VT?-)j?_=5-&P31Kk`G
zOXdmdb1SIhjmXbP?go>0z<atQfmRr+wE;<pj#NrX0fFX9^(Y_^gH|ON0LiwLXwQ9c
z@Wuoh8=YsBYSK6g8uE^F@i4b_1#Kw}sqC@Jq+C&>Z08Zz;j$i|Ns0bE2>+B);^<C=
zCOpZ!X3DtC#}t*bE^;<cGMko>lO&n-j&p`&Pvwj%N~MHR4JSCM8duRU5o-Lpcs4~V
zMf8EHf=>&ZWRGt?UD(<_Z_b$Vsi`z!5|v7XHY($J45|`zbQ-rQFS^(PruL`Aesu<8
zh$ISIPn?|rGXYVp|0Lj1^qZ8cD5NdySwFs6w5FDJZf-N}B1I+aMY~8FXG}MbFJl@>
zk!WmbQn5i1_BYx(CJli>X%^^doXHDqXOFWA7{^(ioK@2~r*cLWscEz(mgS^%Nk9c6
z2@sQ?Mb5T`KHRomg_hPmfUv_;SJA2w&HKXfEdmybvZS#yrddh9n{>1Uq+!V@8Kl<G
zS-lQ*v}qVzcbv%GrP7P|{4C;7v<yZ+%qn)Cd_XXI=r63Aeuxf=!8qs8_SSqmI!9~g
zZ29JIq9v%UU)(&D4pLZKk47j9+bh=AZ<OIY@(!*AT~u*jSV7BJq(_$2n;QDICfqv0
z_Y3r9(Z&&m#O)i=ED{tM0KW<@D?IN+e5oQa6nuGitnF;0cmZ8qs|Zs&o)My>D%z={
z3$do@<C`y=2QnFy=fT6ReW75Pq$*Cp%G>|V78AA}wWJkkcx8=alLD{!Q+DImggY4x
zC@3cC%TKt<$*jV97<P$J4Bw*NH*j>6()k>KlBz>(ssN*uDku_hE&59l*Ipmbwdf@-
zTXq#$mAYc1$v5DA&{f$8Tx)!DZQ*|Js0WW>?^4B31>BmA(yb#Nl#;2wzM^iG;f4VM
zm9im<d{Pyyt)ZKfM5RVmx5E7!26BzYIT=@0W>S3?m`oyox8dhxpwJnO<b4@W(eorj
z8{a&-uytwP22X3?tX)13q*|Q}nut-~o9T2aCIhQ_WGM<8znidZC}JcLG)ysn$`+P(
zGXA7t`FlG!Q=^y>=F1zyiFBw2BO!hRme><G&F}%L@JOu?JrP3e3t5Kk>hd^`87*Eg
zw_Pj9n;LB%H5WJ0EH;(OTiFErRn(#iiuZNZh9g?cr6zp4gqH58YEcx^s>vazY$Sul
zn!3!Uuz+-}n$$+kb^==pjnFqd)AYu-8U=sIW6Y&CF{!-CFllTU3HBY@8xq7zF_Y?A
z;vO8M%A_PbNXIAArLv8|gM+`Wob1R<ioOSch8w=EYkPLc=;R)<zXIJI-%QH=c`1zR
z^Z}EF8Y81xhnpTxU)jB~J8GEBqkxR+r^9e)aP-yU(KISRY|0C!(*^OpY^-WJnou*q
zTy#Hiy=;@^5+US`sVh`Ul`%x1H`(gJlEJG5Cue$6&6$C|H5E%21fFEs2$of@WwVYa
zfJckJvQ<&hV-PK&WxXVtETLyrdq+j0TMpf@RR8H}FD-Xfh0GIYj2HmsL2-R4IfrcT
zdUP3J-6I<;p_rqPKg8U%@gBVme;)0LhcD(-H}H|~8HAFj__jIDUHT<U1rEs2KJzI1
z3%V$}z%0s!#On#DCqc$004^;>2dYM~B|_yG<gF)21tYH_->I)*!ltDmzbk2EoLVO(
zVo_`|6FPPkNIkY@yN0SG@zo4;$5Y0)C9+~}>(ab@%~57i0QF!Bn@-$KrgZ7bF1-z~
zDZnz9;!y=akqLQ}_NMcBw6G2tBjWa)NtGwqbGDS>fJg%+8!Y-*tQG~tA8jXA(HGv(
zecjb0)%EPaSDbO~(dT4-jLNoi9+D+@T`-5TILmBFNBFcI4!}0wZHTmRj!_IqNLIx<
zmq0xd@-q#~=VzR$Q6L?eXuL_D$oZ%AG;BuyB$l5of(k*4hT&Vqmg<F;>)Nv7hrnVO
z=g$Aw^Rp2(;VcOUOE3cY{G0<by&~xAFKbIzkLLq_Hp(a&DcF$M0!2y;17rwH;bOK%
z#zoTHu+d~|#X{~2qKJvuT1Ar<sOcC)J;d}Z4Q&&FPd|LNb}ZL13>2q8O?yE&&Mj%g
zyj6IbqlYUXRI^OMcC~3!@p0q%5?~)F9Vp3{NQJ6LbqkLYGfv2t2tkp${a2j2MGpvx
z8bJ)7Eo~2BJYSJQ-8C%79j}Sholajrg5bH@cEkbo3ag#UVdh2fnaQ96)G>q#$b%CQ
zDtUBq15n;$Ynp-j;bOInn&i4v%ZBJ;nn*2!sYxwwQ(>!3o-OIKEFCE;mTpwB)#itR
z?K`&RIIe3d#yIz$dlyzNm#BKLYeh$EYTttT#%v;Xo70>2(fr{gG;8Tzo44JyVJj^6
zqjG+ArpFCZbIMxW=q3sWv_e8Rw!^XmN+)!~16|pe$MqGlG)Gq$n1)Q1fv~ZQg@HjE
z+{wb-qiLf9MnEFmgk-{icCQ8pw}=%6mTE%uRf9{fxrS%Bs^*6>x>tnG_>OjusxdGW
zG%LX9+qlx_ipD{f*GsorEH4a(1E)KpA^>o?G2wK8iJ|=2F5Mv&ts0UFfsxtsBD*9(
zKwYOSgrs|3D$E*4fsa=32!u&sqpnglt4K5t&}ZG(6h#S;Vg~@h8VZ4+YeTo<SzWnD
zxGCdpgN_ZD?#f0(GiRe`Y4f5~Q&Xxv$gE3yL27OW{nYwWb9#F_(}ABlEY;o{1k|04
z^KmFUr)yOY{&zZ^OjlnPRd;%dAhs!1hGAv(Gs=91&yp(}gA)eQz0^)_DKg)MdKQ4*
zcy=RQYo!<(5OPseUrNbh<2Gpu>S!uiJXYX~B;_x3lrv<w+&_(HFaru%)2i^fQT^?Q
zK=Sn^*VRmabA)>c#C|jiC5v~gs*iBbL)f4KKa;|#%HAIR2scEJY3Pzsa_0(pUP(3Q
zXmYj&?>~8n8-6zw=aX@K4M+k=fU1y#LR;2lPf;9{fk5Ppb6+~wd>$LV)`yJhavrnj
zl-dz=_ff-|q0MOd-NDm3up|x3EDgLXQ}S#5xx^+4qZ8MQU?+;g<3En&Y1stU2=EAo
zA0NkF6?!gep(*<=ipXkd4Ws+#26H%T0Ou!MkC{0fC5ewcPbbYTLHi3o)!seezU#Oe
z?aZO4;h>eY!BKfa&TE3a&e7D=@I_JhoG{QurPOL=v|M&^flooXHdPdXDP|yh5-?5-
zH-rIhgRR<8_Rtyljk80ivoDz0I4_gw$u!RHhK88gc;rBkIk^pv77`j7XMeo0C$rk=
zPM_lRLu;%`<4cXG8iix7_?C}9y6oeMu<K_x7O`(Vb&-lTz2+@nwyJe;$EtbDTiTA9
zH-FAj03fu@X*p(9>+)Hx%PR-OgLOTta)aXi=t!0IdQ-(k48Rnr(j-Pj1_z#p8G@$w
zH)A9)tkJDakE_qrC0)i!FWu6KAnr^;KuE%ymW6v2WPq76g^Kf%z<3y+$)fA!<Zxy&
z_*5h3WX`xE6h2vl;j7t++0^7Zu=`_`FQC0qOQBTZC@Vo2`bg@ubVoKlYn-DAEl;}{
z7C9r=oji9-dd@(9Pnpf9f*t7@Wvid6c6g&6Z6)mV4P2j-^ab%c%+MCnp<;u_s)Q~`
zD6lT2SY;$LB9}QC`zi+UTmk<A6_2<&kAcO&d=AE#Y}7c*!i0^PJvYR`I9J5N8gTlO
zhmr+m$=72USk)9jf9q9P&wA+juA%FSC!2w#)^bBBH+o!WvE(l2B;=ON>t3Di4pJ@D
zun}?E8Mz_jK6s!#nN{m#2Ib3*geybZ6qgA264_1RQ&R`cIzZKBW7Yu&F-H#FBO8<!
z)rxQcXUtjJQbHY96>%o5J{_&wh%w8?o(r-;hR?4Ez+lzIZ1<?DLC{rtNmE$06-84O
zY{DebxTuFx5F;++SwfEo)8{j-7@Ehm7RAk?y`CYiHD#G(N#Pa{&J{5(FcJ(~SCdAm
zEzAmzZ+tKd$uTlO6{@h2jUccL#ezWeHFz82T=(WWzO9)a4_)3;%biTxaXa&PInRfW
znAxF*aI`_QYF!>T=LDjemFGI0ok#=e?A*Z51)IUO*TaIX%c<cZb0Nemr?YK6e9nd)
ztkDK-#kwd!RoY-+vt&3LVhcVnfsQ5OY}o|I6)-jeLewNKpRajf9$QnTW0rrjY)ws^
zHk=(&$2eES)#ycsa*3pFudt^VZdJZ}vNBszXDO122sO5zh9AuGHQ7~E!se7r=<`u8
zIL2Zbo88<feAU`>B9OsGjDN9Ipq;U++p>7k(s_Bdb2hiDMQutA@%6M+q^1klIuE0>
z1n;FjfO|Cebo*%vI8ZCs)`70xl$^a|@~>13h|^sbrdO}o(8My?Z5DN-*<?!wT_ocs
zgr1lw5opQ84i!D*vmyCpGs%~a;wi=c<mHge=}zHT&3C%XhW-Hu2(4RrJC8mmz|?~D
zxe^&Oxl4eh=}@N3@Fl9M5c+9Fc?uIiN6UGsq&$`3^Z=(NnK(rSSTQyY5>2$i;Di{M
zFHSee&8##nZ6cMBrLYE*14#jdhwP(Dy>BB~ie!AVDx~8OTYZkKdPu6YRmUE6Sy~iX
z98<}fYaflxW?#}x{3TAh8;Q1SgQkJr4SNGhS+=L{$PrhUOOylLeHpgT30aIp5HhwN
z2Kl6jQ|dA}v^WVCdx3SC5f2!d#HcKa!a$BMRESv!#zw}1h7R;q7)cs>@j3`rXkOsi
zelY5cJ=R82E-Tlx24=`E+F;4#YDHKh!)lw!K9sD$c`6^Dtd^Cd@FY)<OuJ-h?^S5>
zIKK$46XmCvr4#U|iZhuOr(vU2U~ZcbJV_%P25qtr4n7yl;0n-R>}o1rblo=t#q{M+
z_SGO5uN7}~v!kChOJ`|MrhiS2=FTZJ-?vi2PPIHg24&``G6$AAYE??9Z&j<yBs^s6
z!1zfR<DlMMNWwdyLJf;h-)Kw$6JD>PeV^pfrh}sUj;~sdF<$rfBla#rbsXAt7*3>8
z<dajRqpfAABakx`{y?`UsT7c7HV8tk{mK=)CW?J5ZeA1@FPOi$eBoR5Cj7AUes@oI
zFe14I$V-WH=QSNW^_E3D`$Y{EVE!ky^EGTD7T5uRcqoXA(x5muo_1gg_c=;9@#tXy
z00|g5i7MVu+)vVC?^tP}>FC|0Rnz&z*EHF71JiLm#kPZSj@n*R(|<(7#m1GxRq6z~
z5oQ9XC}j6WlDP(>6AjRj^+4SWN=XToYJKa^SwN-`ORr=Cf?nF-;8yVh0?i9GQ?8;u
z-U9$6;2$+dS5RWFO7Q8%ajGD159CUTMc9<^oc^9nZbCynm=?5|F{d|!tp52Z(5nCv
ztww#$Du6_nkMAYn{Gl;X-bQ}3e{=)^z^jlMR9X<|^%)j~K2MGWOF*y?1ZT%a4K&L{
zA%Iy$SP)h9Vc@DrNJR9&an;%h0@?`zHsXd_fLTRnaPzS(kB!`mwxFvAg{?@o?}Tr@
z1e=o(1++ogK4)2uM9822P)1Wx{t$T>?F)e$D6IYB{9(kAva$Bz2IEU6ERA}69<gNw
z{S!)V!=dE59yKy*>Iq&dsm+cS7l_|gf~*l9zK)$gSy8L-@EzGP9LJQAniTkwH_p2i
zzRMwNHj>{(kYY#$Cy(8aie5&}pRFrpcI!}|moUSL%=pm?=9>VQp#q0w_Bjx&QOWZ_
ztXkF~6=C*QgZ}Ku8t&riLFoFDG0xKzM>xSL>(i26ZdUR#nJA%nWn;m9oe1WY+kUHA
z0u8gb2ruLt*Tdg{L;F8OyDF#B$SCrGH(7EE4%^O_Y9oA?Br_4&%O%PLVD@GMZrLOm
zF)%I$mpqjQrLYR_k%)&?73t1{gEv&<VOgGq5FCM7RkI_9z6!RggqDYxge)85JP6ma
za8>hiMO$Uds}?O%0OH;}XYS&8sk!qOccfAT<C<EstV3!`r4*zSE@^Ggv9t4Yf;j!S
z)5|N+8?wC%ZEMN0Y{ZR`D_+SFrU7znep^BE|467(<gdv(C7xAgO?2vzY*gx$3V6vX
z3cw}g6enX&1qFM0iNs+WStVvs>TonTjzy7lOriNCp>l)3B&Z}I5tmZi6ga>)CgD;T
z-BuO2Pyl0hec3lmSC>LtAFpP2xuV$`*342jt%`Z4Wdc-VUo&f1u{&9m=8weGk}+ze
zVxbFn!YDOx*o*97IEnZ+b7fYS@~}8QO<pr=qbwKsEeY8ap1-6G4(?bnU`@&tmBKRi
z`>QaBP0NQYceT)UkmK)-^ZoDKSoO%~{m&`N`_I<qp(Y^R=dE!vtAo@$WU6E{*``4m
z7Te7fqAnN~+1FU8qQpSls+?)P5kc&zu71eW-%#2!5_%ifqX|607`?5?XeOwe64Y(+
zb~Zj!u@P)jaa<Cs8Yw0!b!jbdiE$4~DJhT!*}?e&f$WFEVm<}ENOI0+>wa*msO$$R
zG5lMSX>Qop2$=mNR&$$%YN-&2!!P?$5GD&9+jIg2A#91jDaPX-R*8(aO)0-MHBDA3
z;kj13fjz7om!T?^kr57Yash53O#0G&R}|o;lKWh5P&X3#l0(cSVty4e3Cv%41Dost
z6hlJ<8daF>rsm3~;{n-EwL;Z5YRLSXN(uhFlh!x`ef_BgXsFzZtkM+v`Q{i3xej&i
zPa1Y@emk}ApkiLyt18~?;(EId<P~Id+aiR3*y*LXuHQ=$Zg4q^p!%G*WTpL_>^A)P
z8iBOO1=tiFIby(@pq#Rz<Tf}2z_F*Mrb!rIh0|Yb3FuOh11|>VTLH95S6~H*VcD};
z8-VJ8i;MtG!bY=<Ya|i`I?~?Zzi$`k7yxF9Ab6{Zvbm~bsUdc-P18dofboh>IToE>
zhH6j?;^q1Es#!5ms+<=A_@&5Y;VN796x3RzyCUU1$yQn3m>LB8V$0kV#F{Ooj9nlD
z?}W_%I<pHj1K>z-a1x0F6$}cy;O01|z<^U@XqHxmQ-H?f5Tuf0py`9@o3*qH^76r;
zGsZgx-liMR!<vaR*?~C=Kh-K7g12Q8g_X&e6Y8@_b(`o*QsBY!uzbZ`LT@L=z=KJJ
z5;XV~iTD&27|>zk3e91_Q`bbm`>RznPL$QaF;ydQU01RkZF9Za7^B4}MWD^rnCFAg
zzDo+tu}2LJEE%P3bYzIsp#}y#Clc8bayEL%kOsafRs$=RY%6*-jaEWz|C=H5Tvf;S
zHD~jNT0RObnv~WIg?7``zzmMw4%;@}*Pr(K2#vV6)9GI24+w0~#@W~`r2WD&J;>34
z7o8rGYT)AZ6qEgT*0&pW)G%!1G%$9((WwdcLm~}~K4s)MS8CUy%q9g4^iJEfszHmk
zB@I9jeqi|kd~y6*f>_qGrCcI>Uf)PB9Y~@F!b?t_cuYg`Y9r`lP${mMsPZ(_(wQ#U
zJFZLp3k>+gk#P-sc+kH}g@BkkTe4;$P>9Ctx?V+T7XV_=s0E7N4^<aQJ>xSnzOu0u
zJIn_evyd3U#o^OecDK)4Jg>PUm4{oq49wJ;0q!NhEGa@hryU_}e063j5)4uFI>Mff
z1|U0asNy^)mY#=$SsvB4d~lm!fGVZLAO#{CPyody_07mB`dHaLBei1TytaAxEi_dD
zn46+}nh2_I=&Y9_ts0%BN0t+WFFYGiXnq3-Tv6zs0%+CtXrm%TD96esfQbxG8Jtln
zf{uJ?<VgcwH<=MY_`0}X;cmLB8w6&bX9z??^GyM6$XXCary7_IPI@+sPQcqmVRYaC
zqj_u6{2CqEe@HmrJzi>9XdIziRpi$=iid<B$98-mg(KUyhEk)Rt$`?v-jWW(AVUa>
zPG@R<y0f40H8)6!Y+J;<pAv(MTSK&o#RKS&+`cyL$Hdx)(sjy7Ak##qX+;Sn(~eF;
zFvwC7f02hXQjij{A$3*34l{a&mQKMz?n@nRwZnCW>z&Y1AoPAj>MCuu3xfg;Adx7f
zSJAFQbFszlxH7P3LKV2#<5V>GkxcM{>5*x`HEJ6G??+G=)JI?SYGr{6Yh*+5O5U~_
z9h4OdLs>XjVZ-c$5i1B3;Ik+QfkU|$@4m}+T_Vh((J_jKlL`VI`-6jn7sc8LHnq;v
ztHGc`17*o}G|yFB?9j<ItQF5~6(QGK+l!ucC<R~TtV73()xmUXfw2sRGgiEUw3Qr>
zBUQ%bm_HD}=OC!-Mcw^CY)uvVAjsc|E($Ls-Myj7#*h=jBidVAO1mM$Z?|z|`w^yf
zz^t+mlN(_dM6w{Jr3QoU=ejNYOe4odjxvpM&Uskv(OK~rgo`61YF2#OlWmM1@|Z@}
zZ^5QLWJ*Q9MK++%0a{aX^H9-Dz}cXgOftX4z+NH6Ir9r+3Fw;KqCCGP8k*;~aQL*y
zZQ;WG=xo+)kzpAXyDjL`ZX|A{MWB}`3plmHW<z9~*w|i0o!{_tfK3O9lmXmA9~H-v
zU&|W196!I_MAFJeS)Yxx(Wb*?)SXaccSJ2x%^9bAt>^Rw9N=gokejB?0Ii)<eF20Q
zy4}pkmmkF)-XX#Yy7Qm{An4EZr2A8x23P@KIxdz`!jl+L&Wtw+B-9t6kqo@>Vv_(N
zy?XQwus2C@!Nh?Bg#!nbU}u>FseTWQysBxw5d@|Uh&k7B;3MeXctw)h<hBvo0}rh?
zoXp8l_Fc9mMNIGap#WrApqjGrft=%m>La~3fzv!vQZr<{7+Oh8{9Y`iBl`C&QG
zW<sUv*oeC=*chlf9KL=<xvu9}n(JFiXawP?qsHf#e#0*u({>O`(%l37L0@hr{l+`}
zLnUfRhTVW2UoN+HLEvOk?Y$Z2<O-^H<C%2y0(97}LJUiT`CF`nvA121U@@p+1*C%l
z13A%NKCofQ41Rx{XO8qDXlqTaq8bJ{uU_bys_Ow~QVz#!^uB}=9`RB*xKps2!K)hs
z4_&&7Hrux)CW^0DESJlw-{I@$Dgt$*qsLXG$C{Q_a@!C4>2;~cVRlHEa3TQ#zPD(9
z@2WUQS=I0)s6KG`iX1Mo4G?|@4s+-^b}eIfc`6YqyPopmBMDJ+MP3OTeiKvSax82+
zf?G(;0~T>f9gf&r!mL@^i_4E|!OzVLX<<ZOI19(ws<mv|Ue2kZP*#aZ7NQdXfJ(@q
zKx`#LyHa0TVQ7OtRp3uo!JoR1mhhnxI<o5k$7Y-ZzU7pyrbb+Y56|Fc@V6n&TyEnW
zb*ZDFct@-kLws-yzz`N(WUy|rT`mJ#IU!Heuza4znLM7(9%s1~$Avtdb1G+4QFIzM
z0%Oo7Qo88=PCWfW1)ZzSmh@Rj)4@I{!Ua|6T-`y^Z-7dFMC{aXyoSc2oSL?QZf^2g
zBW(35xVo$djWh{b`js%_h6W66D{%w)gs2lkruG<ur$b`{laZ>?N-(%pmt7RF)rMYD
z36(Y~)oDZg_E-i-h)~DANfmCl;y5bsYz;$qZQE6AnQFTU7}d+ut|-<3xMK(Qp89x4
zF>}yHGyG70LUOz3wjI%i^28V)Ih*O0LIdmaI@_+2%f}!yUdC!RHHDfM6|*bYxE}Sf
zj_h-4N1sR^YX$?dHdKg`34IJ$I&?cwtI)?NEkFW`>-m-!>e_fmSRXYTn|$GN;p~kI
zgjE8I*a+Sp(#~~;tB!u?i2o>7X&HbGYuL$khYvvjc<{*+$pBS_92*?`MZC(kblcLc
zD)1o!MGJJ_w|xb0JCFwBG?}$Tq<n;x_J(x#1q(McSIo823G^B3IYyJW4s><~C#Nd4
zNv%|Tz7b+zwD+r#QVG9Wl}lRW3Oa|@BYRr_pH9l&)B`OMdt(<IHRMJWx#})(q%_&F
zbYNo2zAJBLZ)B<Vqqc3NZSsWdZJ5c}yS-<iY!~Er3wxtbn?TN}M63w^4Zsui$lgd3
zPtM-DXj>Soy^*Sl{BsC{;YZX$phCIGQ<DQDG?2WvnZ3uBy_0J1@xb2b+l7FdR<hCz
zWoY%7O$zWfw3SF|oQndJ*nqjI>L{myAv1jTHWlA-Tvv4sP4Q(}+05Q!%ic*bp~iu6
zj_M5n5^~LWDeYaC$P}<OWTc3~ZAj#OBeqVZq_Gh=gdu^{DpiwC>Y9LSbqv*4q|k)t
zFuoalsGO_L%E?8)vk3j6RJA5SJTGouRGY|p^61-$?S`W;G_HgTyf15L$ydsFHvxi}
z&^Ipy+zd8FkUVcsoQK6_x=^}JVbNBJOc%6=g1wzk1}IeXJ2?3BxCoqS$tbHwqsQSZ
zc~}RU?zurA1!z93jaR_tO)di0nn`yRmy7Hx*G3AAdhiX-t8U3U__$cV15%|P&3BZo
zlck-IwUL5CY3Q%Sv$l!u6evR(K5Ki}p8(>c<Of=aF#9;K77<-br>|XsGRu)fiO7wi
zSK3>PRpZd&%jdV$hJVLGi&J02%7xe{azuir1$A4@<mk?jREtxuhk~p&r5H8}gGNq;
zYl&4jRfR!?Us8e3_1Tg<3wT?=m8v4^#gc)MYwIC4A`$Qi$15I8q`()<ZC8Qmx*P%a
zV^ecZO7~HuJ?P4!Iag+Vr>k$3@AOkeiNI?^y>-dRdVf!E+7oprylrMEO?2ZX+_^0O
z>cH`$KV>_gnn-Jo2IOb4$!oG`a-&uU2JAFT1Nd6WA`b=K2ooCEG;Fl;P0<Kt7`+b=
z^#)s>7R#2<>J)UZf-yBe0Dzn3x}hFuzE;EV86oumhICb5(B0RQS+!1{RdfcfJr{m}
zj%m#%+9UP?Z>0_B2mL9L%njI;Ak)iSn!}#?zaW<*yL$Yfb7kWpr=|{?bpW?MDxXpo
z!*Y<FWmrxUO-kZ$Qivh(NQViJgaok|)d=fX0Sbv)GPguSm>4=UNp1-tp%yS8buG;I
zZnGz{$=u{s7l4wGica9%BKcC56Qi%9u7YyDKUN{fOsGbHy4bp=RFPjoVL}~1B!No3
z3B6h>$2tZi=_<lV0w&RnnR;Xf=Kv!GP7c0Nb59RIzxxaIC!-zV0u|Aa2zxafN6URO
z!)|XDhlK=8n=A{LtFbK0;{q2gj>nPlG6PshSp--7@(2jY$buUUo$}C2I|4kS2Ov<8
z5W*=u=fNhd6$Fx@lGLbdcuNu$a2lnnyiH_Z0~<Z5fWf|Rv5Y|vlB~TwR0$^W$UGaI
z?TZ3YgGg2hVFQ^|RA2&(B^wBc2_0??Rh^JFuf}S8g>Y?w>oR<YTSG!?fG)cbf#}~p
zzPlhU4#NJ_{7etz$J|U7R;`RBo?Jl0Uo~R7CM}Diob)RmEW#+r9$6Ta-;T9V)}lv`
zz~PIY6N7HAXWj_|>E12?+Aejvf|)7uF=nQ?Vg-E>qgY#YrbTQenJlZ+NQ6+sik)r(
z9U_aN&bpeZNG^pvl$6m8RLcww{vnppVMoC!Q>-c&T~Tz^BcKe$w;j2bk+odjs5j_E
z=Oj4D!jvCzPET-I!RlTLo*XkS!J`eaE_(;2tu|9@RBne}^ic~wno-VosEWA-3v<<3
zo>d`OTrl=Vz^dwKBM_t76|`irQL$WdXG}&32-=h28c8@DraUxpf%l+cpQ&(rObpCZ
z><l1NsMvE)W_=9>w<uEVD(;M0m4y6GQ4_#5ydKN%h%_SUuPRrb+S!-@U$R`ok%Lj+
zC?7MgJt`J0ghb6+_IYJ+n-1W53J?huidCp|1Aoc}>dHn95|*28l!6B;V2lWmQMaMP
z=P8-Ej=o1pV+UF>uqnF*dBFo3l*jocf(JRLvTR?<oY&~NLL4N^jv+kD!NG6EGCfMr
zVLDV{%<FO}nXValzT;`0QA;e-JT|JANFArc0IRgeL!*V%awn5^+)i%FQVsX#6^+gr
z=<mrUqB~eUaFwZ)bnzmThejrNK@a04ctqvxaOy-<nPG>~CL!m}ylW=r$Bhy;LUkiw
z1&(TEuN>*!GFq3G%t)6}Wr=1?$uzfyFkF%5M$#*ls^xPp4~*qsvICNY{}da@rLYbX
zaWF;_2VWY`!74Vks>*G#B^P$O>I3o3_d~OW(ZBp?bzMi`0Yuz$qMusab0WH^hzLBk
ztx_(K3<0Fh!M;>;(Ak$B$VFDn|AAoGp$)P;LsxKe`$FLLmS<OjAgau+#Syy-Sj3U%
z4CT9Vij`dA$_$EDJlLcoxZh&e84<6fuCrE(Ig3~mgW?#I;&(}+^0I+s(!}O8LFWt(
zemkBo3Huz~y@&5U8(v^|I-s6?SqVL<hC5?*-MK&n+l8as#z3O|nSuV?{$~eBvw+1Z
zsn)P#>D9Qxk?L)APVVe+{A~Q%qc&qB>Luq%jMCAsjnK655imssFp*qxYa4w>&7}4*
z1<FQiZyRuKvP{J+iBv09d?3_)PAm&ss1DK1DpJcdQ+Cl49n#Qn9j~UnN9xizUfOZQ
z+}0J8WpYe5%VZgKAnEK(tw1Z1uqV@%YFpBtn%>sc7n5&Rb<&BhH1RZ=R?KZ%P*&S+
zMB2$lDpZ*QO_dcYJU$f(78qBoP&L>P33s{4EY9b|qVz8eiPNSuiv-#fEy-nrVql3_
z(L|v*(mXFH$BM>m>J_2Sk|B|RpkP)}WNpH*cSAFjB-C<fK@Bsw^Nql|wsPA~Bx5#b
z?`K82+ow}=n-gZ56j_|)qYBHG6(z-bqs^v6Qh0q-kThhsmf4Ed0H`4>-~0@(C~%ag
z7@>bfcRt#G9tmV$MTTctksoc1!oz0Cz>zB2+V>^c8XBpjlY|V398yW5PGX4|QbU(P
zM2kP4fFa><S7Y6Vo=#mq4b=%8Np{9LrpZtiW^3TAT|R*BH%<ndp47r~I-RP4c8oB|
zn%Y-%v^6a%uN)=h%MLN*FgbPu=Gds^Wa@C^kl<J{JOn`Qx-5yHTnag<uG9e14bStD
z1gwd!!-i5tP9|_+5eq_FVLYzDF>C30mb6>ztAR#a>ypt7@HST);6<)dR&SPLQ>_0|
ziWO{{r1W6~1HFs~R#D5MKRxU`8(dLr^dh4vX%vY}&;qL$3D5-r-l4tPoQ;>Pk(*7-
zB6Xj@?nPq|llGJnX;~G)U3j3^Ct+p8cB^6K&`~wVwmlnF6H=hm(CjT@+VKdXzo8e7
zTxO3Z36Op~BM&L$d<%Kx^(|60W6WV)w+(E`kQrKp5Tqurfhdml7RGuLgnBU~5tEvD
z!bvkCA%}=b*T*s`_Fr_(swPeZL1(}zMe4QTSb<(cCLM-vS~PUb(kr5(rKl)^z18+L
z4iYD`?cXS2vTRWV9B|IL$p1Y<doyJmSkw+g99-<dZbZ@(3MNL_U!^?Yii#9U6dr3y
zZ&g84W#YR8-s)(EadEFi>JwJ*Hsz`!h9XAjVv9^irHY7=<A;ugt}}Ls`iQ`&W%^=0
zTftFsQ)>nZ4ur`!y|sN&j5`X3rJ<C%yJ{V!_z!J^Zm1J%YuNy>iI^h>A#k)Y?Wq<I
zxIwCYZQ9Q&-cleSxW^pN8|WJ7bozTTsSY%oEgKWgNnk2xPnTE2N$ly3+lZCSxSd!X
z^$UQHF0Gjjzy>6&UO-_{kr7Qf&;?dB4~q*QQ_JWiDq2A*b8;0;g(8E4ZSleYR<vas
z!{3W_EsKyjZ7nnd2OuEOh3!sPZ)e~)&JLZ<zF=nKyiBGi(>R-&g3oL`av;c@+=fVH
z(AVEM`{Rv0nbl5r`V^->-P64)jW0E#&8H3f-?x1H(PhfI@LinW6wGK!be!MWbK=Ok
z#X}NcPLcOZc-ABv=EzN{A@jNtj}y=jHH1xdtjP@Y^~ViOHfjZ4WcGsPws}X!0T<mh
zNPd(MV0mIi$Pcn>(meyHh*;LvGOsN)_ZY5`NX=<)#+n*t%_S<DW2K6ssQ~sZZBW2g
zPHIsAkgG_ls7MbfkSaVZPpag2;#udMx2X%fa||m^nMS9|#Hm-BNS-WP{G2s02v`iJ
zzf^^Z5x7A0wtPFZUF>QFH4L=nTfcR^DV1yIi_EKv_Wh7~QO1%-xw{Yn#fE(KTTyc9
z%I6Wb?PHTPbnHYSb+2TS20GLx4U!czVj&l1kFx<X9A~w7mIrfA<%}v~K_qCv`G!rB
z$i%0*WN3CT&iPhQcwlM(4y;18d!dIuv;a~8c&EL2W0-53mpJL})SPsN)2zz6)^V(-
zDpAXu60ElYQ=pu2J+w`nALH7P!8lS|zOihE+zUgsWr{8pg}4;hsaX2;=0gKC2pRp*
zfr;3R8bBh(b#4MSv#0}C6>O$yRw&tM#~$deCe^YyXc(%lXy{nSxJ1^m(0cRJL8o6i
z4Wqi)7J@z_uMtO{En_2FsA~uIM;WeEliH!gy!mNSIKu{Kj%Ld<Kyx%*&XBA=tfc(#
z%|!eNm3L8PUbJRWajl8oa#iFd8tA92V`mr90mYH58aA9LVzBa<J4Hjsnim)Lq~Y_4
zlHKGOJk_xzJgRpEOH$NEHcBU7DC~OGo1z>BOI{hzl7@ujHMI(WS@d;sEY~%Cbd*E4
z2p5Zg90!=jFdb^?JfuS{LpoGyx;&#yY<N>kiPAI^F`JTJB03h;qxp)wKm@20YD+@2
zgZ28^1nfyq!mffnUEd3&5S}YC`O$r1y!UWv9c}1HQ)%jGZJJj;OHtK;=u|oV3GspK
zfP9Z~43HyD#BDUabPlnM+*@d+dQ+6xNkW6O#5_smi-Uv5#q%V9v=DcxB1_R%03>DT
zj*jgU8~ZY&tun{jKJ|mXPGr}w>hm%|(7no+>0jNe(A!x?#m(Zm^!J*cO!|~nnJz9;
zO&vr<YErgJE?C#IDtD*0pR*{Uz1rf4O)P&|C`F+^K!hmZuaVDshz;2ossc;Yq}|4%
zkc~{+wspv0Xo#%HR$>mLmSVBVgYXkol*J*zs0t2aKvu{p1zRD&Xsw~vD{}reX+Lp8
z6;0XDX?1R5P4+TIrK~1Cm}3B-n`&>0`L#~_3=YF{Jil^g0x_wDbbGyuxT<2gimU0a
zq&ThxRK?NOf<^fXV+x#H!_0)1vY81jYQ1D8lz@eyY>3u$H}X!h>5|!mh#Ux)6piAH
zj+XSgv>#;3IKEW9Hxfk#Rp&Thbq_nEfiPCvk>ZGGUfS!UY@A*I_O9{=&}0^mrExa)
z`OzJK(y5AN51`Pv6?hVB9KK~yELk<Fgds1S%){~m${Ul>fP9O@yQ!!%BBT7Du|~P3
z0$EO~2HX{fhN}mL82|_l!DVZFi=!MQ*VOd+Bb+svw57^?Xob|gZfqrG2D(cLf)aT~
zlyIe00Dp^@0wa+cNmk*Ar8dR7462O{B{UH#B?c7oZR*cu09rGn<R{uxA_Q<a3<<cO
zu3BZQV@2^)&9ihL-k|Ah=KgFi)Xm(FP{s6wg@!dqY>ef8^dv)YSSej^h{z!_mXg>2
zBqT;_H`xS&rIyqSLEj{5l-e7s`_W&?G9XxnpGl?pK&g`Pm*rzS%c>#w4<lRv5j3VA
ziGbK>Bj76F$0e#*ic70P>eQyNSscr98eo-dqfBX%glK;Ku^i+8YLb)koJMGOgM$~x
z^Bj6K<H}Wl#K}NsH#OY}0g5T9K@IU&hrx42L&us4MOxLO?rz{|$VlpF3;Y3(oWEM6
zSE-Ud^6W_o$0M9w92=4YC>BaoO54K*x_#9jdnRQ}FlFU>UPYRvnn?PTETf8kO1kc7
zrlI+&V>*f&)DYXIh|@NqxOr}#f6eQ8YZ5sh3#OD&IEGpitn}hw6}kaz;#?V443slh
z8VJb&a%%YIDH0tiIm=SW&JX2L-!HDONO^8(YQ*5n5!DpMdc`Q^LuGuqsOA*H60u+k
zIJKnvyfsc{b&waW%7&{;MX4xkovq*F?j@CQ(i6;}C{#A)ad7cOHx^f&0MlcF;2Kh{
zbXh970f)>FIFHHHy=qWZalQ)-lerMggNKb&5FAKy-0&e$7)+t`n!&*vDjMY^X)N?a
z0G^*Zd`aV@7)do0&yhXTFdcJzhryP#%v(675U@xdTY{v{O(uIR9uig~q;i0Hl`;Yh
zSgrUVNwA7yS_PbnttjToY+(b2<Z&r^15u%E&XDY>Z0!R^6&L;?up}q$T%w{6A-MFs
zcrHaZRp1X*;X0c};NY8J)X*Uq%JI#mLn+rehCxpDO1|;JfI3w}G8=Z&+*Ylry+Sr8
zUVmKsgK{XbW01|M8Gu7qY}&>xN;WenO)D8Y64&OlBhKV;WA-@9XE@HXUX~kkPUVa$
z;zr9NKO$j*H8M04G9!AbSm>KLe3fZwjtZ=?Pzg;lRP4?54=1JjpfpE;8Lg&{BPym=
zZ&bCznrzf+^Z;OW83h4a$&DNtxTau`B)XA?jT$qemM<|Zg@OcMXvTNqHKT^co#@;-
zd}h>49TE*jJ8(9w&>OGJ7$INf@L=teIHEGZm5sUCS@3d8r*M_>i7{9m!_19ZBh<w1
zDpA=x+S!$KWaZ?*v7^cV8WPZ`JS&m^$`CejwfgM8>ev!OLFVxJKhP!5F>PN9G+j5O
z8uI@z;@4xnhg7y#!koF4cah?@b1R`_)OW9hSYFhiixs(a6Cni|hIOcm4J4r_*2RQZ
z1zr5!*r+v(V&LBBRm759CDaLl-w!Rc0&{B!5tZ-af!*p5z<FVM^_pCxIJF4<N<6}H
ztn91i);T3Xq|D8&90Hw7DcM+bfpSg;J;}Yc3v)23xptXGDe(nYWnJK8c<y77W(AZc
z71>49DznH{2E1ZDN(7CXL&2R$q@6Y3)j=ZM6l<N^I>5V>YOHg~m3_$!Z9NP$NQ7}7
zU0yJ^T}5ZkIdGPb9<wH6hSEgyXu?Fw?ij_gb3N53qg{Y%nv@|@f=y9HkZ?0-G>vE!
z5hGaxp|(VP3g{e4hrA@7Pst!a)9v9~(I$W$0Tuw<u+UaqzlQkH!?2=@hK`w8?NdEY
zx7UvzoNYl@Pk)eF7&x8%YtU`CI}L?SZOGc^EX&dBMRTtkT=y{mztZ8^F`!N{gA%NJ
z9a`3yQX<xb`YhY-nlV=uWtL{EGTL%#)j&C8Zx$+4mo(K^+(570wolP|-9(eK1o#!T
zzfP3CTwQ51vR_9&YjO<u5?y+r$V+1FD3pU=f6?b*@Z~L))g|*KQ0dU~cKGTNtcZ$1
zlhmcBJGE>>zbQDnqIK@rTD8cVNYFaBp6ps=!^fmKDW<F-K1^v0MgAw$X;S_i)yxS?
zf~HTpSBs{a<bW>Ns-fZIjU6buqFw{IK$EOADSC&rL4%?R5~`D{*sC>kbfnjO+tw8~
z1Y$|;G-zo~9W#?P+lZ*=MAQ+Ce9~lMgn<%e{@C?lHJy!6mD(AiY{}*x1k9-Nfz0dH
zfv(=vP@}TsUm1#nmHP^Z=~}ZLW0KuwQMb1y)sjiC3*vqQ8qlN!IASJtXdd!G!XKQ!
z*CxIc`;(WW*-aW<WvHlRzSCVc3=rQuN*hL(#@s;U#wLlv0)tp=uY{C<<Z1NLE(2R;
zpnjUb4wCGyYX)^_E!bd(Pj13+%FB2W1LIQAWSJrr1q~qZV~WyVG#LNMciMoUj~GD$
zX9VGW4FE2~H+t|q(UvhpM;Q~XrnDN0(#_7uIw#$^sxRp7qs@W{=B!N52&6H+E1VHi
zwy0NmDPyNj+d~Y)8To3wGh#uM>eaMO4LrklO;>Y}1%ph~8piHQn>a;h$FznyHv<Ma
zX&I-xk7aG<ppD*sV+EG2%!pc&S5oRQVc14UQ6E*e=cKZFHUUskd5Dq`uC$EZCr9d|
zp=zZQNM!<aOr}*SxG-8dYl>+mN9ZM#VL*>O9mlbV$!oH1%T;L7013FRX9nn79!kgu
zA8n_kemQmw9hEstThPZW*GL8b3iEnn7`L*4T*4`-l$3^rGE7J@v;&Q?lXc64F=v#F
zGRV}^C?P{L8c_jD1Ffv6n2JD)@@!<pC$i>nHZfeLjMxGb8N-T9WNPk~98pYxa7thz
zgR#+v)~W`2n}QT*yXYE@0xManp^03?!gZJcgIw*UR82LC<r=!g8RAPTlxu7Z`1TO4
z!ANC@%ar?B(T&xyD=V3$8>6CNuP%dtK=-g=2_)P@(~$ZJzKkiKQfBF>B}cZbT1C9x
z;inkNw(2Md-Pf>b`KG{^qsWg)HksQhS1O;{#40ur1HrCOX?9dllT2o9owX=O=On12
zgv?0!=m^ONm9s>_jNn-`n5d?G43O|cR5N0mK(c*2&&}3v7I<$ot=~DLgl$lMqoBeJ
z-mTaM+)T8hy0=VSN9T%?6(IoLwk4SJNdyeG)`}c#v>b(a$_rWZusmx#$MViv;}nxC
zFvc;e$QVb#H0oJk-6zVNp*mD({TOH?7Bseu3fn5&Y{z#bC6thP8p@`p)KKeh$^mQ9
zI<AdC|3vakt#fD0cSX1wdEscTRI-d(wooILj7>)~3YOF!hx$LUa_~z-Lq`?hf+&I>
z{x*^d++qdQTAJw~(Nx`xyxEPcQfU6(pflBml9B#lvSHl*kgAl2_=90cf6xV|Tw~)h
zap#dwR*Q{0qZ1A=ZY<JIMcFWb45~-V+W=;$l(<%sGVTTkA5Ox0Hj;;`Koyf+AAL7e
z#g!28^VM<Qe<OFNX!IbjOGJDLPpqw@C1q|*oVg66CPc9w3OfJ)_Pzu#lCn&@GYl{w
zETfW`k#TK<6J~(S)P0P)l4~XdbA)70zzx;a)iaSK9lDb_z_<fmfT+kJ2q?D>h{&Oc
zc<g{Epsb>JAg(Tpy9oZj{w%-iBKq(C&-+zX(%n_vRb7+nbjU=UudDj2qr2<<-s^dv
z*I_tg_>2*Xa#bI44whuV@C1#TM|mX~;oN%Eog!>oYWNe%v?YcFn6~!@OENM|BG^r;
zSE7n(O1Pyo8tG(GoEc-ctJx6H>KxDI0BIr)lIhI%<_E_tn<8*^&xU|RZxNuaC{f(I
zX<aXBZ!5@lKrcwGZ0Tfv*`(aEv~;{uD}q@E-PQc0mXYq%pT!G}s)W|R5L%Mjj|WSw
zCW&Y=7eW6@s)fc^XjlWvdKD!h@kPE~Qx55>L2D?T9AhmXm$@AwJ)bZ_fz@lJler{{
z0Al(#g@U(LkEU@I&gi)zBvJuPK0?`}Bq0H3WE9jqqAQ(J9|DP0VMY;Dkt!U6yil2Q
z*n+&xTCpHT^g{W(%h)wOG_)+ap|7jEFWIpz$*$4f-x-7LDxqG8N4p?DEhtcL@A}h%
zJa{7|Ev)fMB{Jf~#3jK(r-K_oh+@2oQbtOF;W{O8l7)MVPYuon<F$s_SXo$0wf`uU
zDqH)1K<U_NhergvlCrMBlk_)A(G8K}tM!(SMG`1HwXB_v4I_<!ae;iNdRJ1-q(v!h
z;fGfg&JyN0MaOhY<J!(i&v7g}I%wMdWaozUn|Tzut?%e<+fp8@1llR|+KCpSMc~Il
z22U4ODTFWr8e9gd1jn&1bt9l0_#r|Dc0f`|F5<pG-6(?A6P1t=EK<!(WkgsQ0x~0t
znc?HpjUEST7FGP)Im=bKEP_58#>Y_MxMpN@d@Siy!%<{+nz=Yc!Ktd7PW!b^x|!>$
z2p(i#i<YqP^`}Csa2|ID%PL@7=qC|Da|%`p^j5MZWULID2*yatUWKgAwD%ZiCX>@=
z19SkytpS)`Uf~h4@o$Mh`X2fyjMYk#SzJO>3bx0LB5)73T_leS?kPauyBzG8<pK9l
zCNs3vD#%dw6B7$7g9w2lKpYioj~F2VBBa5-A);XkFPN$z$A}`RYah~~ecmuQgc^S<
zpC8X5V=);cExvUYAgz{M5|58<vf>xzfJ{h_4^7s*7o|D++gB&c5kiOKoJ4yhx9PVa
z%N-RGCLl7!U-hVkkjc>LwwtN%ZAH)@)&mU?H(xLjXyU@EU<puzk_ACjN|ywh6tpIg
zEl>jZ^IH>;$N(ZgtnG|Wuz)vws<J+om#ZKpqQ0foJF+~`hq{*1l#~Egqs+mulHHb}
z*D}u8JZ>d>jhr!Tk*&Oxb}g&CPDE~)qVhoelV{tvx{L%LP>5*;dNhEBK?PRb&pwvx
zGsFb<5fT8+9ty6&!tWCkzYigqfb4R51Rgut5EKDTgH1~TyQCZ=61Q08!5FTmy11EW
z(lAK|Q&QO)RwuK08;y~3%1AR(3(_pp#}JIl>KIh;Dri#tB%X-i>oBN{?&OfKkOD7I
zr{sx=`+@}?SY1uyBe=%0nNic6q6&&`>2fMY*LXGveD~H>Z5uk;HiE&bE!n+$hcS-6
z-XLD0*i|ZP<kE#S@s;(mwKF?VXv5p62!|oY8=DHj;KXccGNxDeZP?W7rcakrzhH(g
zVje&d&WJYrV2s72PScT&m555NqO5|80%XcmQ|`qQQW<JkVHFfysl<4uPMxLoGBhp7
zeD<qe_p_gQG*86shZS3}HjxGWR2Z?<z&IPBPFB!Nfm1Rn$gvCseH`M8KG(I_I^21M
z0O49d3yb_|mLgJ|S{Z+NBYp}1AFMseji8H4_K%FGCRuO0jr>?V8oSnfGY~=<0`6;n
z3m}-NLv1Vp(;GTiM%ELw@eLuw8bs2lIjY_>M*jxh${3P><26Ih%x$smDPg79uoLyS
zLqo|e#!f4f%?&5}*7YZs_6_Iba*<IK-_u*kLk3qkP&o8gHpW#?4H!^F3dR&t)m|M;
zejjvSm~b`;a&b7jRQtNyZlD_5Zk);uXBX@$nB@s)BTFq%|L{-_G7oSeiRdlCA_}h?
z@Tmv}a|ZnG;BkZ5oD$Qb#m!L>bv)$m%^L=K*KQo>-rUu<t$S7bdi3S)Ywy}N(7Sng
z@8)T>kTnsc4V5IVEc{+};P+J;$RpE*RRpgwv9MJ%kFnr`ACh9eUeUx`Ku(1ePTP<!
z7Ej2$E0M1Fv1vi}vB=1#LIjvmZ-9k`$1R4EQ=;F1fqYr8n1XylkhlmjX7q!C70#oO
z4*WK_v-rfcij=QsD(-4V5yW=;!9FS}@84l$#wM-O7=RhE(4Ug?JI!`3^(U$*s;qOD
z0B6uEsdd_Z5W*iyE+LZv<W7|K(M2>b)2cUaHCZ$=mXHFWgk=IR7oSMZ2A_4h@rwnO
zZ>9J}MMtJbs%Xc81`O3Fk+dsc`%I63)>#}1v;0Jmm`(A#*P9)S+X6x~tbznVIbr+%
zU!a0SkXx4(DGGj3l`YLQRWp+Utc;thcB~UY>Kh+n%?l2gdU&c7wXbUuP+Sx*BK$?U
zTu)JPs$b;R?zWH}qDBh`*bo2~pe^qu;pA{_J}2YMo;vjD2qy<4n9=Z4K~UCj1wcwQ
zEHH}}iegi_)J}G{3w+dwo~)<>SP>S40c0^*$idne9e}~^qDH83MLD{Z;UrT-*H4Ah
zW9-L$P#qVQ6F9{^l^S~<5b|QuBCc6I0b1GO5%LWy0ghdd>R6Mi2>haGm!e9audI#L
z<%GEaqB4TFnifz(Ks~}U(8g&!Mnsgy1)ZhE1Zze{YEewU#lwL{+S59Bd~kFaY#8f-
z_5yU%=7gB8m{YJ9%fTL{emOo~EQja4{!D17;;>$9du2c~iK-&NY>tw86@ge3LFFQ*
z4K1Y$bKH$3K=Z1h?5^5&F5}yh9h3=LH#js@iSLEXWlFFnH}1#*$`Ld*+^j9u)Crfd
zu4!I4M+m4@<$ePNAh0qhj!6rAicy9Z_rdIVJ^~jM8W=*pD#pFwz@i5!C1ZS+4A88E
zWjvKcIPN88H@Gig8nbw`8=+>?iXzmQQEGd$o~R`OYE1K_k&qY)&vPyk+g@O0y|aku
z=(bgTEG(s`G(kzH`HUbM+FU1it4%D-X#~`^e#^$bwly)qo@;M>qbLr9`~I37S&+K4
zzA4!i7EJ|UBU#_~2V38$4OCP$0_;LGwTvQKrmUw>9yr%kdhhghBF*QmF}D|2yPx7N
zP<2GGXve_fRjllOPA+O9#MCL&jcbV?fG!n@%B->~mC&6)(5b(qhawd$8RSZb319;^
z>CvQ`P^w7TtRh7U<Gxgp7U)(t)P-3dJ&LFs_yIy1;)2K;bPb1)TmZ3TEkYEV1H+rh
zYi1f{_-0y*vAVDd$9bl+x;~Y?74x}x<hS&6_jUU?rI5$|o>+*9W60bFM~y)ufd|+(
zYK-7ZJvzOLhz9mp4nldplB`f~*(*W_97s}8%UHePYM?`p#j9v5i@{Rnc+|LOV@Gd)
zjXP%{n@g9i)T(Ee!4lznBHK|WnFClN^i3FTNYiSS%z;VDI&y>(OOa=6G&hD2OAY<5
zqp;DareqrEA0!)!g<^#m_26ts=4>gI${Lr}Dm=)dW|4&Q0wOY%4OWmp$Y=!PXRyl1
zYu1toqriHtWU7L>gZeCoRGpB}kU1U|d?0ERNFtd0LFq5@DI*OISZR)f`};SZ*3r9V
zGwSTNCEM)+_?qMUWucATMkQ*u1Jb3_9t}e4^#Dml?T8}7@$!>Y2`o_|m8sXU^!SyS
z66954itA`r6jcs1-QlqjHvjj7!RizyE*iZ?2?&XD+K^RAPlI$(Q|35pf1ClK!sd2%
z^o3G3YuBvru9U2C(mECD<_Pj$p(;lM4R>hxlQBJEA(V6l$SxdhRh3B4ykQ|(2~7_!
zQS!Ff5=I-3()0=@1&J@#nlR#vQa8XALn>P0Nkwwy>JZY4+E^h<PzXJ*GC*Qm7In);
zYq^6!<H0=iY^{x8sjaK8q7i&RD5n`f9dl?SrwP6=CxNN}>(5OEN?MXam(QWXoka69
zl|@rqE}+{y3jW3Fqep4cOzC_kElX&L!|`*KZNe>EFNq!1SU_dIZDU3GtT)pada<&T
z3f&^{I%zFWRt3S%umW0B(-+)`AaV$)I|U8)`^3aG!2+uANQLqdEMFvx(wu1VnM@j7
zT5+O+t4OGLrp|PR#i^TPO(1P@<JG94GmX5JBtTTfY|Fq-K0*f^ofFFT<p}1Ep@kiP
zzT7dj1|IU9B#SE1vv{SH$j9qYjpCL<1$rTy0d0~=c`BL!`ojpanNm$7B^qf>Pg$It
z*5<Zd+g%#uEntDETzDRtG7%aRSB)+6GVRf#;Do4(UU8a#t2hBY%*m0gv#uOys46Rj
zb_yktz*taNH=xoSQYirQ%)wu*-li4cnS(CT5HvYuW~@0LD+2#Rt@EK=+d2UH$~MZW
zGb>gNDlbQu3v@&Dmq{5Y+mJF@Q!7>zU}1!f0D&liWHKLU#R|NxYn&DVrC?eH7%4zt
z%F7aHsp_Mirv8jp2DPhftp;L)W(n257U(<zJgSQ%onEt)*a<_59bCDR(5qA$dBv2V
zz$-*4OyE^ffol&dcJMPv?7k8#b^@hMqzK_1QP-^ubfIN}l`y5w@xbdQba0cpx+%pu
z32<LC0$y)aIG}uGB%NF;v?+43=e%XHvY5%jCTb>!aBO#(gvkK~VyE3|MFdB~LtQb{
zBLM0KAfK-jiJs+P0w+NXVY;FQjt{9(0VGU^tiuxUY3%O-`9TV6tZh7XPEv-M>N
z8&*KbM-e0FOU~pi$*@wAYI10AFvsCTJvJ_3QfT9MELBD{SfRn(0U5C~kSYPY9&oFq
z#ypW&<Rii=QwjwY=P|LTNG{!N7hQmDR{&knE!b5s%Y$qOi!O*%FgcW8sZcg{V&aZq
zWvU_r6GfMrCRAvKR}4Olru5=mW#V^ICF{F4_N^&$EK4-0BDVoU8mBIuEFZ(Iu=26w
zQhwR2P^iISAmqtqS@lhv2w-*yY5PKWY9_9lEGyL9Ih3@bAcv$ifVr2VK&p=aMZp}P
z?UtT4&<_XrJ}GVJb2~QM(_0(`s&!^lA>Q8Z@+F@2rA|@68DQ>$fL>O?pYNwnRdfRm
zDWK5DB?1~JsE?Xug{B0BjwJ8&wqS*-ONt;y>7Oj6;Q%YJO!yRM0E0WXMRXIuIa`V}
z>P*0I=`Lmy`ZmZf<x%RP@&4qXU?pSqnjyT>8Vjlf`X;O`p>!!4K~jxbKrxHH98!v!
z%qY$%*a!iGlw|`66-!DTEHNvg!LPA0b?eP4G>WS)bsB8$0%xnB1G=9!73;K79AZFf
zXk{uAxOrmY(aOq{$*7Kaj}jr&t0^Y>MR95d<bx?xV9#x3+Sj{o`UsGU2orqq@tuYE
z_)_`=qC|?HJ)sgQ5h66xz@i{%L2!BWlY+NKI@KFxl7UbqEgC}E%20hJW-)k<N&-zZ
zDJcF+xcw3?A{geLNE0%p;ayh&M4+&XkE~2+MxqRoLWGqgz=|Y`O5ZKr5G;(Dxk_j@
z6Fnb!x5%YLh<gV|2D2kB8^1frj}RD@FzYe1D>t~)N_HcsB(@)j^iLBhun#3!ZD@hf
zjt32QeA7ax3JL2c*D{cWg75G-T%hi0HOK}lcCaRiSOcqvGjWQZwoEN0WaN~g)Jfxv
zI)P45pNuVoBWYsaZtplS-~=2+Tdw`c11I6SlqH^;V7@B}ni6W{T1HOg$z<$w@rVW+
z(@3@oxFJ6!%;K)$lyEIYP7|u2AJgpEaR3uHQ^4RviJ*Q(H6>>ZSx%>{R5}%BSjq#V
zpGDli1)XXbYKd~Pl?5;w&K^ysW`kG;()$7$#FYct6DGF)q%v&k6^`vy!TJwIfB@d^
z2>LITF%?rt%dpZV^m&WVv7OE0*pnRu32q-L#9A?NE3XN;1Pd+Vr_{{N>$3tuKFh0L
zc7+g3K^D=!r+UFut&D0JqHdz+v6@z5%&=A=m~~>978Ihn_Gw3FW%U>Y?3FpENzIh`
zX-%4+v&lXMv!tLZ6r1rtkr+or{b@o#E<{}p3vN(kG;W=kxHVYxfQAP^Ap*;$f{JZa
zxuE1Mol;aSM$wxM>|$#HrsYd;y*19sI4xxO@{ryGO36%mk<-;d2F2Gw2nW<e0(S13
zlOck!kg?p_CQdn#2M#%G;uO4LNSxjsLQWCJRfGsCQq~Oc0A|3)BBf-co8z-xXM>yq
zT2PEaDuE2v1bh!<FF_nCV3CmL0Qa0zo?T_UXZm|DO33L2iYhMx+v{t>f+di#Ux&%N
z@C4s4NxTpnLP_R@O$$oBut{N(FFdY*hG`*XR=2t5Lx3GLusd-PqCc9+sX8wiN=CE{
zQ=Q|CDE%G1-R*_-GI8kDwVjiV8B05*@*o66fp-OItv9_Ih#Z8sDIpQyXa0&vo`wtr
z)mE=cAabD$Y$!mIV$0~g74@pfFIQCX>Z5)(LWy@;MNN{Du`;}BiWzl|+rORTxg6S-
zK}<L1jFCLcc;{A3W9QU@R>oygo<)69S?<*BFwwhdT`%e-%I824;H|7{sre87yj&^y
zWj-yJM#w~lot4jy=S-_*Wy|WEkugS$6+E|GT+X$Dzg^Q>CQfeYv+|?ak-XJEcpjz&
zr&Un%0WAv%gpb>b*e>NuLSP$cTCFN*Bx0GtdJ;-%1skIDNU+qx)WC8OL25bh6$`3q
zfqq+8&AARys-@JHq8Y}ANA2LP({ZIpSa~O~i}!5c5Fia(jQoz#A!B4L4mmDqaP06O
z#U;mRjD{X1zUfi)3av{y)>NIt4Tq9r)cH~Lsj`Jk+Lsvff$@(Z#~MoWOmt69nL5ig
z&v~!5xZ^||CUSuWE;W$>2PTKMf7TS&gdki&K?z{2eySF6*svm6Qxh3Bip0S<2qmK^
zYa=*tu&&iO5e-eFRI4IfWy%%@<i0_znC3Y1L*43H(aGt&Ms+RvSdf<S6J!*lgcamk
z+RU)>0ox5J$E<RGV&cnzHZzE`0Wnte>XWP%pHgM8Zi_|=sQ$T5&>JK1eP<nYLY#DK
zH|`llC-T~MLPX;y1aBd8UO!g?%#pD9A^cn&GF0N|s*{j?mkEL<1qG66@d$(Ahfhk(
zG2!&Dg3mEA5gefcFi`+ww@|&@iYC6qOgh8ylBK5%es0^ywbZ}SB2<zXhi6;c2$dp&
zFq0>N@c=8Rzi-IMaxJwjI#>zei|D`^0nc`E#mXDz+2XV^-2CdL79bB4+<@XGqk?oI
zHCHQdTsnL}daax>HlDMROIIzc%;Ae;@|ZY`K=i;)VG_)=^sJWr;24w`brSCDnoRUe
zg_Z!8;gajJ>G2`UHzw6#3=JjMp@=Ri6}@lkMmp^(%Sy9Sn~k~gkt*tRq|Wj(FX3Uc
zgU;1vfJ>rTW$N<mAQu7K2D9V&Wi=_r(N*Mvb-2ae^#Sc@>e3pgvI;C{p|m>4BFU?+
zO!&>#2h>OKVua<v%w!}@2M3gi278J$$CY}|#uC1Y+R(D0MaHb*Wdk>M5O#w{<6n*%
z3VVeTtixf=?V85A(9v9#aL1vflChRgOnfrfM+VIVnuSD&M|1Fz6|g0ym5dJQoa22Q
zV<P;n4APiYgx|qs?1~Opl5dI#(V?)4vZgYLPQzMw22=2Dkg5#f0PBcaX;H|X(n7}K
z6brd`RY+<49B1LLe+O8U%NxV91fn5uAgD(-3O`*cz@85ox~ioEjBXz~)gFaXr5do9
z^2Eef%c)XzrNFJ^xaA<}2Xm^-N3d{HCb+^iYKWXtbSqVJRf;f~HJUYd0Jk%or;QOa
zOV!;2nQU&@7{hL3baZIYG{!*3u;RV>><A{AS&(?c?Hoh26f<bgj`5LmtaQuDj4_nA
zob7=;L;5hd(aA06j$64snBCvKw!3p<(s6m)2Xo+!vGU}~CYSd1uZhp=<-|%}Z|BhX
zn7JcK_tU>8KV}V=@qKARpJiHLo5+<HU>iY25UMiXEFgRI=fPMN*RSK7tr}JZ)L=j2
z#i&jkH1jPh&u<|Z$v}D>aRvN@Ei1v6h=L%0uM`c>!bYul0!WkuoHZg)W$|S<;LK(P
zMVu+`93tW@v$h~1MOT7virCfe%A(6J$8W%^<XToAyD~*hFwqE-%YZ({Qs$iKuBZOx
z#q|a8JEUk>&aNAPP_~;NmzO`g_py)s@kNzC)yp5R!V0Y{e<~<)Rn51~N0U(@6E5FV
zQ)3q`+u$qW&O)h83Ew_s?}cF50|t_4jH{PD<jfRV%5WJ2mqM>&{80VI@aRy13hK_~
zvbmO(Bj5o!x#e_HK7AH+P#!w%{FZEPyD>6&9$OLz2BEiFV6W&1GNHf^e=?uO8Ph53
zAGU@E^43T`n**J~@}ezF$&K3jZ$@>f147Q{i80CTzUah}3uDNIpM*7FEc`S@5#-@)
z+8Uw|^81r3mY<03imR8OxFXrUzAJfFOXaI2&l<;XV<zG6R59O5wn<!B$~FQC+Pbnz
zWy;&>ApGouqZB7!ZwymlJcHQw(ulBSU8HiiiUKYwfK16Khw(Fu(e{_<QNzhNbc;vF
zilB8XQL=1Wk&-RQ;kufbO)6EhxDO9G8+A<;LlcXmq<|dTvV0V_i3~l<qYVIp-qnk`
zf*!1#&H=`ZGw4E`qK@ZcO4eQJ_7wWfsH{)6<ft*|QRawHyfWr7y3(zEe-0P`x~dW$
z!W#w#IYrozlUF`Vh11<G=)B=dp(~o0;_fP#<q=G@jRmd{dOw^J<{9IES=z)&!4wn<
zWbqLo7rKQO=B6o0WCp8xoIP_2Rabo~;qH;`gClnH<5h!0=y8?o8q8!YsuLPSw=37j
zjiQf}$my17WL@kejl7``3fc2k7v8d!=~*qIZ_8EO<gLe`i8)_YOu1SoTg{a6WO^2h
zZ<UU(AWNXR2vE(urlUp&-QE1fUeQDl(2!pn0&s;3QlW5(3k-sDP%D=xXlH7%il(TU
z<Mp}=0ncfo&tgFPJgWhU+j)gbg+r|dU^wQ)iHQ#fn&dL{xdi-j^?KTZE1VW73N48S
zqWSm<=b%aMaH#f=Syqln63HB2YwaX!dRcN+--dN0u!XBxRff2(PKve<Da|z7$)&;>
z(P9=+_!1zN$j2h{O4939#ynID@}J>k5}lkq5{ct+;^L?jO)HW{^2SSrk(CUW#HFBp
zJ(8$Uvq%w3K`2So;28l5c~`I`>gY*?;>qeIF{P;n&{UbUlCsjW5oakv;gejSdbzKF
z8LA^^oMSOkr~~qj15W8`Qmmwo3tXb1H9bBlF7kCxPLgsxAb&ToOcyR0)vEiJ>FO%1
z35Zr0b&55jM3rJsk<_}|1vYZ2%wmeWt6-K#Vo~0Q<tD7e%HXwti~Ape5{o1@h%D3y
z;Ve+%A)ICDs$m#fs`ee!p{r-ctF3p+acFB_cr%NOi50{coD&qYDj$K?DrA6I{ejyA
zp$wj}Xif>Zv3l#7NMRs672{g4?{soOp#%NYG`K>6@e3#H0=1Vd5qDnqlq;}lMS7Y{
zgc3b1tF-o%=xNOJNImWx)p;S+G(y27^uaM~AT4y1H%H)El~R(f7?viYjfkntaW!4w
zrxv=(E0pD$E=Mq&E#{89kYy&WouUANbyTwN^<2$OK5+0nD~Wk!U1d~|N0Wd5p!GBv
z<0!2m5vVBVc%IP?ON4~_h`A0%4kTj3in;73=ZaWrkb|J7)FLksYS%rsXO)UN><f>m
ztGt9N$gm2YU>-2Zca^Jim<*Naz^DPA6~#Fg(7n(=6~C2Mm5em!oMUUqGRKm`*_{L}
zF#w$~wp&XP*6M{@d~%YPmz}sAM1sjoE<229*NBS|F?M5)uc$6d8hI#Z9Dj0hcyMHD
zTK_1D6Uf<S@8?JHI(fkBNAiR&Yb=)?9830-T8{zu5mM@@slizz%aXfxAhJloGxqsd
z&dma>*<G~?cAJzua~3mvTlNxXdG?qHd?i=+ZP?VC?AV4A7x@u(MWh|5i(p$oWGJg$
zanS)q4Gaj1y?GZM^a=teAGCM4RR>un68#eMLQp`RQ5u+YrWTnP5(NiKZ4C4hncC~R
z7F0;(S$fd@P{;j-G&2<BoTZ70_f+;RR8V(;!B<!GC{cPXC8tu<?~O}|dMZwSy*iEt
zoYkU8>Po(CW9OP=TU)Z<%IzF9tt5*G<df}$aYIehDM@GfC|ohJ_*lk}ksHoKe75v-
z_jNC|{c5U&>f2CZF^FISOsnJtI`0B_@nrPZ@)bK2;?`pt4BVAq{(vNh3b(<nCM?Qu
zW!+8!CW-=<$_Tn0A#e0j6fH!*Ein~q<Lf#WFt2XsNw>>R)VqRGF9D4)y^Ukb>GgKf
z`-E1hKAK+a6ogjl(c|jjvR(vHBCXEW7JN8-ct=e`CllnQU>8{2+<o*N*qR@5>dc$m
zXRG~q6gt{%Q~gv$eJ&s>4n#DW^<IT8DvGK~P<`PPx&+zu`VoYo^F;QYsYME%Q7e=v
zbODS&J__AGypM)zh{K0naNuFVnfPF^MwbOFzft-n8Iptw5nTq^bvkWmaXJ&eh?p6R
z7CM;mwV>bHLL|9clkK|)^LQ~6jl1bC0tzFe17Z#kg&L^%!8DIvT{^RBz1#&d0vq8(
zlcv>*T@%etb#RV^ltT%rbY#5O1S)<I;z~TPMTpv`rA!LtRuD%KE21)2@h_!LdJz02
zxgt58Q9U?YzG1z}x8a2P(1Xw$=nHeCm2_3E6P|D#2U{b<6)zhZ)TS5CI!%#59&x?E
zM7@Zdv@9ZC=BL$>1V`84u<#jP1!T;#dLY2Y&R&%GbEXz4bsY{ssh2Jya5=Tt_0s8P
zJzlyB_&AWbA=eUgN@txbE(}!a<e~#29KlT&01Pm|bC5PAL!5(3y{ET(Gc@_OnN;aO
zxK<{<pD~3V%xfTtsObW>9M81Z;8w!B@)Tq%*60MjDUwG_DUwB>fSF&k%ts7)qW=la
zHzH~b(wYyJLYUQnu}4wqmTXyyXsXns&5W}XI6)na!2-N_`BtPeZEqKJ9$nwn?e^q8
zu;yxDO1BFveQ<1%6=ZSoWKAf3fZRYmG5Sc5KJr-7R0Lg*mNbB-uvkNCKlEavRQN}O
z#SZx~jaMU7K?6db;xgz<0}2DQzR>W*$Alf7ePUODU#oZFBU7{_apZu{YSxTfM|)9`
z*ZnP20g2u^6i2EzLFIKwag@Q332_XxMbY{Jmb%785JzxXi)f$*77;Adpu>(&96iNV
zbx}~cZg6CL%tE?n3bI`k2y-yo)$#AScrGH)3#QmxO^t>7M{~xW^7$odmW#jYNd^wM
z$XS54s~G+j5uwC#UZ&6j=*y5|2-C8}8U-$-$QZ?pAY-(^K6RZ!?DG1A926Z+A1bc|
zd<o^1o(fWmfVS&MvhfjM+y-YVoC2OL6~H?&J$IxKg$4s7)^-$U@3aQeP8F4DEv-Zu
zGM$o&=LxA5h$-a*=KyC2ID7)c85v>~^;KZb^RsfXpM1D{h@vUz6OTmWVC*awYMK@#
z))a&)fqnXkHDIZ-5LUU8fq?AIe+P;+&k3?DDG^|w#1@j0GNNH9oFzy^G!T=mTU2K>
zTdFgrg}=aCq2W2LlR5?OfD-@?eT+H9ZBSC!p-UAI;iyimUR@Vi({?$>$w_jVmjW<$
zK!qL1{sbbv1>_i!+K?eVUf)(lP81}zgtQ7F_|)miBl%FFiYw@oAWt^2NcPFva3y_e
z8i7#Vy{=lPYk2g!tbmY#mbg%g9fcUA*vryqGB7AF0x*KErCb#G&q$JqR2C<vRT-X2
zeDYs~uj`qLyPBDb5J$=P=CY&KP|_C4{vAeUEL_|PBrVK3FaAJK=PtOSNTO^t-`^8U
zG>YdYv$)gc{%XjWRJ6M#O+l2@-}?o?DJ;2{S5!;I)ZKO^58JLVx*}Szt6-K#JTWbl
zV8Z9UwW3-kq14g+MiZh~+Z+e@mWf`8vWgn$IQushKbpiU<k<u1V~()j8#5}Fow@9;
zbaKte2>1=wjEs(tC3^=M0X%`Y3jAfyjYknmIff8131$3Ysz5nKp`ZeTfYO!TpfOPc
zhSRDvtvFeOJw=wZyImlG@yqEI!E&mLI;TXTxm7e(wo;OmGO7QYr5;qDezcl}a?0n&
zz*w0cH;Mm|Wo-ySIq5J@i>NB)MKy}qYg@Q8pg(~NKu(gfmmaZ{DOWkG#j<SMG}op@
z){_&!1r(ue8k(jVdPW5mpGI)W7{ziHc(uh8dE;ZGCnxKYs)sF?dYR$4e|!?FQADxG
zUiSAuX$4_TRj?LU$=ahUu-4$nRWxb^aPA>3=pTHNka9#gS-?rc_@M-QBq=V|n!Jpx
ziYKycPc3pqd2)qIu*5R(gFd(;+UGHL0a1e295xz7r5P}LCn{nXVfLapNJAk<l+3H3
zSZUypdD-OUlp)3$jVel^xE8J_0W_oLZ9ypl7`w72c&U?{johG-8Uk_Tq{=M5>40p|
zRO6-YwQJUQ7tu<)ZNR%j^g`?!%$qxm+;*#|^OAcyKbC^GA%wkgM{Ybnc7n|mHeI*k
z>Hl;fIVwQw>?GD?v4XAr3)(tOb!GJVm}JefcI)iRJWv`j6fHnHD=$X^HA`0f+f_(7
zKCGEwH=8M5iO^3D(XJvBo=q!Sg?ttvihQ{XlWXf^nr8umYs<`-P%bx-Thjt`O^`j0
z{!UQ)Sfr~bE5h=F_U4w(nHB&bv4W^)!2&c>^qz{^wy~ok`VnQVwxmYMH`kuuQxIVj
zs49{CcujfK0@tI5&7<)+^2H(JVIqBqiHS=CWfM$+B%cv*AZf|q(*Vt&*2s_qMXw8A
z8#ghdHi8qGUTYyFy)HYJ1<~g;VIoDrX}IWPx*rH8GX8iMDslsqCUw--IDhUSYP&{}
ztT1ULujyW@F5w6iDN9)Uu3~f}(_zucybi{ru!s=QW|KTxA><LMw@3v?8A~x`U`@gN
z$z?!Zd{7?2`?1a#9ARx+deKIvz(_BSYXxP1(4U$w3rHph)3#B|5YSIK*2PBbOaPK4
zH26hck+P$n7#2|NVf#yvKo*;(ZYbH6B6r(0H?dtRXa&0pW_gTK+%VAkxZ_Y(DOwE^
zXT~MLf(BTnBt+qyH#AG;Oo*vQ&BWC>qgk`bC@pZ03k1!_8#adX18HN7U{rSA6>>V&
zm3Vc}WLM&`?C79rXJI=xtlx~JNE>xFM43SumvWYwp}YiA8+eQUW+-6@!|6Q;QthNs
z1zLjY>%vJRJ|Lv=r-3mlNdz1;lG;N`n+J`pm{O?=1UJV!KoF9NR2}P>HvK`W&ay$O
zPN6M130w|}JN2M!q2M;hhev~s=r)k2a>G@2s~MkYPqHwmRe9DH^!h0|A&V0^q=0my
zj_lZuq}#+iA9RNsvtwZLZ@1aYl0AFUxh&bNWmYPJKnDV}>n!*Mtbc$V4_*&RMt>WE
zWO>aCU@fVRA&ez2QkJb`O%wD<Vs6*SVAG21`GQcpJBdPOuIb_08Gg}snCM_73AwOI
zD2E0R>~odP3~f1}v7mC*)d;aKQ%Y-EN)dG6O{9z#V|(*rV|aF9PI|KFWti;BjSnZO
zD#A4K;O)W-qSucBb|P|WSw|qU7hWf@6C`-?w3RVNLiawer7?nQg)&3nc~|a1#gn8G
zAsf0|RUf5BpY*gMXL!j1`6)bu_&o1oYZ}*KKrqkV84ZEWRFqm3+E#b;^KE6l0};rB
z0F-t4&X2VZQnNO3g61oau$03F#cC;!=%0i5LP(*MRIsH6k!ah3se*t!3T;M4N~3C6
z5=||QD0e2#Cam>LzQ;ItSV($R4$YZuUd!p}1sQx+hf603YnRnhMCw;nmM&*1wpJ$i
zR*p!YNA&IFqz``OfZ6ha5erHZD>95(+`6BcjNn}O9+0vodktj?uo>8~MQLJtVzH_0
zlPE@%+G{NPc9;Sf3@}wJPN5XAjJgTP`W->$G7v3@iRfYkat_JjGomF~XgY7994*Ga
zHnZ7Ww%Ixgq&D{I3B=CDvL<un*m<3p%(6}xH4-GRyK6%!!g#y2RMEIrD2au%UKuQ{
zVBVI62;I?j-rzOxSgGJ8*Hgkli`izXtaaN~H6Wfo%PeN_I-xhQ=pTKemM!6ePqh?B
zlt{9s?xAE-L@zDK<&7ca68yL5#ZkR0n@L$IP!b^ZAo9c}Jhxrh8d7~YI?_Qax~*xF
zC)AMYqb8JCV2$BL7=wN)Q?%cxH<gK)ALK5i-bCR)$>{mPGKy|+;0dT+L`9iTrz}xR
zA^i+W$QVn6J#nQvj5c-k40LW5qye2D=tuL0WM_NFS|F~v*KSPq_jY%8RZc|@nb`r<
z6jQcr7`>P$Gtyp+Rbc3-Y|A$A>G+_K8Lj(tNE~2{csBcyYC<4<(9E~2Jildh$QT(&
zkLR#2JJPaJLE$eVoBn!HQIJ0krx<yO>L`kFACP98x5(FI(~4Y8Ka>??{|Uj+0hA_+
zGH{2XR3ysziP$-y#!y?%ljou6dIa2~z@^f<sqvC%87A^TF{;RV4k}p&^;IH^mEo%#
zTI4>YRda)^n6Oat3O|Tb5ivZFw?^{W+`vw1OW*)OSTR%bU92OUs*9^;fH8x-`-t^Q
zXzvM35g>sTrBTuZl|xSuKMe@<9vTD?*eQ$>L^V4CFr@;CB0Y~pvWZ0^=|`KPBL7n$
zs&2BB_IXfQsN+MBB`h2n2rd%Kp9gA10KL$uijO3gi2taHkx84G6rYOIaxHwC>ytTW
z)i~%eDl?TYi`+di#K}v8zG!QnO;elmN!gx-y~-gTN|12#kY5GGz`i|`Kvk}wBQm5j
zjCA8n5WG_9MjcIQDWW~GNV-YYmlPt}v)sL|Y(s!OqRs2TRE2I?j)j1b5Z4BZwgAsu
zM2Tbd7H?W1?u`sOl<}~5r5K|{vpKYDSM_fh97z-Qp|d>>_yeq_va#IA@CTNOiJKcX
zkaPn*O?ky%Ze{9W9!_X$89NH=lENMRWku-}gyrbmfD#@tf;(y|uvd6fSCDi`qXr>H
zb9fe@p$kCJa7LX{|FJPEw_*+C(lk&JRA?6$eWHL;VPA;!RBbB&iwq>K3YCzGnAHou
zDQi!1eKtq%M&lf-ZG04vDRdHZaZ|_h2ZCB;6}waiSX}KeMJqd9d{=>ejF4ePKR{M~
z?@7;~)W06%!Kg@pRUg)@24@#YU?(Pi7A)RCH|mNUAtIat)2y69`9I=XfN0AH<s1Ep
z#}F{8bNObKPgCM;BO+fjk_|u*XdB5{^-8va>fZX()_^vkn+g1aP;o5GM96Mp;_6_@
z)>IMajzA&J@s<RVPTevEgEwRKn2YorR**UjB+nFVQ5M=Zo>rq?fzuIh4%E6(Uy6<f
zd%m)X62p3wO;Fs3p=DD9rVOH4x#bD7sUe!^9~i;oRaBHBaJptHW=1pPtmT~zIi5+;
zEU>OCThWnA3}s;xp@@YFMTQ3iikKvQ&5l^hY9fxZq=Vv^g?l}!5@6kW6h~1)sS(lC
zhEkO>1&k*qZV90&(E?BrqlhD55sIlP8d!1_HC7z)EYOr_ZQDJw4R!(br~bNewd0$1
ztUpiMuwqgOm2-+BQ^#trK`Yj6e?D+C@nNG-qTQM3nTn;rOlx8yki`+|aAYyV%bAoU
zSm}&vDsg6JXMvc;TngM(wiLL1d@Nfj21T{C2P=SUR{@O75E5_2+;`JqdZ1oEXjubA
z9MFW+#=*O+@d~mDAv4|ZQ$49Lknxu0x!JU$MA!k;1M3KfhxQ2^Bjn;>BLs9=uva1k
z;$-wev{GDJ0M{w_vExi|&w|yhrar-1@(IN9m4yeyJ5qa>K;$(zyejg@Ua16=ZaumL
zx+DWE8P+4PtBo!S6h@V@0IC|HfWcBlF=J{OPy>sGtQzq}#b<v?+UCJqHE0c`<H$*i
zoE%F|dYYS)1}324r=vTIKX3%y=<6_S7S{Af0y;QY1~M*GHA@j>5v*mLVyL<$WdKDn
z>w_o*3&M<6;U{f4$lB~KG-I(P(1C`CObC;m)=++MJOn2mP@Y9)vIOEvp3vWetV4w~
zaX_JcS+ul27=!@SPX3$Ov1kHmEyGt_C82y^>>BVSPKNjCrx``N3Lp%0i)6JJCXGGJ
zE_F$1+-(<!g>4s_RWQZfRWQqg)&k&}y4i<Rji}J47KXuTB^8y89C+iaM{DsZUezQi
z!}F4!G0aqas&Q7uu4`Z0*U>AqZRzM+T~<mt0T>AUMd%Sc?IE&Y(L_`j#d5wzs{k;6
z9Rjq_08s%!a!Ao6?wCLzr64aC6<JY5L5)x&pVAEp^?f>*L7!1cpX1Dxu~+x?x35k*
zys@wOx;{xjA8L*G-Ml^}<hCBoS7<3lcpECq1in71>#Iy7WlN_fz*MMSZ>CUBWN<1q
z{#9hjnBzp2lkuHe>rH`b;*np9g=4n%u^z1Q_)qm`ky6EaRGT27K}{kQeJH^MOo{~a
zrpmz{Qk#gp!dQ*~ah4PVlrol(mQtdj88OmG&E{Z_&BSOkOcgnt$&Stfw1(#+Mri&S
z3k##a=UMA05HzKr`8P8YBt-q|57>xX3MHrz(#ryc6qS)^b0SA@I~7UQWHTj#{mIb4
zG7z8J>3M=@QHx%$kggd)b;4*IfmyU$E+3hV%=`o~t+`C%fkei{*R+*qyG#b)rl2@s
zB@l^ml$52Y8aP}84<q0osTe~+vsotXNbQDk$YJblvt!Ymbwb9b0?mjX5x!yy9PaE^
zodL;A5>o_%i9euv&r`F~T!wlurBmqRmWi>%XZDM!TVLDW5r@i*tI1cbH6~~rIIWD=
zN$I+fupbiHTIEryFk`O^He*#?C1S|xWl`Z#L~Fv1)wnb#>IZAx$IExHM<bxgn$=ji
z{`FXZ1@U*>3&a|Lj~7560>}B(1k83-85{~yb>c0JQZr}pmWk9Rz}oOK(c9yom+2>-
zUN6&Co6{Z&dJQvDD55MvOnrqE<!W`9@T@|I1|Vf&&jJgD61XUxb5t1&ZGw<dL_;;u
zKPNsRomB{g8dT3IuPs7h5#)xHKJgVnBsJ^MjO7LNVphWP@{0g*0S<I-ph2sF?hVC0
z5#q8Lh1XIdb;l7kl;s?3OG~tgeb#Vx%u4oH#t=wY)@4TqK@3B*P5tc~%H~QYX%fo;
z@&t$a#mio*MJ8wB^V1z}4MzH);TifE`$7q1qSvjz90=NLAodh85dxwm%2<A^ve2o^
zd89EEArd;B+WhhW`=A%D213O66yR(Q=s3OF`_PPXj5Uv&Y>87=1koEWLb@^*8<m2t
zH4p0u;4KxXAB>ORh}`<@Hc10AAxt0zw0)K$l&0z_9cfKn5zr_O+!!&2b!LM}x@qQ1
z$U(b8cM)3+`J`~xjOYO|2_7Vdz6B)E1NCUE!d4YHvIjx|2xO=;kO<!#LT{qEF7#jZ
z(N|6u1chR+I;UH@Tpz}&N5?{)&uDFlov>UpGE$3#r4U2I)jEOtKqZt@roe*Np_yAn
ze5Z1g8iACl4u29Q2-Qnz20|)i3`;jbY{MHd24QD|GVR$o({wD*rMQXLkSe7<Qa}>;
zdZ0XL@9WUc1vL>Tg{=Z31~Y_x0>JD{M$uqWRU(v{ScavffoBJ<z!D50#scZtAfq#Y
zX7A`btq+W?0Rh=r##|ujvJ5xV-^L{x%=!xr!C}PD5Yog2M}CMZ6DF8C&#6&pL?jE}
zpJvJ?xCyOHW{&f9&wyYumTp;Ti&10FM0KW!1Oc^a@<xUTdsl~osiDCT)vJZ_Jq79v
zKQVE6pzjILvn0rT1dYk-X$y3`X%6}lRNXOZOpil3iva9}U|Bvtmh7|A<E-6OEO0zd
znSKP&j0A4cYY@*R@|9lk6za3<VimsTCqqc32D4p`5V++9APGbxlS+XjTT)|8rObId
zwtMpiTiVWQ8Ox3inl{l0sY_0f;a&EZB{B$X&Pb~_{Z;5nQL{L$AQADMKPR48pXDSP
z)qjNLF`(rnoT^lS%>n&Z^#LGNHbl)rRi<L`oSceLC}*>(v{86{mf;cr#SE7PA5gM=
zeOGci;RiEO?WKUzjS#e6Rf*1M^=Qdfp(;NgtSV7}NtJumOC|Cp9K3T29VHAwFb|eV
z`$E^f*w~n1>@w$YkW8~&)&Y!N(Roc$5M1$!DyUoQF-%*9s0LJgY0Gs^K}SVNjUcL=
zfPPNEa-;=KHl#Rx%~@sD#*dIyd7_f#%hQV-P<M&i$9)7@7tqzzx7iEQ==G>AbzThL
zxLBZOQi1f9wk)bDi1?~kn`ty~&mcx@SfZt8gcykn4$5MoQ;BY$4zDv5a2eO~fYKu~
zAdR53l98_SSBQ0qoaIr6h0s*UFf5qwh$HWs%2q5BM^%zQc3HhR8d4f<iaANN1cZJp
zF+S|<tP;oC(Qvtm%l__-eQj$h@mJ2Sun*c_qsOj)2o{uAb(kUHq5P1_2q~K|L`hW!
z1Y=1ykq=VT2y`N%sS2`{5tNLgXELdS4rT=hvvy%0PJ3;HeIy{e{gDN;KkO60_e?^G
zWGGDC2xvSZql+ocG-NBSfV99ms4xff=nY&uh3T~YwtJM6w`FIWX9=oC<Olree1HV6
z%i2pYOsZi+Jm9M%IC~h;1aBo^ht=D%DtdRLV`17fMG*<i7^_ZZbDpGs7gb)`Wneaa
zW&aK%GZrtkNO@+w=%t!7P!Xo0d_ynCxmB0ppRnQyQ}&WTH<Oo;!<6+1wyc^#b6{OI
z1WiTv?Krh&6{0!Qg(Xf!=2{EVS4tsU8Q&9B8!dC<^joR5;k^s4mj<?nQ)FN49ghrq
z9eS23_45jgy#ro?f<8|LOQVc3Hz|TJ%9f-d=pze;inb;49H(9{$b!I*-Iy~*^238;
zV5eNMW+Xjm8lWmDD3C6WVzKJzVav#m=d59CWNcukFe#egJ#c#XpnuTno=LEek<VLW
zYtmD2AN&F?5Xj69j{=!9GPX903lC;T>{-3zLqpb{<T_(;B)QQz$7&lNomAJy^JRj$
zp*n8bK%ojaAzZmOjl03{qk~4yABdpUCI644qi!awf(0Qv#ft)nfZ&=^qfq2lBE)jj
za@sUFO-Kovk+RG=-tS}*>s}z>l?B-t&JU!GF=`*XrKh{EJ6TSIOYnMVF3B2nSL{A?
z0MXkS$gBvefckx3Zxoe`^{8XfLO~CmSP_U&Y+~XM!8#V*hlpUPdS?{W37Vj0gtV5S
z_C<3%&q>|6R}E!%C1LZLRz9EHY~%)w)R0wKBFn2*1O4S}<ygKmcNgE5>@bFglIzgn
zx1x%De{#k06F2V2jpxUfpSWUKSzI4sJu5a4LrBn`u)1%<rru=7HU<w|$`3Ci{18+9
z8~2RjQ`4r8wx68E420q))^Jy<YyhZ`tJ*&XOCmouXy#j1p5HP$WQ+`?$8*fOXjzFa
zoT#fR69<Ns7#3879zY)QS_mYPDMft%*)+Dt?eRwmB4%!-2i#78wCXs!3TAma0rIG)
zM$f*G=^9zYXPKC|w4%g|+;UjOhz#>lu=6Q|@YIwdpwE(lYMU4<T8fQOJQumqmX$4@
zy)(+1P#g8@W=|Q)D5hX=?YT_^QIkNY><cpj8EZYNU!F(gl?zM6C+m_VirU1)CxYcs
zl(DErfi_5Kp!id$8qFXUtY>0`HdGOF@=TrSAe09-f3QPJ2tH|4BkN^c)w;wsg!NM4
zp;o4QXDU(zSp>ZG>W$)zDI#r(g3lD65_t8Xhq}AQ$x~NfGPKc#*9_W7($MHxmwnMj
z6cesP1`}SAB!$9V;R>>O74D=ig|cAL<``tX5dugl5e%GWM%Gg@3bV}^`M^B3>YTM;
z8Jt1k+B$mq*eHc+A3zdyQQ=kD7n)nA0*MYLx`Y)Zlsxii^QEKoGn56Z!GskeXiMEl
z872Z%DK(Qxt6Gc>rN=6rMHC4Ro`LE<SK}Yk$XjtS+v=Q9KCc?N#otUYB*+NPfLq!Z
zIWNq13$`-Gh*_{;Igp@ItG8g$5yi?NXlAA*Q%NgxJghqz<f%RU4{fVoL;8|e6&ZzP
zz6lrPAL`M71u?WFglDtmfJq60|JkC6Zz2-uw2Jyp)UlYEw3JR~#55u+{l@U<kd<y(
znK6d)*2yj1xm-5avT_7j?UP$hCp>wdb?&&8AEVHCHn-gv89a|g#s>y*PzxejzzHLT
zlg6LS(;3q#zesC6Tdy@1O+|+(E41}j@-0DwN@)_}Cox9S1q=%2yOKOGc9I4H4rfA1
zBBl1GEL<X#Q1Wmgy=B)t(2MoxTXH(!{kXAEwyX%;F;XLy=`l$(k!^uwR<ASDRwgB-
zGJ<6)R)){SSjaM~+{#)DrqH(WwBE{sIlDj#u(*j4!B;TBELn$wDR7{n42@EdbjDj7
zoFf5$FVc=ta+Vo1M6ysgn&L!D<M@Lvrf-c|4PDIFO!E|U2tq$G{}g6@(tcHuBMF8P
zfa;#ihU%52X<8bZQj6f<1LKz%V~V7TpwCPvDT{g+*--h=G(wYTzA}Bp5Y#Ep7!*0E
zQ<I(xfmnG^4ZxTde8aFrgjtVOWhlcIl&}m5l8Vx}ppJebzZ^^vXz8ab5d_l6WDHo-
z=u#~r1H;z`j9{h(GM0{hcphV@TB&_UKk6AK$mD-{Jw^f9k2;h(5ELk4SS+2S0_ip}
z@%;Qm;{2A4gToeuN)Q@v<ApXs-pHvd1#Tr1I&_xMB`pGDmz<I^qM8u}#91wYMPBA`
zEQipuy*a>UQDzbYg0_9_n)Tf-KDMYlrCkV#pnV%^Tzb%1u;4bwhezFW`T$*%bVRoS
zN^2W-8#FcCtSt!Tm7K5ubIY#TkwpvAjk)oW5tP^D123B$Glr7wJFT3t9SuzPq;uIk
z@U#a~auX*+IJu#(tGmyI9VgrSJE1751PucX3`@8oY8d)i#(%zW3qb<`e#jM0;%TC3
zptV+dAhl=A0z?_ns8ZB0n^uGxCZE3qHB7`U?mibbjA<U&Fz#L9!uDX}?V_FcRYob2
zc_AxcQRO16bfT2e5FO*eE0MMgUd9g=(@Hms?EP5G3o#Drmg#R8%%kflOoz$>h=7ld
z7_Xm&fd<ql424`_owAD(&FuIHG8_~Sb7akh+7sTv-RH`m=n+3mDv~}jY)Yh$WvJYp
zuJ~b|SNvcaNueWPvIIKUiHRqI<xfF=Ns1tTs3PEXi<2^@B&I>48lU*hhS+pLsoV_J
z!YWSxL*MF{By0w!78bq4)33Q2SbBtcPV<Fj0B>855pBRO(f<W2M5t5~s^CeU|8EF+
z<`v}JB6P6OH5F9(28i5JvdIZ^HQ6+h9T^z0wj-In(;7%yxxt<6<x*7Cji9WxeK0p=
zsxR&BU$YE_cqb=$4&L&rz76Y8omRYFg|2k(>zb-`ht{xb^uYHjg5n}QK4iHTcafjb
zIW#_I?nu&I_3z0;Z57S3E9OY}X+oc6TKG`8@;10iED%w$%hON4k7Dj1RnAipxj%`i
z49=vGnq2j2a!{j|l;6>$AWdGN+$-sE33W(9STmY>WFTM=fXgB?8c-J$K1FCJWu!&R
zPz=e+s7i`c4?>q~U0GoFc@r4Na0SBe1ELt7fl53mA+$6kq1bJi<6~G$Cu?VU9VnJ?
zb~a5bc5|_ZNuq0syW6hJVcRvQ3#Pcc3TAmMY_x?0sY_V48PLFxXjZ1BV6i+!kvUzD
zAes{OJ-0HVjw-yAsL%2K#gmZ90$aOs5qxx73W3_8zIKF>N53dVkSO@$5k^5`Et|qP
zy|olZWPwFp2pw2QD;fynn*sytu(`qatwqRYNqi<{Wn{rpHI<Xnb6ihu9!%Lx2=RHl
z<?La#l0AHCc2BYsTu>DA&*mnh)P=EZSi@5@l9FpM`-R#u711QbuZ6NU^8p|--TWGb
z7YDQB*qnwSG#Kd-2Zj=>90mjZD6B<akco=4x2985^Dt`yv9pzEXkKu2va5h;Qjqzq
zDXd?5siO7^g^2<O(wdN-GuYI_>1kc(Wz3C03#VsPLsU#tK(&zo1j^hFh1F0`qx!F_
zuea^BLmwsavPQspUqwyMc{ri0rJ~k=Zl)U!6~;oR3<_iYC@72t5WB`>DT|<|L0>JW
zR1u{NA{-EPju$f&*vj!~>S^I<WO3>Kk@3`2aJszK)sU%)7O4WERFbhnQ7OyxXk2K~
z%E}fp4Jq-rzJXM(L!k!di^0G{3AKW}HH5nHu*Ulj#eZitK0UuNLH{houEZM_Z~f8X
z711sSg`Egc$G~m|I1p?HbI16|IR&*>Y<nO-cplzIjDnsQlwRfbV0M4^+V0McNhdaa
zP}q|tMNbEXy@U@A&yFYV6-C&auke>?8Bgd+L0^tiG!8VFd;~5H4w0@@Duc#7ifO2(
z5}!3)1+K$06?ZkGEOGri21iFNWPMl*0=p?e2dem~MFFh?1(7k&y|a{5@!lY;cctdA
zAk(g_i`k;Z(n=-bIRlLdWEnic{<@f_+R6GeuyISF)G#PD5{bmq^Bay>mzcMp`IH;?
zUU0~QH!R({|HcbGaQMQOo@-9I>B0}au36ae^!}SZ@R4JZ9TzpX-~7Rg-?+4I-`0n2
z{?JF?#HAj)ru~)=f9yCd_rlW;-SUx7y!Dg|7cA<y^`cK6-*frWZ4cji@f9cZ-rV!)
zj@vG|>ZGmr?)b&Sw|(^L6=y$u(W1`JUHWNp$B*}I`}XHPcAYx<++&~a{QSp1^N!sw
zzwnE1fBq9UynFAwBi`J#@3NaZFM7kWGalLZ$y-)m*0Sc>uG=ra?X+uz!Dk-1{ff_T
zxUu8nH+SE0<?a3Z`ffktJ9k|5g)Mid9>2Ex3!nPpnGfV%eC9h}xcW;2`!77=xK(#v
zbC3Dh<;%|e?wy~$Z~K!sueom37q9)wIZxj^_{;Bp@w%^OpMCh^<5u5w{nzp@{P^}W
zAHC}{U*Gl8bB|xQ`b(ev=J~I@{NgVk{n8Ep_z#Igj$GVx_l*yIV8I(tJnON$Z~FE}
z79M}<^*#68{GE?Bi|_sC$L_i1(N82hFIl|i-di8PVrl;!XB~lWHqm(0A~|u`!bQ!8
z=j__&#zfQ8^P672?|Axu>)n?f-<mjR><nbGxnTpSsTN~&bckr+0T*5I-h6h1Oq=Y;
z*g<0_dHg}ohJdSXc#sy|S|#}o8avhqBI5%)`Psx&bkNw*w3dmJTl%a#+FazVe&!4Y
z(`~-Cjh)8%jZYql|JF9%(SSfhBC(CsH2&UZ4~efi<Jt!Mg~r?Ci_ZS*obk)fc)v5=
z>x>sV<MGb%cX0L&^>z06%U>elh`;{(><OPa%HIE+JtXd2Xur5PdklN=ap(FyD@C?_
z_A>jpN1gNjvon6_8}@OZ{hU3%-yRZ&I`?siGycfg|8TSYLVc4xKH=<t_v`H!EzbO}
zI{Sa&j7eud8IJVVw7?#ZJk%clXn(xKwa$2-y*qKc^Eo>Z$ECk}>>+XHJp0A%&h|#<
z{8u^S51h~Uh;u%{*}vDB|6ON%z#02bvtL~2e2!jc{uiD1FMYfH;)B1jZ{+O1vA4C8
z?eX_L_5r#7ZQt1cZLqgDzQx}Dj&q*Gx9u0le%C(1mz>Z4W9L3!=X}4b{??wi_}nJ;
z;>r{3^WEUgyWaWypL4c5d3*j%_K>)0WI20<xY(8R@t-$Tw!nQfnissyIS2!c`ooV+
zoOZ$9B)4FGqM_q$3%U98F6d0WzGv~g!@83T7tC+E;FRS49opYsc1m*L5%cFYJpRGH
zBeu@FI?;OB5%VRycQ<xoSorMxl?^{UA#vozs~Y;2oxQT5_x4i~i3RiLB{n7TtZu#)
zQ*edE=GQG8{pLIlTM3-(l(kI@o|xD0X1tepZ^N0%(avY*>8I?y>ZwEL&FkBLEFPK*
zzP*0og1hJ4kDUu%I`3Qik7-_T<VPA(H$HaceC4~xClaT$9&*zuryQ~1<c13p_q{kD
z_mX&V5l(%9{TVMvB<UvhCK7b@23&UUVF~QS_2yl$PvVZAzt5T3IN!0sk3xobaw)V8
z@#l|edj6Q!myd)M-t6-|YCij5)ME`<fdAGuUSh8y_8lb>&QG3rz<I_;8|-)XIU~L<
z_UDL9V*hdW_R@Z5|KHdLHh0<E`!2V~z0UE?IeS|heG7Z>igWzg4R2%HFB$gxFFE_4
zcE%ml)Es~N1pCUI^Q=9~_9ha&&Up6e_Jy>6w0G=wuKR+sm^h-5xY60)<9v|h(f0f&
z9a$Q6*1E@>`+wy~`~3$Fv&ZCOd%W8@PIJC5N7NIewx}eQIx_W=BO^PUc?o-IOuVwl
z-m%}AcjMvCdDq(G?s@hH<sP*sXf5`M7CN7Q|9X2~vtjRl@(uR(KIi^la2EGbXTG-7
ze*dv9drUf?cZYM`XPq_9U2{%x?t8aAJpmbh<`DenBg5`b*z7(VbO_0CbK-3cduija
z`H34b;E9KX9m}pnx3j<DlfATb(Kqqtj~_mNVM9;ijfq1O`>^dkH~Z&3kl+^W&3yaC
z#JuLck0xH(i#O@7gZ^3Qe0zy`$0zn4o47KuFmc?68xk)yB-q!eHS{L-zU|3|`H8)!
zoO&n((Ej%4eE~-%?#9l<lSe*~Skatpy{=)$-ZK&j?&%Sn_3A^8O(fouxEuHNjnA^*
z>w>q^)e=hL@Ke|e`e)&>^FEwNu%&uROdZ3%gKE@lF}?HXraO;reR%;BQ}0)$E%5HE
zAg1n9w9C1T<<2NOqw}jJoTogo+d05;9^;1$d*TP2-)fWd*tm~xr}J1GewTgVKRCyq
zoV1^#zjGdw_dEN~cgCah_IWoQW?vcK4EuAWCE;8?aq@}w{@*&PdxbOaSB_XLbG9XC
zJi_@Pw>sl9&N$+X+noC!aJGN$9QQ%zzV3JSZ+G@PQk?jK^F`g}?Eiab9CpTMo#WnZ
z+B;ko{YU5hPq7e5BC%nwz2kXj!fHpnE_CKS?aVu#J&%dR8P0Xbo%8?7SuQ^A%s<?D
zpg-;WK94x_e($`0y|e9njfoy-zw`A~5>xjlJSe8kM>}G=*%$(;pO2h+zbkElcOR3}
zrpBhVjq{j2PD|<PoRKj`3@n#NH6G<%EDv|yZ+2hc5G<AW|75uBFuM-dzN=3)G<~X}
zRiH?ldqlyZ(B=yp7rK{s9u0a1e6PL0ozOh_YU=8)q4779D5y@V2DI<u%7&&Z8(L3h
zOPKd0+T`g9PmFupzXboSW%*Hv>9f~uVh`)CJ8YdjVQyyIquY0|@yEv+Y`pHct!#Yd
zKhtd7e(2e3-22cTHcmXXlZ|h`_5ZPZY5Ze58+V+!m5skU^945k{7<K`@eLo}#)R(0
zL$6`mOCK0y$G_|GCbs|2H$WZW??3h*!nT)hcmo?>xcaSZ{--WC*!Yg`LbKrSd)I!Q
zjkn4g8&`hCw)g*X;TzfZ(C1HL<Nm|m$Hp&y`2jYb_|qS;@h{)MnvJ(THpa%UT=0)<
zeConBHZHsUa(4ZruK5BRSHE;A+yB4~m#{VXU(UONtzr);-(pg~Vel%p$Xv+Zm8AO{
zn^?*g*GHZ_gW2z!Mt;q7yl|<-7MuKUzVc?8_xAs3Vdq)-#JAb||MJAA*rM~yw>Pjw
z^>45FJlp<R(_7g2zy75Uu~q!|)-7!QQ@>up7OQus_p|f-^-ISsr1L&}&kvam@^9a7
z9Hs4tesCUBl7D`9n2kMm|A^i9*N%OWDVyg{9AZi(_2va^{Lp7I>^#ptaMmI^{@%+r
zzLCas<1OsCXI^@$nYM3P*2?DdAAFk4AGoxI-FMS_|B2aVZ+f4|=I{TbZL2+hs{MKI
z%$&>acj*iJ*t~CLZegGA``<a9eXhRW|B!9p{(t_Ro%cI$`5*SV2Ub4A#_uh-n;pOT
zzHhSm*CgJ>?nD2cZNshl2mAYY@S-Ew`Ty|Ezp{D%wl>4gyZpXQ?0a4Ho^R2|NbI`o
zhwQv(4c@@M&lAr-mMO2xo_&Ok#>Jc2_)kw{+5DfLeH7c?^`7I|c+ove+4$9u|8M(v
z<-Kg)q1!HGW7GNXX7B&H{UP@J9R5f9ypLaJvG?<P?qbiEdD!2x=k4N#bJ%%~ef1i)
z{oPy6Wb+yx+Rw(LZ}?wq|BVZ7W8-f>_Bb2w{o)#Sofot3W7k=@>>jrNxaU5@w!iQ%
z_IY3Z>Na+spZsiqjc+~uv+Vj0K4+V<E&n;p_J95KU$OlcZvOx~?{9znVKyF~zK!kw
z*iG+X_xV4Ut!4ASwfT6q|K7X)%(mbE(VwvKyqB+H_mO#vec#u-%U0rJD}Kw~-})2#
z`&#w*pV;=-k9mcS-+K03cD-NxpI@=<r<$%{=l}3owyyv0XM5Q8CyqOmo&T5T+TWA#
z&*!oI@4w}j?6}|Gb|o9nc<(wke&nj(v+=60zlojyou7D)Z6EXFd)WBIR~}^J;=UiU
z@y3%r!p`6I$NymGZFr9zN7%RFU)lQ~`_~)T_~?&pW9hA*ypwIebN^4-__dpEXUCnl
zd=(qNvUf2Xf6;tDd;fFF-E5rqi|?`V%`bnKjc>on{vP+d=i6-icN?~|@n3iSl>MIH
z__Mt}uG?XUZ`NfVV(-7lfIh?DwPR1R?O*-m5jGzGpZ4c_-%k7fE_&$GZ2zt%`#MYd
zu43DpK4HgQt~lvow!P%{5<8!A$`$Oo^B=gLja!cCW9!+^4%x@H&-;fTvUTChYdhHX
z(ysqz-`gEO_#~6Noj>kp&&#T#?{1|Nf98-KY<)QPny)RP?c;_v>ondoaWV5fA9&)g
zi)ef4zxSU)<MsFNSxKXzeg9Y*e}DB*JB`1-{6uyi+wOdxU2o`jZ)f-WVB5L$c4Es<
zmM*37uk%;4^`Lv}{cQZ&qc5@hT>o#o*?REU_dd(U?_c}x?7mJr+LrepzG*&tPCoc0
z_%Zmq>sR)3cJ@X0vCp$x`Wf5*j)(u1&3o{WJK1@Tc>U>Yy#DM@&|!(DCC9P%f3xQp
zHXif0_ptkX^{CZs`})7P_0s*@o@Jlo{dYgl-hch;Z28*$3tNwV=*<6S`)}WO5u5k9
z)PJ$@(0880uJehn?PTMf&kV8g1CRd~8~d)`%*OX@GT3;+%fDvlx$oUSWMlT3{cQh7
ze*Ox(pU-CZu<g6fKa*`|Kln#>ou8j=>-{^<zl**9z|!69_@hrAXWPGhZjg;vi?-gp
zR<fV}OMY_`oA=ZWIrjdCb01^dJ$Kpn{a@c*z|OyQ;Xkqc7dD=0zpvPzud#KAZRh{p
zwqq{%%X4gg<8QaJ@$!#sVCS`db~Ky!jXUknJ^tTo*tWiSJ$t|T=k|R+daE5Te9*B=
zZu`r7*}RFj&SRgqb)hY<`er*8qx}4b?D)msY-jrC{jWQpUH9K^`Y1aNW?vaV+5%S<
zH!pC1Oq}Up>O8*Gxob@yJx07hT}~;WZA`y%QqwCZwLZ6mS<~KMM)Uo(7QKGnqUHt1
zFXhjD<<(o>gqMF@vgyRv&%5<ajfq8Aec0OzM|0UU$G45=+t6uztc_pP?Ahf`Z&=ja
zkf8t5j;H721Y!2z9u}5zo%`%^FM{cZGD3BCEO_H6hUv>?Q7V~BFKRB@ESPr4Sblg>
zGyPkzbHQKPLX3~yykK?DNhXO!!P^T_M6SqEhRYa$UyB(5x0uODl8IKkoSatCl1wa^
zzThq2I*kby&i2L5bJ|PJc8V{UvkKtXqGtb7xa?=acu=YEZr{f4m3?+McwYFn^Dccg
zZeXd*+pa%b@V4t>0`uYeu~U!Nq?d<Nvjqp2DmZp>X~Em>Q|-LHxaoIG8ZSGHwB(}2
zO;5HpUew^cesx9TO6TR3{N&5~g~^wDrK!pLmBzE2$xmpFyPcQMuWa1tyu9N*jh}L_
z^1Fu{8t->Eu6?Ot>YWQOTGIIN@y;Pn|71zyALlz8&;Djf<Ad(TT`w1GeCw4ZjcI4r
z{jV-*+~K^w_2yRR#0UZY_1K=H3^WedHIO%VSi?qRE5hwd63z3+Kz%oK>FXE2ll?f2
zM=hDxe26u&b4g+?g~!<Tp%!%<DBN=d+tZm%Ti(4ybWphWP_!gi(^a@<VHObEoMBo6
zdAqei;hXbw*`cE2UzZ=t5^-7KdFSIUk8$cJ-383P*Vj#+dob;umS!JCJ%fv<y3nDb
zX@#i_`i5|1Vc68(RUD$RFYH7{3QtxNu5gYmT6GDvv}p07zrL~c)sq%aMrzrQVlT)q
zpVaj7Nv*$I(z*bN6n9;nT5L6+eXyo}_T|>b=B8iY-rD$giPoo=FP@Cw&`)ch_=$$5
zCmLFxY-_!=ZShvmQtVxvan?VuvtEBkYvbQ1TEEOqoz;Fo?DKAJ+T8l;iq>DRSgc^_
zPBeY^E3J)9O|Rmhw<lVEz)u}C^$blNcqQNZ3g;YnJv;ETUtkBmB$OUVkHge~_Y1Ap
z3eJJ|vI8Hs54=d8I?#R+3qSr|sr4HoK7Og*@M;o#lE-k~=BDRyUVQxP)v5D(o@e_K
zmozk8($KnJX}w%oyw%n<j>^Hwo@F=r$epc?LZbB%ed=WF`L|E-<%Xs&H?%&XwO*zz
z-dd!Did7O$_a}C`KjC!i6Ri)vW9oE;(qo_X{Y~$0eST%@H&!m*N~(+LFP!1h`{{Rj
z<cqD1Cnj3=n^O;Cshrto*>^(Iz7tyScu(s!@5F=YD!S%L?Wg(d!&Pp^RhparfUDra
zyz%jdf*UOmT>BCaq?;Z{w_>eqeIbqCs8mZfdsHdT{~*r)=B7LEV!zSdf5Q1)Yo}-d
zurFX6J6L~PANXDaK834bn<urd=Cco1`3bwqC0}Bn;*l>bnfes&nqptTg(G5BZ~gIq
zH5>#a9t0!;fN8S{6;gm5EVU}n?spK7m^oN|K{XBKDy<GyU&sdmiF@(|oJOqrqUjGz
zNk0Z9{n>em)+c@eEj?)q6k7SVs<r`3@VHu^e|$+5n5CwRzDgFy^Y<`Y;Mx}ouTtnZ
z*jKUf7I5WS|L_c5#r1-x(%$}%i>8~|Rc^hPUFC&86<np*7b(09oW#~|yhxXEz3IZs
zc=y4u`3AepbM|F!{&Rt0<CFM>-`RTUD@%NnzfC{GnVOq!ysx!!Wuo=7e<`@(60gX<
zV>^ip@3r-kSLu#jAAahU=CcoX{0DZOXK)=Dln?G}o%)f>hqvGuw?6z=x{MnOn2J66
zN4T0k`~dyR*T4b#yF}B=uPlMv*1CXRAlklYar05_eSI6Yth8P0<_BK4g8sPn?)LWf
z)A4`$JFUHaS9|C7_ETvW#`g9OwjZ0=Zf{>hTiE4}*jR@>U8m!)_MvvXUbVdoJF>KW
z5=~pT3x{=XYR9<@+V0wp?Jkb4xSKA$W;;&TVbb;*+TK97(wW6S?HT%q?#}slckg*C
zR_tcm+t|N!D?8Y&b1QbPSixSgH|&FOSMBXReLK^uSLOS9Ht*TCwQn>fNZ8q>t>|5`
zg3aon4@rOBovUbAecnkcR_OG~{<BZIfi_IBZ%2Cd=CL!j$lU4ZPmvnIn_V0A6)Ut&
z^gexX_ILUw`cmw_PWJw4+CcP-H;2~`(dS)}DRcdGzSm!GAHRM_YFHjRW7QcuQazi8
zOnk@fr?T_2yJQ!kiKnk&_tSyTh~L-!bACxWsN=kC!m9Dq@aA()9~t7#*lOSIdA(w}
z?^mNo#r`Vjbk6rnFMCcMGR2Oasgd<Vn}i|d40}>H{T}T5qz8fhb+4NIe(?L#=kK7m
z==iPpEaK_yYkSo)*Y7mTU%w;2ZHwHU-g+v2o1vX4_H%ZeQ~CNGo9O*cy8ddN+bZ_$
z*|uf<&{kpSoc2?@@cV1*`C<<_-TUclOd9au;){YD?6RMK&-cO3w^=%4%X%KqNx?mJ
zonHBVy4ol2r)NEv5>MT6hOl{)wY6iYy_cSc3eVRD_CEVT%waw6eE;Xr_f`J;Z{I}k
zvx5phzli5+Ye)W!E%qndLeI}`_S$`(J33Bxf6w;zetLfwJ!&00&EfQ^ee@-*>B7zm
z_uIqX$NJLVe#X{Q_u#YIpA*};fa{I0>z#v{9Yf~GsXNVc*6%!HHC?}J8!lSm^LDMF
z_dDpLtlqo_-_stvqNjKhEoNQY+k5GE#orody<@-s<nwy^Ci@Ga8JLGdYyq%;@A33@
z+IhOOQ*%zS-QH#oIXyS_U+3V|sqE(|OY_;IhHd+Am%Ws1-rR><p!pq~v4zLFL4WUh
mSE6~*sF6eTf2?hLHVJ;#Jp8pJFD~bnbBR@rK=iiv@&6yx&N?3e

literal 0
HcmV?d00001

diff --git a/test_fixtures/parquet/influxql_log_3.parquet b/test_fixtures/parquet/influxql_log_3.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..2c331b26523af2f18fe42258e687ace6e663b552
GIT binary patch
literal 449494
zcmeEv2b^40_5Xr^p@SF_Ai}Uh3S`6UCDfEnfz;hJaAAEl88SP&%*>JmSbCQZ(nAYK
z2Pskoq)AsmQHp<xA}BvGiUkyv|M%QGZ+B+iWajQ<5=xd2aQ3~quix)I{hV3jTI9hS
z4F1Wk_;=diS2q~k@QsFsA+vLHpB{1T5bp_2_gw!Jw*Bez=P>%(J`XTDV!H?V{y!Z#
zkH+7!^NEaZwE3Bg{&u(X8GY}>%e9~5_ieU(j?p8I`3s|uy}rIq`>%TVNk;Fy@-;@s
zU$D%i{h3P_>`C<AB@Uw#j_hJ|<snyX@c*#?FFVZGnCK0+T(}ic^~`G-9sH|j7+vqc
zA=}XYf8F#9qdUGg_M5c*^vBJEiSBs8X4?|IaLIw!d;q%ngew?5dBYDGy=>+FKcM~L
z<Ud?b^iRz<-$3-OqdIORI%MX4HxadmT+Qgp3vOfdoU_mR+sD{{-sR^rI(+IUjK25S
z=KrAm5B+`l$3*Yg@*zft%s%TA+TQ(@e=r)J)ATRe{`TZ)jBe3&?YkR{ymrX3`|k96
zqK_vp{eb904?oH1&C;VE()K0iyv*pHSN~xpZKqy%;IBlNUoq$-qE~&m)8B}0al?}P
zw*)=-yxSP<`DDkZX#4E_hy9Z1n0fcTM^xKm{l5}@_locSo#;+2f8KB_&=)qmcw?du
zkNo)%qJRDMp7Vx*F1cg|qc809j8EG?IruF`M~=!KPuuezyNJ=o>~C(M?PD+8=r*D^
z$-6Q-{jBbXXnRcdcYodm^!i^Myprg|hj;#n=-a2YF#6)3?))chYfbNeOmxRnPW&%X
z=g`F)?h5)$>*&phezthqEr_<=F|Jhs-Tti|=Mc4=CPr_6e{V(?jvYOh_7CYkhtbUs
zesTeAH+Fw`AW`$z`?e9i<dvHkJ>){;`NNcJhn#uYJ1-I4`J#(oCi<T{62Bz+^2-*Z
zZ;r_@`ugLqar)Q4y+Xea-)hFMh))0TXN=x;!#QUjgYE53nsXM>6(^p@=<E+?pG(^#
zmtB7z(Onapo=@~I5AML|bw_OXecHaK(_nN~^CU)hd~xf4o`L=Q-Zhlbm#6Q@=)Zor
z3!_(js4yD5If>DR&!#Y%ZurH=bleNyaXuls=Svs<o9K0azi-i{piAw0k03hp@lQ`6
z`rYTZKauGF&fn-hqPHHjEu#nhb;n=P_PK5M{fVeF;t57~{oq4J!!3?_b~*Onx7UAO
zAbRY|ZGJ{{#Xr9JbE11b_R&j3H@NPCmx(T4xa}K6?I(vZ`pWj&Z)m&y&JEury5Wb%
z>~}Nhu)A($bkPB~kEiX&7XOXWBb%Bh(Dt#33nmi1_TleNBD&3=uAWTvi9;TpLUjB~
zzh-poTZw1x!}hO6O<?qs|6ctpZNIzAmOmr<<6FMV=!mm?Mi09s#ptqaPGfYFH<mMc
z`Co^=NaLq%zUw(Z1>OFgeHop5>vtIa??w+Yde@-i&ZXZ!+wqq3h|XQ~pYw@MJlej9
z=mFmzd@<2aKiq}UYnA7B{3W(;|NgHSoxc42ooV}5w=ZlYy1aeLw~1~v?@2}*hfUdq
zw%>U1az=N*?d@G@d&R!fhZF6bvDu_|K+l*moYBo17cu(iwv#5){(Jv4nbGHhBN=^r
z^!fYK{$HC%9YFNLv&T&*dUoQb8AP{u<E^$+W~~_V;FY(x6Mgl+J32(4Ui?RoXp4Vz
zK=jQI&PWpd?tccQh`zA#9-TyA(%w6a=m+=6-9)E$U48%PIV*--9zOPCqVK)l@i5V)
zCry5gXxqz=K2G%A6OZ{R(S<X&e41!>_j{fty4Sh$pC|gG<#T^dbol$<{Kg3fuNd;^
z8S-aDpZ&pl>zxSt*l%}VN_79zE?P#k{iE+6N%VIU79UM?@c-?39MPut_d0>-ksD@C
zCffGDkER~yt{C#mu}2U6KIpCsUf7=KHCy~CeI@Ak8+Pa<I_slbyNJHO{4GYOzrE2i
z+CFgHN=D!M`UOYO_ABo%`R>Z(iXl%1C%Z(yF=khv=<HkH2#B^m`1T^APuw-@P@-G!
z^<0W*Zkv`g(Ps}#XNX>K*?u{qcRXWvZ{N9M$hW1=B}CsH({&Wli@x{X(M0#V@0H7l
z-uBDI%ZZN8Jov;8pvkGzeoA!H(?5Nd=+^%_{&}MJe)PtM|6H<S$nBR-9z^ud);Wwe
z{$a`HwEg$wsap~q{_aY7>!rBwMN3tpKmF|n_l*So<F}80fT(@`BaHrek6+H;;pi1Z
zHo0j2IYfW)(uU^|-E+f}numdY*48wc=+fJ6W^~?@kL*v|-@bk3bfOPF`{@j#AN=gP
z7NWm-^O0#+AGcyikJ`2DI?y*C+3854f874)-`@y&#fe+KPjuMN#=3Wdj(y-MkLb&%
zt-SsL(9y4Nd^6Dpx|VMFW6<*#9x(BrC#@K=WX!wGMDJg*`DH}E^}UxaCwj>}yPohV
zsQGjCWTL4*-+t#tC$AVXa@S+-BKp|Ahu%%J;nbskM0D^*gMYXo{%vCd7yWOm!3~hS
zhAkKrL1^fp2t*AH4I58ySbxya{jNIpxUUWR#>fRfx$3wRzP?rCl#BPf`uLMJ9jYvP
z^(R-KaLSg6@uv<Qcg=~XZ8Ngvh6RsabJF*QNbb`Yk6Ur_nLAjSKfL<ric`+sX}{wK
zZ8!efQ_ubOl=DU&_}I0loj+pMHB&Abf8FWd-+jTIi(Y%|y6;`I$9Eq)b-M{aIO7s^
z(aSd+`1lXbyv*$S?bDY`xc;o=doKOx53fCb{n=OUd+hofZ$I&dbFQ9n>NiFm^u!J4
zu9$pI<J3zh-gw@12VATqUw`7p^RJ(I)%er4Z@%e*8(VK^x$&SUZ~Fet^MB|*b7}L<
z7vB1v`!aug{mGjzy1o4;$8WsDq+2e&!+ZL?QQ!ILEtlN2@WpGUUN-60OMkTZ)jN}K
z{Pfn#?(Ka0vD0>#eB0&sXaDf>jo*3dw&f2l`O9ygxoq<7S3Gp&Cm;Rsji+wE@{wa2
zzP8D*DL=gG(Gv!3vs>HKKfL<!Q?~l{{+Ca=<C-VGH&i|JH&5TO;;FL}6HXsC_0DUb
zIe%p9O>G<FwKfdiU^~5Gqph|Zx<M}45e#nF_SFrx{b+amJN&o14sRGXw9MDA%Iu>P
z4W2W2&NL7bpSQ-TyE2`=EXhsX*`~nB=9-kgoeJ$-63|Xegl`W1X5lp0h)sj7`QZg4
zhvs(`oDo-fb%XWRJLhW+Pp`M$;LY-n^0gp+IDe88*WdQU^@n%iK^6#}kM4hP!!$M*
z@lYEL9yE9myE)8IHc9F3=&JZJy55FCCexYWSDd%)yy3%eJ%f#?;ba@LlTGOK^B<#5
zC!DjaYZWJ)HF#F^)waU>v^6I*CEs&YL$`fRp?wDa(KJuDT}{?aO<vVE+p6^7p<9(6
zTK<b|*qK>4bs~R*|5<O_|ExDWMc+*5eE4GD82pWCgV$#tW<y*cbW+&>_qf^M&EoTF
zL-y-XPWav6YxaK2tvYk<xs2ll?OCUzD;4<srcPgP+tb$@K89|p<OH~UVsIk5{AM`g
zVckJyS*Ei)7i9cZ-Nj~wgN7Du#qVi@OlK<p@$O!4+q>5rz8Bp`(LtrBz+G)Ic#G(+
zzJ_z=vK_0snXlouq4dA_r8mvyIy26~pl|vdz3tJ%ccbeU9EGbDYtNsi>5>f%ul<tK
zac^?kk>T|E6`byQ3#YH1vpyUD*nOPtb0?>_evSV=^gHb5h8Jh9$Nt#w8NPk>)0{pv
z?i}{VWsmdigP-8^#2!v3zQ)hv{D2=ndk0Q88OisjEx!HAFn-<r|HQYioWhTP@fA+5
z+n(<~`YFDh*qY{BLxXh--`;p5tPA+()q784RJ(+qf7}-QxEJ#Gr+)V8?lj=v?$_D4
zL;jDEhK+v4=?44p`?%`{zWtv+aQg4Z_<8<za)RyH<Ve1MpYL(H)jOO%y~#p$f(!HS
z=dJvC{&pij{-LY*xWDAz>)}W6?eug`zwU7Q!URq~I+WAdf9A*Ebt&JzD9h;sw{f~j
z7pGUv=k(a^hp<1c{{`PZ@JN3A6C(&H4Gnu8#rGe70;fyz?`LuTeqLC@_qXStZ^&}K
zJ!kUf><{M*zCGe1PKWZ>-muf%e0x&<y2kJL_RnAE^u>euedurU?J*B?`oW=JV}IQJ
zXTE*-^PD#4&v*Uuo7j$fPUZXeI*^~|)UW@5?KtH(eE+fcaJuxn{P#ni<J;Yf_;&6J
zK2IJ!hHwAvbpHF3f8q4m|M2g>{+RsvTljrFnc~}bKg7qKbup*wU(V@_H#psl!=YjN
z96tWH`S)=7)kmu|;^Ck2agXfDf8YAoe0$PMoPPgAPTx**`p!5`A2^)T4G-dU=@?G6
zkNJ6eX7KGRHstsF&Yw6v`eeTU_5An#bQ<5D-p1+A+Bv;*4yWH5#h?F&`E7T9zW+Cu
zaXO}%U+<cK?|vrjIco#HebsuLo`1$3?2qr~*NMMvyKgIvo38WUU-~a?H8lLOf!|*$
zKQB()i~oLY#$~?`xrT4=l)vtg+jMM2KeQjg_y1_lf#0I-C*P<4H#97GlGESj=j&a&
z^Kl#G-`BM{{`+K)(|6YA>&JMP-^X$J`EX$efBr#PK5n_n&l7y@!^N~``5^xL?3MiY
zJ+|S;|NCXW{mub={29;k?L+S5^!Rpu-0u1Lz3<aD`(fw_eEf;|dAq{l<HzLZ!>EJ#
z_OEZ+nEkQcTYUVP%Qs@%KR=ZJ{+&PY{hJQvbpLDl{xkFM=agOf{v~_y{Tn~Z>HMep
zece2W|9;eeI9=Yt_owsU>*mh}u>;=wD}Ua9G;-P%a{6ZT7VP((7IAvThwO7TbT#qc
z=UvI)_s9A9s^7}j-_aBJ?;G#TpJ(OU{JQq-eEddt@!#jK&v85bC;a}Tm7M+~zdk&k
z`a8S+diizaoPeMIya)JkOGonao^vUG?$aLQ`)_-MZ{P6&Ki|y!`QE#Vzwh*HeqTeT
zZ^{O4@gl#kj$wTNuEY8E;C6nWP51Nj{@CW%8@7^f@2v6dI}YIE-rbE~?}1tT_a}bA
zx6k=0Kkukl`S#{-@$<hjoga6|Vf^>D*ZBE9*oS|YCD-unORna>zwj*IUNL?XcHcdX
z{5p#s=I8x+e*GSjU%$>cm+yZ!zmA`_h(G_334H%vpK;onUzfl455C=&$N!!A?|SXK
z{C%d5;@7|FHNNdH=lB17mLIqEQ+#`yfAilzZsX%y^6To4uHxINA$<JDzvb_t^ANuO
zq?P=+XYa`OKg`hC&@d*ChqH$Bao^mSUuRIBe~;gb?_ZgJ?u+JdKDlt>X6#0qFXrQ4
zAH&D}>l%K4dmPItB)NMw#J_Rzc2Kk=4jmLN&Nu{0(trZJh85^7gQbB7p?_j`#>pi+
z(_C@*WcO`9>D8s<-^MEYy&zA&eK7v*^+wSuCd+|i$&P7QvgOD&Q_M6w^lVjeY{@kx
z$+c*lCwsDK1e)eMW+0h1?XykI3su=xRY$WWLuTU?+cpi&)?Hn7HA$y^VW<YSDhIOX
zXpW?4w9oYv4SJv{8)jgep+oxu$M<yC^mQ*ZRMX>1qUK9-;3~eU&}BUuXG@N!Tbkur
znjR>&Py1vXXIhSA`KBG<H`*r!p<`+`ZpP6RU1sB4-*s(YHXK9tLdnb@>lmgI>RRAS
zj_ter8k%GJz9y@d>(~x|Hc6F2E%c1gwOu!4??CZwT~bWP^K@AYBzAvVC|MeI>5lJ&
zF29#hl4M7gRLfFy%XZj1@O7Nqz^m3R!_oPB)@;{-lC0>CE?c3Z(!E%|6iN<G20dAK
z6@&IUiWMl9Y-)<6`>M+JX{zGm5%8vU2{*;wpKkhiOwCpU*RkAyjSCfg2i>qt-;qp3
zVfP#alC2ny;yb#H`M|F0NtP|iwxI{U7byIDD3+r*x-V&JpeVW`(Y+{+jK?rl+#W8b
zne6^dT?srx4MWKdG@tErZOacZeej_S%kbE{*JV%Fbxm>%!wqCTpksB-SCv3kJs%Uu
z^+Vcc_<rcCn4+rZSf0e*pDZh;YdNZ~OMz#3d|ny0sThh)<3gA3(>+(h6b*bEH-MK(
z=d*m@@FiDs)zEWonSEbV_XAfCbroO4z!YTTY)cPpIWTR_a6+5SXH7CqUB*t^Fhdxl
zXq@6$cBpEmYRRGOyKJ0<iEqoAYCFE=>NdMS+w*KsS5!Q#g)haPEet$U#$@)*5YsAT
z?*Q{3FF{j7+qXQ^W9JK8%hr5746Q(OC6gVCJFx&rTQ^+Q#B88@F=fldI}B9a$Je#l
z{YjWqo-AXUo3`Xh?A_a@<ts+$Ikt<#T{aJ7%m_TOW6PctdI7s<d~(mXblIeb2~~Ct
zU$$%u2r&Xh^VE>Pzfkog)5YBNfhL<hhwEVB@HE3xRm-;}_D(S~eAATyFWV0_jzOSY
z^{@wDO7rEAjgx_7ycspb<nROjE#&|o#?utT)-aoGhE?5C6g$LgkTEp?IC=*@KxLbz
z>?n9!YQWx~uSlL{>5>I_nYzxNL(>9XaTVMn_G8VXV*^KZ99ywe!;v&h1onM>Q}q;E
z_k65GmdWQvpcooX7+5l1fzQ__#W8GHw167S&VapV&9qfFlwDiJI^+558h9<HYa93~
zHXx_6ZzSUuu$n4{<OBqCx~}8kP06Zkc(@zYVbAO0Q>vyG*iI1mKopI`5~}#L9AX`}
zJuN>cal?k=8nP{E03#cxTAJsoss;f20e>%8g&h+sCtenAD}T>c7`i6df+`u7qO#}p
z6+d)b*}~VrMCISj#abPDz8%PfBa^**#r18?3at<?B2aCHeJk`Wtnac9>=<_bo-ut)
z6Vn`%&vGUH9ERg4xD8BeAK#z915NQ2-Su=2H)winuK9|oy5L7x*D==&b`8w~qrjvE
ze=tLje@pPEK=mE02ADW_AbJO(i?z$a;-#93XDV!NxY%JUuA|`#&~&14SioEhOiB;&
zm1Unln-p4LY<MLhJu|;&FSKl5vcdMSoP><`D25?RHq4u*t~yr0=c%r%o~3ApCR>4_
zv+Jr@g|O~{k>X)YmCYB)ae;W?1Is>ci(_BH0x5Y|Oe|RnboLHxTh`^kvTX@p29u5M
z*~LPv;al1=rh}@m=QVKmmhFbBDr>Hi=O1QhJ7MU^mL|!j!=4um5!}>vY~WuuEcR@M
z=EF*hx#Gxy5B5XX2xwK;91m;){K92(!*D|b3x+~VqGxh`scT9ofmd3Rf|W>P=Tl5W
zb$ktv3e@TrdshZT5WGDz&~4cQ!08%z0lp#OP2pxj*<kMv{0p2vFkLW^01&2e8l)6k
z#TUauXgemGFFIIf5SUnwZ951;h6T@cF^g4M^1=|Die1CiWmU&|7t;T2i|qr)!U818
zI<BvJ9^0pz5F<=_n{wzYD!XTmW`+?;cvUXm7XNO#jKv>cR0SwClf5h5Q^3VdzyYri
z?}qLfH|qi95`ZHaCg*wJ$F^^JU_S~Zr#wHm6|5?z?t-DaE}t8CkFFNt5+P)jKxb<X
zzHI1)0ltW(;8nBhN-pFAtdbU1G28=tf3ks@>|-v2k72E0_hPGNsA!PYJn$sfXYUWf
z5Y}-(!*(zgIX?=q!n#HX4yA^=<*>EHkvu$S==%zN7@O@g@Cm^-WZ95)*Wha+<ik+G
zBIa1&N)XfN9T+ND2bi`ZflUEQw9gI<M}}Yr@xWCTmwh*|_CPi5FbJ?{0l+j4%m~~F
z@?WT8<>J@C(f~$=brMgfIea};u~3<o<iTB}Vc})-6at)Y8nOau$JVvHyo(p4C|D#d
zQ_{h3*fn%;N(j7$VL}Kt**C}PgKuU5)DVj#pFJ<0o~EuG>X36>o8btIM+r!zaRG8q
zV((PN(jYrdVCyPeAsTyL0|x`UnuiDSarbmxh)}YQRThg8AnmbtV7g#E5CCyIegG+h
zy(<zZoDf`0c0D{J?Q=|ASq=dRI-kR?LEqWOil7r$R!w$3Lj$%HFh&3y?2qk(1Q*CM
zX1=NU7DNO(pN?;cMa0Bw^e~L$O@_d$YT#wMq(YEn<5YYe&%}yvflKo_E=v%4@Nski
zUdPHv<3bxkp=yMxkM$BWiQThf+6HE=7Rrz|b@ptEXW$Vu$U=%wjDf~Mv{qdw3?K(W
za#h(J2alJ)x-o@ZFjtph5bGg67X%K>R@35G0KOFmi<^U24{?q^Zvf#N@(&gWi0ynG
zG9fb?KKxPO4p?^Bx5T*u6HBToOAuA_>o}fPRsd;N!!&f+JdhRKgW{4D3t526dZwX-
zt_9iK#e##?lb*u>lfi3t6c<vg!DVXI1+!OmJUzISFLOQvnH7?_ZD5V|pn=e_aG=rI
zz~U7XoRxi#z|tKla2%*=Iz%PD#zMM>xB*sy|3kf?V?#^v0!V;ZWN=P?f8c%es!7xH
zH7tuX4lAc6>);(=UJx_bythnQ3pGiR3>T}o&ejb`%BqU@g_TUR!L8|7+ci}s#EAl|
z3s~}KpJPMG3n5>45E!s<(Z0Zd#I2iVD9OI8a=xmDmIoEW*MTJ551YGqaxhQK6d!9o
zloL8u$2*lm{7=C$ZMp2dK!L^US5+6{6O>8z{_wqQQn_^9(19|B51<DEpr?6IMOAz~
zI-h4lL4u?K8Co_ZlfS<JQZN=f$AdKBE9^NuONA_DL0pDVhsB1D)u5=#64(dcKV%U0
zY^v=^4iqB??>-DQb`2ezQPUw>S$JKB%AO7SjikXL3XuciIOj(`{)S42)zHG)$lpD*
zHZuhHJRr%{H1=#B-kh$ROfL=rF}fFE6xsqrDNCXSkBuYQ7IV+@u{=ZWWbX<Oi8Tpg
zgDOc7pg7L;(A6afUU(^yylcst$)>r4Nr&5^Yk*_nX|b3B2LwXe2mXQO-F3nC6dS@A
z?Sn)EhKwnX*$+XVT^Fp@2iw469zfLO?*OI*ABrG8F+?h#>m-s%%mexj#2Q)8%Z@H&
zE)&`fl<d&b**mo)EW=pDbPu8|bTRf0un0lamR&6FSRVPj$GkB$#|Qs{!ob(2Fn}J2
zB@yzn>p5H|G-Th0jN@o<Pn&R6)A@ikC`edYR8j>@Hiw{kKxe@;hj0-D4tw{|7NHH(
zVufp123v<9?*e?7ejzE8T#qspOf#&c(4GybKI}Op334n1R+tr_>T!FCtwMkD4cnvj
z7pf=Si;D$Ef{_5qGz4Bg$0ZGl6e++Kz@C3zPsf_%)AVE38U7p)4I~3L0SGUU&e%Rh
z!Ag&hWMQ7GUdZN)0fT^!M~3PbNQ%tnGk%rvZSa!us%>tMAmkb*1Q-Y=j?U+r>e-H?
zg8{i%kYEC$dsbmFGD(5K6Waha?Sno6)($}t8i@q?jP^m*(f|iqP9&@$>{~)m!!!kF
zm0V19{(Zs1Ei5ArR8GtHxO@spJWw6ogQOF>lE#kJEf0#QD??VH*~R9rhgku>g@aX?
zRzmiT@S=ce4N|rbS(HDA5B7_-1OhV<h?hzCCqt`pA?TS>h?&6ov<cBg!<~^V3A+<p
z6D>nFY^=O8ZW*E!dk0v1FekuSzyr<v8i6|vAzebQgIx?F6<q@Ylxbmgz#<Jbit{oR
z_`&zULgZ==W-5(?po$k~Kp2Cm3>Gl<{(MvNAUp*o6jDg;w9j-TsQ;c};r0M`_RRsS
zAcQpFXoe1J0Xr5bf<}i6!v+$9!_u)(#IcHzTn5D(N+#{|ZIzZy*#d7MjhXg&SRfRz
zS`SkrG+i!FlAZ_V8317<6w)}T-4Kd>RkMIrSaE0{mVV;eSd0}cynK#3WDaxzCEx|p
zFvk&;VFi{4tW1#J4TeEe#tVY52C)GW1(%g|;KBn)A(j~sZrJ@n;KDNNJ5br74CHk`
zSQx<Hq37U*`}|lK99;`CG*Ac?)8cc;#CmUrSmi7%3p#)I5X+z*W6D^eW9RJxc)?)^
zQ-};v5jHk@Hs~L)G3X|ia4h{?uZ4L90*NC-4FQh|*mKw+*pUMpQQ(6Qb2}GqDF{_b
z#yrQm!F5U4B|>}$%nN8#Fu~9@0xb1dn87je0%2C8eV7iO27yF^B!aKSp2NmU0J+(K
zwFwG3*K4uvLq>*PkISq1buDyYoiue1v!A9Mn;V#r0U(3hb37R07|)ZTY(OIN6&q_U
zOh9y9Ff6Qxv>(%1=6VH*wy?`VUxTQJ=|<zA!b0{YLk4UE8s|r<D|;{+($WFTF_%HW
zc#ROIA1FV>^4L7US^~oo3_1{9Lm2Pqd{_?Xa<qs;wcy_zubxB+2}U=t7JkoI|20f<
zm`ojn8%Ycw4oOnBZ(zzhmd|xCtRadAz6iU6j75gMKgiA@xCvN3q-DsrbX{nlwkBbz
z!qn7cKF0$MCIGB_T4=#i%jFGbc~UG$Y$O@l_Nx6WTa}u~hi+BMY;oNIf6JN+Sd60X
zHtmhnO6G&K?(?_3&mVsLAuM!E<U7UUmd<#QJ7Tx^<RN^;hK9ry&oO$@=5I5)=*pWI
zJ$>p`jQ;q~KVbCX!;WS2tP8nA^C0s!w!Pnhk1z+}K8Kyh=&4Wra2);c;y!bkWB8ye
zHe-&(?XG@yBJH1L-p95d9CRk5H@$xmqdoqi%)vPCq(j(o&r5TeWAG0foG^usd-j5>
zSwPJF4M#A##o061xCic>J%N7LPng0SuvaKIeUr9Fjh)CGjLSOy&f+5W_-Qk9%$_PW
zwb1XL|5S$P2A>|iKhfl^ud;Eoo1bFW{l7UsVGiEbjT7uVzqxTw=5YV*`TOli<F6m`
zr|pUU<oXvSq8GP(V>_aEU-B3`&)aRsew((Ny8g(ndu_|}jAkY^?M(asy4f*X5}kJm
zcgUZ1!++Rye!1uuw7bDRhdZKwb~g{qSp0A&druF2@H)G%;g50$`jmI>W#fN2q>=r8
z>W|K5+XwFVG^4wm_Y|XNP3dCa$GY+{wypjAdiMTS?#vz0$Ngn<=9oR~10E=&9mK*;
z8ul7|EE|9Oh|?KO%sZ0NeNW=o+5Czl*!G1NpU3F&ZFAXkY}nevwp)I|f1khci){N3
zV^3wzf57@Kdq3|cU)qU&n0j$zg6QBYhwn}FikF^d`?qWL+51kswbN|c?;baQ7}1Av
z4}FX1E7CE$5dHVCEtrG(!XI_)O53?_&Sv{hdT5)Wv_1d-jvh|*(E~PM=iBLqLs^8z
z#OF3-_cM3nw+#Bd<+xkfdwg(`%I1MOW&9qr|C)Qht`ogy)Mo6xwdI~>feL%v&mHZr
z-uoOoZnJx?VdG|e>m!-Qe_fiw&Oc+v>FoR8_|ge%p10lo9(zw0UUxft&Zjf~V%HnA
z=e=zI-kaXTwzHqkXU8A){EylGd+yqr?LTw;&)M~6ozDXpj(vpF^;(v*{VUJCnbAvD
z9L8wdlJnW|5B-J*YRo_P1-iY4WB+|Vqs#B&xIFpx{n&Q!+9XE1&*9tGA3KX}cdfrC
zJ5R9kO}0Jk%C{Ij-TxP(@4RyRp7h5@TfWjnbo1l>IFYFNi0=`-{EyGD3%);h2E(1E
z&S&4}oil&Hw!i<-TkOL3AL@lP{*2c1CllT5k>}a(qbEMdCfWw)H*Q4xxA@nE(}=$L
z{5H#o-oDQjY@YA=91~3%{{8WiO=$m^7dzPlY`gik>^uDCxfyK#57Yaw_tQLj7Ml-0
znXR$!d)KSe+3znN_APdu%-vV8_pbkU*fw;&V^e!Gy!`K&!`Zl#pFMjA+JDqZm$HD4
z=I-yY^KP*h4{-b4KVSR~{XX{TTiE^m@8P`}{$Kt5o$UOFkN*Xu*Q8vH#%=okk!+qk
zGWU3Pp3Eyh;M-Td$L?d}@iR;s_rh-N?Dy9fe4WuZcALb`)A;u3Z2Oq85<70ay&QJG
z>+N|DyWbb~xsuJ_zf6CfMZ7)w&(}An^ZjbmeU2qMcKlZCJuU1Q%Z_hZ(a7G*#Dg4m
z-U}Z(htYICaPeQi;`iD1$qjUP!%^4m&hGQwUCv_TZuEZ7=o7nMz>oXv9!5_(cWXvh
zT=F=hKl<+_?D$Xb|AcKXe&zzUeRAjFY<tn7AF%D0kKuuWuk3X#+jg(Lo&DZ){kd%0
z*pUZ<{_E!F*>=}6o3Z<O=(AJU_RXKZ%f_dNUdQM*?{FO7@f2T2hyCM6Z2!ks<O6jU
zZq4R>_W~YZ_x_^ct#qPIKYnHb(S0Uye)`OJUZTy0pB(42_t$#xJ&cZh<#e{+9Dg=D
z@9~fPmu>HK$}4Pp?lDjD?Uj$PZFAZ2Z2SMtoWiz)k;~ckiC136p5vNZe#5pOJ@l9C
z_ubCEo^9{zG_(7B_%?o>)RX-A_C1#Ki-U(9&Bp&?`wQ9l1FqkMo$uAx`Fs25o5?x!
z!_DDkY}{!d^7r?x-_2v=o?0-H?Vt2bmu>$geI+~3+=>5W+uOdz=f(RE9?rJUzn0(E
zeZS-9+vS7D+4$pz9Ls(`eee6(_Pn<H*?FIObSJhyb?ySTJ#674?DtE5&jaOtzs;p=
zd+;0l`>j0bX|^3~vjaPBo7egLd-d;0wtdB2oKJo2xe@GqgL-)2;9;BW%-+*C4&(vT
zkI(1leKmPG8+Xk$kFoP!_sIX)_VZI$GTN~J`|P;$kJyf#|CD?A^Q<?U2WGBXt{xOE
zeM1LDiz==#AV;q`IXX)G*{mz&q$>n1#XpQy_G3YqzVK;<{_71}8ji_uWaAtp7B!A-
zgbhG#l9eV!pCemib$P5Nje(^A9za;F4{99UI5V@*NhgnB!BbNw;)v305{#KcsTu6Y
z(r_sJ*vd0D8ppB}kI{|Il$O&tHr<^{jc%MxSs5(>@-DKu#<52>BIBg@YVAqC{N;QX
z`*T4mSeS7_Tp`niZP1{r(_57PGwpTy$@IeZurt%);M9#yS63?OvB%is&}?TK<Gs#w
zE=VJ@1X(!zFmF+JdU1fu=I@nlw`Y?_;CETVKcgF2iU>xxHcxAwFej1i?iktD*qsgh
z_G}m4LR;gg#H5y)(-Uot&8hT6C+E!TOm%kz?ejFbtuZlwN^?td0!K`8G9B37`{=|T
zhu~;O?#lSiE_;khjGHl$exJ~d9FLA56O9=+xuv<axn*8+Ycz;5!xC-XFhd7>C(@lu
zMvlUFT$GsHGIQ>%#P|bo19O73lg_o#tI2kDXFPhy$#`W>+S$V%qm7Xodqy|51lg|6
zbT(*Z4+(1pzE}>gDVs}rS^6Y25!?NI_#9pXtOyb!)bP4_WL?2;FegL#$JKCxpo3nm
z*V&!M+n2G=bJA(NrNf;RK6c|+Mep0^F3Yi9^nNplawr<ac4UK0GCC^gNak_{XHs!1
z`CZX4><W`X%Fh<woN7zT=*G0u5ftvmC3DWg#lf<|orqzO<er|<B1q9-;xhY>l+qwU
ze53)SR6|P8?FH44GIwJ0?1Sd-ug{oaO_{Jjv)3$MY_lp}G!3)KD2o?Z+vo&Bc(LMm
zQGmi?ykJl<Ua%tqM}$04yx4H@3*!YI=$@Y4s^f*?KrEvMyp#se4&X(AvxwlObx!lV
zaWlrvnaMcE8b%A|US;k=k#&kivo_9Mbi^wV*AU<?Y8mcgm2wvvRD>4VQ6pO5H;{lv
zA?|`lv<)Hd0#|~LxQ707m(tMk0kqT!E%WB-_T*^`W?1{rYMC%&+Wa+&7f4gp@IsNn
zx<*TS950HFI4d!{kk}VVXnASOP$4Z9aTgj?gcsUTBVJU9ir|Stcv0Xn6qCjf+ldhE
z8uS(fUKXWY4B(|scsXc}vi~G~!UD@Q=gpWu{h&397YJI_@B$A!vUJMx7c90CkR-xi
zbQ+h}X!3Z`V$v85D#8oxs1Yv;rZw(b3@^ZzD#Tw9s)jg}8l<sOKi&Xd>V%ip))|u*
z81uCQrfJ56mWeagFkY~LSC+>l#PuM(sw{ue5Jy9NLqHz0!AnZ;VyZZ65r3gUMR=hd
zHR1(WHbwahQfn+l1TPfk4h5zLyp*Qv4d6upFG6-PWt=f_)?{<R{w>pH=rh#M9WN}h
zh30Kc9z$$BqJzZnV$rx_{$l8*c%eZ>c%dD&;sr6$(0;`D3j+W3o*t^wQrV7-pjF6u
ziq?Pqr8Jdw051Y~5z+Ri&73p0l?=&KC#-2xvJHK^B6G1c*+jBwS*)0dSjO=pyoJtC
zf)#}HmGBlCRD>0F-P$b42>rC6SBYUoL(WuB&+Y<PK^lN$DE-F@Qp}1{ya%vSC#+0g
zFlT>FSukb7Y-{$U16rEbFmI`5NwyWF+p#4+&RP&CPR<P>xD+^J@~R6sQz=|9s2DET
zQ6p<XvW*HcQ4B5?jDJF+n1YB_Td4sqrPU7x;8G{J>_2D!=7UtFb+UOt%e*<0=6%j^
zK{g)BJE&zHmB?3u3?qgYHm+C{Q+qX)$frVsits`^YQ&2QQB0J*Q1K9``kfWni!xTS
z$3ShNL_*o>KYJ;y4Dz|)r6Y^W25ZZy-{CBU3!Z)30)Kw8)9qZ^-knAXR4>Sq$3L0I
zo<gs_mrBCxO+Nd?h*_DyPkOoJ(h<z3-`3~{U8&Ax<QC@xmV_Nm3%fd+sHEx$?)WeA
z=|db)QP}`_gZQT-n{d*8qHln=cOpMPjRy3({%O=mIXh}Jf&dnEX6briGL;&c?o0=x
z5;I#SHn&iO06W>Z)(Ly!-IfLj#6tscwJP!)WeFqKh;*E|_JizkMNpuNGpJJbRLY&o
zFsFiDQB(kS#i9a`oC^~N;aP}3!NRB^r^oK;NmYd$jBtvf*J3RwExfZ1K(4KER)7c<
zgpADM@aoL8&%rdt)Xj&}EJ>!@y{>NT=nbOD1Pf8piO|HuXm~|qd6zx%h(lE&ia&`4
zAjGEkr_jlCwihFk&Jkp~vZ<!9DbtmUPLfPRs!CI2523n!PG`<Z<>6WY$0!CT(PVWL
z#^CfF-3LZNyeV2Wi4eI=$TNknuN>ekWd8v-LLwnfP@+M>NM0%uVo(u_D`U(i6L`e+
zBGh4640S3>Q{dr*Acveb(x6VPjyDCx3Q!89|17SwsMP@8z&O?h-a4|`!%|Jz)R^5W
zqAd^SJS$^h&!%*#X}5g5Tpw5LgP|<f35T`;QLeRONF0*bkut84fiBMYP%1=}@v(8m
zq9*dqOK}+OhyX_Hw#1M@CyMqJ42$8Aco~L@;Slu<VD-DRIu21d0m@nbVTgS1qB?Q|
zFkDM8%(FeW$?3{gMq*vX)V9VL7bYe-%~?iquk`0?a8F>e8f+D0%p)O|@DsDBz;tau
zUKHV;ay-@u^>Ip2OM|L$Pa0E-+Oph}ibV>d7U{^Q+0#Rn-&bX;=L#YU^`MTRRfB?v
z;%G<(?<@7V4#2h}aMCR78%6jLrk-M9-@7H6<izd?#EC~DYl*npFYVBCn>wH%eo2%=
z*et4{4S@!z+g6rE+M15?IGE}}a5Ku_jp>#QUy(%y#kxunS4LnhgSc{9B4YbY94*8r
zQ5Jv@cYJlkSrW{-cK`V#vh$1TsICKm`(mhvt0S%&c13ZuURtABJs=$_k_^sAp@C)i
z9t!FZz64@AYU6ulq}6hK55r>8s*N%xq9}x9D?wUaUKNF?8YWY5)&9e<v~=-007IYB
zg3?|Nyf3>fiyR*x_RwT5+a5Yebmb{9mgawk(WKW}(=Yb7IO(`aaKJ9c0h`jnlE{o2
zX@Q9m@iAjuC+96%OU6_ru+?l#MIosw{1SdJlz*X&sB$1hWlAxS#(AYyDw>w^Nm>_5
z^hr@RiIO&=>Lk_Fg<0q7iU5_RF;p`RShi~bXlWhs0f2t~0QH^hBDd4YRPuN*e@2}=
z9;*XV)b*+cQcO-bXfP}eQWfrca4`{1O2bOcJ|+4ih7_rfWk8A{5pE-!Pgz-3McHOS
zSr(kpggbtw07y}G7WLTs&qYfM`>z8a&1;X5!(%SKR^Hdqn>$>__!_g7aIZ&jM5ZGP
z!%bHc@`aJ#On;(=S6S!#>UtDA;vPl9mzX~T)#s|B3H47AZCnmb2qRP|Y+nRU%<WN(
zCtZuY7m<G08}ibN6+&?d-y>}`zpHmx8NpP;GJty%QxBo)B2}`ZI-~$TWQW$EAC~sT
zSO*~W#n2Dy;Xah5I{7VRR3)zhI8^LLY`;=g5kv__neu-{0H;FeD`8-$wpzl$XiyOg
zD`SqKF{OaRu$U$Y_X4S}LV(i@cpH0qsK@21ETAfd$iNhgvIeGH1K>)V&I|xf#Bt$t
zH6iLGw}`hP?)A*q>a=%O$7x?j)KOuJ){5>z)NrTj?=;tAeh0ME5JepuSIp*=Sk5L5
zs)jlmBZ4}Zjf8X1G%PEyf*w~5b$Ag-Q#EX(|B_>ATc>pZbtRHxX{byk@eyPFZZ98t
z){*r(B6~^!-V!5{>0GqGFSsT#B5D>cuhO)FO2Vj|UO7%J8c-6aCaprOtTj*#-6X0Y
z5M8CHr3eM9!~lpY{a}h>e2=9_moPq*D1g*h<cjFaq>hA05<VluMFMHTu4ou;FwXj5
z3=uVgB6`q7bZDi}8OHh$9Ew`p==#=w5SI2)TL(b+#gH{u#}}cm9bW|SUImCUs^sgW
zJ(Y!`iGFir8u=pnBH&$+p)d+bpePIuVY;%;m&YM`^nm}@B7&CmE*!w2hzQ!t*H&Bl
zbeocXXTM`-SlPBjlTj~~&!)JOioz$1C<a=5*ku8St}aYz6hRy%R+O+l)F>$7d^D&?
z`YbT0M0}5BXp|UPFf7jaAYq8wEm1H;fzDUS$dU(~ZlMu^RRiEk8-or2P6Tj4cheG-
zdTdfw8*t7S190RluL3wYuh0mGTxc<2Q$`y*mcA;$;%X~tRSs}vym@5-huT6|PJ~#T
zhCB*LnzvL__t5VPoiow<q5qO*Y1iF#0C0JA4|PgWW)%5GDX+ad+wL45WDs{&;LGz;
zo!$Nta(%ZuOS0|hAh)D5v$#D-xrP4syrPGAHJ-|VPe$?=b=J`;DSb-xW2zISIb8?!
zK)v3=q@7DqVbyOcJXaIyHYiQ$lv^%3J90rpy5K64nxX+E%3^j@K9y;89D8at!>Ei(
zrWTq7gWr?^Eix;hVTv$Gz^8;qVo#B77;Q%?AGS*s!-8D}qhkD#>eEsbo)BKqg;PZm
z)5%rws$h<wq(u#mz0!`@>i}NA{F<SrRE$zf$a=j*RkZP~!a32DkeVMtlq-iqB-T*@
zArTZZzhQ}?G*+yO1{H~_bk!1fAdQLkMNq`B7!*+wScgqj2#MgE23+hvs)|Gu<wiZg
z8h}{ZhJFBuf|4qwBW9db)9NP_I!)Qk;r-SQYv+|~G_wM+Sk;hAUJRt(mvt<fXw?eK
zi3o6Oi!d$YS1cD{TF$QsrYMX06)g-!gekSespwavZg%M6tJRQ~h=yzEUygQ{=;?<y
zDh(jF8gdgO;8jd_X7XM|ITBq9oE%8=9f?ffqwrr-+RKcJG7rng*NpVCNmNliiimm?
zOAnT%qr#j)|66R}moR4@mFx<eyv|H8s!Aebh4iUN-W3;!qul;@qE!4*YRvgZx8*8O
zS1E-Mup!PEQ*SZzMT<Deka9~z<ygID8dHr$(-<KZjnZVI_FE0bOQGDJR~6~#g^K>^
zb}dLp>s$l<n>F-j43TWI4j?^JZo9hMyK~7@zQss8YHuVvy3re>*O^(O-G=SiqM928
z&`yj1s3XXu;ziKT!_a$G-Z0xFh3I~5qbV$!lB36{Yp&9z2{T3iL0MROBX$S82*>JZ
zuINGx!Lp*k2}ykD>ssF4L^U|5b%zM^1@cP(NdbIC$}5Hx;gR@#iO7srtbIEpnHjrc
zGBZ+AP>4e~z5w;%Y)BcE^Q^S7k~zkND1uG~H55^IaA<&r<6$VFZM45Gfbz?)!Af#v
z4c4ra0*key3J}o|09|##IxsNK8#Vk2rYV9%5=l#-NQV|dk;W8Z5!funB8`dmMNq`B
z7!)-atieWw)K@AGj5r^^0&mQt(1dvk9jyCr_;t~39?io8-$Y+!!}Hf2-pE{J1)T8E
zAeUR#o?Ap8I-Acs?#r*jCp|m~ofQp^xPiBIPHPLw-=K<4EB`5CjY0ZwKB5CHvQmMp
zgcX<|rg7!C9?WDupu|+$kvG*!wP#6L)tDWM!=R`xWd+6&rjTU?ClG-ci})!v5oyFI
zi!ermiZCXZQ8mFMOEFfC-Qk`sn_NFaD6?!-5z&CNqpRW!ad!s7ergEi(Pd9Z2>=OA
z>)k-sgLMFBZH?1|j?T=oxqSj?J}U$GNW>HRDJ`QTpvG;zS&e;Ra#TO6So^Hq$r+FC
z$)`Q;jH`^<jRa<>nq}0<d_+M)@e8X^@q-2PG$0&PNO(md6Y+BtM}W>8<&cL)YbdXX
zpG2SsD+*QXEn#qoWQgG}Ke7mdz-R=wD4-xeEZQA$I*g2QIy?h$QXveQIwa1Xo-?Z<
z&_a;|&DLrO;?Xr<RRUS{d`;3_pL!dv{hn}Zx71=C1kQTJk;gIDSAM3T^jmQ;Dp5Sv
zEx=sWbjW6v!D$1RC{jr2i)Gj%6%i)D3B(*wKu-xUSzCo7w%8l>6Cn*JkH8mUlEy^)
z3Wmi1iHKENH-rEQVUR@6ld1v|5-QxCS`D>=4c`xa)p7ty$MPg?0Fc!Tu*3xG5LH(b
z7W0MSj4FlU)H-Q^C@>73QW=RbJ-`>pvf|@NG;Ep7G$KH(v0G{%=prD}iAsT3P6tGM
zE7j@|0-}z9c|_YCQx%99h?2}!4fav^f=$<QR9}|@&+-O<NDYR|BncNwqn|Sn)yhDu
zS11887plP!c6n2>S+CVN3}JO-@DkvM#5Rky#F&tZAw>de0pp7>Sim16URVx-=vPdE
zN`eqH)hgv~m>C95y(!g0t;Gyuz~BngIq|~KT_+&O>wqJi&2R7G)rkv?F{p*JxKI(C
zfI{075HY|*;}Rn#P*7fLcQO|w@F!ySQh^gj7MLQZMK-r+nVU)WdM=XLe6m))xN%~{
zfzfdtj<+b84yHM7km4qssWT?cY#Bdc+Ju=c%@XBj6<c9+o?PYS+liwnd6<OzgnWiw
zPHqv-QOPOMmTpUnX^lEc>Q~S3hqI7k%*hTJa~5I(qs^mg3vxsg%!GU)rFet~PU@9N
zBg936i;!6u6c7!R!Jx{qNE#axR#k*2iSkGU0aKy=3##%+Je46Mw4ncXQr&YU)zJbU
zlE07S$aO$iEy;&OB@Ms5wacMm-M#KZ;*zCUotm%dJvAQ+n{1NMSX_ejOAlcHlc9f)
zK>8s-Qf8!Vg`wrjcA(0h9Xi39u$}VS{i)7{NzY07ndISkBEoFm3RDTRVt7St9Vvm8
zrB@WNF{tBRIk-ZM14k8<USlw=>PjrbHW4Z>D7(V?BFZ=oR8R-wtP}{#8K+GRo)PcS
ze_UF=Z}^g{xoYUSK=wL>%Y3JdY{E@-dW(w-#2uD`@B_9;tQ9fj9r+sjy@pk>gyjN<
zf1m7|7=b*K-srtGfzNv9ETc(S6);jj9h%pc#UhZQLlqTKB;y=2#w;<Uh)))vE@G1f
zR@X`rEJg5AdY6z0t6?m($i-DTB)UFXmZ{f(L{s+zR}W#xg{%=8>N<eLI@nk1phe~j
z;{>ZyqN83(k|?QE1!_o+FtJWkEseOCN}-(iK?G~W<VwU-xi`!_f>JLhjDS?er$}Q;
zjW!q-6G$}+;<YH;Ag?0><gN;E)5l60I!D<QmQ#bx#zGKMV9Nm{2*U|&eE@B1$!3#B
zn~zjWx7MKZsx=r{aZ)PM*sE`@$)Ix-o&o48N~>tjjL^4LWQ)jX+f1D_L}06t23l<8
zK?SLjSO6MS#3iw)L=cUjO=F^c5xOxf23r+zEs({9xFiaOThJpbb>WC$YpF^t;Wiq|
zmqVME1KTh|UtSl~$S)L=3|A*t<O>E{nVDQG!dBA}!XwGl4l2%UA@Y*MD}vg!o+|<&
zfeOD0p;nh=iXcCsDzk+NOqC3^|4OFfSazr)C(e>X*>?w+?U%tpVi*-;Rb_4Mc`6aL
zVxttsD(a0>intgI*Se!pLyt^|24XNon=YuBhgF3kgaz~ks-cAyvI~$xrm42$Tdr;o
z_$h^8Sg#14yx_Z9r9tY|h(m#=tct{n8Z`!NI+SA-XMKobW05{0ystKiwLqz?B(Y*x
zOkzd*MoHAJgSa{B8~44+pwq`lD8DLWp{c<&X?vdS>57W5RSQMP*O9onxN$Y=JB6l-
zm2Inirc#R6sY6koc)SkxR7AZR&8EKAt7|Qim3)#=!77TMu<GmZYNHNMIS`_Rzl=aS
zLDmPWQA~40kGXhFRyMFmdnAu&N#>NoMn!ug{SgCWXp~J;BNLGj53~%GMA1K0hof!i
z7J9GtUlt7m&y+n28B?L_hT*ypNS<=C)D$Nw>0hxhYZri97&uT63jj2h%}wsZYl&+k
zAB-AL_$A;~%v}~vT}BE?OVEXs>I#088r06hL`l9P6z4Hlj7t#naaJw!quaWH{>dcU
z#u>1RUtpCH;lE^9DmECEnv0_C2*$M9l;FK!SHY+l*mcDMmlTzFQN0J%?NEF2(yDk@
zkq`}8qz0i^H9f?wYielwmS=hcx~GuPTj-*!JyE1r<n^*$Q6+=2Ve4gbMe_AUY5;GY
znzXV)AQjoOg+2mM#>luD3mBq!A#gnrfW|GlvGxR|_S`ZYxr{lt96Q!@1mFmAV-$w9
zjH>2b2*)VFr9+>r0iA)1iUhunn3X_tC377Sj(NCVdnqH1B;>UbI@{~?T#ed4HDNO*
z4G?FoPgiAR23HZNEeWYlDuu<y5sTar<`Wf&v|a1wTy$@@fMy{X+K^x@15e#x)un&n
zgdx<bu&E*6rvEZDd@z<7NDxq572>_J=A5TEbEd0tY$GVdJa19{&$QR+BRjaAh34e2
z6Mdyph(^mHD0PoRvz=*-M`6kwB3|2b%en%7n761qjX0miv7rMM6eDQ6J)1lNzf*xi
z2swv!2bpCU-P$~@dBU7TN07@Ty>@1SNQ`Su&~naEnhR3F!i*D+ak_Jz3xo8S&Wz)s
zrU&A2Gwn32I~%28w(|UD%F<dAc8vAiCX8hfd<DPc_i{!!GJjqA-gMafnJov5Zy7gr
z20N<c$8jwarc9mJT<)g{GbhiO*}95f#?75Lbxyh8=1-d5(mZbB%o)=T%uCf2AJy(V
z2o`SZmL%0nC`_~`DVgsHaFYs}Bsa-QlcLX&t+AR!{&@<Sg|X7q58^SxeMfku2k6W*
zSa?9M?=}({Mn~U+-pl9&3Fe~{c?D&3f~z5;6Zw}qIx%%=cB(%*Q5;>2-M!(JqZ5U}
z%%c-oKiBT0f=sS`VWzXY3v&o9U@{)fo&y{NZYFb`nW^b8)xC7$_z9^bCRbz6=*E^H
z+l3maK`Wa_nucCr)nXTQWWd3MOJ3vnI;5fbiixasLE%A?NX#e}@Nu+;dtgd@WIcmR
z90QB=dZ_;xRVwN$XfxEe7t=WzWPuNS$#U8~VlWZ!BpTOYpoBc!Ks`Obsw(lL)tZ6|
zur<`;HQ|;+vTC5}zM^>cy5S?vrvQvQ9KCKGHx=;IacJ_BT4s{OpI83lzxReTR#W2z
zECDgz&bk2hThp(zHc(E1C!r&Lw>1_bL*hUcxdUB<Yztk7NJx*V3c8FwLt<zrKUXo-
zWA-}?DF%5TmZPPB#Gkx=AHf?dQB0Xfi$7@Gln)^$cOd8*8LyQ}4jN-6X^f73);jtx
z)ey8LIjZ5oTdjnqEqT(qP;8|axwUBvtmI%pBeN(~XU%}QjwJ_S^X7G1R#kFPj|TLL
z2~h$<v8k@ENH?iUQAL}8jAonYlA;o0jUioA6%{8CF$%2hl7k8a7f~*b^^U^4S}O2x
zj&u!uw2|CcLvB0TY4{4t;5oJnCByX~i+$CLO_ZbyP09p;P`Xf@jWNGg`PREyrPjMO
zV2&HgSJ5L)&=9U*-c`gDrW;gRqb&185Em5LwTmfm`%?Z1RLA(Ifk=K~NGS%YOHh}J
zxz$hw85$|=HI;Bd#ZyFG297OzQs@QiLI6k}RjWvoEh(2&21<z$%?GtiZ=KdM3;C;g
zhgLE3JOo)j6+2q52CVAfXF@tk6?h@(6$O#XClqL?BSC&+0izh;St6U(V#XH4mWEJc
z(fCHx_R|sLacgy8v1-9JWzji`=TrR%3a<*)HE0lv>`q{arG~LDU2UW~gm(0Lm~tK2
znzIQaNwPnpW^Adbn;9FW_o@IEnZPJkSw2x6_3K4DPt*sn#Fa~w$#TpP?pqmpr;sM8
z=}5Qh>B&|FtYsjrA2F8ww_y7eQR<k!5h$9chU-GwE7Ht!$+Sn2AgkbprHSYsvvKlD
z4yeuGC%s&9X~YEmpexn6tOG_+K46Knuqh0h=8v1)B%7$09F5Bta)5z&4_J_$N^u(f
z$6@(6jacmnK|!>d917hIvL+dO;;Nl0Zp0y5Pq7gPdx~te(RO5=!FGhP$c9s}t6)@2
z2GtOdfuTaMGZe)4SI%)|YMp_egy?WD)c`v)R6WUb72WZbP`B5G;Hv`Iv2RyU{2}GG
zQ!BJE)wu-Tm1JsJJD*qN*`k?U7>Cs_EKZ*^t7*KG4VqeK&T7h`r6JC^rh!)=s@4f^
ziUy|~;uXrlO-A!w*epflQcVA~1~(PWOTiF@;HCgah_0v<>&W0{!k=WK;8_jP6|N(D
zvWHHIP#Jv`Y+47bDC>HxW4(B2<a5OuwEFd2s>V7}Ra|h_^2O*CbSx9pLDweM5t$7>
z2<2J`?V3q`Qp9CcD(1rRuA>?#QVxy&OSR|-;hU~(`o8Ulnz=6cM+(^MnviOHIqA4r
ztxa>lN!Pkq%Y|9eF7v4g9FCpopytZ;0&qmDZB_t2F6bgxRJ23d+C(Gj5elcN8#30L
z%EhwtNF+xkR;K<V(U(w(5I989GtHO7b%FCnmewvVy7rbh<6VwwmSst=OeYsE%B7Yy
zWe{`)u{OeB9uO2)6(2Mv%&-73e=Q2WsLD36xg|0Fzy!Sm1dL6nlPD}JXg_4GA`J*V
zT)>6CXreG|IAn+tV~#dDJM{D<s~LPu)3A^aRf|Ir74Z=MFRLLMNch1zR3Nle>Ffxy
zQR6X>x|?NLx9%ubv8@r-gyKdmI3St!gQeNSy$E<EEm}N>N_&o*F_BMu79)s=Ko-G|
z2HurLQpL8T0ll7lZq|`_ox!@XJ*1{oF|5K}O_{CWTV?njJajCTRlpUp*5iX!6@AqN
z!KR?pF!av<RRo)6j*(zxMmAn8@xgL{`eh#KkJ_3W7`D6S^v*JF5bO%pR(!D6i55m9
z;k?e|X3c6vJUn2~nqZKVR}m-N)_7QFwkb)acpcbNsdi=rK7=bf=^O?8PFH7-8#il4
zv)n$Xg$Ml>|JFQrLb;!&&Y08O(mG?@gag`VwM=BOy~W41k6WOY*$v^gSz(vrFfMo?
z+zl(H6o+?dz^^fuCXFgXEix2UAmBH4Kq>w^MkkP9(|gJ^tq1&hc<?X2bpHc%s|vri
zCR^aQ<<<ZMGfRT82BgZCn4Pc`me_e|P*Jd6L1bP(4ZT>kXw?y7w@A@5umUtx$PnP)
zMn``*qoz7d2;Nof5J>_uT2qAX8jLjf;;sP7I<^Ana<l?ys%dEbSplL0idO&~3jj6g
z#hStftfd72Y4s3YDW*Lx1)x<YRn8C*w&nN=Ad^H~<msV7^;!XBSwh5-;2NNq$ja#H
zd9IoiprBBj1kX?nari!D2ir7dM?rR<8mz(0+s}CoU^ZvAUNiRJfpq&IGrK$J4py-O
z<Ynx3WE){jJ_5hbp-K{f6CkW8==SnKNBJX|Y%s1BTz=xXIperEj*NZ3+hbw_%6&6F
z%|rQVaA6*AIT^hy6IXQIgjB8>q7W`Tpcle@&r>EzCx6q(x#8*t7(1nD)xyvrmsb<(
zkx*kpeJ%%R4e2OkrxK9rNhT;^>2j>}A`L16H0Ah|B-3L7iFV=fd3sHm_(7z+3!9fw
z3&cPo{f5<8I*eJeif66`&Whw&XaIl)d{{d4H5lXcIm4L?^-dR6w@<ZaklxiN)vp53
zT&VYxnZV06;m9wbOkemsy-9n5qP<OwT}9b3v>nxC(N&@FfZkPssrDvk?7ail7Gp;P
zdlTWaBxT29^AWaB`FKJL0alWdg+QT9x+2a_%{Ph`g4$(!>SX}ag^+BITI&}0`s3C3
zx+SZ~I<jlg?lmpY6<2|08H<6cIDqRKip~F=7lJP^U+*n@F3i-?5*2AFaD9lvxA1^o
z_^vIcUR6s+=@|4JE?*o+fw~|F<v2@^_;-Q+T?AwN9x?BNA<?dgpJP-k77Nw%Xq6Cy
zF|AZRJ$F=tF{+rO-VVwd^xs7lII82I&x&d|=p7OoUs)J4$5f$nn6YzO=vTqbvrC+=
zyi{HwT<4q)E?wut{}VeyzV`i~$qx?aKlBReC~xIDId2iKdBc1`kx{&?z>RJ-n9cZz
z{5*XJ#Ixj?VWwbwFEaBsb8uE>H;=O65vPn@y;3$A(`f?cgF48IjPKrmE1Cj<6A){~
zOQf(;4;kV|Hg=^PbTjJCurdrt8CNVSF=1fN$Js4<Ss}MT5T!Ct@hJ|PMAKBT)FPhC
z#}&a9_JcTJ6$(Wzg)AEx2Q5lni0agkj%=k`2CQbG@&<*pAT+c8pjCWR^<aeXd>^Si
z<{I=Xf3bM)tPCvXl~igTA6ZqScDm9*UZp05lT_p`IyD7IauYYR!AHL_D>^y03f6(R
zl3m<aU63PV8pt+vQIr%h*cTtri~F)1+3W1+a?;ECiel7J-v*+q03V_Ci1sfVw+WvT
z!AV5QCe^w`%4SZkBE<LV-x1>_CBH=7CPQ9Hh5~lQkd9h(!Vx6sqAnrVeS^yCwlxOj
zE)b}S+F1QZx)mTC8~S!&$w*M}*I)+h=ZthX&lUzbre{+b6&Cs2j^>uhj_!_CFldju
zS1utA+wLsMwigutX%A9vWKc_WI)1zBfGK%FhE<{nyHXuZu&XtV$#Z6YX;$r}>;Q+C
z(@3V0xn=D~fLkV5OLcy9BnPlHq->Z;39}~yxiyAjIZ)ry7IcvnAJ7Z?{KfkaUi7r1
z6SKHg5O%eKXRK3+8`OoXvI3wM5NhM0!4{ww)3j|Y4uWeyZH{z0dxZE%MyE4H`=Ygi
z#>JKaKpRCb1jTgN#;Kakhn2;2tOKb2L7ld1SO;vyL6KiqL&00<Jg8o^j%eHU;`n_w
z^oXgiKEtoUs*X8;7R4&6K8w`Cf|Or;KrimsI=jbmMYW_VOZqUZq01jh+A*}7a5js|
z`Eaktcs@eDu(OEivtdP2K6%z7bVrD%yN>onh^Kuq#3OZ!a=L`{a#e!KtEXq9YK-5&
z*M?-L)*#};IIp^)?Aoehpmyz7*SyB<?9)0IPEC(XrNFRIPp=~@Xlwss2hfy06!Z*n
zQ@b?=hIVAYN~$TU85ADSyHpgI+R=J1f5x?BYDe%mKt%|QEePnEMC}<wQ~<I9mZ$&_
zic3}l8dS6x6ma@zEx-y=x)fknY$d>35>DSj6EYiy_OmOnesioQk5RA)0MbegLFCBZ
zGF{t1(JR|`P#)wf%lZq#=%!C^l_a@u5m?P2nu*Qx=CsZ!4Q=BW5eB8Mun&nwlLmTD
zTLPfuU%udgVo<Wc7I4LAKz%@|P<gqEd>VOFuriYED~?LQ6--bBmAFa?C~GP7A?O50
zmKE6>gqSlrYRIs^ol_l>2&RV;QUfF*=OBz2vp{kJU3b6gPH@boOs*7?s~Oo^7f5Qo
z2tx%35ly;ud@&$p2}W}Gw+WGSLTPOw(xxbD>eLrUBm@T0w8xr?#T5WBVv_~CB8eHh
z;%rivG~BWfCKdRufyqy*VNx>?a->^&4c!GD2RV<_3&(>iOLe~L(dJY1_GvRG&z#Z6
zBUclUpKGXj^fbcESYymsgRe!_VLumT#)SuzV4t`#YQOi6s1Nq5dazM^9W|^-ag8%$
z1ZTi>Cc=y<Otpj?qc~iNh)jd(BqCGaH>?Rl*oUA`-Ch1s9sB5VgPPbiD9bJi!m6eg
z*iI0@7yDILmLs(EVShE{<1ZKXb%_LV22@iPQH~kcFBo5pd=`e52R_bMC#GK&_>jR7
zKv~u_juMd6PDTXyYg;!4&lgWIQ_%fJ0Qj(J8>VE~HIzCqLr3v7JwSaU%l5Rdx^6s0
zhe@fkJJ$#N%1nQeo?_#_fPtvdqx6&GgGC_P8hJ&c9KY~@V%*Dlj^8_Ct#N#0L{OFG
zG8`X35ClF;_AU;EF_aj`hv}_O9G^@cQ2E6;z5zt{^!&XV$G2hBH*6%9^xrS8I0%0+
z92f0aB`xs2x?yYFC>Eim5BQZi{#Ob3oZS;jA?kxkRF95h{J!vjV&LZ)e(!*_gnRg_
zU{ok;4A&$S7!hRnWS%R=J*(}@U><4ky$NQK{*IGJmhWhcpfOxCEy_<7!adqaQ}6O$
zRK~pm#|G66l$9DP!kaLodoF_aH8t>quW&XQ&kAoxhbyWeQmN$>udL#?cc+uNsM}OQ
zpjvOh>d3-V?v3V6Gh5I$iE_{RRrXG(SY8n2*2cazWCnz^9M<immSvk*s7(`UM5J?L
zFsG<A1czZ1{Su3U?TcP5(-jH;RnP&%lF_}_770a<9ngzt3T36TNHjs)8b>F1q-zx5
z3SUVLQLQXE4MxvZ2el%WOF>#AVJN1yqf8&`{Ujp3qbMJEb_t%L$(jg>EkZOOSj5XU
ziwuS(QEp_Bi}n?OKW4lm<pYw25Uf?|4_(#A1z6jLsVnf**TBw^ABuP)A6cSscYTG!
zTS|RgiYP~~>gldvP*;S7Z1Qp8L*g?cM??}ds}&#6i%jNt<~}YIgUFw;KD<`I$3>Cp
zs}izpR+d2^*K@7<xJ(&IX@ZbMPc$mhdu~-uix^xe8#NRS)vPddsnQaR5tgEUbwkW6
zAhNP@y><{uE-nTz4MrjK))kU|iw`J9WZua|kmRSV7bGE=R^*w~byX|RGF8i<hzk*p
zxt5(=$bqNCa3Ma4f-2Nu;<IX?w8_Y=f&=%TPx^`<IxZp*3_o<;HCfv8a}EdRsGNpg
z49Llt1;Mj!eDZU(br(F4Fi++y079v-kR#eibOk6pp!^EZJEA@-Kvj!30xVI1n#9hy
z&`ep#q}d8EWfE~?5;Kl1mX|T8NLsF414frtsx2Y51eoyV_4J%rVF|Ft$_VC1@da1|
zYUzIHdLcTZ`F0?Oo@<+Hu%^;ismV-lrt~g6Yoo0(uep+%i$qOo-_-k4tg8zf(vi&`
zmTJnT##HVwlUJeh^$OX!vnfq)+HDW6T^A|F>CPo1M<tq2->4(e3(~g6zOvhW6@SHQ
zh(cVe!iIG!+yUXfZ2T93U#PdC2n<=5^kPF4QawvxNP~)C2)#B^Pa{JVjfwU}Dl3M?
zRaO&`U4l47kwyiXS;tq!A@o(`c^Easn<%btLrAtlq-6xEZLYbn%mEzsb(jHER|kix
zRuP9*oluC<ht*JMqof|ZJ;at`79<GfmLM~v6y_3U$nvR+m>~_Sib7fsO7bJip-?rE
z$S8<HL~~Ij<}p=KXxr%Vt<_TN-U@vyaD3TED5qhs$;giZ6xI!eno<#k3B67*)OGNq
ziaIHEp_U=DmG#QNI4v3sjpIvD7)wxySYrW;i|`h4Lb?k92!dysqQO@PXdwtz6-be)
ziKaDbEyV~-^iRWoa1gt$<w|QX_-X)xb%UTGSA?Lf)d_-V(++M?1%ozvkAeY|WrfI*
z#tB3OQYy7o0z!d$TxA4OI*|wtk;X~6!$OLo216`xcv>}9NCi1H4FRG3w+)$w<0x>S
zz~Jkn&ixupO86?_urP_YR{>l*x*;nfuv>R7Nh)Op<*;7t5NkYUc10|r`;v~z*C_N_
z4vZEom%xh%7+E#VV(k#607|q&8dM~g7BIkwC(@WwFk)CtJA|7;I73K-x`rN1$5jO*
z%KU0jC2MeKI-24uy6fp4JesDrE_4=cYn&BiJVXezBZ=1wvf0i|`yAfCa8k?6>G07n
zNv7M;ZK5w4u??SpVX`wF8DTzWD~)bfWQt~;cp^GcSAikKQdDqc;%gc0&_F#HF%+_l
z`(hNvs?XDyA`q70h%}}YgxD1WA?kCJgGY=bA~hJnLRBSG6h|U!sa*pIeZ^E=BhVDY
z8ii1{2TU|y0uV}-CDS^ZN)1Sy6}cgN5*B3B@?iuTD#eN#Y2qFWfg11yn2VSp)rl9u
zp^oSt+?x;%4Fe5JdU{T&#td~uL4*Q~Ed7^EH7{_L&<kwIF+*=nmWur%Nv50$2HAw0
z>hu-|KA2pd55kW?L3BZ;y(`FsotX}*O|@#Sh{AFRV#_T;)3aQYQcoeZ*GaL*A@EUX
zy^f9ieo+BcT~z42;0KGjE8tp#Fjo<TRMH%U9~CMz8E1y5s{mO}j2o&t;kB3@qBtdv
zF0z)gVbwSy8zcfnL>UWb%0ck9AeWw04T{7$fkveUD2A#WsJ^3m4w^F|zGr|XehFA&
zRqtW_tTp5otO!R9VMCO+Q5KEx9#T;_5e1QQRZ6fZ+c;?v6nhg|Di}>^j0g~cSdx8(
zc%o(^X;nZr#p;FnwU8|P&l5v8fU!nbP~J!J;8Pg@;+Fu3RosWNS|<$=Fsh0}h^9Jp
z#Ikv02)5OLr7t87anr)3l#vx`z>QzYfoPAF^)ZG)<)jc3+kYIo0HLk8j)qJG>Q**@
z!!H32t2q$sg+s)pR$+%I7Ng2&)n3+KO081xg9r=(;}UMz%ljhb4@0U-rDCj5lO(dm
z3b8^R$*PDnuT)KmLlFLV1o_t>k}84cT0V?4AqpMK{u&Hu1GY0=jbj@@gRV|*QU1@h
z)h2XgOLsbra8?$|n(T7gGkm*!N%tx$98c@WEY2S^zcaIVZj}tLqR_I^s>2#Bl&~tI
zp@&%21ol}A59ke0o5QN3Pn}rM(Tk^DMQLZOIlWX{PXQS6tNRQAObazQ%SV$VgbiYu
zXd<=D(xywmhnmGPVU!Onf*{`&NuLo=i}yyNE8iP4naYS2LP0iN)S*uuK_m<H(|dYO
zsGuk2F^MY1q&@eaEh<uIsji01kB}lw2b}s}3jamrC{tOklXz-VdSXQ_5yi9NIYZ5_
zayA=qOIuV+1e+>_W^b|o2yY_I_E;xhf!G>NVmx1nHJUPPz?HScJPd6UC6J^Vf=o>>
zv}_-t@tW<bVQ39Vr(Xi;)Tjc(dinFA?^N?(7?Q2n)MKU`4lN6PdBnt1Izb5zQQE)M
zT7yAF?2*u4lAp}(L;!~ftucjzy-`^gBI<=z*`ckgXzzg}I>2aVW}%Z#9>JOvO`Qly
z(AJ#Llzb0$c6A%#D8|e|Z}x!wsGxee1XHeO>Z)S}U)|z@BUyJHzgI*}2TM}Rct0J#
zf{;3chVet|D5#?3he*0ppP-sQU@Zx%R3;izz9J(8KLI0T4*R(C0>)C*#uF`rEu+>=
z422L>DI}-JYRU%|A(8JY1!M8vK0%eP6Bkr16_y*6_z+YaWdl`BD9f~Rpbh(YCJL^C
z$C)*h;?;Fs^(=)7L0SRwF9$f}mw`iCYbDSI%@?Y`)6kGqDI@)$9DhXhI!bjAG2g81
zKpSKaQzaoGF%_YwlnZrqHTH;Z&d9=6%^Jj1)$pZ2)-|Li2H?eOush8c$-)2Cj&`e;
zPf;VUdqp0pOR8?ed{4nGv0NzR5V4Ny0*S$d)ne~{EaDVHiuhs~2^B*kJ{fUDjEb>C
z6&+JW1yp$XNI<O={LR^*siT0FY}a6~ksR0bblLQ!P=ZHefDH=SYrX`*-}MSQ<qZ@n
z;t*BYkd%m0ssw=;5Mj&_ZKqsYhCcXN1iZ+yQ?8m7hQ+~X!fYeR4zb$Eu-a4#vSDC^
z30cxr!>U1B4B)nOB@bcdmMjJOK=8K^7(eIW@2YX9N}a5yFa(03R74_t<PupbD6}O8
zL~s*IFBTC?6&hE}65($sQ5R`YHKs^oB5+pXiW(}E;HHJ-QuK<`kP>xzHM!JMWC`_T
zl^V+BK(N>4z_M)#g>%q0aX>i~Q4ZH!ylG!u3#u*IKBqJ1q<BHAw#I(OoEmjPBTN+4
zLTyl?ShrApr>v4_BM=jEpa>dS)wyCcDzS778dQYFGHj8?l%la5TSOU2>c%L9Ml@)&
zA)|hG6=+nHv64AfQqjYdGK%3>?7tD!H9QlQ4SXc~`v~Y=7h+F40w+DPt&zzyZH=R%
z0PsVD96AZ-7SV^!wjbK*W(%XgvA|M+Rr+j{7?l{&I;Ry;0OZ|oEv&uT))=G@=Oa20
zR*(vi<x;NgIC>hubSFcP!ap6^1d3`U`a%V~y%T%@(gX7YdhMq)YNXsTDv@{Sqh##F
z=9a|x1NrgeS|^Bwc`Ia5tjO^ob)vC=4tGSH;h}>j>oqKbF*3-Oa6Fg>OE?}4D#940
z@kW4(WKXckXjjDXumHqGPk07!Iw6dq{+)zQD6^_zj8aReFnSFf52j|AQr%EhWzEIf
zwhnN-JjSBr!%3MSXm9O;=cWK~lzHeUvx^I1R=&#9A;A%uj%*W}i8tjsJ3J>xe*zl2
zl*`avePs33cj6dJcc)UL#6X3Vh$^5$$4H>6e2p0x1<89Tf~lHHE*1!k{X`VPL(ydf
z3x*J;sMP>m46Ca2)XTRh0pGTOj{fs4BrVxa7&@{A_?pf@i0GF=ovW5JQ7^j+<iaW_
zMAuhz6O^c~S{dGk0;8fZq;bXKqZKi-2x4@RBF<I@hU`WJT<{vyGcZ9h6r!Aeg;1DY
z6zU3!TR@nuA(_{MVbk_)$JPT!Hmre&v@Zh+t7O#IugnZ=B2{2$!NUxuN6ah^Lj$$9
z1+_S8(^&#TrMJvsghLus4Tr4Xrw9&_+aPRH(GiW_)8kZ=FsYv|ib&PcnBLHQ6rx7=
z4Mz@qd7bby2x)OiKUI{&`xVsUs%AqV@LEw3&`ew>X17G6bi|&EiJWCJb+uTcE)6OI
z+3Kiq$Unf!AjIqxWIqCAcU1?O4JRQY==(2n1_<_)G{=)2Rq-8n4OZVDaHtp6R68Wd
zbY)Xb9<;t}Z#sNgvt^lrO|`3oZH+N)lr+O>&axmwU2RsUBrv#BHANAHby3VhB|}LZ
zw<fw);Rg{-v9_vDfN^ALiTG!^s!tdev%Vl1Lc_xe!4}$^f_9XLSB0&Lk{B8??E4Q}
z!wn4y6)w;SC-h8bU2y!RP?Q1|&T)r3$&}-!g7)#taskYlqY`@<@L-{Rf(Co<ZcPYm
z*d6~x7Em(CbOar}_L#m2K9ve(!04H5LNG4!)I3!(LrHhSz%A{4fzDF3Lyp@vv!e>Z
zMc1b78Qk>Bte(swQ>)<<0g2VHhej?)A|!~$p{F1SntHj3V6TSg7t%;e;<KTr_U)`>
z_e3HYNxeeILk1z`D=n&uJe|V6Yia(Da^Im8x`-lDbYIcd1?8~ASz5-k&=F+Yc|$m=
zwAS98MeB@SkYRjGD#Za_XF5zS%pUGVuDBAng6GH;+zN8!HCTmPLDr%HC2j?YV=wRb
z<5=rVqtQo=P6i}!S5W~mxgooe(IgHvn<@c|A&nIzEjGm9CFEt%h=b9fBH@$(UII22
zXj7sAVpxpJAtj0OCWNr2B0?Kp^;JcC_E}<((4eT$1N+bA(Be&XeAHw^YB0Ls4J0Lf
z87whX3jWqBV;*3y0zyP%Sg34BCXF}>H48!4Vko3x#UP}x8V0!tgBVf-!7|2Bx{DG7
zVpI%)$e<GT@&Qva<V*nsQf7#$)sQl81P&5M3^PEujU@-+I^gB|@>dtFQaq+^2&A^4
zt(BUWqM@&?OYnx3Lm<j+>VgO)!L%5GF%u1jR6`(!h#(M=(Nxk#hzFvji&CM!FY!QY
ztfY)V4<^*ShpW8*JP^UwJ_-&aK}`=r*;s=KfddAbA_L88b$^j6rXAHQqzuXLRS*dN
zN0oBR2U@GPO~K3}I$ym!^UNtCzy^^^tcp73HBD0x>~d%YrH_LUEvpn8m3sRRLdy*#
z#Ce#mZ(0FD-3Hj8NO)%*;;i#lQl}}?m5chRRWOnkM4Q26lka&}uPk_*k_0O9I};XA
z#j;7%i#a89$ZB9KLSTzqSk|oChed)ag^RH~#5hCk3#+3Ir6?41VeCKJG{=^(GEuy%
zZad~ce~GoF_T|y$t`6GjRr&=XWmV9IutfzGLNP3kHcF=<sZk)Ak3d;0T*jU3)rCu(
zMMSvNAYJ0Vgpg(<6#?nA>1s$r4SyMcv1{OKFoRk_V1`sECkR4mfUk-0wfZFU^|H34
z-`Q^mV4YbTYFbu98@iLgW=?D(jyB7LHC+&GxK0V&Shio0IYq&sB5kffb&E_cG^P}B
z7#26Epsxx{okEC1Jb_&$xQt1XC<TG$N$9QJf8OS~E_&RnvgCy!JXQk+m9@p&R!5!F
zE30#Iy(CQ3S%T<SQN%>pl_8;gB_Sd%V^SogAP!0KD`tCZw$2i?G+JYXa0tF`f%7e|
z%J~dL+39en^dE<=j&>W8sf6@@+gcZJ$o+WT*}$i65Y#`ZAa9{GXJJBTHa7u7C@O@U
z<P%bz-F|C#G8ZK9Cpt=`0w;{zcm-X|5+ky?Ma$ew(vN;jX3?d|@Ah)}_DhKo2Sz`2
zINqXUI+*6TL5g2y>WoP<TgFeAHeqH<vqa9oVppEt!Mx~RcNQrVL1tDa2$M^>C%wzb
zEy6h}dD7d`ZE3NL2UKc7XEJ!{Dq;~r6ABMeWwSUG5#)l#yEc_B6A?nujL)K=vJyzb
z@P}wIoTG?6MuUr3WTA^M;*rQn)mi1z5{nCsjWJ3UMRjo{A%Nn(Q~<Q9$wf!GY-n;d
z#HAT3gi;@dE0{l3&s!Jp`NeQN^opgcVR5Nf%7ZLZ$+3zUG$2jcthGuUgSupic5I=Z
zpe5Kt=zj_BXiyR6%6QsYbI}sGmGiXgXl^ZvHq=Y9;bo|<2-=2$g4<RN*?hX`n3_qs
zvU2FdYBQh+ez_FEdU+IN<gQggoQC3j$QuO~;)p}ci%LimFuSmEB?4&7t4D*15LZSK
zq%lH@pd^XcQ9^$i7&&{Y3ZPhekm^x`<wc`P^ajEW0|-BkJW$B&%b*8JmD|bJ3xrH&
ztH|}h5H$1yVpBh^1yWN!F%SalC0vgRZWjrg7*d450z+pcW@1Pw05K{qU|PDcmjRuQ
z!~{$mg#>j}Chfllh(tm;L>($$^#cdZrUqD^2+OOdPj590pLHvdgc=4__#KS!l7{I2
za{LZ+&%_TR{Em`gOR!cwskRKeE0<JT&hClg(j@!}sdG9NzUb*Wq5`|K$4b%|RHuPL
z3Zc6HXwy9<M8=k8g~;ba(!!cU8_xWB_FiFW3T$O$rLEattEgw4a@&`n*-Dt|T+*J)
zIO%MNMx7l=-*=X^bJ+^TR@zwsmHAVeTbdJIDhZPenPXu2@RKm9Eaj#YKj=#F;1&9D
zN!Wp`q9*E*!hi0o!2+*%QM(p=R{F9Bm5%d@hEW_-eps(PrtgxY6BJpE*N)zyFe{<L
z`2h~9VkC2&nW^b8)xC7$_zCRB8+%4KwglO(&U7|tO&)=fXlYaheJY~=Q5?6djRl&Y
zaAk=&_)7R(siavTRYrWSU{|Gt1++@jU{n;u9)hTq|Igmr^=OV<X<`GzxAt$yO)Q|6
zwwYou_?dt%TI$kFWBW^0EopYpo&-OuQk`GQk8YKuv<-Xh3-7LtnE_^J?HdEWTKLNF
zFR^cYWB(UF5#-?;!3YN9Wo1=Yc2YB==}tcHAsG>ean2Lx1Dy{2v@byn0bdcuiCff2
z#0}G~h7p{+#qwm2Q~SPa+iy^PzAA6)S0~^}S18>#r)6~-oOiw61t@1f)v5PR_m<8Y
zcqrPxhJcx#9$E>Bz-phJPv#X(JUwIU6ClmbRFrK`{X?z*4h8Ugue}dx?0Ip2|G#<c
zeW>ZOCWg91wd*FUtE7u6oYlX@N=c7e`R=}XqfM_ly>9DUWc#%@a_8$09C{rT`;*uh
z?U<wHyWsAd_gq^!NJOVg$V2r0fu(9wR#2HKwAe>kvC&SfRQyGA2<hKx3lv-PKzNd(
zDt}T?hC_&yO2M&usVPHoHE^K+aam0Z#Im7b+x{kMTtr=izAF`Lld5`~wdEhDp8gVv
zw{3H<mnmx3IA5)7zB_$)YWd;8O>=uvp0(GvCoiur@2=Kaw4!O)lA>aR5;11xu8JL3
zM34F?>bu@mqX?Z2eU%l#n~2DEB_{R$B+LuM!mt(oByw-7xC439HDTE%H74std47cj
zdU)KGXE*JKhtICBFRu@eMOyWThiDme{rY+P`@0sd%JH8LFO{xLC#U$4tu0ei^^H3p
zp+9+X`tsH7XwtrkH~IJs+r?&OqRJ=I-Gzy1u%{6ZMG%gU3a;wKvfi#mcC$zv_S-HK
zWPRM<UoLK!DS8SQaHIF%E>ZfktGgDpWAHh{Y38PZ7e^|ivu3*#%+1TI^LI?$Jj~9w
zXrYQ~Mxu$eP3y1+hyv9<T8I7homr@;Tq;UTd8!<e{@wLKhfR;A8DfL>*SN3>QF8*d
z<xuPieCk_F1$-Y2)V{bxb8;toQJ-|u_*|q^hac@{q&{k`!`Z}w!HJkqR2dQupt;+j
zOa;E)eu@X;U#RGW$$__J#K}mY&gKYncj6^V6L=CTBmQRiiAdQKv2lJ{m?PDFzEv(U
z*+(z{a2i>x*!Nh9{_zb$a69zUPQvXZqt*Al2AWD#>c!=CTiu<WHE%@iJUR<>aVty;
zoLd)S$1_zDW}fIK_}*=WDGm-il@JB?5u8xqP~*Q?G(d|K$%jDiza17l>ku%{t2*LU
z)x5<t!S8uHR719BjuJ)q_OR)FFe&hN@U#E&;HTZynXoFG;K){8UOf^NU&wgfk>GXv
z5;5K%9t`2&A07;W-~9pM16i3kt6tdeU|iwA(4H5_P`E1=yO>9WnqYD!yB!wxgbf{g
z#1qkJEe(oa(e6h$?Aqo=hnCR@NATL{a1@`Tjmpa<1!E{T1%r{T2nt0*#2P6Qbyk$<
z`q#b1Q^WVB^u$nrS5`ASDRIF5t=ilHh0c&-&=a62aW+vjT*4W{v%h)%*^5t~e;Fn{
zb@}z7k{tm}{99-tclCHc^+<R36ts)$0pnRbbx#o$1s|-+SyU`V#xNYtM=+$UyI7JT
zIoD6oU{M*G#<*GNOu#g&QtY#&S9JCu%-pF_X;HNp6eYd`eyWDDq_8R7bY_wr<)Y0s
zYEi7%%etuOs<g|xig!s~{=m-&&n{n{UX-`D?fKQM_!8gUNXPE(`mF!AnN1F=)CM=F
z=Q36%Xv*7ir;US>$y2`b5<1pv-*dSoiNB74Yqko5+6Ksulj0cDiFOKT2Zw4vUePg=
zAB?&=ZifA%z?9fOOa|%Q3ws{Y;Kk$GG?B-#td{Gs<ALg!Y<P>Q@aySJB;JYL)Jp@A
zpUe=T_Fp?UyCgpXjf68$oD}YFO_pJeFwm}PB2nw5*fj8q&kkr?J}U2SFJHpX^Vh)B
zi`zHV!2Br8;uM>VrvhsDcdUP)QBb!}CjS^G3P^$TZln1w6%dDzIm!+C8=P<e@9+Qp
zk`98cL3TVciT2k)q+I588#C;qNtHG~+(W(k5?%H_`0TS62#G)W=;M!1o<Do><rkm7
zc(x-rquEXH_m?-pIS)Fd?~*NgiynS`VCZN3<7dD5>cv+lUp@bP*y${{!G;L-3q`U&
z`ut<PW|FoON>>n{RaCqJ%ZEA&emkN5gHaQum71J?cn~3Fh9q8y+PwYazd;=7BH<#!
zCXv==+aFPqUIc6Uc!b!lCi^4)@|J>Qq8zbezS|>5<$(M9e_B>eB#xj2_=<d7&dV-h
zjF)*5H(B!*E1-SPWQ&JtviCqWdm~k7T3VnoOP2Vn=I#f&YO)ZziA>W`<h#;MbXH)Q
zaOI4xGeTDrPbczS>L&cb)Yo}LnfVMgW&S_6ZWc#EBuW6pD>9i<-lT1v*HPH=q$}Pc
zPxAxT%`d)uaq`h$gS7eL`LG`eukkb9|2>`gdp`5`;*Iy-Ucs?0P+}HU9B@)K4TZDi
zXe9a%tO|}DU=aDP6dZ^C*v_m99;1l$d${=)ng6+ej2a#H_y6y*f};-_N^GnkVv9B{
z5vCrat5wCL>MfROd;b-DwO#zaDR^&O_xg%Dd{`HIC^-E<SMc7rmijFL??}HBBrxUn
z62mBnnhRFRgM50I8#u}fi$rHX?g?s#xSs!gNxwlAA}bp$NB8%8ldx@3rnzg|3i%<`
z57)Ss$1Cb_ru28k>X<R9Z@w$91}9gfGa{)&?8>+0d9N$P`u;Qgv1!_%X}|3+;FL{t
zwwGt0GOMdE9<1K_;Oo0E<SiWG1uvL(Vytn@F&cRKDysT}QBmoT6o*GJw}<EKU0<;r
zSMkG(?r(zf@uDrMZ3{*nfu!(sfKj|ugeK@AK;SSCv#z<pJx)8TVZzC#lNma!fzH#P
z=v)+kGR{T8QhE1G6uHb$*YD=lm4z~w8Qs3(3}?mrt83llIX>8V-SLc-KVX__WiIoY
z$k#r`r$bt^XYi|>#br$-*r<Qgyyb@pOF+TGg%cY1cj$wZ*$+#xON#B!W0!o_X&+1!
z*znvb5xK;WO8L#Q>cI{UlNB3fT}EA9q68+Q@8XjGa1G&=n%BqP^*qY@+5w&s=wWnB
z+NIOThGR;wStWcsyB<_75&fsFyIvu?y!fb`&`f>D!(X)Rj{aTQ!)ROEbEgxbuD*sT
z&TmAu;+ti4L(AbTUgLP8K`A<3gf$B>mRbE4y90mEj^|2!TeH(u7s2g$pQAi=_{<6c
zq&aV@U=Op)#T+Eg3+z&-H@BztjmV?BIx8<un!9WK?egOA7#)DojU~hqHpyJ%8HxH&
zJ}M`ivx&}uMV)BfSgMNDPTX}8W3K#3K^X=nLY2hXt)D*#LmJ7A|GiV~SaigZJ0co_
ztk~v41ej;&)rtlhAoq*67}-iE{1){)zri2di~6z=J4ttWeO@AS^02(RIy<f9bpP~s
zH<<Ge57E{WCwR9fx38~6^%wD%S9cfRD5;q8-%oC2%@@>=7s~*dnAfj?e(~(nXTSIg
zfoos(HHl9UG+MVeH<#BZU!9(dmZHCWj!=ZJ4@D#2NdR8&O93K6T>o~A@cHM|DaMSv
z8!uU6_aNg<iiF3Bv**(Rd$5{SD9f>=hOopyC`7`qGZiuC$OrLLM8Sei6p<N<C_tL8
zr)VARe*#0w>C!}0&JlH&y)+RW9CI+Q|6xfJWxOPO$o)ON7-?2Wd@c~=7$$Z06xwX>
z6#=_fmGCu%!2j~IUw?u+;F6)yB59R6<DNyrAE-1q3WIV~g9GE?+z8R;9vmY=3v3FA
zV@;$fp<1t}7K(D)acD9xSpJC57FD}7b997jN-0}NkqVMp@(3r6_&xodR8w>~HFimK
z`4C?oFWqFU0OjON#DZ;uZ8AoUZ-fhFE4IlxiMlLHTV$qGh~e0Z7C(k<GIej02twG^
z{QvrJ*XwN;+wNDwKNseqQ$<l-Tx7o^!qBFpSX9D{^|4lE?;G7il!~G$r2uLAsBDO&
zjRG^(M4KY=9L-Zb6%piYj69J=b2UaVAgYoSD+=e7QHjcOc^ud1G#NH;F?9BaWUiu%
z)ORRG5=5g-`fkkCe+E1E=qGo#r)Nh$u_c2It)&hgs+uvw0mG6eirhkMh{C-z&DB)Y
z_x037u&`5eRczI|CQ`mKZFb83H)^5?6`Z7z0A_572?Mp_sfp-?ks^rwY|&gz(-@m)
zy5eL4;p?JXH|vtHYOA7vEB2PE;=DJSxR<^9?BZMDEux*x?@LZr<;O@QQCr=Nexxl^
z1U1We5{J^I2+wa0%8Nz`psjy)&;ufH;rI^#Bj+i@U!$<^VDlAR<-TvtkgxT{^N*iB
zKlu5->V<vu;un6q9J>Bt_i}D?5poJ+xO|1!aEu5$7-^!_E)>sar^|9#8%N*kP#}5C
zf+YRvf%<+Ys_&Cd5e`QpyFZdXCiq?$nuY*g9>EM2@qm67D7cdlJKuk^sBehNfxC=z
zU8u_!N?oW=J_bKg#ATO-4Ne{K9cp61#e8**UcmBVjyRz<4$HIx(_DueQS%rq(1mVp
z57oS8xj8q2w(+UYvfOHbBGf>XUiDGScio*EyPhDBywos5OJvZc|Lc+#B6SjtG{SI2
zcpNV(&dRdINhOpuev9$1@6WDR{s9TdYB<Z*%dPCni_)wyj$UpYCuIw}WQ=Tjlx31!
zh`C8}a5Xsm$gnZ_=%Z-RCN{B&DJr_TA!~nPjuKegEN8m8$u9!!p|o9EySX79O?*AP
zR1}O>$PqL1uNQR`Cp&Nw*EE<)VVV|U6W0iyX>f-B1T53Xf26?+=aI(4aW%`O21KuK
zn6RP%qG`lt$Q$%jO}5HIRE*Toa<zDffEjjcit`L#t_ESW9DC(|x2T64Sy=cYt=KDB
z(y*i}tE7k;GzEBz5j5}3UitX-FwWh2_<$#i@X?&!q$t#gs@+oGOhp87AtKy-_Q-dB
zvdB67JkK2wq1+;f^n6hf(;`ejxU3*evx*~Ot1NMT(X>VP7DH>^Cq?Yf_S>6->g=*c
zguw22|K|57+I)@1!lF5`j5fHa#p(Z8ZAU-sM}Hd>m%Dw{mY`7CLsF;%A>&JvGMy*w
zf9*%!;P3&Ph+Ew<h8SfqsDe8&?NSs^^X-6>h&)Q2xIy)vi7hHXnjLDPMg?YSg4(E2
z^w*9>R;QN<iqrWNv26e7P&kHY!y}v(<_t~0SJZ7Ru;;-&L_{E~YQ(fX0Sk2CM!$L`
z=JXe$=78cQ-paq8$b9FWo*~d?AW20?&6lWE6`Zv%%lh@GVme)F54`E=%|0yWBUwf3
zv)@C&aEw1_m<-yvI=gFGfJ$4xw;o3HY%0j`flZyITM3}ktqg^Qb_a(;1i<A*`(r-^
zjl==HTR%lnZgkGJm-83d_MS)lg6WxPC#*)Pi)3*EqUokEg(u-|fY6^t@6ckC#;L5X
z<)xWmTt(RrmrXioCR|Rq+AH#x%M3g!>x!@tzG3+k_?yRXlcxM#*`zTPVL=HIB*`O0
z)z~&k5FwFp`aF^=-n~tlqi~St7AdxR93TE)i(4c}Ec7?XSA@HXa>h*-hiFb-HavR@
z&Y=wa29tULOY-dY)$5b;<%y^)SCu#Ijj~(9JUzL%Y+7+$)UDg?m|q{h=#vfcC$Xg+
z{p5qD3S=bdkL`h|AD}$lpYeaeMSVR&cINHDj}RJABcS%WJPXFyH)YfO;O6Yl-R+-G
zueu+9{dM!F=sx)R>+ftw`ER~^8k;AYsr(ov5>_2n!ro1Ud+1bVoGdVu{lWpNO_{-Y
zphdaop31B}SD|m(wklED#;OLy(dOHS*-bZBfixH&1Z;cvIYuIKjy@eON3-8uQQ(gw
zTxo?;W5^l|XP1M)j@#rZDD?yEt`88M^SABw)y-K@25->kSa&YhN5io~gsC_c58EUH
z#t{=|(>rAEhN5$a&o0WL6cbw(i4YU56Ch3L#BC?mtiaIU$eabtR5(zqfO?s;afV_;
z_xJzp(k_aV4(KM%aQ6=iU}f4O=&3<|ag(DB&l7MoA3)*8sGHHzd^h26(hI#gTgu*e
zC`C*~*D1FiMtO{ak+QM4N%?SiN}hUX-TfwNC@`8D+D<xvnW~}fq(f9)2RG!UhA`Mg
z&*Nn^#HmjlW9!;q4MlKo6C<!3g(R`<y~Uy$?-9ZJS$lqY{rcAk?mYdY%+VUMs$iQQ
z{Nx~D2mj&#`B!R6)p>ia+Oj({)XwZYfKu*Butv=;hK|ac`_VTk&p=5m#>Zh}Lv{t$
ziEZNp`GG^ZT*R*V?0pELMhc{{|DkmvN0*DK@(o~30aInuBjb~#PPI^AjomN_(Hg*0
z4dW1`%l-ZTxwso92qG70Q2VQ4%o-7wQ^9-<c^7R5|L-b9ZZJPPM44AVAxCs)L5e8Y
zldmps%d`0~*EPJWr7ePRB0nu(x7U5d>+TtL-ld;HqXj9H)V+Hm&pk(Sl{KUe$0brZ
zJ*Xg&05vsYg=s8xmGn}8bmiUclq$y1TbtVA-F%JM6z>Nk_CNbi|KczHbSSc95vK}+
zl?N;eqU0;IaADGG#K#LAOAtQ4zuG1o)u`$$@AAB@<LoVVxq5GQ&zrk*#MH@oQ-~jw
zo%;6l2FdLr3;O)DzP=R4l&CFrGWG}Aol)AeivRW;zddV%zRa-@c=K%?AdxyaXAxrf
zST)+bOWW^oZ((Bzy6NCY>f@<TN5Sp5iw?x23_>1O?`3#9&Rc{|-(8;$!R@!VSLf~R
zt4kd8iVOPVcc*7(ABgt6p2sm@Y8C^9q6}r&i(JI}nZ`4i8AOqzj<w%rj?Q?QV2<^3
znKVgL!TV@mf&r?14B!I_4r}7nqUG`Kj~I{B7IhSC=Cr6>g1`Hj(`b5tef&RJ{2&zA
z#`A0yHyoBtj%rsZ_J=_DG=6(tEhK-R4VU`@(4XR@5vuZK`Lf;BWS=Qg_w}JD$&QW(
zEfNFzmO=DMPz$sF>LV1GMb^or!0fL+diLvIFj=qNy!6qRFP=rS*FJw{``ss>fA#G7
zi_bs$#lJlH^7+SumMiar&xTL&cXk}hO$0h1nlOZ!N3kvA2l{a=7Zs+FS2>D&M+GWT
zgIU%kv0C^E@?ShZ2sj@xJ_ksv#viqhfrM$F0_9wm>;mP}Kwkm;$*|@TU?f5oy}klC
zeNA8%IQkR~W&y~+9C<(cTLoQ_B{*j5c+=s?wtI^mXx|&F;PF#i3sJoB10`I?q7j(e
zHs@e{LQpYc?Hj>NxZ>N17J59{ltuYHO$W(TC*4vuUD81DHJGG$+V<gCP;Q<Y7{`c1
zxxfFP9sV@R{-SC>nvZcr2Jf$d2p9x8h%o$)g*hDSx3>+yHyViEmba%D^(~C^ziP{?
zf#SNs&U_>iU=&;Zci*0$fX@O0rczq<*(nZFdn@=$6qr249{mThk9XxDdw|&snm8w6
zZL3$0x@%eYsF}B;F;5F|ONhg<rHKx9^uUzu?Bum4uaRgg>D0|MPmtCcT6ai}m^-Mf
z5C8RU!<bM9AruG7Ub|zABDg4V;AXo9?L>EHl<!(WI!86Mx{0f*i<&Bbo0Zz%8})oY
zHpqeP5^nX)O15i_w!U4qYx%4w6Ni*DXSNyow|W}3vO%Ek<|5H+vZV@;X4NViDD9~N
zGu>1HGt62Tsv<$oOL5@{7YN4xKbAICd<)RjeN6yj8`iukyEbi_1nsDkC$OmwaZkO!
ziXh+yeA18UT<GTY4Pu;{le>%4TLeLoGdn53HMaPZN+3>=utQ=uqpej;ZvV&n^6adw
zZ!fQp1_Fp6!@u?@RCPc8U3GbLLsu!}Nfvw$epdw$9w3{FnyU!PX{h2p)?IkR9amIk
z$J4y|j=Nlz0S(o_#E8=#!xTn7rl?xtv*C(;Syx=rQc_9!cd~>P4OJjhr3K1VTF!-?
z?WMG$=^=_Q{8x)g3pOrANsSf77aE)>RS}YC+KN?0)jR>MeLwcvH>r}jP<$bIAZHZm
zTMGs+a!6R7BfZ9YQUnTFwtx3HDH12cx&|8OM1W`;r^OisWU7KTXB5%+;?T)U1<_h5
zM^4Sp7gZ3o^F&db6^Fa5MPbaWj#D(RM#}qB*f_sFbi`OFrE1l<imwmryPMm~^Y$A4
zXLtICe!qKt@#>Yi)qV2$FTZ&H^Iv@W8I`?!vt2F-MhaGJr^?RecjUUbkAeTp5t}Ni
zmRmM5_*}vz^iU&gI1@FZjZ4|3G%Z@cktK<L`yWw?3)bi79$F-5)m2rycoLO$7nUer
zQlO;g6WGXx%rvU`{LAaMJ$Z43vPy~-DhfV>`}qdiGA4n}9I&>3yhcfNaiR(>Lvd>H
zk8tPy`Fv3P8fULR;_M4}8PtrYIte1hNyM-$$}^OA6dQtJ7;?l0%4(Ay3S~LDG)^s-
z_a(*0BLfnCn_(K}MI@Y6uYD;C(Tm94p-0j%Xs#3Is2RT^dV#e`US)Yxv~AoX7v>3Q
z)Zoy4^lf>1R#s>2$<JTk_8BZc#ny}tu^``B^pgPLfB(V}0=O%=`iV%+hDUcAp!yjK
z-qrB0yD)*9T=l!wn{7<vw$y7?A2>jvLE_#bUf}cIg?w{&4CUFdlHH0IMktC=h~yGe
z@lXUj^KIh(FdQgq7MUiYP4xu*gJ<?qJ#f4kP6$tzH!}`~2EF0;KcGO>+$e35qKcxj
zNZKb+f>IXal4gpseVw;ACq0oO+7_JLi31CDD;u~EF>k%1;RooX(4D?SbP(BvEfuWn
zLx}2<&7U?fG~VbB6K1lcv9%0<iSmWL_Gx>X$dX=r+<<791!cpb8jLWJwPETSqIZ0Y
zh)Cm7cfMqqbL}WlfHcEVczlzn9D$*~(LM*5vCnZNZ2J0N2t{I`K-y(xgHm#Aiu)@Y
ze6KdE(Z>$OzF6Hph4c8qa-3@0@7mkj*C%~VgPW7zT~-=zY3SOEuU<SyIRp@hFZyV{
z8HHm}H?MGSVng<hSh!ln(3E09#c;HU1_QD_tf^yY`N(`5FR!Z{BCD!Tw9M0(%&-r`
zMDg6yPzVdgC!*i}mqkTGFS`W$)BbvvAQl!iFY~TK{%Kro70$ha*iobB=JIZ+(enmn
z&c{T`Z_k={dbal<LF<bQ3R)q800?E`hvD%>1=641Lk0yoS@2N3j83>8Au5Qt+CuW<
zFguDx#umt={ne92I01;NG_Bg2cg0(bP2Eo2FJGSz(KI4KtWR-n-Xz62e;~>~!XeVa
zmWVh|l-xpj0h`&0!{Zoxst;FbDT0netij48x>A5Nvs2mqRLs=u1k5ly(G^Qrs9x#@
zdL>0_rK>B}pl--b7SW~q>l<9*u0ob7uXx<G&DIuc(wFXuw#C9{9(#h5WI<sMQE6G2
zo))7NVP+{JM1|?-qlTi-6{;s22M1`21@qB3g&X0TR=#;#EF@8iuoo}oL={^G0_MtL
zst(FoWFoe0MGp{I!xgFzlu?z}VOnoJ3A}7owdBs2jkR~O^JuBQLCKGY>OT;%1pAy*
z8(E5OcIZ%w)oam4&SkrDA59doVw2N5Sz%7&Va9+mB3Nu2qIi@^P_}JDOmIApzkBH%
zDt?Gs1HWF>Ipj4VxOz=hWnLu}><`3a*CqDDr(l4NeRtYMRBRRXsx<Obt14hQwmbOI
zv%h)%*^5t~e;LL^zL@&Pokw`No9~QJTd5~RCw%AmkCk)qwIruh4yStIJOLe_M1fXQ
zyU^V?g+=G3To&D$+L{wnQ#m>yQ&Ukk(m(5I0v^~m#tSJ#IgZA^SX3-z8ztxhvZ82f
zR9DDf&Wp6`%DO6_#A!lXULe(hj>dnD*%~?_kMWt7R>&3&lm40YU>x<5k<|G4$4cia
zv>qQiSB0cBGp7g+t=i@DEBfVoFuu516*-!niUI(pHbroc1R?69PN{h?j_MW@g%Tj$
zOcdww>*!1lOAhkA_NpXDkz)4&U2h*EP6X{p@)h<Wnpp|g4XqTA6;icNqb|<SRQv%7
zk3Wo|XogS@hi|@R$GOIu^gs5~G{!_yyL01$<ru~hl77XaHz-M5^i2m^cRq*5o2r`a
zd=4;!C_#w;WHNaw8yDU4+zlC}vVojM#yeQr{h5+BPs*?@%dmo@%J^3O#>*LdfLd^a
zCsqa5SGOL;Z}gpTXo31r%p^1pjD}m|VuWV&nrSMWs0RmI&1Z94SHakFAT9)v5|X`W
zI}nd?$mqMd;p{xdIo<!gWG|xh9Rf;2w&I{Jip!*96)$r9cU?S%^w_Tt2ls0X{ptgh
z5mYjSBM+ved3_A4+?Aa<_|d3*cCXA@yx4Aieo)_>f?)o$_<#J*@~D7?s9c=(|Hbf+
znW$nM)R))okL}%v*Bhmjw=0ZNiYx1*%FL%-S<pBZW};KCBXQu2?yFXY1*MFA8yiE#
z41Q%|vXTi)+Nvlp`G9~-bs8vR6Gb5^o)-Axc$utd-65LC{ccgK(P%bGAbS5@w9BKq
zO>0&(;Agh;z2{%N<6X3TlU?+oQhHE@aY5NYW^!bJ3TMvTL~*+88^9@Y87A6R!0pt?
zOo7oHM9Us3b0Q{Wrfm;JjnWW*_tQ3L5e^V87PYMiaU=<wbAN5CQR^IO>>Tl%P26=)
z#6j#S%#*=Uyd=(r_D~DuvfDHts8s4hoTfimPexJZYf;~DE{&23R!Rn)nq(55kJV~B
z!!wTW1W403h0&x9PGF`x9AM@S2hR^*6EBTRg!%1mjv60_$r1W<AZr&%cl&D`%iA0s
z=Ng<OV&|=&Lb}3^;QT%{&ZJ>>mZHdg7z(6X`c~bX@zC7>=b<WwBMcu)*lA^oihjs&
zu-8>WK@#7-#gU*;^>AvLZmR8k0;K7o#n2R(sUF%aO{DmUm-Es?L=FhH#&wKGdYDCM
z$hD$!KhlBYE=CD~w33eG6G$5xj`7O#Nq4vkwK-A$@ax0gM>KPNII8R}9KXc`81x5~
zK(9~h&xeEuv1tx>IYN0=@qcxDbW>lSUZLF6%iTW(;^t@`H#x*ZYqwoqkRSD#rFjA4
z(5XwJc?;qYvG<uaB<hKy^{MBv9Y>0TsZJR)0>yfurc#OAQzyp_u;-L>J#zDGhj=mS
z9TwgJ)CkXvp;)e@AgNdxD$R)G#1)Fwv=!=~M`$=4Aqy>g3VGt^?<^?@1%&5w6a<yu
zpMlRD=ZTtFJz%0=51texxDKc4MaT%kSH*hhfILv)gZWsKvgM+_i(wC)`fm`XshrJ!
z17^C5+59)Co5&jU+Qm><LewC0B?a}$21^nS>k6`@s4^aP@Y~wDEpeFt1U9kv!<Zc0
zxB5-U68%_jXN>Mn*8O(6bn(QYh*2avEua_XJ{O~M81_jYWrMO#EJnj|oBC}QO|!Ud
zHbuiuDH=(6DOwyCF^H1?$3c|ziYAhN8Oo^b@3xf%dZTu%%<3uwQ}PtDf(CQ3FS2=Z
zC;QAG01TVm>G>T(zQz!h`t0(q`L4XJU!An|4GN51+?J;o2xY;VK7IMhQC19%5$WWi
zK110Q{6<Xpkc@uS{9FkA<@L*7T%KRu-L~jf9CRi8`Lbgz=CQ1r6A3v<OL*>Kk_2d=
zT?ZV=TS3)jO~8>R(iUl1R&_nUaiZ6Rd*ei3BKyYKC)3`RH{YBzsP=*yYA|h2y7K(=
zOw=Xa8Lew0&lWR^kerjkBNU2eoZiC#lXj^`1qm~#6MxZsLHc(xM#VA5#A;Rkq@WB&
z2K{lwYk8@*$YaKd!@pisZ4t|cX!8{r4sBYY6<&sd2drVdc?toY@5d(kMvP4CW@H|!
z(zQnfO%aJ&lqfB4Q$eKr$wKlTDkyVwbz2tomL^&#K$<OUV`MD0td)@g@eh^t(!)4r
z9L%UInW5iealV0S5`0BHUEbw&o$)4N9cm;MPoXuqQtn#P+RCPSR-JrzdHqdycKO}O
zFHz@36u;ipM%t^+by>YS{oU#9$!|aU%Rrp5$3fG6`}TH2t+d3O^blU$`%pU*JGWi&
za6IUn;s+g=*tPh}TiW|L>R5v?_0l;c#*4a||8a5e%Rw|nXx6a5S&K;PgqLXV(6m_<
zrB5MdZ7^#o=6ZT0F<P~jQ$S#wyLu=ue|A0=uO;72DWinW)?HqfWh-jEp_`s;1+xoI
zQIItnS2k#qf_#*^MDWAgQ%vnK-`q4s9TQQX%t`aWd`kq=Zrd7D)E$!bCOq~=nQ}W(
z(kK_b4>djAW#*FoDf53PW~z5vCa}7(uc3+~84>X2>h3K!$UH<<8IkX|zp^5)q6<6x
z7q-_Xubx7J#QR~W4s1{RCaLd*7Q{`eIJ8bfwa%{({n#F4MJ%4xqq_>|d(orC^a&YC
zcKL3M5~o|FynrRC>6`#*7O7%=(pkO0kh4gd37Fv-itfMoR(S1>_^OCvoz2qjn1=<p
z8{`G<uW%(=%0@WNsKT7JMfMat!!gwl70kw_hvS|&$$dA2D{+qAr}};IWf&2)vuCQ`
zYBI)Bt@BMPn?X+&b~mxN<@hSw^b7~sXngHQtUg?tlGtb`QE0Y9LqulgzE&tAJ`<Gi
zeC^P)<T#9uP{@sOFlj5SP?R*o5hg29xG`y(qJ9F$7>ehobzlH@*!y~;&JCAWdw<`R
zFM|s~!=AkkuG_P=ylI2;Gwv`mW#z!|{N`Zh7%`G91qEPojA$t>_*VTt6eP#-MQ$Zt
zU}Fi}&F5U>UHX4m>ZtzYUDsX2P-kwda!@|e#VE;E1f;bMTB782T!T^<#Q76YzW2(i
ze4~O&4;6SOl4_SWyD-BMVj)T$*{n*Ga`D{lxFuA^^yC0qvMPn3>27DUD$_9bKC{Pu
zE)?!^aktAk3Ydk(3WckStmzt7WNirECVL8;%L6Ff!{p))o?Q7Rp4>wvIUpJb3wVJ(
zK|Ci8@2#F3ihsSgJ-@_>ysv?Y_{|(Nr>mrAzsI3oH<BaQoZd$*3$rQ(+lj$!qAgD5
z9VZo4!Ka_Dt>(GH8vXk4L8^J;V0}qHvC829iW1i5o`@3N2?Y39pfbl^Ct8dhQDC$k
z(Q@)9Fw-5;cJc=f$-5<ADhf`!bN)y@aSoA$EBwD2XCi3E7o`YfYwGf8lrg)$ySPB?
zvj~^DK*K!5jfgVVLmKbR@6SZ^%x#G{>C=nuQuY4yTqG3Uy|}y{3h$!W%MgEUaxUW~
z3u}Y;Z1XUF&ngv@Y@~h4RAeHX!MzcJ!$bK>Q2sVeLs*x06?v%Y<QxG9+Y?Gt?6;b#
zDYObvnsQH`P%VHNlqN&vO{8V`X4rz4i4Ze-wq$?8U}q@Fw1(A;lNt@BqmH9iMTfxC
z&EZ4VxaG7@*RwHIe$=DARqP>*&xdmOa3l%)%&Buo2Z~WbHxgc!5n7g^Ho7OhBFK)3
z${{y=qH+SHsT=^blaa~^4E@b@ion4M`5K?8C~obpYbJL$#H))V+H6I|YF^Y~iR1Ya
zbt<|pev1SPH2M1K@c0n_zrDJwUk(3wftIggr|B-Q&&yl<5goJ6PSH9Gd(KaPcXN3G
zya?JvS-+Fp*H>--v-;KD#W!tpc-)m|H!Tq2-%oB%|A^}#u|&$r@9)~{*Fe8`_UW@<
ze03nwAmm)3;Mj`;`A_|RMwPUW%Ddakm+i&T*H{ok?$1e`-vr;}eIJLD`tJI=FF9?=
zMfwp!0eQfpAWFW9*l~=B3IC=)WZ2~_)dw03-KD}FHgc7yQ;!mch}XP2>kC|ecrg44
zi|X6cS_B^Tf55B#%MzW+PSMWxlZ)=`?hhaT{1>vr{NeqFhtJ!aD{R{B3ppgq!73<Q
zh7$qww6*CIE{VSFKE@(`KT#+AMd}VI3jI4NiMZqB3|#q>f--h5(N_ZHkYFzQp30%n
z5=(?X_nXBnOtfE)(9dN5gSjLPyErLXjidBB<xe40Y&@9z{%UZp@Wg~(e!w262IrDD
zhI~ELe-&AO=6)qI1cd455o?0qPxNctoJhd{(Kah3SC1gd{zVFAJBdS%pm%mUGMmsK
z!e_HW`NTBJ;uYkiU`k>%fs3j#L?^fo-@+H=`4z$s$Bp3G_4VcT;qe8Ic0N4(mDmEF
zx7Z18u(SN>@bda)d2#wjxwo90;zx%?Tz05KSeG0%cbix^cTI)A)24ubpKe;`H~3@w
z&bF=Zxd`VQsMRi2NJ%yDizAT@xt#VNvOlq9WmfYKt{6%n`{~oeoo;*T6n0XP66F;|
zlHxTG*SzR&;ba@d){zjr|7O)dmGEWiiZ)N%daL;Gs45ygRM-|mBVckl^^M_32vnQB
z>Mb}%VT1YXU+eM+#3A~`MwFc!CKLx?hLMOy<sw_%OW)#*X9$!0JF9WXzX}<;109Fy
z5l3GKQO&}hM}TH$2o$T)c@jORQFR;J#aGwpDE!IC*jTDE&$>1V>n4dRRB6G*H{8N3
zx074gw{=<G1a*0KCg$ZPxI%oeWjlM9o7v}Ie17uzv%mc4t51IO?BwHT&p-K%{6|#s
zJ3TM2U!RnB&FSSyCHlpl+`!ZN@xf0H@`Li`KnLJbb5fn&UYED+hX?xzy+$hX>9=h&
zBY3z>vU~q<9Kj*MLHs+6hoAkZ>Kf3`brh6!5d|@_`~s|tEWoL0jl4b)T+Dui;T_$z
zFJIrjf>X<nSa?vLUcS1$y~4lbN8IADrGkWTYGi~n-i(VJbJL)d(a(a}@USSEAoC|S
zB|}Q##BGx>&8}1k53(x-lSqo{GAzZhgs)OEqzR%D_2>WjU;M?N4!?rQ4lNpeZAB>&
zx57CxNbIr87A@H+Wi9#%qiPLntEelR2dD~Rx!wD!l%Ia|;;V@UN(BS8@J)Mrb8?3G
z-yhE?p^EwN;IohZdLoFf_sVQ2vU>T$gSiI*LDbfBParSi*3Ulv<fG3gcbBh$+#+P|
z&i%Z<YI}6=AB&u0>%-I6gx+#yAZ;+)1Cf~BH*9u13F3ULxnZT}ag8itW(QXX*j`6R
zSms9uSVsBV)26)?`vm%a%&pZKjQ&79h-4UR)x3<K>+4OqO?QnP@e{E%Y42o3Oy)<y
zn3FTTXnO@Y7Oz?nPdRt<{_5<HzdLPiU(NkiUUa>Cg_umIgmKf>biIEyns1s1LtW-!
zw$~vsgIH|&in{corjVIeqX%=&tO>GCF2UyJSG#U|EdtEWvJP;Gn42k*;F@m2xMqCB
zkfk<EdMh-n=U}FbdYB{!f?mjg&}1s=>%`0oGL^`Xqd?qZ)FHM@83x%oH1ukUM%aa^
zKs*?IqwHW~IWNH$B~BNEUEuT^V4!H-s5;^X%}xRh$RLtF0*16HZdM>Yo2Gt-I32Yp
z%(+TP*1|#|7`=&Wt=O=x$tC>qiZPKsau?SPi5-{DxU^NjQEH_k%folH${Fvr+s&+^
zMqK}WTksZ<N!x{46CugHN`kn8qlmr$VNf(}iBigN=8}4PT-eH1ND=xVA(?>wv(lqP
z9m*w+Cra@@1-Bkipek$>!DaLDC}VLdKk+ERuhStqIg1jed6a5%6+(G>l<1}|!f!5M
zeBGQhcd);3etdY0SH3M@e$&2I|A;6}(Wd?1EzvwEd=`g@C9t1VSX{iFN13LLs1h2b
zn7bN>7MQe+=FwT4$3pXXDehXO#9{OT%~LQG8?);q!b3y{B_EZPI6a0FWo#!Dj{s@<
zXkmD`h$fxTu24~fc4a91DPjiv6cV{XP{=QrC?3RbMriV24|8$qV;r{$nnc%4)rMK!
zJV183<#rpxqqtZ-mrLe|u*xS6Q4f%twrc4aJG#86m=OkGmV}V9H-^WU6f#^~BTJas
z!PNnnAA|hU5t{$q&%F-V7@i%UMKhy}sp2YY&PT5f##Mbnl*_8KVf=P*GA3rR@@vJs
zR+g0tm^l0DEoi>!qoDXGOEBB(keER%Id`h4Da81U^iNI?W;i&iEE6>Opy{&AZ<^)0
z?X~DWv$L!NTq5S?nk2ZUn=q~!A2E|jHymQ)EAK$i3mFiaOtF{D5k)FyR*-xeZFr4)
zDG;|9b%^a!hC$vK9#MSMq62eNNsCMS78*lL6F#8nBd~~@omIN6dwuArMPbfWLb4VX
z3c=`&m9MAqwl*vta+VMzyzv2?mApgx$en&fX2m{chKIFHSR_RmApExq(2Xt*^0Z+A
z=QS^yf|V$)@Rk`KIA`vd0aW~CC^Kz~8Ax&jrg=uq#hk%!H7K4qMKpr+Ac?){ftpjv
z;AN5rNibiMN2D8j`;;<lU@?{_iIBOAyn1WsoW;r8aXbhC0%w=7mWLs)(s1K=e)DW{
z0{>Vz9xv5h$?+74h*EW`B|`!@Pep48Gn5BWM2Vn(d{`c|G%>XjfuU2S6fjLEEkqBh
zitqH)70Z*NMnAqg7hcROYASjWJw=uaCVB<)nj~F?t~F(eaxroG5Vi0vAAybOQM|v6
z=@~;8$&ONOG%^l*Us4~e0Z&$|3o>ywbq2+BHZ9i`LruLAJ@dx&sNTMG0Cz^N#$NSK
z%mgu+9|dDh&h(<bM3k$V6Jk7Ge|7f9s?%Zoc5pUEm%so_O?DG7ar#vNttnLxEJJ96
z*<OdlSk<K;t|`P=>^+R`=$u&-WSv}s&CRcN-S%1pn4M)E;1V%6*CfF;-Gp(?_=uTI
zy5Y!4tx!kAfuI*MAT*f}wlO^jzf~5qF;<h$&>WjGfh|g$E(ZI3M|XXW-J*5lToD>*
zaZwCt+B1y@qs2qGS%Gw0P5lgUI%-jvs))0)25np#G<uVb>G8<}nQUdc0i-PF2Z-g#
zyQIz%WPapb*9I|KB?oQMMnM&!<Y!!=5FynU-lhIOY%EW@#Pskq23cb7K#$T-$ae5^
zDH-zR4WAPAZl*qEkJFq=nVK@m+VUiN(8f>%BV2g-lt}7Fk&H#6Cq)@27S9wRwfJ~D
zq9@1C+O}rMg>BHy>H!k^Ew_7LUCMpZtNQ)gs$g>e8Mljlgwl8XL2|adG<PM@gDUQz
zOq@hdgseeSgA#O$QM3l}b*N_Tr<K4ON{JHJQ}$KDG@Y~%Jt%MON%SCd8BCU^QidFL
z*<LI~^dN1Sqjpxdf_?>?jyh53u~WAd>a+1DMZXTFL+^9-Tb<M}i^<@`kUgV<NJg*C
z8_`29h&o1O71CG<6U{Z2yN}rHcoJmdYwC84YDe3iM_$K+&`F;Svk^W0s3<&(lMaxI
zwL?tiN0DMq&h(<bM3k$V4`MvtM)Zj6@P4>R<96yYS?oQG6m&Ol4*qEevQ93^nwwwk
zx^0RHFgwdSz$HTUFV%HI5;mg8$$;pZWH{(j)zu+(AjYL)$Iv=;Iq$}nt$exdQg<uW
z`I<Y}qC~|rmRpnkzN6*7#9631Vu+$eO1qQ{Z!}OeaszrJdSqNEjUA27U~$k!Z;T@0
zcS~Vz#h6Ifq9OF+x*35)8Be~`iq)#f@^Bula>lzoXwnvMnifql;~-=lL{AtuL4{tr
zLDoh{;9_N3vGy&8^neC&x|E#d$S)ElrA(Ky;0Z58bq}91APn`f4X3h5WmtmrA;(~y
z9;y44xXa3*9yGuZy)rJmxE}CUh#&aJrOYkV@j?CJEVLKIval~ZvZvta0A5xV3NJ@R
zl!)p*Tjti|%-r(Q-j!qz_|+U_ic>Ywqb*M(CO8*^?J3Z++P6j>%SCGAB-#$u@4{&U
zR}a(`)6~mC)J7&{bw&1oP!c(#UYZ$!T}5f|=Szf7R-i^to+tVK@petv=1~-PZCB+@
z_7ELLEgypSm4@92pKDkpN<NHYKFkst(!TDmTC$6DW<|BDG*@vRW_utKGuR_!ysA9R
z6f!yzDv0bE)t;Wcjt3z-(1w{EAG%2ou)U6saE#0BC@Ulfy}05M^^ZG)(H}^6|1q^X
zFK1phMn$=eVHFz>vO`SfN0DMq&h(<~738X%r2>?zvp-fnRO7*241g4_SlbF2sXY1A
z*;8*p3u0j|Z7|#GkeC6lt4se#Q^?G;(j9m6rmfaqmI<;>F2UyJSG#U|EdtEWvJP;G
zn44>o;F@m2xMqCBkefE=;mft7j)>uM?hy1s281RPP*PtfW>%2Amo~hH90lSQqYklM
z$}q^zp`lk(zHWA5Di9AwUv4|tSk6nZMTyhJU>7+31{f$>H_jEIfo3Ox24q0fN5Ht{
z6gMl7ZtGqjI%-jvbCr;+g@r;edK1@Lv0+`4OZep#V<LUzF0LCAJ1(7ZY4;n5VzDW*
z-Y>$ZXbXghG<gt}2@BAflLbvzra>9CO_$|ulossm5<dA-vPgu|=V*uukI?igkw}78
zR0sp`VSMs1Hq%AY4E^XQ`64|~cPnjy1fE_cj_vV;7k+#XlC19U|9+0|iH;dRPLd-Q
zqmW9TrYrcKgz+XX)3)Vp7UmDpt>Y1SmHWiu^jo%vy6z)%_-5dfZxuZGq%4%!YEsm7
zLSKjg6}9RBL_ra>Rh<VY(wy+9i=f-p+JekkE?5&M-GgRo5r_>D;%3l2QJ6=fq_U4*
z#vznz!uE&?{}Zj0FkLBk3OI7Lg>P_Ao@bzw{IoJd9iIF9U(InnqgJ8?91qb&X9d?&
zW=)!>yo89V2}R-Zr^CW-T+hb!s3sGA1*NI2m})l6#`UOdg>(T8gV=$v(g9M6>gN~&
zXl4|CQsGlHH^lgDe-+be|FP<J7{49djftVH+*&cTH?GGzxtel0D5KdaCIPdvtOINl
zO^CU<)~*{L0G--$%{a*!Jv1kwp{{tt7vJdwy^sMRC$~2o?ZG7~W>%2Amo~h{y%dOB
zj5@@2DT7WqhlU>8<-f~EQi00E+3#$8sFq-h5~qv7E^u)<(7JK12n{ql2{a%BdgFSw
z;2y1;&txmp58ssK`~dD^9Hj|s@(f*jqB1}b3J;1ZM@8ni=)x-JSz1RkTo3Y}Vyed+
zio!2zdi*fN)-1S`5$Xw{!6|aJOqY_Uc`T0SeO$^24S@`w68@@aM=4!M@)vuc?o}!|
zTLe@7<O(22!6kX8ae=+@p?aby<%pd~R}_H_vo>a|jjLR=fi1Ep<y7v&%Je(7RmJ44
zHf|XE2;F`0t%7O$7-TO{Jzl!IQZM7QSmI#OV!B``>|@hQ78RgpJ@wM2miV+xwNk)z
zrL<5zYQCjQD>H@?%Dz-jioUfTR1e-6#4`x6{i&Y3>hP`2vbe63E>9ky1NkE&U^l8~
zqk4L;Ubl3}+oZ!`8)l<=3_is0pd1J*8z8qg)y**k(9GD`8qG^Je!EdUTOL>6f%HV>
zA=Gt3X3)SwOx&T3>KPjT>6&DxHPj(?Aja>29Yci0M)l0pZ{Db$6H;2AwPM4%cs#-z
zTbr?5O{9-pg;TLpv?O+P`X=9`#}L+ovd3u?Rz=V?kto;`qI6G_ltC9KENPOwj%J&0
zJrvcmR2l>)<#2*T8+DWFLEoFO5CxY#^0@G(qO_0R>jxEz*_7((fx1_z@H%@^Jq3@v
zOEX7dk;18*t=L2H1G2$25AzlA0|;4)>omf&3bLr!vbfN3{gKvY_R`&zR1extEk*U9
z=1rK(zUF37593&8o@!~)O5A)J)WZc#*GUW0ljEj730x^MH6ic9E7UFqZRNrA@Dz!!
z#j2p5w5>Cd|5c#~l1QU{QvB=2^lVH|@91qzPkVbK`nFRBi0c262`w%w=Bo;k9F1WP
zI}oFXXJ9s_=S1;q)!`wlpXNirFdqi%Mb`&Bo#(5frjVIwr8_!j)&yB6mtb@At6jIf
z76E4YJRRT?q0$iQu#kj}=^3J~42M=K`o{E#PR2v=D>)*I9(#{k%IBl?r|O6?1saG}
zxq+H$_}qj}akB#Hwwn4Gx7AUL!c;|^l{IMNT=W;+lHNr5dd`Yev3N*2(wW5@A3zVJ
zkKDy|LofH3nVt%#+jW&EK^is%D)ylNR@J6y(4o32tBNk}n)v$;>WM{F9>K?$Ob=?I
zrlOXQ7t@0ScZ2DPa<m+ugb$#?s^L|x&)-TBbp)2`!po^l;~2G>|6wVAi{}U(2qVt+
z4<G2Fw9MkN=^|wJv|0D0oXUOTZ?G%C;I_uz=o`j9LWfVr!{l#y>F!FV2i?e*gL;ZE
z$poQe>Ln`cq=K9E(o0bWdP2bVrJW_QL&U1A7QIOIh?>qmR1ZU0X;D4eOD!XgI$9oF
zPt1|DhU)5T)Uqw|CJvJ(PO7@ho>r{g#`SDmPw(n&T+hb!oHn<wEQv*O({(xj{jR*L
zXSpSCZCuX~Jv4e~bAGB^J5JAjhk)-WX+6WuB*zs{qH3L(SwZq%ZeL!3xWzcVcqwJU
zg_jt5HJATx0P$e6&btIJd+rn1qQpg?T?$-W4&>Q8SA+(dodg=Wo#k&28=2x}1=4NZ
z>qAE^3UjU!lC`i<2u5$>S}QiJYjO#{ykbnGkKDy|Lt@9JGcIk_aOqdqZVU489<6f5
zyDf+saXoptkOWedWmZ>N60~)U1TN%c1zk~ba6K%Ex}rt`SN(Rm9?{9?1%h%;Z*EWP
zo5SNj9bTQ47bnf#HU4&aad;eY)YZsEy&Kb~<XMz(!Ql8r^@M`enL3pz;)14Jj|6IN
z<r_3y&vDd`n4k+UuBV7u?h(}!Bl9YalNCWdDQlXhWF;gtS(j}M*B@_CkC*DMrFu~P
zdqFMHZyi~L8DOSbMsXB#QE%BtEn{f*R4cO-zt*W90n>ESM!#lC^qn(*D@L6oU!?>a
zjI7GVQauT19`w)(i~R}MxZrscRrv5CtqdRJCq=++OwY#js0I@mFw)Rg3^f~O%irop
zMJZQfn8OZ)l?;#utLo(#0%&H8NEZ21%?UAn+h4`B+JCG%9ma15XJcX>EAKuv*$oM!
zsjhu=^pqfrucFGK<*CAKuS3@d{1(+SwjZMroSSRyy72+fC(AYCBxgG51_5KNcn5-}
z+JMk}L)gajYz^1@WlD=;@)4L_JcOHFWwfTXkE|h1M=c6d6>(P9@bR5vOX-&M#v9Y)
z6DKg)%Jicy<=bW@jcCY+@7n^?gCwo0$`H#Fqm*k9muVNEk8&HNRadvXL!%!SzFnqA
z^u}^CJ$aPpVPP%H9Hmhv%E$V+l*q0SNIidx#rV&}sqBHeSE+cA6lS3O$?a4kD8SpP
zOw;)O{-5TedXnQ1|I3b21T*oR<tvJU)N#~w2tVj}T(oVOJSnGgp8%YG$F{1Nv<yc3
zVIQG`az0E{4}w5aw2{l|qDyO*1x20aL0nXEP=KNhsv?Ub(0XX<)UGYg95Yce$Elu0
z5+@0>5<ZNf&bw!9PbBlbCd!$Lz#EP5kuY5~cVRrEW`=lZpV*$XK$mn+-9(gQh|r$T
z=V~+ex|tlIk6y+SWSs6#_#knVw<xFGREW6C)2%kMC}3wX9eT%0Nsm@BWY?%5x~^E^
zbC)O1uW}R9`JG<Gt?3w36)!&*vz&x-)%NJhWi8Y}xpyym>%(OkA2NpeA+n9Hj#w07
z8D)iOpmnC0x&1<z!RQa9I|s&EH7^_MO}WiR_&9=t#=UzZe44h_7Ja{jNGoPJJpyPR
zr@Gd4fNh}@LiICk5(XD@@@dtdQ*nxnU@p4PbkYr<R(@*Lue<|6FJwUI6QHQC6EiDF
zK8-fKh8zXr7NZWaUCM$VFYiRJrhMJ(!c-t0jP^u3*!WN_!4@S>7lU2k;&PyM<6IFM
zXm%24Kn66izQ%)1akB#Hw(j+zqZWlZR|&~lSSSReH*u{M8`d?sgkN4UCelal;<_QR
z<I)+IwqfomvOHW!tDNy}4;$K(G@POGOCX{}f;j5ppe%S3<Y5vfWrY;4qO%h|qKr<G
zjK!H(#t<M2YssUGvOEnFZci79QW*u{L-<4@h-kw1fQvC)N&(YcN|`}+etLV`HYcM`
z>FH6%nc$NyyyAM$2OS>ezjd=c$x+He#^Fn^D9oI9O~<M-Njm%wZ{v;ad9>AfDj&sN
zsgyZGGb<;{!+6X%nt<8jdJr`ed9yr*Mxqa$!SPJ>ai@EuItn^?P*ErQ8u9|_?WK=V
zm@(+%-?$l`4D2fcbFz%B2(&|4qzsYfNfX9tQ=la}hP^z$I%}K5<E}irX+J!Cc71($
zeRzCvcXsyS;jix6>(|fQ-`}-2w;~Ag^7>_Yar#FY19@_aA02KbneUp@%aJ8g%x-m0
zCUZnw-WZ;;{zfDZ3EK>IkrHyzlS(vLg8@R;s%<<8VjWX5U{nKI6{1~N3^W^NmOdjP
z8^d$mkDHYPn#_-aF(+qwQC}j;Rn7S@Zr)#={jut77{48yjEPyS+*&cO4S!$ys-ua%
zF+3Z?qgQ!RSG?hY%f%xRv~mpyeIAvq#)+8~B=4muks(KcxW%aIWS6qwxXU}ytGT>(
z1BeIP7#<oq+7Jg$)tHJYZgy5_UvsYy9knRTxk^aZ!a^Y!y@_kB*s!kULimm088-9Y
zAfsU!HnoM29uXFhXAz503%KH0&_*>6;=DwFKom9sxP&xs(KVSh=^W96_EIQpJp5;+
zPsvh|o8fdSc?{<au_!jD5^f>O5#8e#+k@(^W^7M{8aY$nvIlBxPbBWYj_t`~478_r
z$wUw9#n>KrmmDpr(MD^9cNvBmuZpmYySgiyaEtAEylGtfDDg@K%}_0CK|xWTIm#m|
z=@Ug90(`YoWPRzeJt!(Qjl08lX+r$iG*h;L_hEh_lqdJq&6E|0yZh@!<_By`7P2ha
zKeh)Pd(<^4Vzdx)$=a=K&lcMwD)lxev&@Ea)e?20TLu<m%*LXqFftZZ99<s_j*a<I
zC<huF#oje|3l*b}Z!d_hV`?cIM3tdlR?K7-A{T{W9y<^l^K-syjYWnu4fM=h+F-8f
zw5p3f;{=(RR=T5eW=)WFatWsL9O?qmuG?OV0GjJ*lQ2*5kqFh{v`H9z&5=N`VO(4v
z-DgVo8Gg6a8+Ak+2pSUuLd!-`Ungc(kbD|#cnvuU#4Sc0V!M<DhhE-^UQN*myD$}q
z2iuq*8jxWr(w=FI-V`@GtF)%w>qAE^3UjU!lC`i<2u5$>S}QiJYq=2K_y8K?Xd->&
zPCp{EV#hDZxU~Ckk`iyN$a=qGd-ib}4?g<*<AdAN^Y-BDyD;SK0n0*k%0h?cl!%}8
z?bnx?AC~1jPeclp(fkM1hB6VP;}hcZJ@GyWK@wSK7hX|58Bb9t<UhEhe6l0N6Ck-I
zjrsn*WyMfUwoIxDUD%^8+oF6n-skrA>ZklDKjMe?A09q$Z>}ycZrT^8e{2trJ$Roe
zONz8$B8$pUPn5DLLQ$N@r;3kYblx@bgGfeE7R*mM3zk4Rf4xNfWGRa|;-6L!KVclT
zO_i6(*R1%~o2w^1EGC4p&8iq|@v~tniZ!mvpr4D&6F*7W%T<>^{k1fT=^U7hTm#_4
z-Pj&oO85EkM*L{WDLU!3h5T$GKMFxNTJBrO&ld8th5X=XVZoo5yAW-YE@$2VVo#$z
z(Qu?Cc-f7gz!oJcPPe@*vJ^P|1{hCA?18%PyFs*n8z>sNLF7jKoHp&HwPU+$a%n=A
zSB#1Dk-NBVNbI<D#-(kTdy1_0j`%_4Uo_@Uk~&S=06YN7Hdh4?%DRn$4n-xBCQBpU
zMe`v)4?+Bh@~+O1AB6jW=nx%)OwW>`+K31S^6@O0rD`)`6{g`n5~fG{2&Z;kv=6FE
zi+G|7FV-hV*4F*~-!HO01=8*!xR(2~J~axjuqG<=uE^rL+ln?nYSzbFe^*BP#Q9>1
z7n*dXNowU}IcQ-)<#=#D5maPC`FNywSt5Nz*pWw^PYP<rJI*IVYU%y`zgnVu5Z8yU
zWcW_)ua$8dmthK}ZV({V)lW++HP1_7S;$mUj7(#5RWW39sUW%zJ5P_D=<lOm-sqk!
z!^;{#-7`iOQca-|XdxMEbq1raRXys%SgYn`#fI%yyK)=DCUzhuYhIC}x;lmcqBs?H
zRr#rQ1a16wKKw)aV(O8z@!P>Qo!HvStrc5)%kZ*JE;%l9^Q&FAtx^Ka&aw`0iI|&f
zlHi(d!nkI9#4PpRmMc#k5yNfYA?Sq+2z>$+^>t!q1<8A9!)wS<AZ{`05Zk2;gFI})
zb_~6m@^xDV;=wk$hemE#iW}Xt(LEx3+f5sd->nC&;j>TS*S8wyBLaBbbWe_Si9*ma
zCf$RQHid5nNrcoU)2)orP{^nqVs>O#3N?@AL;QS86iUiPdf0^z<r9Hyitg|K%^c;E
z9&`LJJ4#Sy1W%r?NbrhKq9l%@IIq%<*WDKA^N1;*tZWdq6$UvIRhxNN1O=<pplowg
z<_^0wFS4~&n~O!Rr+`+XvAInxadT0j%ZKWT&FCJUPc>42bOv^(d!&wP3Z}D~PZSm2
z3~Y)zXO0G4(8cUHNl~vk%@Xt_%lGGc(yS>H^yp%B*oF~Hw`x9Hy=H08Z=pRqkD+({
zZIL@$Xph2oY+TP)ulWScZQG6O*;=u0Tn`Q!(dPL?`9O}2_8h-OzP6M7(?GL?+s5@A
zTwNQyAdMZ3j^Geo)F9!Ft!-BF4(TIzN*K?I9hc5XW2#1pwAd6`?^kFKL$Qb!9)D0G
zQvqdsiaIDOl<{e+wy5izXJOvV#Pl$ZYNKzS%$35UbCNvbpgcnGjm?{65h4mai}<j_
z^d}il;P6TRk(@;d(>zMqeP~@w5948y=iE1$izi8n0D{?)&E!;~$OpPNr|F8W&tP^~
z)r1@rQBtEl(UWp2_vxhty+t5^hqmq`boge0-M0#+PulQBS*XP2rMqk6d5WdtJy95C
zJen)n%n}66_>epz3v3e4V}{w0jw<90#qN;WdWQ2Lat8W{FNPQAiu_N}FmE=X2l|*F
z$Kg={Sehg&;(4;H3vrV$tQflGg-?re-Qsz+ChV%IM4vy38lh8R8)lXRBOzP*)|S5I
z@bCIH)dOhBsGf($9?rwa$5X|Tm_aNZ<+~t?Kcd3u!FG;|<FR(2jpNxko{i(#IG(Zn
z!Yp;+TqX4NSqS_h7=89!v0pZ<^6i^_&fT|d_yBq!edH><j6Lm-9{4eHJUDSr(>7`X
zRBWz;xMgX8-pxE{OEhmP!n(_gxx6hFhH3uh9FHif>g0GBGC;xzw1zc3L>#q?rddAz
zWXgnhH}NPF1PM``06C03P$zl>$Fz><i6bOsh`@#mpI9EyPAo&{QdBsq!tR_)^uWgP
z|BB)uye`Tz$?~Ggld>qbs<2yC*sUt;=3Q!zA#K}&?^L!-)eVtH7{(UtKv*3RDMdX!
zGX&7gY}q2#_F26bO7k^XCYyJ8{dzdZC0k83YSlHSU7uO%gqa$KHaVkBICo@hta!s}
z-68Nia@iXYnr}cUG0};c6(sMqWs9JQgdB<8_e@y|#>h(ObTQ1JmXe2PZK>ZtLzD)Z
zol9C21DZYphO{YeRv_Khy*_l*qA*nvjYlI{n|IkRWO+>2p!Mjm2hkF=Mk3INE6dp`
zYQ*)@7EU3vtmRG8v7qVF77+wh9u!r^gQ6+&j3;OX#MxW~0SkA1fYz6T73YO+?-Fqh
zDGE#G_K0*AhpA^>kT{aT>w2#do<@du*#mX&Qrvx=cZs$jxo->sl0V?Fqe>-n)CcK(
zOLi27Nf8%Gw103qX9#RB%M5*k!y+n+Eso&Pwtn{FhgQZBfY@P6kp!Z=FcOwfRMpJl
z%AlJZq>vBwgD4@B`H}71wE!Op(=^jk9fXTuA<|YQ?4Ubs&zhT)<_@v6h#)*XE*KJQ
zd}*H|%g_P(9~>!M2qzF)iB@WH#@PO9ne&iGRafKpZ5S7uT0Y(sF0YMsty+dM@Xftz
z@+?B}C@Uk&^RS3L_sm3e_?i&1aU?2nX=*89OKKU3joM2sdBnjOU-+=GsK&#gma|ow
zX|rT;QWOa~)$gyCXwC=T9o7y~EMXn7C$(E{VLn@!PtTv|X<9`0ZDBsL)&cqI@Z>fk
z_LGl3|KLjFsi$7>R<_5=vXk?!l1w)8l9bQj+DU-M1lc6?03v~G2$lbty$%8T2S5+D
zQpgHp#_3TvBGw+fAXR38Uy~U6^teJd44^V`2DvA}JBD>GXk!9flsH`s_WLd_2U<6(
zju@ge5Up|pMI$$$HzIZ`+w*;u?Lpt*Dvx*(v~^gZ<VO|;b(^(8*VR=~)^NGn*)Sha
zpePnH{AaCCnJ%Vsp;wit^a{S#WO{Ia<TUv)Jv<kAClj9%$tW2(B+aSpfx1^I9NKj>
zY+OXBkK-HU6C>{nm0s`fU(D5m9UaS*q3TY;SLBbduuf`pQ_8{$`6tEZSZ)zMTZ9kU
zwz^%ZJD*$rh-#5f3^7ZV!31ITJtRgos*GFE%otvW{OPRAp<E?P%n&{2Y{=gZMzGb>
zkbtRWX^5X1l`4QtMKT5zGkOVTdmR!pu_;wiQ;1?Yt1x;n!xDFctdmQ!=H^$sZhI{P
z%+9h7aEX|kYm(raZo;@`e8kXs`WE41W>XA<>>L_er;0`xE|k<i6{sv`JJ?vxORz<W
z)5TyHIQ<3~C|WnFj<`Xz$_*3^$RKiaEH}sU`*JMHv}}t!Zi1$2;vjBh2cWtMstTPq
z;=1B>7|%xdAkBjO%JmUGBIw3>R+*t^EKb*~@jaYpv2P(*o{C7Gsbk5)WLkEz2kMTc
zaBJ5&mKmaH@az{}r<JHEndYbj`*%xW1Ptf5IA={)H2=wJp4U}YiqbGmotIDQymFs{
z9@xCz_bMSiw9rb*2QAi@G_ye1T_V)SWPHR~xah6p6WbG8f(W>5=cVyI5~gdVsg%g2
z+WpZL+moSby{}rLI<si?{M$LMCqGUiSt<l|aZ!XMUQxOy&ZCx>Xu($GT@}U~*Rydw
z8`ndO!p8M%K?J=vxSaou>#5t*Z`+fd2fcAUTdfe4PBVxIsWKbaV~oqj^{6#;X*3oV
z3c-loa`A#Ttc%Da{O;Q}7j|MIedI23gd}#P2`9%5{Q=Z;3$mtQ^&n!Ot#Zb@J!tQr
z6xj?(UbYENlbbw<IQ}aR+YV$;R0R?18dh}$kCSeO@rh8XV<ddmGd{^u(g=!$q)E=K
zot}~E1qaK|v1EnJ{n9hK5DsA$p_D+~w_MNo6bX1SPsRsbNm71)|4(y#&*)mBPHrOp
zZ~yr8B(K5ZRjjKJF~Q4r<9i-$HrE~+ypr{aC0Xg@V_BMtMQf#fkVcUwp1C6+Fbz7E
z;SDEYJ`$#@=8oZ^f<O2y?f&SdeG>6a@M2?;1sQ|(`Rhg62OKmPEcO1nS+goc)(Oa1
zo|H+~JSpAWa!0n@kv>>K58v5nA1yh8>}lAITkgn~J92~m8e53Z$<$-B4aw-NIpbKI
z0Ugp=qYu(n#K+RwMUD`CqlC}z^0y^;*@>UP79~y>gF)y>fs4z5+-~QJ&_H-H_o5ik
zTZqpV;`4ol_<&16+u*#8(y$9ryr+vAbOYj|j8B&5L6f#%RFWv~*jx$;c%QHsiJz68
zB_0Egm(#OEkWUsCnVt8EI8Wdk`UDiPLUe_lMEan1tdT;}19j(8aU@6Y($lv@FIYUG
zpKqDwG2(m{cpp(N1mQzTv?in=VGR;;VoZ9IM^CN&$PX;eXAc!#sh+W<P@L+ClEyJ-
zqCvD7?}N%zppN|1^C5|!LOjh7nCIes3WjtoPu)Zek?7R*rJMJGZlZxRM-T25F?dbG
zk_db(%iFrHldUz-7Vfi!`;eQph5KybK3lkt7Rh5t+9N-7=S^?nK3llY7Vcvvvu(T&
zb;G7_hI5r@pz)hxP-+BIslTP592M&#^+@*a+qNWmhxC!V$QRNBj7!HFrqi>E3o$x6
z^=Oqd-fcnDi1RAmoVOi3l(Y(1lOeUMEbAbTiYRC!R6306BCpDNKG5eO2%p8`%~64_
z$hkeWD?*#u98{1O;e#3*2H{i0F}NH_?u<l^#61+>gTfplMaGZsLGx$C`TURWI3M_w
zStiO*W-DR|SR3V4(w14+!7r`1yspPv9@a~BR}wv_A_OADN%SD83yiU~IBb?CB0|Sc
zDFsjuzC*24LnVRIHFC%3klqU7*N5fF(^x#VpGKyjUlx<P*b!q;$ea}`Se^`|dyHVq
zjK^J;m)#cKbJ3irHh|{VC?T{e2K%{EQKSdab?mLwH$@F3Q7?}Mj0%u@vU<vAzo?iJ
z24J?vjFdg!pjf#)_xAmdPtMCfSRbCO0D4O$&RTs+8_YG91D;MQRZ(;&f;b;*ZdmDg
zHkL;V-=MJ-OW9a&eYs4zWOfFwwrs43$^0l7b8@B^^(CTQoh|JmA8)>3sJyH?9R12t
zBoBE76Vq4$OwDye!f2*zA%(pJT6Z+ihPuqdY-2%UNEa-#F@?-bD?ONVW=)WFatSs!
zzuI-%YY|{}mUVzj#N1qy1lM#E#x>(3hEgcD=pK{v7zWunG_+0?jWE1?sl6&tS<H5@
zv7DD+ixQ`c!7gz64KPr&Zd4s{gJ_i-C>oGKB<-1txLJYpY?}HR;&jxaFjWy}Wi2ce
zx+T4_YpvL@uE{0*@`^E$K5`e=4T&9>&SVQxEK)_*4=<_fA-JC9q%M~52rYwb(F0K)
z<$_A_ax1@|q%IM!;~PN0@-#*F=f&V2WXm870IgW|PaMgSC{v&_8$)T>s)?Rf_vas0
zaL@aZ)D?mimNl9!u|2HFMDUD{c0OcmkAG5^XsYE)`(#-Z-{1dcDYz$MppxP>`Ib#n
z#8sYQK+`w_?Ypr)8|x#@XdlXGV|_G-T2Bm{XVu7(n;l{uU>RkFZm`(82IHV&$PiaS
zbR83~GnD;()XRz~t4fed#xRW?2rDrlrKlc{AwXWVy@LFy=8zb_-B_P;>`+cqI;^y2
zW?E?>I%n1dStplZbMvcRw@ontW@lLkxJ0PVrn*i@!r*64JS?4iQ=F`=AJ7b^GkhKm
ze_m?qf-^5Mv`$@~ya7}ev;EG-a$bThN}MhRyTH+PpJTdc{i!-)h|)l`$_*5a+<@L#
zpDm~B`^xExBb-WSZIeW3;>rRfO_fOLVr@{CZ5E)CD9U)SMED?YH^OHzr3-mx3FpFl
zH2q4XjD)^^B}%ZG(F90HNsSO63Dg4$fT8FCiYQPT1-a5jP=qM+XMGU6bASKumqL7^
zI7Z}9p5`l9pRVeXh9?!{ZP8^<E1=+q#ro`_!7BrO5VV3-?hN!n)>ew|hrN_f6y}*{
z&n^(2MyL-%>*Z;zkAUf_X=<ima!6ZUs+sX3hPVr#y6Pw^R3O{(S2Ls!3y-7hIO0c1
z7-gcQ{E8}DZCGY`R;P8FRz+5Am3+30t}UaBxP>jFYs=``GP<^mE-mtnobIWkZ)1-Z
z-F(SdIDPybV$s0|lt|lI5<p;P1<8A9q?#c|fw;w}#F|~o#`+lJLTk&pA~ewKBy6k?
z3XaVVFpZ926ETFe(a++Sc|w|%9nwec^dmC+0OOZreAfPZ^B7`%!lp~}H0pw^<KTPh
zCJm~nje{&i6R##svM6fi_?`qr(VLg>$(B4y9-@PdtRZXiJ?NO6<q+T#N5DfSd`~@l
z1Y}oeH0zQ;&8cL9ZCcx`3ssW5ol4G89`^5N_@13p$x&wsFV6Podw5snNIprsvWm-U
zOX>RR@uqZn>F!FVCksV_l$+^6t1iYw>0?tbQSSyEt*2h5LV+i0nHx+G7cfmJ6;@~D
zU^CC+Jz2;L$h+{PdJ+Z}c9H5y5qAfv;R+Tu%!;_hcdo1Iq{{1j3-H+je55%)8-1c&
zwS=DNK`yFA<%oq*VPw0imJD4V%qcpobU`LD2Kr;Emb5B_%(ac`(VSZve$2dT7xyEl
z#2W3aTy3tch{^mY7;|!_7xg8gT-Cf(<MFltpDn;=D5|6K5b8Ri55T}eOx&t|r4eT8
zrP<_+4%%Eqhy0kTE8g(+b_iPb21M1*xsVzaGb>2mOH(4_UJAr5Mjc|il#S{!#)a0F
zb46&N*-4-YAJDXC8n>L{W(Cr1-Rna~Eedn45|XvBPzXkE;tB?{Veya~)uS6o%D2s$
zAx6VEeBTa!`TUE|4(jskOwb`W2fzKRXV0I>e?K{Ge)glfDr(*!mZxlMRBKM+Feo^R
z_u#y~uF|f{i#nO5dT?(408|fZ?>Jq`BnGRIMAp9SaTG-yG>i|`gT7Xx%+JK9WFUJc
z;R8KT^D5tj>Jb41zD{M9B}f`sq<T`+>BaxB6%C+^CJXDLP1~%E(vm%`tdSp9Xpfif
zuB3XBG>2L^^%C9lP*zJ6`7x;;1nOkI^?Ra1WMod%l1I~kI|<X2(nj^f;FpM#bw>51
z=%DVYm55wQp_L0%Pkh8s{5;QC81GNOW@X5erbhB-m(>XqJr7@$=T~QKb9mg9XE*JK
zhtICBFRu@eFYeCHK0N%@U3>lddHegj_U88R_)mwI*DuS9(?9mzG*9uPLzI?I<G4<W
zDn@LfZ1mn#`1`kYS>6QWeh^$;Uy2x_>nmKt|8K7@>sP}+Ucl5jy?A-jU0$D;xXodC
zb#-=H%bWc4cQ=<8z}J@-w{SgAZeL%u{m<%GcNgC%&n^G`<mU8`xGqlyK0L%@U%v+W
z#j{VJ{o<>G!2s`#bme@+Vl+50WYMS~k^$S8j#}p~PuyPRK~$4Z{k3NQlbvD!eApV>
zqf6;NKi;SwjnmN0rG5Zy<4cCp>=5e!%P1@K2CXy2!0i{p3`T#T9z-&XwQ63*&-L}D
z+-9SCn%h^}ickh;>h-IWMDtDaFf*4nm}@!*Jb8KQ9qCR4nVD9)qjP3W(By;Gj{?EW
zZ<^)0?X^gN=4+}ir2||dR1ec8VQ?<3OP%hbxI<em*Otq5(Wr~cU{vIuI{gM1C|X;p
zj<`Xz$_><1!<RU~;npIn)86olo=sCf<F-0#QJ8bgNY<c@b95=e=#7=H=j`LDSUe<c
zX|s6a1L%SDkvsi}%!(bC&iJhTcS|uA6<Jn(Mqh$fIW&|@qDGuoo#rmikqy*_foRVi
z#6?;LT@g1y#}kh5k2Z?4**eXrp8OD04?HfnONma%sDp|i1Jk7hH6@7@AF4+jxf?zu
zI=ZL$t8^hLG@u9SUZu^g^m8in6m*!cQ^}JMWnmYn9ypaqunSpHtcV|AZQOQUla+Cr
zR6KiH@dH0Bs>e%r;f~*2-d)$i5&!b_?W@a+pYkIX9+an-uWoOz@GtohKfM3&@OgW4
zb$M~qzK~9Nin_rIddV0nx^sJ_W|l=LIgVSy_-M}s7f@f`U4Sl70(eGs6PZ*~swYdQ
z`CDATbfw%WAjx7PM$nJy$wf5ag-=uu$mA4SIoFbFh_x$@7#CIJ`3j{3CsQ$zz*R8T
z@#<++Pv4lHtrNRyFp&Wx4Q<6xvtc}G8b~q9c++h>MQluu%p#M9D$R^CrV5*)IU%YC
zFMr!#)h9&xvFdafza5;7iFvI2TCt4{zh5`8wU3TYa;W}?g)yRaqrz;jL)QoV#`KKE
zif9Dq=32XMd;s*xa?Lo&8U3?4FLk%5=RENpf~MN=25G(lr9%ZLW>%2Am!?F990lSQ
zqYklM%Aixup`llEdGGR(RG>0(_B${8?Na6rwkUDB80`05Tn@BuHm0X(FHdMR78VM@
z)Fm*dO*X7+atXg%3QLlANFTY2>xLd+Tso62=rJ%oXt>qZalwMF4bhRSE6Siv!Y*jS
zHcnHP=S`8$2ledIx5P2@8jq5tq64~{>A_ifBvYzPrYGic;@go6eQy*$vwD5o-W(oB
z=x#OjDSM#iRK7`254tgfOS<s!DG@vnBYfcR-1%GZDNzC_&ETu<A3nhAtf<45CtZn(
z&}EBxdph9*KP;xlOLtf5CE`Gq7+8=R;B&08c7g*EP1${j9<W6w(Zl0uQl*4xI%y+c
zMcAGf(ZiAyP*0^yM4!Bcn4aP&;weHQ!(xR}HU+O)3+gP1nigHypOjK+PM5N0T5JZ@
z*|(XNitE`6A{hXa`K`)OFHerZl?PERKK0k~3CQcE11SSA%OuF3_g7(#7>}-8*8EN7
z!TKLtIEWrY_6fbEVyLT6X@l7wh{Ozd-F4N6n;_OPrTfhdt9Bg^VjEe)%nq&&u)U6s
zune&du#B=oG0@l;Lp+1gAE*bB3}daDmyJ<TZnHCTwRf^2CiA0U%*mNvw7r5{vzn9I
zFEr)18`HC;Z*A#Y*p5hl>~w^tL@tP1%*OPXaZaPVKF4m+hhO~$8dIQwW+#EZ=>wYf
zOheifH!F~C>s}u^YEhVTm5{8B>1j{KrDHWp<nE`iM(<ZlPhO?G3*#yPy^#hn8UP34
zY&mGdu4>XYs`DnBV|wBwD;^+y3%;V0>EUQ@BT_}+!~LPH?{3?Z=I&bN5F8$-=#e9?
z__&lQ<G9r%cpzaY?L3L;>4CaeX>%+6oJ!=0MBYv%!j6_J^}wkV#c-1_Uy;65w{0EO
zaau=7-DTBQnfdWn>haRul}ryqvE@ao2SI(Apkz$Fj51Vu_l)UDGBc|(Mj+foD<w=<
z%AEpIOiz@H#~Dg$xze}NA{Lom7e4%J6ec{ozyI}IQD%M|7RPaP1eynf7O@pYnTxK;
z+ahV(yg|jDve>Aety0fM^;|Y>4KsD4dbXm>TT$k%D6>|#iX2eIvogZCM$FcL{l-=s
zSMD$i9=ya55up&-23cVMl`pp)Y;e*`utkZ}#b6gW`s*9jW8qwlF(WNnSSSSZSt~ZI
zYjO#{ykbnGkKDy|Lt@9JV>L>o3MjJPFRG_4Vul`9QIK?T62xuW1{JuUAVCwZCT_AU
zF5(%ghohQv@&Hs1B5j;h4_X<gNC!a}i_NJNZH{xeg+62tvm}bJ$Ve2CFwLoC>zSS`
zDMatD3m>17@f`6>=vJNF-~SJDfjy&RnTD7g{Ad5bo+zw&+B9fvmejm!_>*!he^`M%
z2%dosHZ0&>hx%GPYJ(z1J3Z8k;T>;zme%2qR?<CO6ft+IW|S5w0?e!|EKkE&P^%sZ
zTo$UwAbKXLTS}G|(LBB+Pn>c<y%aKz^C(Nvd+XmVYh#iio+pg<*T#@FZN;iOu0a){
zg4U*uzj^k-)nM26!6GsOQ+Rub%<<Si_fhd-6UT9zRq$kwU#NTMwVSTgMj3_X)JDc*
zG#*Hxv?lk;iz4wn5)qr-<UmB^PRk9EbWLFN9dgI8ju=HpU92m5LZsdb2^~u_#Jzs8
z#Jgtb)&O<lE6Qe-5vq(;yp1>ln~P>^hPO4t+nV9If*0u{xrNvnF>Ho&>OgEEcIUf*
zH(FB`=0F`n8w}395+d6N6q1+$Pti51s3}C@2viu|(K)jw$U3<MQ^SXB_afypUW){1
zF+Mg4^Rbj9LXGUPNf?-(3CGcoF3C`6j*WLU!^zYkHW)R@6D`nb2Aw*W#%2JO#caQ`
zv7DD+ixQ`c!7gwl&o<t5<6YZR9B3`BoAGNyCpeRDl9KkI$g;Y3QcuPuXq7YGtsJYs
zSw)RFud-QQp5nnrpMQLCdwSj;e0>*&yggv(CB?zJX3;y~T~V+-&XM?Mb0<M`WWp)*
zVRya{))mFAn0F`-%VJTU=YLw_c(Oc-QSd5D_vd&}!>f!~R^(mGIZNw}<JmYKOVKP3
zj;Dwt923|%o&?qOJazINb38EzQJngogTy!oK`pHN`_JcD9>$J&am3LF4~Ii*XxEHo
zRTGL&X;k0@>}j<u7>upn&R6zJ@`02yeJ4hDwe-wB?5$fO<;*~mv7nB(Yb*!6?zZZ~
zO^}JB*@NhQrlV=Ou2}Z!<dckz<x$;ZX+av!lLIlCA0^+>oSf-JeTgVnXZ?Tqc=OJ`
zysWyV#)G@ue)(&~#5R2X9zbTg_R-Oa162<!xu~@CRG4in=sM|wMbs2xEO!0DoHI+0
zwF6lvmtb@At6jIf76E2wSqHd8%*{1Pa7{O1Tr)mmCX;S={?!q2An1h*2u&u|8ErLA
z%&Z`}P_^MT<R}og7<Gv4QiegEV?5-Y=+%@B#4bz);=w2`+78~WR3~$^gDpy&E(ZI3
zM;CkUooU@TSA+(dodg<?0Zn_R@nBQjtU$W0dwuArMPbfWLb4VX3c=`2Tx-RKbxkhe
zcS~Vz#h6GRxr^(D#Ewg6T-vHpB6UTPWu<0%5PgxXa>lzoD9e+g-yP>g6?7=<i$uz-
z30j_aLE0uw!CE*uaWNZRr;t4BOJ)iB#u+q#z<?atN$?xZpaHmS-X*ir(2U|qz^qIH
z2P9079Ki1=Icp3Z*!{^BIgr8qLy*CRS8QDt<;d4^1o8|XB?2DvB#l>u)g^7r>a@(l
zCQY04iIwKrC*}tF_?`#ycq#76Kpq6JV97g`64860ULz6FW@I-bB2AP`_hDW|#Ox%J
zC(WlN>;y<tNK34%$oeE5btxn=KSke5FNKUERET%f%Hqe2qvqZbM~)}-JXui{owG7(
zSyMKMWA1qNv?A*?Z>#t6=L}0@Lsbn5Z05?bv(7p6imbSO=H-b;q&$df>KRWUFDiC~
z0hnbW<V*HfZI7;8*1SXI!TKLtsyOu+G8NQ8vWT+5Y!5_Y20WRo>I+XH)-k2~%?+z|
zeP(0{Gds9C!1g*i!Z9whqpZ*x^x}$P+mFi(Mt`6lL^6!EYF;)*MY)Y(2OAHvLrmsJ
zkz!8H^rGz*<eJsI598+j)!84b-iGnp!ONJ~#ma*z_O+txDu8Tj?W5BJXhG0C%*>??
zW_ukHGvIY~>F+p&%uFlYaW`+;YSo{92eM8s$(oyA?Yix?2rxU#I>04jZmvm!Yq|;J
zn(+}cnRLU!mdjQh5eI@^$bisfD(dUR%nCA<$dIEz++x%rwo4fXd5-RoccNEQG{P=S
z1>(UriiZYdSc>!!Fh*~Ro1Il!)0+Aj;&jxaFy|^ESqlq=VDu)gwPM4%mJ8u`-?rfc
zIQ!x|q>tR`M`Tv)W2Sg`Rh3oOG^mi3p#NQ3WkFTtc~I9)lNVKm>cMkW(Rmg=L{)T=
zzv%QP(Uh-1+=@uYFuX~|aRlxe$P*PJgJjZD9S!s*6pw^y9wif%pNIN8dr~~Y6Xu0?
zRrEN_<MjUi-_D2f3{GWHL`8xQcKe6&bWIgjQ4|$TSZ2)=iyQc1u{>V7yOQNWXUhe(
zM7_4OC<H@ls$~-K&@*|Gn<P(UkURpUsicMEQMJ@X@?;Upm-~`DValP8vz_mV<Utos
z9)WGn_SeTIsjIw0&UzUqb(ym#6<VkHTMC6?F&BEbZw5oQj0z$dise0bdEyf(H&M-h
z_1Dq>mKPOE!T`+j5c22!RokO0mo=YGd9Ydfp~sMIM1!E%UFuWXV73P$F$10qR#ns#
zVjWYu-`ucj*JnnSFtdZJ18lFOBP>I#11waULLShI8$&#U(H}^6`Hi(|UN%NWxy{bL
z)!xa9n9PrYF(+qw(e?^*&1(KfzdDrPZY0kxKa5t*u+l_pbMlMTaifLkoLLiOom_&=
z&98RdHpK*(on;;15~2E*w)Y*}O6QnN{WF`K(KT~28S01_UY=ao5<xFyKxi@*^>t!q
z1(`}@$Wb6}G3pT8r7ZaF@=o+>%Gb><Oa<b>XffNt#&TYQElQj&2D`w;<v=Ugxgs>s
z>?F{D3~1UjjR%|JW(Cr1HT5&Z>8M3v&Q(IP78VM@=uKQ}#fEiFF5#C~jEVG-ySQ#h
z?6`EsrQLsT6q`(u^?t?icsYtQ*;?<4<Y62q<@Ur77UjGU;W-|)c)kyk2feTg^ho#g
zDHD_(LvF}F%yB%)aTp!5;;6{-f)xc@5jc>b<a&~IDSELtB}<-GJM|wH$CDvtFH9OF
z1cptQ1~tzFOWUMiX|p=0P?0<<JI0eFTU#azv<0YDCj-mE0-2N|A<?9I(liw@Gd{X0
z>)h%5YiiK10<=W+2wfvW)=BlSLex&b@C>kvQMYb3WlO3h;&}>`flm@}U;9%%VOb@0
z+aNkG@5-XvQnt3jJX>KN@~p^VP^~Jmy*8>x^=+kJX4u0Hgq01DQdBp`5I{4dgwGjd
z&K%FQQ9T>gW2pzCt}#;X!MU8cRsBjMjN(}t5nLl?i|5%2qRXZd^3yi@>vPN>tsC_l
zILoGJ-$DaL#K$0Vqk7~t&;Zlu2wKR8k}<Y6W4uZ74(TIz`VpBGJ1(8cH|a4@Jq6<(
zN^qlsa?aWyPQoZaCPo&t9ge80DCyENoJrY=Il8aCd0`$=4AtpPim(Bm6ygZqjOU5+
zLRLBVpn76UJ9Sipr%(Zk)R8dFqhyNvNGK6q<kMn{G47io!ct_+Ia0Ppk1|0sJ+!`C
z5zK?uU|rW0dCA%|ZL5vwdAyD9ycBn3Fb`wtQZ$d`Ska-*rj$sc6pP-6;z`VC9;AXz
zBJ2c6Q%LbCn8epbz|h~E6px^$d?_9T*P#>T0>#6RiZn&5zc69@Q#?t+Lh!p;T_XuI
z44;-lZqYo~utOAgSuq%9DGX_>ZqYom<3qMy53r4|j?fqhJs4_sloeV-HC8oIGZ_7W
zdJxIL)KXaw)$vs?D|V|2k&9wy-OfG84zW=@TkU1)IiXW0n&+ynF;W{XoM0YcTj(T0
z4I;2f*eISsgAJEcD!NXV+A-=7I}qb{z>c9$k1Ik$K9UMlzEO6myU&pGHFvN@iHokf
ze8|yH-zXjn*J{j_rmgb}g<$Fun44D{)-}0=-z|kD$vdQv+{JZ64=^sB$4K#XWmCp=
z9<a7TRi36wg0@b>plkEIMsW^wa_Z(Oo@``z-k>TEJUS=E6SE?Q110jNOo~TD%YY;D
zaVKLD9xzGS;^=NOaVRBB^C+1p`#f<eJt-bkV-@}1FZ_ZAn26<>&DZKF9(a^RoUk0>
zP5XP4RaIAMR<f#!SX;DP>E%aE@%Sk2N{Xk*p=?f_WGK9ha55}yQzsEV6nQc{g3Xy~
zqzJN+>PKwqNcj+TV`F%fzYz>7+MTv%&CN-3cU|6|US1p?XCbOt`RXISEudU~J6CtP
z*T?uM6!qUxL3u?GPtk>SUUs}_o2X8kZp+uYXihA)LhlDC42X)sUT)ANbRDLh?lt!^
z)XSssV%$VM%(Guq%m@Q8OF~F<bDn`vE)Vw1jA%<Hkxo(2TPg;+DvHb(73Lbt0Z#{#
zswfgc5bK!I{pN;MyFN3rgqa;&9bkJM9bp+_9bg&d8TLjKHG|O~Xk&Pq_Og#!Q(e-2
zWho=*a`}7bZC9z%2<hknlXwpG7Bt_qhz)h=pTq`pP3M5u)kU9if*6ZkzvIqru`Drq
ztQ}~ge7Y=C`7GCMuSNIS7#?gX(waEYCll+8wi?4*rlln?G`I#xE>vxJT@bezRh{fo
z2A!H?JLJRAV=EeA7p4N0#cT)fR_d^vmtcz$7d>_PHlv$9_s+D$RUI*&O#{tN0u9K3
zrjLLjZHk)}NVj#b4;{5A%(+TP*1|#|7`=&Wt=O=x$tC>qiZPKsau?SPi5-{DBV>4D
zbbrcI^e^XWRE9}_f60S5ild+mi!>;zu1(6MX_~5@C3=tnG%`IaeM$tdELD0%xK5OW
zcBc|$HVXfSK7z5C_?3Akf`l~35>6~|8r35l*OgRHnxPSxuWMN(X>ou5kIuBM2(D!k
z$8o}o6=_?j#?y9afn8Twm+&rsQm*9>D{X5Z4PL38Q7k&5J1HNYr1;-NR46xS*aC#4
zAXhyC?O0^8u!RT}ny4oIh1eX6VkS!H9l@-m51P&4u`j$x*c43#7sGm>nJ88yzNst1
zdU76hX;D__w1xg<F@I8;xrOy?VLd8yL>88`xfPquhS^9Tm9~&RjbRx(5LRA5N>Lpi
zLjcW;!m;*_ul%Xzj~Ku0uVPy5KUVz@<F|v~F|oLnTTiWbL&9jaYabn*@36QnsvKIL
zD$MpebbY{+zo&|tLX5?(KbUi7>9KYo>*Nw_Zhp1vw$~!S>@4d5mx#H!CJC<TCX8#w
zN6dujGhA+C#XAu6LI#8;6T&vqXCW0w4)|`RZln(_o?*#~a$WNKXsj|?)0+A%pcaLx
zia0B4(8eXEjNW7;eGcBQB0YK2W)16DP;dnE#1YCdmth$OO~+ZqQGO$-+PBO0q$u#=
z^e3Yz$yv&TUuXD}QJjGw@pCBqV1r2|5HMR4k5a;Pt_LpWTCNB6biBEq9Q@m2=2m>f
zq9jICPq-qir{z)F<ax%MItnrMPs*eGVR1cPio25JVTs6RbSkA-glG<BRZ5iCk-Vx0
z$&;CJJ+gzB#_<S{u8^iaa-~|ji{e4!zSLJCBgPmMav^gI3Yn*(QF|QiA7_`>c^$L3
z3RxBLnr(HMx8gioaULQPw&Fapq5;{?!i9v<u8)dBa~PhJN-9izD=P~iF>@5a?jku3
zhIq~Un7KOpW7XR*e(Uo2<%78#{vI&#_SIXE_e2A&I4Mgo+v|`R^4cu1I8(^Xw9<n)
zXVwH+CzoJz^Q&FAy%qsxXITfhM9j@KNpMX!VO%plVkVPrc(J=h?IauMgJgHUX)+;f
zqj)xor~e|;q8OGU@w~<goZ?7r3?Qv(P5l;7i^5bzoRu|b<3{n^wir--`&A;%l@HSU
zMe$T+%;SnjLCT8=Q9U^*o}_JpChLk0m1U6Y^Ogg8(j{+_=g1^s!Z9-`9wyihoYwoq
z3?Kr=i06r=Gpjk25~g{SZ&I9xOK$4ICuo4jkq8?2H*?`U>2a7IC;SMdPtYPPT$3nL
zqODI_r3ka=>MANWhUYiWKA;@+UR$fzx9yGavanlwXzD(+(e7(+6-?jo;k@zSm(RcW
z?4T~s&cxcjIr!~gJ$wF4{`<*k^Rpj?8LJvzmqAl^HOeoSNzn0z1y$0roW~$;dAGLw
zveW=K!4o2KC6%;_!MZ|S3(s6inc$^!u4b_;0n*e_d<dp_lnM#!B6u>6dON-`Jm|b0
zI|v@AV|M)ixA!H0Q58wUFX0LjBpfpYiE9WjfH348bG!+M<Ny-n2$3ToE6kZhLK2g3
z3K{_gL6A#8K}8VcPy|oZRpSjRE8>NM%DS!xtFG(yqq5(A)!o%IZ|2QR2Hfx8b$z3x
zUv*bkpVi$}UEM87{ox2D3{Fm)>+BUgipLSKxkZNx=eHEM^S5=7GHg3tGgb0Mk515v
zfy37^Od4ZIoQ7uA9V<9O9?~SD7ltAO<IVUn%}l6RVI17joUmcl%Ev!0Peofxah#Zy
z8;%gIYlLHjiQnP*U}+c!w?g-q@|KBih7%@xXjLP$!?V!v2GOXew6caSZaj&f64$J*
zHQh=>-mu}p+c@-N8mlGF6)~HbBgAawaD-?L%`sxNby&D=XbsP2ycXwlA{$*N!_vN^
zt&unwyg)jXC{s;boS1|(Y!X-B6wXPyv?v*E701B{&VA8wc+oU7anp#JhKr7@(^ocy
z)+EKq&Jdkolih*$o47q93@K(?@|r_zL&0O*zD)KmgIzGHqKJ1bO?Unm2#OwuUk><W
z*(wJDaBNFbZPozXNMRL0%>&!rk}L<@v21vpaGCEfKe0vZ0OCrMisXbVf<(~HL{ad7
z#f=0;KnW>$h>j@Yz?RKv<MJcF2?SosiI6CN$<;hgl{^hym571|?37fHC~uCV;Gq(w
z3TN$XHn{ZX>?O*8+bxMsyH|qyn-w^sXJ~kSwHls;824Olcw};Wa$KgA6bWok$Q3?%
zrgTaP-@Z)FWJR)M@I5Wgk->voTz3PRi&XK*i67Ha61^+S1}Co+364;XjdeQ@Rq`le
ze;A-t73>Mm!K}`yI{k{x377M@y`tSC|E5^dQ1TcDwX~WyzQ*BlrlI69tavn6_h>{t
zofb&5EH;!pLCIZPO<~Lxrr8Wvo^a;V>G3!LL&*dCf5z=R)m3;o5!ESiod(ecY4|j{
zvc*vHz(tV}S34nh83(n(Sc{K2P4^f*X1s35u_LDqU4IQCa(lq2${ET68d^~`GYly%
zonUIAyXK;j3gR(pO<4oD?OqWcTLRUN3<tIRZcy?NosY*aIXoVlUv+w=SV|s;B*L>W
z_&Zm5($VBWEd@kI70yCLlP49Gy|bQ(l1El$kVzv%N?8FlOvF(wh8S)+pt*0E%bpr9
zE`3J5{lg(9epU;f>X8>LSRGf-145-L$`ZUS<~(~fkK_p`UO1a4ySz?#lig7B{CaO-
z20d)ktb2}ZX?MCn{Ssfz1Ke*{ol)=Of%o4)<&l`~Rdx|8yl`6{P%`P5lAwk2q+Zn{
zkvqP){wE$&I-KxIUE<1D2_91~j-%;Ob0m9zRZ?Y_Lp=xUx_}Eb?FwN`KbTI6#x=~w
zaV_Jx7MC>*MUSEAF%&(s{8M#uPBaZp2+2_Npyg6<?>yE~6DoYgwcQ9|fZ%8x*Q%-2
z$-=aT3`I{2?H|hIhBc32&GT=w=7F1d!Dt5_Pq)cFtIaL>K;`4`TOED}+<WYUU1paW
zOVOjibu(PgbM6mqIbm}>u0&}QZ7vlKlSNbXz_ZN`5rn%$6g}{6F*!yOE>eP3jNYDy
zB8CZ-zvRRgqzg)_s)P)Bz$^e>Z(b8e(Zgg)n@thnvgUI!=<)enKDaj9?N#B#jotH`
zvK}~_(^W8a{f8gf@+88$=cwrcTOD$3cwEMG+2C-MLQYEQ&uqboLK{)wByuYceRw^b
zEeT&os9&7~EsQCn81*O)VDUsWJx-TIMwEoO5-zVI*P`dg)%4ip{!Tmmr-)A5+3R0L
zhr{P|col~`payKd-_}7&!=}fu=`n12s)H}gv6)9_lfuO)60F&%)QbC5+zQG1QA{=>
zFrsP$D2`U})g{4|a5{qz8a1Z1hbEK@%sSHAlP1vv<CR^rp{lDtntG(vuT6q)k@AYx
z98+luEn0CnnFuLQODYm2M$vf2;<l_FDQe?#v{>yM79Oow93f^aha<!z5v#2^1sdK&
zoy2Q#PA9U_qpK;jH4+Dd7f6S~8x#7OaWju$)5A>Uc&EUwXsCJ&RnNam)noTMTyQyr
z+v;(N9;;2X`>gN?lhf+;1;FIP1?Lu|n3pd*9b)KF^>Y*{m8Py<mRw-BBa#h$y+A3$
zc?po$5|JnM2SmV!3Xa@Fhr`?yfQYaW7l3HIG+6qFn`OHb3>vBuQtqfOa=~jsQ(~kF
zw?xD9@$xymuBCuVz!v}u5V!~0<q9}|Q_3Afj5NeZwzv+zcY_NnhQ)wkF<@8>7#0Jy
zS_wQTB8JSTb&7)^4mZR$EC#|X?x|u0W%amK!-XM@VnruKQ({^~jr<=SS*I0g3av?s
zkvfIw1e??<@O~3lu@HvTq;3JR4KdOXBMmXKzQS9Vm2-7gO{KePs&8g3Jz$rWBTG)%
z1}{lR7bC$M`#(>Nlte|A6TL?gL}7;%{~k#-$ELzPl3*bKHUj5jdG7W)0)EvS@OeFO
zbBNE7BjK_g?W7d8v!c1hQH#R|bFECm!!<hMSiuq6drczR`9vF=PMz_j9f}b1szXg^
zw5)4_W0gMU4Ts?Z8LtG*GC*4lJ|k+;qGgDON6d*@QjsW8>y)P#t6IatAtQzyX~>a=
z9BIgrh8!6@kxu6Z;~vTS&>4MNBZeG_Hx}?Iw23al(X@5}I@vo6DLA3WMB_^U&0-@V
zT+Kuu6TfNA1J7&-GZDH?_8U$%sDg;PD2eQYQSSv{NL;fOl}<3Vkd7=WsUTiLR}47Z
zZjTMN6|J&jQ{jpMpKA3u94@QV<@7paSrG%OCzj<oNI=Rjw>&4;HpVq@h11F|*!GNi
z#eifJUBq%cksV2yoSKd>JO>d~f0j9NW0E{o5V<o+wnuTZlJ87{mu_cL1-tfycP63E
zL86R#l-ZdhJ8~pPe^mjZB0TAG_6DwSC8vk%l}dJ>BcS>WiPAXBY@B6|C#&<pG^V&7
zju1^i4#$Whw1*=YFB4F?mQIMgWefw?`nsuP=@j0-;@Y+`1x$1l&FV*s#yetYnH1Jr
z!*81lAq7`|(P-(_IpJhtHZjkPVc;6$*eY;{Vc=>QxJKu2WL%xh#N{yLF5}X=uq|{x
z=7h$!SoNsK)QqXRTIblm35imXWX0|XIIK>aM4qtoOYoSTWVgCSmtXYT6{j=i)01F!
zYL|bxMCokG#8q}Wh^=kZy-CTXy692J1kOxKqDoFZhaZ}BNsdU7lHrA&nv_5)<N_o&
zh)qzYblDsUUxhB)T~2tLAm*KQL7CF-lqC`Fg+F_l(yOTCS}eQA>vpSl**MDl>%Fut
zA=W)tyK|}Ow+Tc^vctb<CIb)xI*-H@J(8UuhI6GPlQYR-_U9BWj47k&d7MCs^JzRq
zj}u;mg|VP2F%xtsO&rq!#*}J*S+%=dqH2@Qo+<5M%Meh#qT+JHrFX{b1XP^liyw^=
zTDwiH!wHwR*zP%}p;>)vX=sBZ<RZg#pfuu9US3&zF&ZXqhF3R7iCU+8d8L;%!z~`T
ztZA4I7^VY;>40H6V3-aVrUQH~(8=s{HPEEM&an8tZ7j3<XcLq!!7wDg^w43&m@}Ki
zi%F0t8>VN~kqgIal9*vSP~}Ug4t~w115UTssVFXo6)vj>MUN{WTip()-wJ!`0h{b`
zs$ljU!*sw7?-~5!lgyIC-jwNp46iLZUG$AP-Q`%4%>g&sB_>fSB&7(6Qg*445+y|o
zmnbzSn9owp<Am1_5;aB#QC0!dftzC8SjQzwMS_d^9IA8<rUOo!+vbr}x2(EtUc2b{
zO&O#AcimW*5aXVsng`wjX~LXGf|C<)bb{zrbUbMjCGuuLBGx?gUb}D|kN#Z(f)vh=
zA#F~O9f@I40;W9hdH^vXs!B-115Y%-U3KGQT{1xVQE~NGKs{@hROKA_(XHBCip%Fv
zJfL><+5NvMel#u_FfJL;s@?eV)X?yR$+TQpAj(QuvvXNTYo|d47~Otn9E=DJ9NT`T
zRP^e%$UrVP=*Wi&A2C8>M%FI0b#ZB6ys}4%csdHfC3@JP6K=)AD_W;nj*S+rIGhZ2
zXMVm|o25vU7)9d@5w~UaNKqS?qs409u<&Tb;s`NYIUFG#iCAsTDa31TjuWrNgE&oz
z^tv9e6PZI8yg)h>-k5-DnDQ8=JZ!MwodUZe9|U4{9*!5Ym+?Lw#(p|-;aE))(=uXA
zTF8w}sl|ASHKS)tl3HBIB%jh^;_C*$1TYh!8##!?BpaMQf_Y`bxn?_1Lo4cu4my&=
zr^8#XDN;eagbs(;C9mJDC^oBCQXN*C>{qQ`hr?r)eLmSOs!oqT;Ekc-vB?h6_RCFq
z93a5OJsbkLgL?+#=r#j(csxM`W1vLjNw{~9yiXUQ<dNVRO_84m2}z6;HC(WaVmm-G
zm4INWIvqq#tx80&lx5keELgBUuA&D7OPd7wcFN~s&*N~ZvKP+vNG`wR_RG?5O0fLz
zQuHLmzvrmvQB>L1l%1U-+w3k`BCMfP^{8N*LI@<ro6#(K><+z2kBcCNGiHbhIsdwx
z395X+Ops`Z6XHxzTq$7GGcu0K2edxU{-EN5^zCPF(&Ka50xsF(^SV`sO;rtrk8yU(
zIJ;#id<=z8?OZ(HPS?rtGyrTN9Xda)yJIkXrEMsDe1313QC_$@i!-0@cpjHB!X!?u
z7AqV9<gJ(_v~p%N3h|nor=k&xbS)mlF%&*^nX#UZY(kEwT%#4tW1X7OA;y!mDRR@|
zdjwjO6eBx>PH?9LgVJ_Wx(lsRR7Z+;t}sT?Vey(nY(wE=+$#C+x>XYHtM{m?+heth
zir;FpxmBwRZXdL&3LGO)U4F&w4a89R*k!N<`WGmCz@{dy!pC8^sV*Bi3Zj=O;ozBF
z1=%h!nNn}kBROQft_RS<1WNLZBv<Qji84V=kIUh5Bx*t+!xfqEPQmCV<w?nIx5JUJ
zb9nB~srsQNMVCi(%QnevJa=bEl!ipf=W4#Qpw-s%`P{e-M7tD^Sb;?m7$PvD?lMyx
z?P^Mw1aBFgtU#m2lr=P=(F6?|981aw1!~rnx{O6GDfI{u!i5%r_bI*#)NFz^g%+(i
zoJ_=GQcEflB}UPB#&k<?EiFA#)W+p#vD!B*JX*0hLd;eUM~FuvR$FrlhC~@xd}A%Z
zD_AGYgg7QH6+@!b)g14CS{{jy6tfW`IXqTVMB>FH$nzG)elauGL>J-2w2T;&RudhC
z)8Zx3jGi$`YH^{&e4>wuuNwdpz)Xa0<g^i!Y)}~y%qts$e+?!CzXFL8PHhkig6Jj$
zHU%C9|Id{uRn=w(iMc8f+X2aCvpW_nSRTiAfbCE!@bn#=J~;=8(hst^&+c~F0;=2X
zb9oKhfeAw+4!wKk*8A&-m<1Mp30V7_jX?qqhM+)<N|=w8IGkh-D;YB?)9bFRB<hm7
z%!!3VONLT-YK3o5uY?%)9Bl{e<f@FgS{_MOWd|ssqG@?-@UjRnYa&V>qR!D-^hBz7
zf;u05zef~P9*xe2V?h>l*S$JiaNI0WcC_0>xSFSFb|m^PxW`96dy|2H>R0?=1Et6w
z+2e5iwiG;P!#M^!Ho=Z?i45EiO}Id3$OEUY2s{5LvLl@Sv?Y9m(g~OCELdP_X?mmu
z{6EEXthq@bf6~B9=#WwRMa!$R##PMlXXRwcqO4dY*(%$|$#RZl&k^PRiYx*5Lsdj&
zQ+V<>;1t1jM{(F)KAQ{!=eSz8*I!fs<a{15rj(okmtS@Gy<YW~&zez^^LZepdVi(A
ztkxZX6l!Y#2G?Ix?JbAT8Fd7x0_mfos%&bYx^{-U9=zQ(HI)@!`leI%m37rsfaI;N
zs)y-oYW>_AQW?ajd}h^jzb`8%;I6FmBRsXPVjhGO`$-saubSzvoeS8P4IMc&e_ZC2
ztYUwymryCLt@irs>Z)s}lFFTu#ipexS+gsurh02;f(M(JdQSm0d>LHr4?csg$;`fD
zW}VgVuB*37J+d=%M;8D|x7S-ev#OpW)YSTdK>>Gd^;Dm~rm}kO3`muu_-kwGDy?p7
zZB0EVQc(pxxynm&GAOgx)N$4I?#f<$XbPG2ki*O=Gey`O8I)O7J-c_GOsi`~=J1j+
z<BK!%CP5ZLvHJm34mRymvIam6)m6`g;zId{L+!h(+*u3yWtI5rYO1U1{FkwMQph|M
ze^La?F-f$OF0E%nhZBxfCdh_lK^uN_lHRRFw>kKh26+f8Y~O>-#Szal6CCcxr$s>1
zO^XWLR}Rx<LaS7pQ*i<xrmt?LXE$Y>DCLOG9MRDq-l&!wB3y14mE9chv%xWCm&YdA
z;T}S0fe^cS+^RF+w~JmM+*SwgYC+&-?in?enhrd)wzj%9E2nB^Wo5stv83yk_@M{a
z0Y_Y&Rb5-=uBw<vc><P7(5O+8qwF>duEeXe2A5dYn%Zh|<$5IRf!F^x#{2J@6+Ho;
z!zp>;;sLKqhPx(VzY_X|(<;IdXTRU&l<lh23#vEBe#LX^%d4xhmHv{L>8_|QudlBG
zK<%&S84^Zn=-zP|Qik*NPDLVIpqmyY1<o>)EyhIdkyN9sQ*E}$g{nl*!gw;;N|ng5
ziCCT6q2?0gNy+81!%|fcgk#N<T8|QvQ__cQX}GRgN{8O1ja#l;)G95jaO04>7GB>b
zT`oV?c-NvO?K1P0rsr;5d}GJnB|A%>*?Rr*&Z6hw#@uZ;tmvxNes=7cZA)&xXvo5p
zE_rt^UERHKP47w1-hJcRUd3ArZ_V4j?3TWz50t<A?DpjwvZp<}v`hYun{HLgkM5lG
z+>RBSoHZXEyfy!xmA4O=^UY`PK6lT}cV5zv-1>roovZH3U)rJ1<mY#;-Zp$yR?%$*
zyVh*KbfaAH-t)WG-ZN%P-i;Rw-MwztWjjlDO@3kbEqf;J^BlTu=$`fWU4EeU^Y>oZ
zv*G@!jSE|M9d_@={oaFX`dt3vy|+GCc4TYOreXKp_VDy$4^$j~@xD!uR-b(K#;(Kn
zZhoxpv!lB%e`)XSPt5-MqeGj9-+#xGSD*go^W!hwf9KQJ3eC>TDBQQ@nMElbFP?I6
z-(AlwN$WmhbK(B2FDy@2t~_yY|F)NI&dgtyQS`vwht~GKZ1<GbP_u$5wToS7nbsw}
zRV^E9OoI7XbMwAaGWlou!$H|17^{#fV-;eoLMrO({l2N%l}hA@XRJc#qQ_W;#L&O}
zOI2mYDkQk#;5_O2zg&fwnwwU2h5tvI9%u%lxgdO)L?%!{_&d5^qWe9%-=O;~x~I_H
z&7uh(HfJA)Ks8Q&qY+jS??Uwdk_8J-pgR!5?`y$4R-=1oGJ`WQUPCJeKZ<TQjQ;_q
zGXmX*Fy4$VNDtv|JRhc(G+<s^7Cr~#^+&wsqsP#I2APFBFy0%OkE&GWUxe|#O<{1W
zSD5QZ{7W&s-zzM=qv-z<rjw6wCc4+7t6(~<5w4%W=vvU<2W>@uY1z#E6LW+mn2(PT
z9*p=uVR{=dKc}$%{(%0!N4(SvSj1w)TZrlZa}V=)74hE1@X>!{9}gp)gNWb%JVqx4
z@s44>j$^*oApRp5{yU6Uhj_PQyWNfcpYm{w_aNruD)euU<#`|JeLDc#58;UzJ{se1
zNB>VTKYvBHInw((!k@NbA1`A+>5Jh{VfbluA4LCDq?<Z-4-F~De9lDw!xk1{z#x{t
zMHt=|!;@Jz5za$;Y3Tp?M3(XGSdLq;9M_e&DM1g`+tYm+{mvNwD)c`O@i$|=+Av)k
zT`&v6u6ZoKS0cU%<L97z8|LE{bl*h(VMymNws%Y9+bjApgmD;t7pDI%^3z?IZVJ-*
z5X)spcQfKAWBi%uZ$|vym`))Wb&}r=SZ~?ru0Xe}CktPMaB_PFuS9s^N(SG8@V4JE
z_!@*KBA+!NydZ;xU(%JiOVNKU^38P!_e*2pRR~{rA%g=5e~#s-L->6x=W2vkp*zWe
zbb2#)a5v`8#PG40PCdfS5xxTJ_s>{AOE_K3_i}_~Gt$HS--P~^=)V~IlMC}}$9B0M
z@fM@I6#K<-EXNG=--P+N7UACLE=0F0(mR6j@4$4IVYu1J@^>S`199A)h3PCnfA1{}
zuNl$<P4lVd@V{R9oWi?$dJ5Jhi3!qp$|xZzxmoiTEm|fewMuDyUYpdm=eJ91-=SkC
zleu$xmkYXPT==_-GP`AU@6pq8QLo<SJ{R}xXYHRYn#oTxn`N_Efo);4(@8v3r^}o(
zAamfL!Iuol&CSa%7&>ftVbO?7NA@2zdQ9<HspPV8<0nikoizFKDZjsBs(G5*>@j<N
zem{w84w%czE3TYgIc`Q(b<I_^b@el6&7NbvZ0<bs)#mQ=7mPGtb8W+Q3l}ZE{)Qz>
zZ(O$AyvTghik0S@SFK*N*1T@vE$cUIy!EzCn{U74&MkLs-FEj~+jrcvbJqrG_nzH*
z?!9mC{o9iF?SJ6GZ5tkX_>o5s?0*dYKK{fXo_uP<(~ZyE`s@?WJ^#e>FFg5T&Wn4u
zz4YS2LodJb>ft}`c<q%VuOBs=?|S3SN8ft;oo(BW9V3-yYL0StYOO{A%Y^MSndUQ@
zmIu0mq@5n6%GD@-grMo`4)Fg-)81sd963mmf!|5`R}j*%8_vOI-ih!@gjezIg!ntr
zU5a=(x(SWwzXjbn=<;C&xlKTB5;BqQ*NFESx{V##M;g*cZ=o1JO3^)n;akes#}K49
z1jEZWOrc+e2sdzljCXnx^QhU(+)NDbjp0iX-iQGw(O<yfj1RMgm~IWaeAzK=4I?lI
z<JBO&Y3OFQVIS!TH{vk95#4?0PrC(tHzFMo{dZo+%6A&!lUN?L8}lgb^D)iQDh%)a
zJ^MJeki|QO^yVPjr(ccZ1p0s7oXtP`a2QdMPPvWI9fEiV(M`XSAq>HCZNcfJ0Ef$3
zoEA#kF}xEN=6;RCP5oRXG=TXBaGkaaho=$4n7@eA*5H{8o`dP7U<Ickyw}gdkKi;~
zi3OO3>4`Xu7M3x*28<_QzDp6m0kbs>{X4zF@Ecb!cPUoe(liF&eJyiG&t+~oE^`DN
zc8f9nBX$;kvIldwU^{HV_CJE@)q7ZYA?C07WeiTk{Qo$I!55EZZg2E|^&o>!<MjL;
zwxgpH^WTU4^$WyrK)4vou^8K}G>hRkV*ZYFX7EX*dqW2Xf5EzykdE}8eH8JqepaFX
zk?zd@Ys}A1ELS1c^Jn!eBjO;2HxBD7&h*2(WO@pVLjP)}pO$>?Y0D;`wx(orQrpgH
zWCjv4E=+?tsI8FM&D^eSR`<@G+qI(;M%x}eQ(KuWz1sHf)2{QyePCj0*SY^i*%_@m
zh&`n?ZDb`i$tJezZ|~8ibGpOXp<UbVYUj3T9g|!sIloIkub;V}XdYnhb$(hJp*FCe
zxvPCp&%u{;9x~XT+s>Y6N+;8o+OC6nxH+R;+gx*IVNv(?ZSCE~5gjkRXt=qRd1UMM
z=Hw2Ydls0xw{JCSbYGa^!Y8<C?o^N0<MpNb17)4*+(xIin%cVR8alhpop;66^A}um
zZ9DN=5$3jjFuOUg55WA^_eT4&Udze+#wNIRx2#{EO4%$l{ooc{9x(`F)6a=?^NDoJ
znvQh((O0>eqW7cI&%O?(eK7s(VAD?*Y@!Twu{Q{8?Jfw&6$17qp#<@8F(sG~me9ps
zBcvmYy<5OhM%a%2$V~$Fa-jhIaWofjF)8qA5PLmh2?CBH0v{!EFyy;_?1N7;ybFH0
zhGAl_4^DeljCTnVT8Yyga)Zzl!;!m$ju<cy{W%+UL%0at(dhO-cO<f9Hp0lw0*<DF
z4=WD2S|~*}y9{0AS^*cg0*;0P1`GC8tN<4w{ef7XUYOs$*RvMHYzW)2qA$mIILZsv
z7~lIEYne=h@4)$GJ;FHZ3oCA5bbF(}3*CGiX3Mc$5261qgx|$s?XL*mfcg9Y@ow+S
zKBi$p6<Dw}h+l#Y`w8Oz4b!W}@G%H`5Kh60o{aE*thiS(K40EU!iK3qymv5t5%KOo
z_eUId_hLC-$Bx_{-3zh9mm|IVklt>j_cGF1h_DCCUyuIhqyOUwzkuyofpq?k9XSi(
zO_<JF^zVuOmtcO@Af4YMJON=Y?H41=wP6P^{&ScwoYjP*7@tcuGQw>zf88<OG8`CA
zVE7=UGY!N0V0wo}u#X*B-bVC~GrjOG_nVwvMy2}vfzC{JBJ)f2Rkc-MI6|bSxmV1)
zx@`V}cH#mNgr|Nm#W)xFL4fMJ#J;o_$WS+#&2v@+RuUO%O~mxl7N?g9?n*FRVDro7
zbo1tP%h3*We$m&tnxglk^UEyLtdXV^D%+4YMH?2vE|<E&n!cjGy0)k)P&so>L0*1k
z1=!aRPGno{1Rporq5bEZ=IdqKRv4b%3JF6LSXv}c6Iwve8lRbD7MkS>7YiMdT8tZS
z5-t#$B?$u|3?#$!<Q9M3eTq0w?B4zO1<9@UoZ9nvPG~E&CENICrT-+^{7JH9O%mBg
zjglsaacQQsktQ~YVxJZu)2XIZeWtl=BBTeuShgYS!72Xy;O2imF-&6FSeIm8mt^Tj
z+kmGMBBBP#h5Tq6H1;$#_8dvA(qO<}a>;X$oFIJq>NC`J&wq`&%^Utq-FEqVsr$nB
z+o-$l+FjIbxOfqDUoE_ux~&H6r*7#(Pt(!&{Abou_u#7ixg_AKRud=}cRRV6=Jg@p
zD+R>=zMt7>a>)%VsQ*`uo2dK2d6je-aL<;hG~U~#rF68IFlcchiFfDzZFCfG_T|;o
zz3-jT6mQwml6(^W-t9$n8IbYQZo14Uy0oYn@!#L&d%7rBUuC-MX`ek(Li{)U<K<f7
z4sG2)M;*si?^3+Q1zY+P|Gzx?5}g(Hf5&w3AI%;}N7YZ>$fL96<i;=PvgZ%0UZ>N+
z8dE2_e7Yy|&y>!ocN)4AI@5ptoQ`%6DTk?>Gj<*IU-H(qbXKtEWzlrzm;4XS@9NfX
zl4wGoFGf+gAbT@~-)+NYq316>PVv8O{xJ2=8F_%ZJzMsm^*Eq-6@|Zk@E~<R7{c6*
z2Wn{h9Q)x}>i^K^Ptp2Y|NV^=es6j^IxE^cd`#h*w^va2sf`WPwfuD%b(bHxn!2g~
zVEv%74}<TsuciKPZ}ZT62R<ZLAA&IVN)}$d<_Lvb{cUMC^0E3Y53Pp_MjfTId3s&<
zOyd8SGrg#A@y7j!sQ>Wxep=t>jp#m>gqv$hGl;9?K1G*>7yRYg9t6JY@dI=g9yz#*
zF4OkSD53ssUwDizKR<ZsS_{F;O>RNyZAhscNZ_sOThjczeAP(@fnU7n0a}mqU$oKw
zXL%zpoA{UQOrgu)21`3yZ&z$RL}$52+svl;xA!<q-Nw3Y6z{~t-_T`n>sJdY{mnm)
zqV4~%dok_jyHY-*^)zShK3Yy+r#USBqyyCd%&<pk`iqa2()iEb{v`FkRP8|hXY7BC
zrYF6{lm$7CcPZTR-rK2L*0_hdhi|K+?z=CnrSUKS=t-In^_tg7v4sC=SV!I2;$sx=
zx-&y4{D6Hpb@RVw@T9lcs$fOmL6nZE;6oaI&F9Pw2tQG{UDy3|`MB=9q$h~Srt41?
z5ci!%zn8e#vwoxr&cA33<-1-Z)=ww?{hzys!ht7F4kPf^>45-oPg~c~jI8?kFbzNQ
z$c40{b+z}Pd|+L-VI;wubh3eV<ntf9llJqkFIr(G{!f26=RD%xa7zs>!0C-<TK=2=
z-x%uu(80m9+@EhLrv0JS!#3K!>yDSu@D6Wg(saI6w$OTi<;sgX5_(HN97OqL(}S}q
z-d_*TUqbx5tlZq1xa)m4QM!Z1NYs7*(Ce2I|E`_(()=yEaWLh7$HoUK{nK~7Mcu=b
zy*7e3;{JzeKY8caA{zgP<@Zq7ll&>ohqL1tC&Bw+$y6G?zgc_ge%HE?(z$=ZY6_3u
zF4A~YdU$DmQ;Q#^`R)AYJ8A!2U_U;K&?(F~LC4img@YRioZqZHt*58#gJ}GJ?CDPH
zrE!9n(pA#dQuoA^n+Upa_uI=UoOkM83SV`Q?JbU5&hoKj*S9qM&nvc4H}%VxsQdj>
zov8bxdWgFBe!q#v7t5G9xFK~Tg>POppN3B?V)Id(>{<#R|Jyw@{IMMmQg~h&8;1gI
zS^xiP+F=^5IFe|;eX3$Pg$KC5p!mWYyQsUo^(^Y<df2LU`=gIjc>5L0DE|Jj9cldM
zQ`z3=;nw11Bw%jgky7H0U(3eXcly3VK*4&)m9+lmEP8^vV{9v_zwc*OZ-amQCxr*h
zdXJ_v>%o^Od?e>N3U|GG5rwyZFp|Q3&fG!aPJykgA1?lY!hbAc@&9>ZH-&FMdMVB4
z1xuN9G@<!k8vl{3M``#EPcEkLmcOui{p#~hl-~G%+(N@Azj5U_@-gpQHZO0vk=5U>
zdNv>L`KULI|JwZNG+wI*w$OO1^G{Q_VmoW^+Wkyg|J%-;H2lirY+O3{$LA^h<=TZ5
z-`1H)HIM)J2TFIXy&LskP&$Re+3rS0KW{wsciwv&g<rm!?U7!7`ymQX`5}{rFI4wX
z`1c>zQuuGLv+?a22g|qP(@QDce~R5{Jy|@g{a@*_nZ~<d#~n0$HLjBeb}~~s%F3Uq
z|Im+4(Red<F{!6vGAobxdJ+wXWnYYA04aP(Oi$s1Dx?VORl_dw|Hv+q@7J`gC${(n
zGi#~kHCs%d?rT2X*YZ(Mx|-Ivj2gAIy0lO3lAdz@@eijgDc;r@K5x$4J^D}b-JMNB
z7c-3gB)C1X)s{rb%8wbyK|h)J%?4v|tIQ)LbxBVW$Uow7tT~WK#0Qc|i!>bvugJq2
zJP|-|K@V7PR}+v*&sWq|&#d>?`nsftff}@C^>s74q?5lI&y+cIsSE{APucoY-xY*`
zG{I@2=n&<ADA~MzkKLgvki|ei@w*hsFL~WIHypCjpr_1SHH1(g)HZ*9hz{eL;32=%
zs=rHm6cT534oB5wC#4(BE6pw6V$}v?C@Bory~YUUqO&HL%d=q8%SGT&2Mfx1K$;FY
zkd!8FaBzWO-mft@!+ffzX;n+;sX~_w^BJoNBue)Eh-k{e&m&Us^BXz%`LWHkiX-3a
zG)+O|11{4X{5(3qG!8#M@tD^0RCXLpGCjh9x4%ycg}nTN&GcP2jPm@QcGJr(5P0@6
z4bbq07C3G<y~*=&>V3PZ9HSh&-hq@shCb7&Fx3qQF=kJ#^OpN(xJ?$=YV9ecHzx+L
zl{dA|pesEP_IoC$H}hA`>M4xOAeBhq7JhPqL+jI;`s7#p{CeMUaFR*u+X9Yw6cuQF
z(yHOS6FIOowT>MX(FQiJt*#7<-=+>UBJMK3mTq&%RXaQ;z{fC_*ET`g6+TXuV5inp
zx~sxbxoLP9nsytT)DXnfZY?mcaCHz%7_}o>Vi6h(j2kxa607ts8C}kFwEWaJBdF@2
zjl#y{Z~B_Q>1#RF(~<(4=zM4mjkjtPKM)94ZgiN^&1+UTOoIi>s_cxQIXr1vMtE_u
zd2zDkjMZ|JHKP;<Xw6_8Wy}V;J1E^FK-VK!z7|7tSr=iHCzvN#9uY0Ki5U(Uy#@1A
z5Xo$QV5P&<L$EB9Ly<zgC=~ICWO+!!h;Pz}Z?lN^Dv=RMhX_UdMz*{yW5iQ5;!iB%
zclJ<3))!f%GfCz%NtTaomeWc`scxRoPbN^3hFeLs-f^?TbcJA9s)p$5dNfO7S(15K
zlI32f<$oL*rEI#vSqF%11Y&0M86YMLmVG%PVw3|IfrpaJ4<%U+xGZPXjMA`qD124|
za(gMcy{jCi(Sl{izz{ht|1hf8n6I%M9bnlxAfuGbUUUir3eU3)0fmbN%bD^}cZ}pT
zMun{wKJi!<4}z}A=eqRZ1gJ*wgH+z5b+}_S?V2b4n54<5hILsI2YluOJ_`(&mb<Tn
zHj11m({-~c(EkSLUts<i=tCQQ@iWlpOP;VHKwAbFf-T>@mIS5Xb8LEW;#H&gK`J-h
zMpAip4K2k}pV~sD;G+slfFC%4f!(t5A4$fM6XVE<aq!hR_-Y({o$0HnX7^(TFJwBK
z)3nBslhCPieoTa(XpK0EF8wDSd^PW&vuDFvhv|-F!Lo8Ytk8m!v~~f`=1^uw3eu2e
z&0_mmn39?sX)<@*LMQ2)p4O(KUDLBvn7I&$S(be}NGe=H4eg>ww-_?NM^ib$QhE89
zCY5k8fhEIsjX{93>}@2;a7kC2jNT7c9N*DoZd*?a^X3W7qBhFDY{PN8<*Rq?QMYEz
zD>sp~%Q2voBUp}ppviEAz`}COwpc;1v^@1b$uXC&LtknXKgjV;NC$+F4K&AJEq8<(
zIkxf%WLC=$AChFaoF38)M%O=?8)<Ql&}7VCy>ADJ%927pIuVgRJvFzaWX!}ICiSKt
zXp>ETV(!q~+}yG7kNhsp&7GZ_UzR(9c!3MX+SDIFfOB(;2n1f-1z;5T6pV#vxs|!_
zJ*=z%JgN!Ym!OTB4bk$)=K?J^feXq2E)YqIb4b!fWk4>^OW+~`k0DvfuZBOl0rH3B
z4*$+6o|m0Hhr*NSUy_w_nsqUIR(3Z1q5&)t<SI9}uw<5R_^`T?!U=OHm6p_aWE*%E
zsM*EY*%T{}6q5Xg<_{yz@Ogc+vt8ti{TR_P1n?>)<-Xw)>L*XMi(_S%<f($df^n|w
zY;`;dCk3a!vExaluwOn6A5H)-#PQA;T}jHD9f*<ssHo|WX7NXtduG@xCl8xk?kSv5
z>4iGZ9YN{ST+&1c;@BdZpFAidv@ic*OOi-=^Cro|W_o5!m_D|uQk-1MvOBL>iB-QU
z=_;%$2sx}@@>w{d(yQdn@>Gqk94}WoCNrd=q&;YTl8!)sLx%<H2il*MKaT{F_@z)5
zWo+)qVrPu>^S!at&#Rj>(LU5yIs)3Ja+ZfSXWsPq>F13n;rS%};V!XMDVaNI;^@j!
zxpI2$hyrMTmG&>%$w}VF7I}#S9TzGJ_+U2c|GN61^d{IQPaG{lKhfl=U~K&S6yyf;
zQ#e}mC?m=z%M-@?OY<soi%EZo)4#^haM~c=8Lm34|LLT@V%LA}coI${Y0Ixb|0>O^
zn>>+~Y$EADbLcnk&v|)cd3)yOUPi)aldhIG%R9q2qJ&gZQ2}_y$!{SIhw&vhcXH{7
zxlmSCPJnBhNUw^fHytqZD!o-BW_hQNo;7(mNxxtcBpRo@1w|x0j}&G2gt<^pbK#5h
z;_+lKD=5n?ChZEpA{_5%`v?2$*zv4F2nOIm6vhCoy=VH$i04@JRFOm(gJ~xx{f7PW
zD?+4Lf2t<<w5tKkezVz7GGRgqWP#x4iSCIKj2q-PXpoTJrN&(gn)!NbS#{<te{CIH
VUz8~+{l)&G@Fy7Si*rk){{_%Y6%zme

literal 0
HcmV?d00001

diff --git a/test_fixtures/parquet/sql_query_log_1.parquet b/test_fixtures/parquet/sql_query_log_1.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..ed8f30b9929638e4658b69ad04575cd1378da2f5
GIT binary patch
literal 227212
zcmeFZcUV-*w&+V#Oo&QW5l|5nrE^eWkddGuC_x12Fq<GbgP<TNq8Nx0L_iQ!R1^b2
zlpqq!Vgk%Li=txIH)gkMzjOB5`@Mbd_wGOMJ8Si~eyfYwv#Lg}*<Iyi=1f(QQ)!uw
zF9(%gITfk#Qc|O}RRY?!maEO4(%(yR#*@!t>P~ALBH!EDbcxB`#XNTg-+!o~^)I%^
z_Bi|LQg-EP`>W?unfP8QYk`=>sqD`#OUu<3joR8(fHW^kFS&;l*@rKm?pm&vD3vjF
zC33)9IQ1~Huk}*j5VD_9^k}|Yx!RR6Q!CaYdvaUWbRpYK^scPES+2&ZG!QeXn?_5;
z_o~heF#{b^(of*~YZYmmNs3ylb4Vt?QQAea*EKF-%hPhTLZ#JW`l|03+J)m*oVHc5
z969#U!w{0bFNW<Vc_!NB5Xmp0m*11TXfC5(fxJbJbt8G%cWN5Rsuw>VkThCYx$HE~
zzwFfHWRkY+b~Pk>$Eg~&Bh6)^?Mdpp?@Ky^9Nf8kKS`fw)-yVgs^u|mB&(CEZjdZ`
z|E=%2L%CXf?bl&1kU<Jf*(A3(FB$m~`D#AhkmSU@7E4IJ^_p{vWZj~`S0wWfc^rR*
z6ct{0LGqb~!#deT<!Za11s0R!aZAM<SJ!fYd~ZX0)k4xBYlWd4(qvYR9ZAokk=IC`
z$aIpIM~<kD&?H$}R?}+WSgt1Dr+t^CKCNr`Ok~db$E_q!CH0>ZGkn1-lDoo*{#i(4
ztvOPL$j`>Esw7QK4rG(8y%jE6>{PCHIKjD^q-dw{Ly~*B_dk%7bH6O_j3n625hN`~
z?x-R;CU41cl5~^G(<D`1xR<#gBZkLK313{Uc6_)AkW8<+zmjBI_dp)W#*v8=Baq`C
zl(I?6pZ-}*(tfUeEy=m_6GuiO-Nzalkvu-k^nn2FcbUSO7bNRXkNH6IScKjWl5r=O
z$_bG}GB#sKnuQLjkn|z0Pa&!Bx_t&o;gm~8By(?vTYW`VKTBCea?G1Ct|W=spS(#{
zbk&GRj#RP>A$hlc(OQz3dO90PHdU8oksQ%=p^&7l6x}`x{pYI1I2V#V)ki!@x*yOI
zk{mYUejv%__=+f!Z#HHmlAJK%(t46JSTS2kswK_ICwbUe%Xx-dx!QN5Ru7U+hZgcl
zn(IUdkW^wXjv%=rq-7P!7Q<gDB$qZE${^`q9G62<VX$il$pEh@`<jq?ANL<28S7Tm
zMDqCLs52zbm}*`o`GVrqN0Mip@rdN>$eLFqPw8*{NU}$Y`itbPzQIw>?&WGLvkobf
z9BA*HMDp3xhUp}eM?@Ks<an~INSaGEIFKAN3Rz0>JMFm_$+#WTB9h^o_f2|*e3JQM
zI>{dIJ#$EYE6?JPR8h_{BY7s}gDuG)E6trrrVKWDlGJp4OOSjRUlUAnQ?c*fcn`Gq
zn92J|mM5M%NV2Z$yl53tWIri{<gaaNu_S{&m?e|^N=w^F((EmfO)`E;W)aCttp>?T
z%gWU%FUD>pY4adDha^iaWhcoT`vp}b-BgbqB5A$Nypg1Ocvc(9E#4vLNse~8a+73N
zn$_{$$jX$EQzUgi+jo(Seyi0(a`><Cev%g>CVnM3%{oM;5}9T`eiTWyU2By|#uDk1
zNc!LJTYk74?MAfTmt;%lepW3qeSAqS$$``bJ4k9QT5$IW^86KU&{5=%^1wI8ko#iB
z51vHUI#ka(g}gw|cvRa|t~T{Y_P8U+C(`Dtk0C3dR<j<tby?@CR%A^egK`?#Kj`2|
z@?i7&&@N=_&64#b)reu6x{(K7%TLojkM{Db`2@+C>7$xRzR14Yse@cL<!s7Sq(xG4
z-!x=b+?MTn$m4_IlV%`~?1+liM@HAmjWR%<U0>r*e^Rb?v_r9kflLV>p~pmCD)vcY
zBOmPCnMX4HfJ+<66&H@Xa*#*mEXHw>Qx}vplDv@=G=4tPilFW2`ib&LjcOw)GlCe~
zja--VKAL2*#>r}uvT3gONtRxp=y(?Cl9dodG9u~dF_PT{kNZfP2DTdS8C{{)RDRZ$
z<Qc*BdnAWaHhtWS3{e?D*@x`tZx~aJO!Rp*k7VU6U8w^|f%og0gUBI6{Z}NLRW0-e
zrdFugPj(*n9(lo!rS%ahSMFB$3HeL<8RIjO!Du|UY<h*-T*FKH%aLXc*$Q4r+m~tS
zB%kJ_x00L_mlXSqQK9BI$*YH?Z6Eg@NoJCg+6!cW%gh@j*D3CL_7d4KJ7V4|WT1NW
z($~n;sK>=5PdIMhJAfSf{^o!*`d!z%?-BAy*2m<4vB>^K8n;MpYZksC>2a_9>}2G}
zO`5Mr&SE)fXd!3&4xF2X+}|{bJ{$RLmSSOX0P1ssLc~5~Q2OWeYGl=f&h-b7JLWIh
z(15(EdV5kMGK=DDN;2S~=H3qE$5Un-I+5!A9;>^MD#KI`$>VFexZooHqog8*mDeP9
zg}<htuJBheB_$;_Vxg3bg1TvGQnIYVczySl(&UukO1ca3P0QA<8>LS3>uo7ZNgJze
zzCqn=S8DnMedj&yt-IE3Qls#@^3BTEZ=T2rf8N_#p0;(0=~@L1^NJ0bQx|0Ed$d)g
zXU}ljwV=R!_r}~=?zMh>ZM!#ZH}YxQpkYy&Q9$?W+2e7#a&r+o^kG+lMb(z#d9j1f
z`%YJFErBGN5t^2Jwv}0I7;mtweNSe&%{E=zLd(5byY2I7{{8KHv#T6S&C@ll_T}t#
z+2g!-*_nO0)ozFQ-Gx@wc?Ue}!(a5Dsoq|*qGj!fiPrn`4-2}o3_Lsb7aZ}uyvw%8
z`at2amA$q8w>l0K)rH(|OP^?Sa7RPL^Pat)od=6g#JqdhU1U?U^W>^8gD-A%)|51_
zk&+!bX~Ch=))a*aGnaQADm$H~G}Ug$g2TJcY*MEO-0nJD-nmuVV&f#++KTRMeV2X9
zN1#}xROB?6Qp1!q)a4@mg9sHV)m}N(!CCmydpKQBYLfcD&<5vUw2?PbX_3K~1F1+X
zpq23E6be&FqtYo18ja5sibRq&p+Lms6MP<xFA|CfaT|>-64ALrI)laHad=d58<Wc5
z6I2$H&So=(0=A@$#U!|VE`z}puxK3dbu=DJM4>aNEC!Dt7))^=Yzl|P=CU|EK9eQj
zGP&eu*;Ec+#HZ0HECEv_5)o*C92L%gHNRnE^Hcig3)PkWc^Y`PUO8!5l;5=UX|`EM
zgKizq+#oI2B`senJxqVN%IH5LRp3VnRuYl6mJ+e%$f@SY>D`w>vAX|9y`V@PR2&=-
zq(Dh^MT7-vheZ+LaVq%6A|xn?7c6v%<V6xeL~!I{@gIh%Oj4Qj_i2sz$4Avw{^bWH
zlaf}JIw~#wTt-S?`XAF6E|7F2nZ*xj)gRJ&e&i(ngL~rG-BsNGn!*I^Bs4rkNTJRq
zqGk(l%;CJi+0=1=nyJ_x6)~IOMMTb~qJETBl>a`FapVpD+m-4Q{>452Oh>*&wq8!^
zA5)Pff}<tV$dFOZkkN}Ir|}=(5hp`aq5U-(8SKK*ek}giHaMCW=r1%2_l+X;fFmET
zGXC!)mvOXLANx<&{u!pMY1e-lrGjsGsK6pb_{SYjNvocc)>}jl^<Uot$Hh=#{539l
zyr*S|4G}p8{|8|m=<n+nX&WpG!GWo&sQ!ImWB%J!>hj`G{~4LGl(O`p|1!2=K}4iq
zh)^QL4r$d6X+0-$WdG(KIJCb-1tEuyKFJn}=sYS{K;e<z$zc#o4wp&i@d&n1$P|h_
zR!9@k=^}#8W>E<aT^uQBEDDoJrLkEQDwD=#bI9w20tOvJ1%oBzi3B2{_&Pe3%jEL~
zOd(%DA(%9Aq!7}DObVCI6JS&#I1F*WEP;R!&;=9`jlyKnDeQk675=#sb)|nEibP=l
z?@>WMB;5C3hJ^p53zR3?mV=6nI3$p&7V!cj$Y3^FW%S=QD=+@5x&)E|V(9vZJ1B)j
zu0R(Q5Gx{hL7{=fAGL0jR^2GAH-}W~KivSk*H+Q~tNYP-M{zU?4~dE-!iE2C6r=y^
zMe6_U-QUL~7akJ$$MBCzs~(lsn@^78zkTSRZ-JwFr;9HKm4lK{>8AQC@PSMO^yqnC
zR&a|1xmicI`y@sHHG2N|vWggZF?G}OhTBmv^5Uba%Ek9V(ci#FVMY=dY@9Ivu5vPj
zPMoK%K~IKrLk<Hf176TraopDNx(^(F?H5Fi;X!cJ!V$x}`S4NaoWYgPd{BIl+}(G`
z2NL6+l<pevfwyZ8y-i}QgHXkBss($!!SH+1<?$-sQ26t5eNE~bm}|YpRaP+(zR%UX
zc<Dtn9BmHpJ-sXj<d^mKc5<SDW>yo_+ISeYb+!DSMT>zdTZhjhL*romOJBh{_tkK)
z$#Y9iN<7Sey?VhFpLMYOdB-{Vk13#^Y`H*_mICkljc1O(mID6qJ!!J~DUkgBQ)23k
zWU$*JoOaA86*Pk99;`o?41zC4_s<?qfunUzC$gud!1u<^nd#$G!1ROD1nQweh<P<)
zm>;CV=8nR4TJ~CaZZyx^OLr}l4$3~6s;X$(u~1>vlaXmKZ;oc6v}Oj>O#1afSg;nH
z^4<9YdK$>Avl$_LmjuB<`i{)BHK6ypY5w}HNwE04qv4DZ8PK~v;`)Z`$q;Y=J_eK}
z!J2)Cmn2L~hUkb{g3gdl&{Xa`XWiR1@JoBRkqORYVEtb9GudSD7`G+eXkIcz*71*?
z8<P#+oGw+Wx~_x0TeY+H)hEH78lN}yaY^veGTTpgM>04Zh}|KRmkjHx)I|e^N$|wz
z_u)O<WC$9F7AE{mf@fQbf)4p5Lt^gZOXWJLFz$Z&v%6zAK-G^M$}hep!?In`{v+AN
z@cjMntGi+=Ax%m8;&f^<gg*{>A`<3-qH}$~1biRmH<#tZFM~~n&-X5zTnzOAWw(+>
z7r@;QQE{PCrSM=?)VLIlDzJ}P^qjV-1cq7_Lw`IdhpO(hU$;K(g!Z_7H?Fu9fy31C
zpI>?HfS1e1&S5DPfyS=)51!1*gg3<2ThnJ3fnzkie9D+2(D`vH^Ty61m|`~X{nN%m
z=>B|Y<o)YgpmddQzLe$;@NJ5+OZbuxHs^JoKHrD>Wbd0ZZ%QV}4I7@4iv63+)4j>a
z*a?$qil>u9c7pthX+zRRJ7H*GLrmC@t?;@*&9gSF2o}#+9(d%zb}(~3+dNIC5PqF$
zKf9&65ZaDK<;TirgWVE`UvBj!Abez}pb=gIP0DM$?2C&*XKwxPQ!*tGG`hrL*@rC1
zHF7nvYAgnuHji*`ULk0D9u05H&WE`7fnVEjo~zcxrN)degVSY87M&DjK~~E>zT3MJ
zm?f2Me){%4h;4Iwv;j-+WA3riq4}k-yC?0T#>{O{k*()8Zsjhx;69jiuA>N|kHSU`
zq7W!~9<QxRv%y;3cd0EG=P`fAJ0G0_7-5i_=ytalI(~Jm=yTZtF;~<Lr{-=0zvoYn
zz5P%G?LFge`nwlE*`;YCy`~kw1&ZA3hjoQ;=%$o1l$L<jOv^UiHN~(pSX93J;ZCqz
zKRspDq#Sq><LSq2F9K6@?Y<@Lg&@4J%j4R*9dL*BV*cu%1+YQp`AO&T+0cJ`jkHoj
zDZssw$@Fho5K)*uzU|I7kh`Z{uyt|{$VP4+=Kie^ZarN2v}<)Kj64@pW$aWApN?<M
zPmC!A8gF6Os_dO`pwQH;eNHx5C@Jw+rKh1J=jp@4+e=}hR-EpUh!RlzvNdOdNg1SO
z@P2NXkqwI-`fTE&i{Obz*S<aN`QQ<%Z#nu<0o-5GrcTHCtxq)GNaxnVhRFC4iu_&h
zYh#IN>dbuj;{N=N?dY9QKK$0LkSE3P)XXD+{yQJWm47sCs_lh12M2oHv`Tnkp0BB^
zT?&c6x1G7%Q3{33rB2Cnw!zKhbp}rNc7T-U(T43Mr@>foevx+38Mr;!afx)}8Mu7m
zz}JR1-Ej2k?JtQ9H$ijHosOMhO<-64=|`{BEvQfb6dZNx40w%tGEzyp34CNiH9S{!
zLeeE;&yauyc=hS}pleABw3lvc>9A^nSKap)nW{I!FZHNiW5Wihe)eElWkv%ix7=0K
zifMtOHz9lSj2gk>y=|<=xl@p1z5Ux%mj=+|cfEF-djtH9hF<IW*Fof_Em}MGUW0`#
zeogs(-QaV}yJkeeHQ2-$Ke=!C1(=#vf6ZU-EX<v|Va-*Kc33{+`}Oth?XdF2%rV=4
z*1=;@nBfF!7x?VX2@)+h1=aRlw3DAtfck)RfzIki`1o_f%W}sSkj?pU{K(x#s2uuO
zR{r!l{C?85J<6mBbd)ZBxvqQ@ghgBKtvJvQ6=kzi^~=t}vdpN=aOZANp$REBJGx<u
z>Y|XI=x*3{kuo{m;v0-n39BOHyP<Ln@9y2~Zm9ZD-|v)l7F_QQZH_;87V=IoCfBb!
z3sdH82@M;40u*=6_;M_#5khk<h~1VaVBEt?S)WFofES5Q3csFS18bkTnz5Ho!1ZZ+
z5-2@A@aXrJJx+H{0AFf4Z@p?W6lBh2m>67#;kU01R5vz)^_z*Wrn#Jee70k&>(zFs
zT(WbaOkgW09IY^U|D+M9A$Oj>OFRc2)|J}%R~w;4wl}RU_blWe8^86Od?&0qbp26w
z?im;%rMGO@oq8}HHT1G?>^0CDIycA}It^-;v-3}wo&_Oye%`f#8<4)YGS$oL2H50e
z-!88?3$sQaD{!>x0DT?LF{gqqfQ*lO@3Gi(u=a@P<Gf!X@YLkSy(K%sU|7uhZ;_Fq
zuntZaF=Im@as4UJsb53DsIF!6p|xSq7dp4S_)a)1NWAz=dsHaA=GfP5b&Z4WTL~`|
z?*>BzYn~gcEEG~7q;Bf(2?uJOuC&7EFnBa~XwmG7Ft9V$z56ya3=$OE1uh|x;GmGp
z9h(*owFk<6-z|*+X(ey>je4Qr;borp;$#S50hJ+-FL5C^Q}R$`Jh>`m&D2-;E9t5$
z{8g4<@g&=9D6KE`QC50{&Hp?f`QLm#VkM<A^3RHWtS~B^7wI1o>_P}af`t*1WzJdy
z)wKqCSyQnPRsR=iQ2jT`!Lqbn8ee~v%igFjtMKrUaP1gB|3E@JG>{h@>>unaS<%nc
zc3JG8O^|<|tL?-Kk08Ru7whX<#fB1sNJ6Mh1Pir8MA{L5E%*(!MIlkaLj0tagO$Y+
zT?4cqMHNM{_1ut?n9oSD`P`7x_x?ZIwfg_z`;)d#E|R5W{;_(I7Yj?GqJ#3P2j%r1
zOQVWd|9_Y)+Rh;zd^xD(NtQI>#<QQ+Eu=ulw$Jj82^0uTy7=JGo0%}p@cEft4m04u
z#?b7=ggzX3KGgPmbvUGz?A4OpG7cuiYrX1vC=aZ}D{;?N<)Jv|p8kc%(NJ3y6CA)9
z2Ga^zC2BVEFs{lwes_cd{OslJ`o@rfcP8b@6ZkSP;k<EoTf#4s%L^|HUiz|OW94ys
zC#iX$rQ-F%YTZ02<Tj@Vj5P)Nsgu2d9UKUH5xZl!Bfzej5(f)6E_4QmO<q;Sg5kD-
zgrPDMw%Mz-h-^8q^nJ<Bz8f5<7=I()uAL1r)~>fVZZ?DCoc+OPCvl*B%<q7-DO^xG
zTB2UGhzk^vb@3)gE@Ty?O^v_Dg^``5SEe)BFe!ErYM-Wn=i?t!-nlP?_ZPpl@946E
zpn0zEvm$MvL+;hvcbjcM`u@Y|o^sZpTeSCyqMr>^y1e&HzHJR>&xQ3Jd~XF7Hkw~%
zSucPK`{J$_%Ps(x)67}hZ`xz2<nYMBX&p>-F%6%TVGE^G6ul{J3t%v|%q6hT3bcMR
zPpC#)!_ua&%RL_0fL-jv`4cW#!^umJtL~3p01mEA9V?Gof!&pH*$$p;n0fNXo?p={
zh`&8**|Ftps4T0@{v={UGI6>jWjPmgqDHkH)#Jdpj16+h@*FVM-=<&M#emoPIh7~#
zIUslYdgWLn4!l=+y*u7^9$3+C`_e+#Fr{+N>}n$}eC~;ih~GO8+@lV+d#7;WTDa}y
zlO7hJF-~|QF_{aGg4?w&FJ;4X@36H=9W2oOrkf?x$brR1Z`+P|nZZ(Lp8bS-CQwYt
z)B9d(3FFS{H4LfQz?-#~eJbXdLF1>An-_kV1G986G}(CroLp-Ac`Xy5Y@_1lyeZ}o
zcc67~wTc;-T$(?6ARrW~p^f6xZ3PMYItDswJ)w5~m^bCxBG~uK|FzP!PzaCT_ABQI
zA98xG+U4(zf@F;g(Vw%oK}5-r@spv2;LY6`-9>YNc6+Hj>-$bHN7XNUYm5!N;>^$B
zPjZ6SLFp<*@lJ5xY<BHfD<_Dq-T0;=(h=nD{>Tq(GlOe4duBW|Uw}*XIZHBpEFp)H
zO>vI41LwqFM+&*iz=={C^j3ujU(S^eKA7SQE9^$lr3A}?*1E^N<(ntG`tnH_U*H0-
z2LiSnIc@`sk36;uvRnXj$4+Tp(rgLy>zm6xUs%H6)I}L`SC&GiRI9g#69H>hJwKmz
zSOoowx_Uc;z2MBY=lwe7E5QDj#Wg+Kr6B$58Rv4P4JdHS!*&Ll!{_^LQIoe=!imKz
zeb;}s23hyr5$825A;GM%RsD`D7@S>cy*9}g)_cwvnsgx(^gqPvUQ5RN&nrJ*r@Iog
ztIuUV-fai{lW+Zev)mEhROO5t3R?(Tn;Pz`hB`yxVAnv;bUX0*P^EgiX9+G9hQBXt
zbc8uOcLi=e;sgaQW*tk<EQa(=cdtqHnuAtw?J(V98)!*Co1AdV0fN&GTgm&_Lcr3E
zJI8p~fnmw=4dYY%!C}m$h|M{C=sj-RI@4ANvkPj57cX&xA-}2+;R7q!vvKz9dJlWJ
zT~8|*HOdBJ<PFsGGZw(JVIsqTTz8l&G+$U5;0MD_5nu8~JAmdTS7l<l18gH&+oxD9
zgw)%w?=3sy0nAZBBJ}`&h`*v1)m*v~zQ@h~lrxS8qh?OJwmDA-`vcoK)0VlxvK>mN
zwClJ~oE64<ObZ0(s-&BWssivmyZAF}ZV=3DK0I3WgFD3Z`H2>vvxj3Jf(%$aX0SHe
z6k?(*z|+#x?{u&QZ1|y5I=@#4KB=zuvr9wZaBS=N>j`GC>Hh1^;jb-l33yy>l&&@0
zH!;YaDDMexpR|P?EQ$jq*5R|=&yqlznb@}IL?Db~wR{@V2?lwl#sed7JGfnTpxG$g
z8Coa3W`(+>!Ybw+lQ++|fU?6G?>>zbSn$*%w?TFtOe;`xKe(h8dP_f#FR@C29+$2Y
zW^r8*uK6~<y?8w&z8Ku_yLv5n%}OE8dZa>1)?LfzUm~GYbD3B2w@9e9ANn~eFcMC`
zFIGNzDiTKT3OuhmA{NZ&_1jtDBPTaw%9Dv3VxX&cp~I=YQBaxP64~hz3l+C-Sl>Jl
z19e~cTU}n|ftH1P*yRKP964R#n|JpJ_`T8R9aB#R4HLI~<=$M-y_MsVTCfThI1N_*
zmXC*R+6-<TBL;%yW;q;vwF;~n>W{C}34<f)cGG?F(aMzzvfsjQq=1cR<UZ+J>)@=z
zicO71DL_e}`0Ywgg-e{`ADynL&{OB0T{5o*$}ij<X|A6NUiw=iWS1NPE7f`XXvWDv
zKU3rTHarI^9%c>3sb<2B!32?YS0=>#e5SB{TNbPjbI)r&kPauZjKhajWkOqSK-t#8
zEa3X=^rL$gKz_-hO>XzHVCDSDZ1wg`*f({|*0_(!VC-MvICwGz=#kma&q6ZnG~Syb
z-<JfR>^gLPl@5dRkJ@dUzo$U^{!+$vtD`W~S57G7qe9Ke?DNcuRLC@Lby&6VIN0kv
zwZG0wh3vp7+(7nPDBIAH;pCqT@j*GAf}LyOXRPMXj?lF*Y}>1MI)iKA+*DDv<CSEX
zdTPQE@6pL{ajr1oi%~Kh?Wn7%b6*4MFRxryH5bD796@dU?=0{NVlOGuBH+QWm^0q>
znNZy@&LuXz8g?w(u+va<0^U3*9<5iC1vkIm$<pyp1id+zcDOIegAs};X*X6y0M=9D
zf3RfyA%`z<(Kty`Gb)N}#!31Le;J*+!k?>?kg}jIbx2Nngw_8nIR5hm$C5oxW#;Q9
zjg<b!QbWT(Bo@Dj3&khrmj7zeQ9DDmc7|S$j(E}WcSWfF8}(q}cv%Nu4l0F`!V#n|
zENo`e!2Q9rxlPhcuv>3Awb+#kUzaNss&Av=+W6f^(`PhL7Gx|*Q@;(J5lh`n&(DGQ
z(EP`4cjrO8SA52enE9ZWB23EKwE#>5li%J;w1f{=1Y=@mS;EHtEt^FtW-$9kf7C>4
zONbp)e|*HULoibJZo))cM_9XL!q``AJJ>hU@qX$(Tacb&euy)|8ZtDro!6Bth4T5H
zt_vMK;J$T*W2KQREO=+GHam0)Y+X3>;%%-Ie0}eG=kDvpP;z3zF4|Ei_>$neZR-_h
zI4KOMwf*D_ncrszW>+r(xU+LGc5oSJKawp9<*WwN+d6X<rQBhYcAiG{&Bbt{Y~P9T
ztDQl;z5iom_ZZW>@~J+SrVHT!bI+;pQC`554|wvz87tTH6Af|8mw?xoU(4TYae{<6
z#<S>SPGFIiJp1P>M_^w`ZZ$7+1>cC#6my{yMCvb49H!_5H<~xK&ON;t0y8>7EN?^u
z1Pf2YI46kkD_ZeR$rZNRhAjNpYX?alY@yq8CwRx-IQ-(5_0X;2zTYL*5d<y4Pwvlj
zfiruitl(t3z}>};-cQ!LK-uYSUeEJgAbe@`!i{N*;CF9B`CXik*-Bp@%13ADdbGLu
zYl0K_oQW&mf5{C#s9eZzy6*-JUHvp&TQ@lLC2YpzB^%*%ZpboQ418}Uc^*08?FiSG
zTO5iBS_f+z1AQwNE{2^)SC*a=Cc$fypYKk+iGUqTK8HVjp9FnhA3F@oNW||__IWP~
z&IL|{f!Cw&tKnkL(zLS%o8a3l-K@F7t?;wqeZ!3nz7Tcbu9r!BAe1{ur3$9`0cC;L
zJu6>6RF)1HNn88EEy3l==q>{4R3|B?L<fQ5lY=Guj{1RgQeKP2O%KSE8(95bLkOQ<
z-RrYV6~gI+O^**Z3gGGF{=PR{AyBSG_^HiEfKt2u1p({UK%~MJ?@6m-;GvrAwl$+-
zU}Q#RXUrZUe5Nu#ecdYn4IO)V(GG9uQczkF7cYXQN2jlM6Mn#dC%9`mJPfMx?#x+U
zvIY{}&bsszCPRMm?OR7=;^6YAky&1ivC#4CpswMk5V)$`7%O1-!bz%Ey&}sSzVB+e
zw?Rt)(Q$RxT~>);$K?i<vgHDpQ}MdZHZBYpv1V{sI}v)HvTV&2GlAAW@MXMQG8FA;
zTDoaMGI*P-1za`sh5+xCQJbf2gfAV-xA3x}VA8%@+_QGUpxxJ@5mL;Dsjn<Pj4@pa
z{#ssNWL15^jrl}S)#wKk{D$q@pBV~9dp!rXJ9vXhhqBM3cLaPJ+P?dRJ`WbCy)GUh
zErilJ>og7p2%$qt{%tj9Eiih`R8v^-@Or0vvh310_)LlGwbzb-{@CVsMSUXhx?3}@
z>ahT_POsW=!<&GL6`z(LaT36Ap0uYW5e^QS!__rf*MN=txt>iFZ#eK~_1BFVE)Zy@
zvb24Z3*=|`-C?{5hi5-im%i*yfck<S>z6huAX=ueH8Fh^5GJeAM-PjKC)T|i$4iI9
zWUWWTTz;Q`W0U-K&%ay?<;CMOC#+u$dYdPA58!uGYx*wVHVBRYua@0bv3)#nFwM=M
zzuN~2YPSx)*7b!?n%3;q>j=nw@KCorYCmibh*fIMNP%v(j)?h<D<HY<7ooJ44?2r>
zJms|$pt8-uG|nOlWXh$sWKX;aJNx6*A8pA2;jh5aDWBIup~5)BS8i*;<4oKA-y$F2
zm5*G!$RQN`Xu}zk_ilszQ_`g~CX~T1s>h=14!dD!mu^c!>t;w=qY&rHECNQjMa=y0
zsj%5><H2s-Qc%^1wf%gt0%99w-j%-7fgRKC1%z`dVZ^#A*S@Zr0wIoU?W(jXa7mr*
z5I8smg6(>U^XsO9oXPRRS_5s!NXUE`XQT`E2{hd=H9EliZfMbeQ5)K~?Tx;>PX|uk
zT>9=ap$ki%zT0fMwFT-{T6%8TNQJ3|FO_<X_P|rYiJe<z8bN)Hvu;Q0F?iNrq;SZ*
z1ws_x4qNBY0Q^A{bN7^kFneEtMRV0bn787Ee85L#2)|IL`LuZ-SiH4Uo)W1IrBcE3
zhw61;cO-j2X2(>RzEXL`Eh}AUoY|K!1=~AxER>$yFNOOqJJpqzYs1Y|i37eXDnaq5
zbISnifr6Z^`P3IoXw2<-UphzwgDYi&QnQyrZRv%dYEo2~*kSHqDjEX*)9<$*#&|%}
zK)F(?0u|(Czkjr0c*16xk*c!a?chW0<FC`L+~NKG%t2VG0|kOOH)9_y_+;E!YO!iE
z4C+yf#=e~b;}5h3Nm-Ra)xj4u)gMz}<AQ1PeGT=X>+GkSRUf8;aIgDb@Adld>{0fr
z-BS#p*sM>J*rWrQAM%#>b!fq1wYO{UZQ2agoDQRv_M0L2cVJ(p^AvD9=;o<^@d#{T
zEb88TXgVDB(0O0@;Sg-TG@;&ja}|seYVDc7wgRp*&Kpnz061D(TL#*Xfx_0clXmW^
z0du)Efljhhz+vGN-U|GVH#<LkQR=BG*nRxfovS|D@Vr1JBkE->RMX_oDSoR1p9yW8
z!II;k7ovVVe(W%KY|q-Ha9;(Evi6!kSvDSme5QEr)K-RL+qWHWeLDeyEk9M8ep7}F
zed`YSE)6(febBL8T^Uw)PV_Kzvx2b0M0ul&Dy%Ji^U%gx9p-;qcrIO66<+R<cWdia
zgG1S2t*$Fnp*U~Y#F+}JaB$Gbyn44Pl<V&r7`ApAEY&hzRcE;vJPQsQ&5D@-dQ88F
zEdO!jv+Mu6kTj7LlFH&j(nMe3ucWK4@Ygi3kQ~u0`wpgwA71}YE&0Fn<393NFaIbc
zCt@LK6-x-B$miDovXtzZq1rP;Z%|T7{#_NS|3*PrN^YBmF9(%2Nhx_OjCDwOz!?@O
zE||G%ya>(~bKj1d><$Uh7fL!uF9SEl`FGA5c)%p5`bnuRyCEyx`(*R#MbN*2@uX<6
z2h7;A?&;VuSduN>uDnxgCG3Ahm`I;q3FqZk4lb1PhYUAj(Gg8d-SN~2c{x9*T<&v6
zT|N{JHPEZ|!~J2_kJ)Y6!XVf$eCl$yBLJLR{D=AAGn?<BU5yq=F>qw-EvEyy5%8@%
zHE2z9B=nqKQ~o3)3f?W$G|nCw4I7m_E{(Vr3HoU|GL<KzKw6I9RoWd1=am|_9Ul_~
zH{3q<AMA>Tvwb&WI%mW|z{>=y-B?<@^-phEJUt1twgwDr`W+3C4Qm+3F2%s=qg7Ia
z!=s_5OgJ)aFd7;*&URVzBN`6<I+1u&BL?(*mIfYk+zP7%-9>hli6F4>YO4&52VXn;
zhs^j`m=btm;Vn@-$d}g48ZwH9uW$P=X4J=kQTxTY($0yX9$jgy+7ts5zbQ?6^g9-6
zO<mP1?#6;|d1%LXMl2-vR4<o3z6~DVei1uCH4cvEF1?>v9}BeA?_-YrjDfa&;Tx8Y
zT?@{Oe>_R1t$`)f^)9(32@ogkeN5|TEUc?oynn*kBxr7XpYJIf4FNwoAIl7ngj){l
zVyAgWLuY62<OzEs!PldYJ1a68W_k`C^-PF{sm8@)KlMd}_3@B1lsA#!GXGbo(X&YW
z`Nx<kSyyA=scM$pk%~yzzx>yaY5XWqkeN`{*BS*ki?n5LmnOi>+tFt^Nl{QYa(-Xe
z)ks(@($8+zM)|F)=rJ_FX9r>1&3Z3KK~u9<-~;7oSlP&1w`)Q)gc~2$THY549euZ-
zJx+~>+n+W;R7nzKE)0Iv$Bl+&Uo~|d_Zaw|G}KpTn+ZOTHJYFHqCSOnlN-y|Lbio0
z?alWnkdi+Yu3L};!>pTZI%M)-d+;<-!jVmoI#4x6=X5S)uk^bfqnZb5f$#R)c;><s
zcbR871>3>#&A|sAFVi6T+Q_e`PUga%X!kJlo?IYmC(o+co(p3=t<2>wZU8Ng{Q&n^
zE_BLX@iCLjgH7(4&03ptL8+ybXQG@7F{(-Lo@=DR>FILPuXg6a3PrEgYFatapHeNf
zU7ZW;Q$-WBYc@bTJ?)d1Z64qcgs0py$$^j0tbDDk^T4~?w&7l9F3h+hGb!`dda#S!
zdG>r<9?ZTlV5q-08zy{-Os}@h0hjb2mNibfAQwveQslA$yo|WBzb?*&hhtcRc^)|s
zc+_}kV_yy^z6!8kI5`i_SJoJ8@Jk1iIYii_`S}o9V}EOfX&zh}w=#9}m24=_nP)gW
zBNyn8AHM!-Rt))V<HikWromutMlW?R7s{xfzv_qMJh>ZvdGs9Erc=0d|Mgs$*t2lB
z+Y_8`a1Hx&Y%UmtHnZ<a=K#OSHpw05r8{>*19iz}(0Up2PPZf%yofcdFRD3EV{^@E
z{G5Ck>}WVvbUqgn6{9Av*}DN88ec>xUd#i=t$D4oyK}&DugQq>ra6$8uqI1KJr7E_
zPYbX4<iNuf{`G5obKon@bI7JS7ldaAQtqAE0Q#@AqG|DY(7(O&likr=$iLCIi?6#K
zT*H*|<8^c4s=Lfkb4MB!(&u<jE6Rg`(bK2?PTT@}-6{he?r#L%FQ&$Zm~G%Zyyw%s
z>TG!S!)Iw{a2~M7+bwfkQwZ}{3}4>vhxQUsX#8G17miXYTaR080ClF<w2x91FyZQ|
zr|G7-aAIxo&%JYzTM{|KJGt;<e5mW={xtY-UuWmqwRuo3V-mv($$@pn+dJQ7=D@e<
zUv2$cbK%*UeJ5A2H^T2<JBkX%<im?g!}Fd><p3?ly=>L<9FUoD$h7qc-fx)+-$Nl2
z3NF20x5Oq7T+Wp#xZKJC=M{CSE{k(uYK`K;&S80=>v8S*lB6`Sy)Jhuo011}9u)Zp
z9&Z6Tl?wSPXn}!{ruaOKW_aE?%F~+L3D(i^Pi$shgTqH3Ute*d5$unPic`z3!=`T^
zjoW-r!NH52^KMZaA=dfo>-6ER;9Y2<qM=?7rrUpHYD%}j(pmNGtVPY>JC!mltfL<4
zqbZl>)zrbZOPNXS^XfrU!Rdqh%x3uU<=vIEtM#zmUCWx)*#xUyp1Rcku7e#bK6N{;
z=?0gSx~hW_$Dyx&=(Q{V5`5ggRZfjj4}14_e=8n!0T?UQjHzn%Fns8NmUc-4#M@t^
z51UvIC)SVKJ1nRkHkU~Gr!*Z0%hit~;!9iLT(VZctwT+4#rLw`^79RlF=o#<xL6Oz
zd`;yBT24WYdh^^~{5i?lf!wxa`5xG=IvlI!MsVpK6W<xs1DD$zDbp%Xfc9<uJwmT;
zXf%(Q^&Wrzv3KlCi|nE<Ff{)_9Y*hhS@+(5y13~AMBccqC^**%uJrHM_?BG|I?JEE
zd44zS2|MoM7|{)TZ)ANacI}3zzxiR?A{#)5&a-kmQV(z~@Y>hP1~ABcJLhFc1H_ts
zda+FP3cQ+T8=s|(Kflo3LC6?ihN0x-hz*oRh#9}SFmhuP%rbvZx#ivigBfL8O|I8N
zub(dCVo?J~RsMXpOT7(l1PtAg`qTtFyJWxWkHw$Mq((*$uWtc<fqr_kOatt1cTdWP
zR(Q49X<+@r7MRStX&3F$47<|y|0=y#2hq`w-bbmQhn(M`GonY=!F{(m8aouap}wY9
zL{Yv1kGws0SU6q*%>#CWtsAj@&9@QO2K69fS|9l}ss)%na3DaX8KMTv<dsJ7VZN8)
zm;JAWP<zC8@~3_QJPED-w9bnUBePa_*`DJ=SO8oeJw^x#`X)w0V+fF0{VLbIUH}PK
zU;1v5C%}Ha%c8T}@H*uOyLFEV;O@s}Q`R$lW;&4Sxa5fto=+}3Wqpu<*FRovd#x`5
zRd-v1lhGom@(8UM<tKzBhdH4w$plo+l;5KMiw|EWnGI@E_++WOUmjoLXQrzE^_i)v
zzQSM5psw&&`GdvUa4dEA;4@SE|9NKmpFcDGx0_g`QKIBO|0sbqu>^Md^J|KKSpm1r
zRBfB7S36DoF!k>mQvEl2`&$KE;k)+RSAjkl<u$2FkC+8j7oMG2EUyn4l}`S{UTA|?
zy^&>=+6?e+{F&G^fdV(*JALY3stvnSt7h6(Plq4Iv`hQWPlnki*SvpFU<}7jY7Q|r
zQ}J8g;{)bPC}4eH{EVdsro)|8JGQ157(vC$(3{PBXTX?;E43o-Oov&U=lVUR8i2XR
z%3n{384!y<3-r~Q3J(&Vl)Tzx2+is%-iGJSfQg=3Zu}{F@ciPZ*fm>>puGL*c85_j
zVeD6iibFJx|Ay^^sJU~%Vn(v-tQ#|-r>W3H<TC?KzVDl@cw-g}JiWF>MR6vax+b&P
z$Y(Y*awl3}Dx`p(YxMGCNA;m#(t|lw(FTxu-t|t{Oha%KF;BLZ&VZU;UQB%NZ20wT
zci{Ca`VgXe%yFrr32=mVr`{%0p*O<Xxu0PGlbRLFUVF|4uf~ykPGuW_Ns&d<K>Hjh
z(%Q7kvTP27ef%DEd$tk8Q${_1v|k@=HQC$J2aLcbub@S()Bva_G@mFiXG7pZ-zTCg
zCXn;fGGT?1A$*?EHTU7+S>T&8_L2ve0$UH>c6>3s2<(#-4|mg@VQg=I@uLYd;aulC
zS*PaNFa#kJj=!1-;Sr2Y5vvS9G010+!;fh&x{kJe<b6Z<5jR5aoxCAr9!eg0q1ymz
zN9h`l&X@@)<)=HH9_hor$R%68PMi*B+u(#z<xFsQySHo0YeT4W9M)yqY6_oD8(%HU
zm;<ig45QCQ7{abCQxrM=MzFo&u;uU>25{+o)601~XM)}`!)cHAqukss?d2^n04*bz
z1k3yxz=){mkJ&d13h2x8#+uBAsi~8SUe+1FmYXkT%^Sgha|Y}D5A2%r=b;M)T<3BJ
z4Z3E*VW&)<roIVyr?*B;|7{4D$Hmt)j>7rYIR!uQH-z&mjJtA1hJxGei1$U$&ES>Y
z9`4?=0L_jOpKqOC0N=hvFiz|@1I|MB-0zzJ{1zHd3cg|ihIgEd%~qR&o2^RU)@)N4
zLwk9{{JRP0rTa>^HkgBt@vaNLKLGky$<O4kFo*lE8p}=WOu(9!n^oCj3c)W;W*O`=
z1!}V8twdR4uwL=bx%Rm^OdH|5{EjQYim<O+{C1hcx^e42_s%tgdj3880J<4?y*s(^
zb(T4ta&R@aNj3xGiP7EiW@}KsC*%La-WZZng1)XhU=Hd2H&?3ZS;5-p;|8LZn?cto
z*Ucbh4)#>NMe@ofkfc(Z@-WsMqO3pq(KXEAryPr;UtkK0&g}Qw$T5S7oZ>e-3``)b
zW^#FCG5*~ppEo(e6fP}3bv@>q8I+zJy}Y8r3Zkvvo(iCu!Bbl{FD?sUbzw`swvh$M
z<u$R#4yMCReol+3t2tOsbr?JTge~Y4ejDU_o5I7<#*ZQg0SYgK-Kv>n3YA)656Z*M
zVA*Mpp39q2uKS+*txGk9{SR*V8R7LiQk5CX%J{2_!#zC?D@-6b?}E+vy7}N}lUtOw
z-4vGom{q#0)g0)bH6BhpY62|fGxl@Ma6W-A53YG?4ymkxw{?bA@G{K%W!f$LHOdQx
zA1AbpL2zoQI#taaq*hnOTGd<Omn38K7e2BCS<!>6BPtfaQ%u@C>-&5-chE&(dBGGk
z3}+874WYp9%bsl6ISZh=`h{97#SSvGZLZ!JXAZ;Na!L(@oMDsZ=_^^2Eg@xb+>N{x
z6Nm}%{;jL$1j}pJt!mkB3}<skwsnkH0`F`O$?h{Sg%$3JKdi2r!YjoaCntZyUuzf~
zN(k~Z1zO(WHwG5A@NQp9c8R7bbXxn;1$RwAi*oedn#%w;=2I3}TbRPeWheYcUNVF9
zVRq|3+%g8<>6HuK*_guirs;;Pr50f3|25yH)*P1cFR1z2+dyl`n@w8R%%Q3Nc{3mT
zdDUl>P%LK#4vul}UZ$B~W%qPM@)HYKAGhc7n@6GW_Vq~V7nCq~us*SWMQkYCymtD5
zs&N?jUP>6YVN4<HEewe9@CpT%bF%Bmu062#+k@p7R)<3E!^KB>e8WKS`$EVmqX?K2
zE1g+++7AqzbnZ+w5P@3VvcY+u37FAk?(&T$f}8dMr3Vy6;IF>!GxdWX%=@UjcJmoO
zQ2M0!bw#cpgsC=-m`fGGiAl3Zth(w8QoKCFI}_H!$t<5_&D3SEx~r#2J9ayGkM(;{
z5EKAfPlE~%ZcT)1o$Nq>0J!yY`|`@`{_rh7+f=>Q9}3^cg}B^U33IME%pW@vzhqIG
ztgpB`6dW~PE&DMg6x^TXxM{eA!Fr{c3*A<R!NJQ++4hKV&^@^^ZRwUUFo|bgJs(#9
zac?QK(B+}9|7+&-mvf8Yo7CIWn=S>z%1aiz&r~FW*%|7YInQF@?9t2n%6CSCz2@Zk
z>jv-%K-=UoYvhvP@TZ^0R?dos*y{DGeD5W~!8apbPnZ`A8FAAeDrv2Pp{D`HeJ;`9
zuyRz&$u;=H-LXY7v0EeI#p|V3zW4)OgMG&xon1rVUQW3(Q!O7<7YDNrPYeT}vWjm9
zbjqQpdgqi4)51V`Wp0B(QasdeGq3aiQ2`I84$nNiCIX^<-neaoKLpm?a^30ir*QD>
zRWnRj7zwLGf4KLLjs)JkTMOA!L!e_`*g|u^2r%rdYcMz;3cGHsV{y&|!<^f9hHSlp
zVbQ9(fch9$IQnMO@ppIP;o_O%ewu|F6l!LkRNuP_s<h|wf6L>FH)Cz{Be@c2jO=b@
zD<!~gw)@z#twE46_*^^la1to@jPY>TAOQIXJ&92(bU|m_jxW$T4Xp0o?vurQ?wzB#
zUiz*MlrEqCyi;i!M9WqMc2((t_L9SP4($eTd8f^eq}O_IrSme(;d_8x`uSL<?o=>O
zYq;s@Gac^lt4w&kWCn=5M>n>6>p^RhR`L1k)1kH`uKli!E)-}rWt|$F1*rzZM~r+i
z4a_s;Q#UfrVZxT1r>gHx1DpJo+DQ7;|K|$$FK19!_^aH(TJ3)-;QueTasRCXR>KMy
z_y0!y>)#UmxAOPz>LJVDzqg9Fbl>+<x$pIlLy^em&MYAdH%d}i3<`(Fq>7ki8-q<}
z3ka@|#^Tbb1Xa9ylPMrrxY?S|W-xHiCRe;ulPY8g2nL<UVdKth9+gIZmPVoDwr>iL
z#Su~%9IkkuH=WJo667BU7Kh3cNv>lvL|hgNH;fWIsz4~&A<7p}nRGgn&E>IKbh`LD
zI)}lfu&6vPh0W!%8RGjfL_97_$e;;D0uGHw5no5}DSRP~&g2S23_e>#C&$923wbm)
zox-FNJf4s*Ze!wJQx=QKU{m>22A3+H2c6F(Xd*&DW#Ddf0U_RT%AgBz?>bX}yIOfX
zF7r>nA_`l;rxQ#DSIFdw$4q14MpG{Ca;0#o0tQRGGgU-q(sAc1_9UR71jT#Uai|;`
zo6BU-X<RCu5ML+2*>fpO4uemnix?Df8;@X8_%uGB#ivt*LLo(>BQ}Fi=LzXl8c!gg
zFvN0Z&{#s833ke5;BI`f4W}-oQ*ehZZp3AvP{lIfP;pvpCKH7s;8R#+8=u3W5DY5r
znPsE+Spu?+#T7BxOc9+bA_zW%D(-{HVDeZ(g3V#0;sq?RT$pU!>dzN4m|T{C#*ysV
z=Top#HlHTovS~ON$(RWS4+SIS3IqZ=ZuceoVB??|9D>H-5p*iflWap%;^45TRDl33
zLLw_6RY2kJ@sk807Nht&Izht@sRWmSnv~cdolPP5RGvUc<8pXRzGTdVh{_l6m=qpC
z#es_3s7wljB4lvrR0@NjO6-)vL;X_dA~uIhBPbm4?qPz(pi|j|2u~=$BMKyPVc^}-
zDyV2{6gE?0Lu?A0&!P&M=nqUTjVbmSHn|a*ps{&07MsSFTt^YG_&hF6NH7EfJ|PmH
zTfm^8rSa)(E}zE3(+k8hr4j-bM<m2MQt&VYuGkJxFQ`>CeGZGy<+1o;`Jvs@xpV<T
zz@;(JZOAq*2d$TeRz?buDX~93jlfY64D=8xZnYN6S-_xCsd$b8kA))OP{nPiXC8~n
z7VsEU)Syso2YfuULB!@#C=4bJkRv&Nfy>5`F=;G<5Tdb)eFi<5O~->EXdD45k0qWP
zO(bFn&^d(!8a<UMmMMja`;jSBbY6l-qwvISd>;BFkB{d&;C62G5po`=Kn5RY&*pOp
z5rrWh3y05Va_BSx6$MVphP;l70zh}=h}c3NhtCq*GnK%=LL<<uC^U+QA|5k`$)@3O
z(MeEXJc@YCe3TJh!ogkWJQ1BQ?gQ-@ty3tV^U*zpOmQ2RE8<h|s0s`l3?U)$Bawhc
z6ET<sUBndNF&AQ;qYVh@9NZeq5u&Z~#PdKA;DHzn4hy%hvuFab{c$nQ@Gw*f*laEp
zEtZ@Ih073OY+}*`A`U|+iMv87m(QnCxk4Tt0|HenQ$FsV7qT&?iST?4u0#e*g2$$!
zXP{Mb1Z;}L{z#2;cp?^;%@uKY;(kR`JR1WgPsd|82r5l10~84!$$_DqD-iNf;N(~c
z5tBma(b3eXY?Px!rhFQk%B3@@812y~B=cZ%L;@y9fWFGaP$6!^-S#v-MS!D4u~H>*
z7xgS6^NNtm7O>Ej$UbNo&rt+43?_UATQU|7g+^m?`4l>81=S?6D;B{JanRc^LgVQH
z5}9&nB8reD;Ghw*`4ayX;k|Im6d{Gprs5eOVwsYTMnR*%AdO}xmNT6|)nIC)Q3X69
zSK_NoAzL71a)cZqje&;`NbE(3=Mo5+Y!1e5v=Z@Hm;$DN&EfMf`k@!H#A9KySPUkH
zF(yUCVbTBeA&z?lACDtJaq}d33nLYdg^V3MhCn2?E3`)jUBqIt*=zxuFR>veraLxj
zO+;oO1|c2`##|vPn86T<D42G|ZTJR_4`Urqh_~g4bwsC8C^$a~Y2ZROO%mG)E{82Z
zvl8OOxFSgoz#Pq@U~CbJ(A^}TrLedZDi0G8mBS$z5*ZLY4uLx2pmzz;<0bdw@p*U{
z3x$ipg3IBH^B|3mQJ9AghcTMU;!E^Pqf;^YVpQi+u~}l1R1u5Ez$v5g3Pk)rI-+4J
zV$$()GE$2BVDT~XV{Bz{S$I2+*sd_`Qbi0lhs7m?OuA6)`&<eJG(HRSrwC(|B!?0d
z%ti#2O%rnX<cScZ9Z)fNqbc(lOg5cGk@O+r@!6Ps&`}vQl&d%&37I@Dnkl*vO6Jcg
z6&yS|g23x2Y&uoIr%B=(4Q+ykiWQ&>awPEptq-FLYF30sNu^50hlktn8R-8M3YST+
z#rA>&q4P*PrE{@7ki<lRkVWU96JuS(A_S5;0?)ia2O$WI#9Tg8Aif_Co?sCuRxTc#
zLkPrng+YUgQ=y7zLN*RSoIm&!5lRKE6djhzlH_gy9dje*Eey0gHk}abjmlxuXz1ot
zGK^tXCS}DJVJ}oJ9?pY~N&N9aK3S?^e#e7(P;FuvU@&Cx@m6>&2aP2ViQ_n(O~ojR
z)`~fpgYi@BgQz4nn}cHIVV+`3@*5i?0fq$BEh>{Cz8{r`r4KqG149&^_#?I<jGb5=
zV&3A?L@bV^Up|Y2p^D(LXmk#Z#rh)yCRzfM3?2d=)+S`XC~rCs;|h&ULj_Cn2c6AB
zAz_E;vRJl>b<ROqQ8*$@N<yxP#t`d>U{UB8Y=orq(-@Mx%3|S#SSAWtOw8d7NsM6O
z0VJqn3^!;TEJ=(&v#0ZE95i3Nn2_WY4oZ@ZfrpL8ERy73I;H?B2hTa;Vftb*#A6m>
zdgh_%*l5in1|c4w2up9$Soi`CMn=APzEmL{lMWU>7+e^CPFTTE#>D7E7jV!_36|LI
zNv|TX(5BLaXx9w!_;_fZSUK`pSf*oPC%%rxX9<O<4l0(i=tpAPz`JA7C?wF``5Y7&
zDN_NRCK6&34W-Otal|%>$2_sALLO5f;4s)&Hj~%UIRX(4>qilWDD*t>+^{#St#H%?
z8zm|d-wz#>gWe@%iTGGzN#rM_GO!v!!3o%yHzo0kN9AKGW)n0tbRL&2?w87-b15Po
zYKX<b(n#FK#3P`v7{nzBlgvuuJ{VLs7Ohy}V@6@%LWUe4R&_!;fzHgp^#K+RWE<KS
zo?wNg3Ys5=8L@sb>Y`O(vJl{lCCR@mJje<IIToJinV7i9&oVK>VpWUF94;GUnApZq
z7+7HAVg~J!fr=onBa06#&+*JGbbOp1*+yX#m^ScVJiUs?q>KCIu?Yez5}^PCBbOnm
zv-t##A;7eN)iF!NljL(WVm?Pio~whUtVj~qgbad%OA9(%hz^Ja3E2m#5tk?E_9!ka
zZ^ZtF5f0Nd){S)RfhReC3+rbF6XO+A#H3=eDVC{-&%yNqF1IjUqKAlWgM~#IhDDqN
zN64hoB{3TlF!}*5ns`Di8O3e5jARMW3$esSVM*30R5lyqGinWMMHIJK1_A+<h7~!7
zPep}tXkxw52t4?U$Hj`DjWT43^@dHjv=CtO7tpA9dJs7`>;UT)3YWkHFX9Nqafro0
z?<NaER41xZtPeI#grOLxiUz^ram6y=WBr6{L=0P05%*7?7BDar;AgQcV=-xxH3^1I
z28E6$gNqt;xIZ>TpnwHb48?R@42j2rX2L=JiWqcsVS*u+Jy`?rL|m-OFma<FlVheb
zsJJYnp&if!OkA9hZ5Wows-KA=1||L{-lI5hjf?R_gll5S*=J-yC!k?jgI-J)%VPcF
zsYn7gUdcn_M<XQLI08QU8Afs;R_Qd!T7xuM8jWCcg@g!CND}K8SMcO&okqpQ9WK$t
zvcgq`h_nm=7pq*RWc^FVFQy3Xo{ug;aK&+h!xqwL3@(-&JRBiQJYNFWvjVCJWlZ6t
zsKj%l@UhB9o5sQ&%hW%4kckzj2(61r7M2oO(J?HG*fb`tjd8^(u?+^sd!7J2mBz$W
zA;}dKJ_V&r$8+JhxR{mH$`}H0>47q&anUXRulBwLI+CkAvo$Y|#|FmOk~N+osdR8l
zV~;dd{ZhBEjaxICv22ZIMkCD&Y*FeiY1(dei|&@>@ooY*V8Aipb!@^CLd<K|u!J=%
z>veztVH4KLW_LN>EQgcL$sur--Q^HqIfTQ>`s~^N-dneBy&qCncXjLPjNRQ;|9@ZK
z{r~&jzwW(N!oCiiX;uW*iq4sW$tb>t*ghERBFt-Gs!mRm0(uREFbUi=2VIfyA#gd+
z_>ekG_`n1)<~e`@CUh{uz@$m#-Y^+k0uuxd0X!3MMr{w1u@a0DFm;E3$)b$=26WPV
z8jKZ98GxQ(`&e8o!8vv%C|?eQlQ<7J&`t(Vlo=}m=@Ltrv%=Vv$>nlDwISaW;gc|q
zKz-q4t0H(`g>Xk8?F#TCbUfbTBG@P@%MzMI<>7$6Tpk_AMf&NO0<JRy^ITxh;Lnj9
zxZ5QdQvpyB83P^%p^C{xFr_>YUx+l|^u}Pa1Ot2#0x~5ytBc?Z>=3F2qYdfND7_TQ
zv;<xY=2t-CuuV(Fl(Yy=L@q*b2pSRRy9gc|A~$j|HI{^d30;$+@4zvDNg1Y1CFo1&
z8anK2Y)L_ryhRZLG3Z*p1XLG%77!rt1|%i}Gm525DFuU;oQ22&8S{!I7~z4OK;MTs
z5zbd$O2b&8!0aXu#Dw_WBy{`|3{FLeCZ&MSp*$pTElcTq4#q;DyfU^K7`4EpVKxYi
zU<oRR<ca{+B2XmYe=yqTaXXK}gd2P|SR(XEnBF2e8Dc)raV12&zz<;ih4>H*alpfX
zYJ=0k?VJ|Dl3`TM0<*}a$(WJ?qLmVXCZ<wiUe4ou(KH39od_fv;7noMpfdsk1<QuR
z5uuK_?U!T}%YccF1Y@K^Y!#+9aIzv;bP9Mc_!L~eG}sV$FKAf^2W9ZMHMS%|nSm6-
zRSA@EeW7*1%|VJ8m~@af1(FN%!Aw2_L=-HPq^H1iAA&0|Sr(y%aQSi&5CGl<K|4q;
ztKfP7r^Apzx6VLJ5VzIh7<5P=y1)%H5Z57GWlRQu^LeO=EP?03Wr6VtT1!EfGQh~u
zHRviinC?Jy3XB-szIqKtCMX6>oFNNb4+syWfd4}*V-W&Lgg=)cssVFJIFk;>@D#ZQ
zv^fj?53D8!Hav#w24Mo2gu^5r0!Je72b6~lt`*2K@ELG3(h}j5;6~A;WN{1vWa1LG
zY2fIoY!W;b7$AgPa67|XBM+_t_(cvPP&s06Fit{hFnmF9K~hK_Y4FxyhBA!iOF*zt
zz6u0|pi`AZn8fB`;)Je&dxKsFW(1QJ7|(E7mVlQ)Fc3zv9FW5tPA`o@ks?F`p?`y&
z<7?0<U@8KZ1hGyS!tgbiFhO7iVjSsg9_TY}11Sf6O#$Kq4IlwpC|{6AlpWApT*|?`
z3AYP`5`oIYq!uD1FxA9z5b}k22|5!H`Vg7dLSM^)y+AAodO2`oB)13w1Ssrct^~ag
zh5~eLsg#2m1Dqw8l40l}Z4a=*7!4CE#3r;kY!=!+hlXaDVG+&>M-T#JS)iv7&=tv8
z0mDCp>L9=dd|^zDJwTWVA}}zuPoW?}68FC)@c+<NGjJp!3@asqku1TK8=?#_`^|$F
zK;?r#Zwew<FwSPBT!!>#a3Lsg0T@F(4%{9rG<sn-{KE^4Vq~F_dP=D|Y=llJHIgaF
z15PRB=jMiVZZ19K^|~58L$O>UXOX;E8hEjkc=Q2SY;<wIADm#B(U)Gn8T~ii|MX7S
zGZBkD59sX^)IV~(i_nx8E6PGuUQx6=CuUEd63TG2XMF`9TeHw8UQt%$Q7qcf#4g-3
zIXkKSLaCCph_rC0Fa(c}h71CQnd7<)g{kSO`zD2<TW@7j(^IFW<^;ssRmCIlR7(A-
zmbh4}tgTe_MRqDMk)$QIl-H}csI(=yDK9jyt}4RJ={bRv_jtL8c16hbtGa?i^-Z;d
z23V1cSISjo1dcbYmx~QnO;uV~72(%F9-Wr#Dr9LfOMAVBwU?kC;Unm>(ctjcW`ba8
zHGe3xc5`pDfQ^R5b=C3Jg%s%aYTvm{!1a!sfQgf83aI^syY9rFa3IpksxSmhGBqM4
zN5ru?Q5?trMxl3J9wH@23U}NoRBIc<hlT&8%V;0T=uF|*>D6ZWc)8KM_uT23sVZ#E
zo>dxamFC4$)A4<zb!oOFl@YK#IQSG!{}qNL;Yw|-KC}<I*1mYT+Gy7CVZd==YDO3y
z5@BL34jtY{E7#6EKdP@QGJaCuiMeC*=Z??CaX-Vg(;8K6N@*mSniG@bsm!>Tf@Xxn
zrilnNqt@EVlt4v@J)j0axeUiw!IBhcfOFU)XXdA;592$hXOB<L3KN$|Vuf?Zwp_wA
z;GJ|YAr|Ri384+WN{B={SHg^U8)A_Tme9KmGo5V16m~kUD61l!YD380s|{6=&XrJt
ze(36LQsY>pgC+ECL#ea<FzIE>sc|f_={CH;#+3L;rK-TGiM9H|YHdTQ$JO!W91y7z
z#7gOi-B79xkSUjdZ%Tr33@9m$B2YF=D7Tn&GQ6ZyE=eC@4MLfVYXjV8m0vC|%Og#t
zqAb_tm62s=y#{SNXWEej-g~t%^jYANDz|7r3PprQ1>U1DyrMK3^0IOSkF|&2M0o4{
zLg5^~a71vI24=IS3}JCCA)`h^Sy$@i=2a~NoeH}rSLFH%Kz~F)NY~52vuf2Ng4}2*
z%_9P;Dx75rXJVqs%@JX7rLn*?(65rR4pai)fg-~kr?duBAw>3%2y4}HQ|)rN1+iws
zZ~H-<QG`aor+p3yCudL3pFzxoWgkv~KM#0~jKDiNG?tGjlZv1Wla_UbSkuCWL4r^>
zWN^z@VXC6k8-%|hZS-fI1CZ;25I3f3pk;<EkOG<?#Ss{BG&x+9n~>90;w)%-*3wf&
z+F3P<C}lB=92=LUQ86`^6H~Wic5Y12pl)QCLDb`wS`p8ManV=Hm0Hu7kLG#esGxD-
z_~f}`h6YgmkakV`axJ{_!&+?>G?W+^L9HoQROHGltF?M_;h~Ky#_P(%^tzTZs{7`l
z9gf>)Yrq(Mmtc@%#W2VbijE3I|9EE*Bn>I;%lq6cShbi1d{pm3ZE2x+MXoQixD9Gr
zf_J$J{?oyZg}6gH2Eeoup+v3S6J13Jx?1H}8lt$_O<Se6y3r~hlpCwyx<=px<f1W(
z!baG@*7-v>Krk~|sKc`c@T@S{kl!Vx0zd0Y$<T&dF+^iN2Ar?~n({RWcv&7ACXKFa
z8>9WO8`_N;@QsCy1&GoFM^=dI)C~X`(%Qyt|3_|ZZ=@N9CfmGWMTER~`#J1ooyd!~
zdK=v24V=WZmeG2n8vRc3w3|gckPsWwjslrUj-tgFQhJkQ#*ezuN=xPCGLZg}=0>?X
zBCTvBS2nH~hw07FIR1|tE=I1gtvQV!)Lg94k9zw+f14ZVI&NZPfF6N?FsY57HtZS*
z2KgY_szEY6Hl9h2!n#OU@Vp71>{q>g=Gh}-C2H8zDD0}q=JK1w#gBUXaN%Pm3iZIB
zOSGrwK<~b>vgaW9x*J{T&1gW+VaWMT%N}#}T;F)Y&Az)QHh^$RkHUs0*lDy$zT-#T
z$amJu%{2%ORLe@mxTJIo&pH0L8y-9+fS)z@4A>iPU`H|*&@)Z{qFdAVF0b+9<Q(ES
zexVQFNA}3W`Alv+BaWuvgqN}8CdMqk>P9^9U^ODIHESDk%LXm}aGuP~-6rRL;c~D(
zzv^v3yZau_`)$1FxBWJJINv+q@rUy*n}_#Kr+Ake5$&D{I06LL|H8REX=#&W#*eyj
zcx&nU-|x8r!Drrh=H2}$uezZe$tidDxA`{Q4$=Lf@$C>TB`bagH<#&$AN3H&#-50{
zz~0yVcnTseV{o=i@D+sq)r~;#h>N-n!?4;u^z9J-;j%S5|LSc(yW1iz{u?j)ZNKdz
zF1-XEkGQmm#Pm|9=!u960R(6vth`MHe|+R$-8j7Qh>P{WBlX!2{xI&Q-twMK|1h}>
zq<%U{h$wek#HAP7eA`7_dI2;ZacL=8@#FTQ>os|2Xjx}~v#{W38Yw#_in+j#rtnc(
zzSKh;LVG0qMu320zjNu}r;$(~yw#0B@Y6^-6n^U+w7W0-);n##4a0A}10D~*wV1?s
z4}K1e)^DF`!`RsqJFE#CNsgk|pNN}zmWTlHqizI*pGJbtpoIfNJIHDLpypzQe$?9s
zdbjPc?m@fTb~~(l5W^lB(^4WjH3VKpo0c{+LHDcPKKI5?4e{ETl!6seJqCLBjcGjx
z!8RPzdJaQ=YDmi_?>*Nydg9a&GW4ZK(fcg(o8>!x)Qx<HpBh3=WpDR#U}S`T&~li&
z`&ADg>~1?Xq=)Qm+npNHL)h)-r@pr4)Yl&B6W#Y@H!Yz<W8j4@ndD|3JNQ*Mj$)x~
zVcLdIcKx=#0|oJ`-Ujgaliik04*jAU{Z7#%8!HeX(r7Y0HkRJZ#tOgcMl0nTE84uV
z!Y@oNX6L7qgotwYZLH}2Hs6LDE4m*vzLvhFWW|r$i>}w~k+t;5DwA;D)>t|f{JgEy
zxL<YS@Wt2CcWB+2U$|V%&QE7)1KPi#oxeritc+iCb-XY)xiB#`vv9I-=ET(W^zhWo
z+~n-}!n80XWrUS-b*-s1hSZmk!+XVr#d384)%6HgFPuL)Y&@Xt@NE*T%S4hgya5*8
zkiLMn;<7iyn$)!QgbsSz{e9>qmsnZ<F0OsX_i|$p?h>>YWG7Q|VrpE<jEkav`;qn{
zmpja&X**FPHpYiY+KbI;yX@0mN=L82JK@QDu+t#{pgTM5u)g8(^&V)qxsck;c-FX~
z>D1F7BieXqWsz~ayIS6%eL?zTkBrBe<hUe`W|E?m+e}>6ue#9-gc6tKhk1Linez*m
z!+7jhy$xu0-xCnOjTim4--b^>dIvoI1f*p_h2H5DJ@Nzu0RmB;Y$`3L^ivjfV8kD6
z`U3qRDJD}RQgVb$b^NLuhd2HNq|Hx2{KDiikoxH)A)?%E<Rv{WCeiYZ4D91g2Y+9(
ze|2M<hcZprcI3r>>)RTPfAuzi$H_~J$cukequ(idg1itwB-q55&gX-_3e>;4(Mmaa
zX)Ag050i`8`RODfqTFrEH+rGXx83rMUI2|R-)JdW@#FTQ>ot4gaS^e~B%Dc^&8PE$
zBP{;ajl&mTzR{888~)*PF*`q<r449zA8C<byOK1TPUljY;HST)$Nj1s+rFjKfo(`y
z{I<G-!T41V5j0L(T1Hy@q8Z)J&?BS;0g)U{CQ~^v_{JFGO8uxCt&)?L4w07LLA(1N
z6ZKBpZ^Oq#y#pS9Ow>{m<7e=5Au)c`+c0+a#95bwjlfx#sg#&EEX((`#>Kz75fFZU
zb0^Pl_K%v275eEj?F0Sd?pTa$q!HpJxhcm-<EZS7ro6HmN82#)&-fAT?nZNYp<L2k
zSSl+O{qJ?TvZllZyy-v~hP^K6KlDel!8>i#d{`DFN63fX)-oE>?{FBxWzzDb!sHX*
zBl)X&3If`2LldPJ@pc<}kyJP?+y$;$w+v0n!$KRj<A&^1pxL1&iVMblH%YuHPPA^O
z!+RX4rdrUcbHdE&IbmjgdYTqW-3&nmn^#vMi>Vo5Sgrq%?v2NWB-Mr@vXUsJvE*T4
zerD=4sGC|}t}a&;=hmmUS?ga><Wjl1ys%uWmTDE*1OSycNfBIap&%Nx)+!V<8Pc$R
zpfG9#fr4sUitG#uRVmh1R%=zI+EkIL%0;D8Q7q6<`C!m&D01_PQg;T22{3pz%GJ_B
zV^vW~CS-KNnurT5RNMp%y&S|!%sN59VY;khhJ*fS`BZaZ@{QVBy@*T(Ov`FkOinc{
z4=D%a_Tl|i#8a|8at=O{`_>@6Mss{o_N^)ITV01a>i9a~Tiv12*+1@Il-X#sg9&io
z+GgLXBFB9z@Mdjz+ituynaXnC8cpBo1r2kZguZj%N(g!9d8>vT_pRI8w~le&8dcxw
z4h{FMLY%chlwtzOl!{y+-}?9ac)Zmq$d?WfZ%xmMNSXy`HR5cH)P^2!&4}*2wO(t=
z&2p_OS5#6!V<HxoWFyI@qHwcTw~h1Js97#K2gWU;v>nyJtmQr}zBNW`h0jr09})CI
z(k2nhCH>wZS^;$uYmps|o1?fOQmb2HxHgK$pr9A>%x!fOEw-X)u165?m~NC@cPPj;
zm6cUEDz3R!R~#epa=E^slM5pp9~OR=qB}n=VW6+p8fE1CjE_qeD$2T2u^1dv3p11-
z3ihjYrC44q6HmVYoxr&Z7(~0N1DFc#2mVgtSdkxcdCaNaW#cO;>O!#&&)}fL;=>_L
zW~}PiNxI;Owdk0n4yuN-`58zJ3PJp@Ynh!PRMi~DyFi4OKLm4TOkKIaAeqkaU_wNL
z2_Dd}5w@Ecsntr%_0W67)SUZ59Htv3MyeTAAm93TJeWY1UTksk7M_Aioew6Y+?Q_I
z%t#&V*hZ339X#EQb|}GW@-{Uhb=TXz7mhZGST1!Ul;D9ob6E~>GXaJ&^KK>@s}qjU
zx-c*3>`U#;V3~Y~laMfQ(}f2dfwRoTQ7qARu_bK=(*%TG4nj@MIzbGN=~{~J2g2xu
z5yC*;Fv0F{5W(tz8HB<8F4HyCvE$FjJy0~vx-H>);~e>jFk}}PFqyFHbllO1v6jh#
z4P~>25Ulta6oS~UYnh#`S5-5F5NLY`BDp+(5X4J&!7AEh+MVXoHKYz?hu&sX$F$hg
zWOG#BB;^vaBN`@yqsfp``LQ^ohTi17=BQNDQe<ayqblZSTfufHctOJ)xdS*%fWgyc
zOw<b;T@`iw*%=uoA0iW#G~@#L)_K1B%Q1jpeURqiTT{|_r+0tBp`|?-o3$cAy`7bc
zV(WE5f@-$gwR$1LyhJy7Rx2{*dRT@%7i^d=8!|dSN1!If9czo0(57^OR^wtV2XjtF
z!Lhn;FQRV6!FcIyMv6@Sr!$<@iUciaa3rXvsbX&SOI5bSSyMT8WSCQNz=jDhbeR7!
z0izSvkw3?oI~6z0CFo`6iJcW0vyR2Fx>yO*WfgM+hRKIWSvBASIaVGGu<bMuKF^O^
zJB>4A#R{3uJ60(<mh3Ic>Hr0Gc2>{o1qIb?w`cW)hPjC@ajg!}nCqc;tzO_T-3Yl>
zh8*{;TPLj!y{0RCYmbEk9H5{cva5WnJ2cF%aNjx_kCty$k>kE~d;3=Rd8g4QS)Lx#
z+icEhqIaf=CzUrzweTrcFZot?Xt-~U?4H^&6*=x(x3_O~-xN=sp#wFdx4Ca62XB%f
zJ8hLMMO$HJ=?)F|t<m+Zn>nw5$6I?Y-YW8VE1d1?9Qvk?(n7VJ*Z5GCt@>7XXt-~U
zu5VS5<GgiS^VW2p`__p1Ru5>nZ;h;PrO0vLx~+X{hOZNVH6D@Vtsc;D-x^upN|EEf
zbzA$^bdK}Zi27C!Xt-~UtZ${rao@VFeQU-&s7FJG{8?7}<E<XhaNinP-%63=zIA*1
zR*Cc0i1OBqJ2c$4M%TBh$OZDP|C1j@Xg*0mOpi!8bQ)#S=Z06Lke?H!as9Sxx^>X>
zdRVgiz~!(yZF|Dz-Xoh%yW6kj+&sEW!m`rUMv^Vef}0V2t$_=AHZ?NtJn#dgO(K>{
zF7R4pTlHE4FW|e%al3$Mt_R)p<p(>a8=;K}p#ay}(+ER{uSZTJ3<Y|_QGJ~`jxcom
z4R!4Jj(9H^4M+G{##!Tmd(k+L&;>w4Wr8?gRnhLWMK=VE%_B_B2*X2Kg9oCz{Ln1Q
z_0&`Fk|UBhk{p|p#(W+@Ko#J~3YSwWD1!}~>ZWo%^FjmKu(n7TBhAF*sDe8ZRI}X?
zS}$Z64|9l~GXXZ`V3f_Y#ZD*O`m<eVL)#+5a@NlysERu_OqW&6%?~p9P|tQzO)8LM
z<uL@4V@-{~04k>Dr0lr$Kws(}^<9Q4K$nTY>kPEvRhXZdIt>G^8p}90IX!u7PKevg
zVIAz)Mv@sc+>CAv!3jN^8U+W^CK1ae9z)O~XeVY*pJGv0dRcuZ7xRk*2knGk7ZWt=
z=x8k9?rwxkH$r~f%VJrf9SoHTF#GIy48h3;Sz&{l9@FJ9gq79!5oT9#T1Rtvp<IG%
zLx%Zv$6^TK;%}&9$C;Ox1skT_-4a7^0?<&IAkLS^5G>GBI?0SzE;h^S$^yP&nkrFY
z^6C6`X>#8*(o$u^IEyDcQg#k#tG~x(nnHS9a^A~H^HK}yyn~YpHaGjYtY>y7Xbo$J
z1dRmf2TV=D6$z@@?ifNk?T!o^$OBLsM%he;!BS%)E@$*mag&A6%ckj|*3fpyn00jV
z9b{In*f3pvshy35$%hz2(17DHgsrXdB0+d5<I}O?7y>L@kn(&T0?=B^gnHC^$$0>@
zXqznEa^Ga~8W?uyIV$U@kSCBfiC8Z27y^Y&*mMj50bvIs^QHPRRm0uY2$^oA{I>hb
zAy()B18&D-2;suVV+c`?A%u%Rk0Csy#7E=y&-Bq$)%NA4cot7Rvl_-Vgaq_4Tqghx
z3PA==9z%#^3_+CU#7vNN2uLB%*C8w{*Q%vjMRu93k}15&;OM-b4CeY3Ax=mbHFWb-
zPb8>jyJHBwwhqAw88bBtf(_G+5Es>e3lu}(3%pFTP#Sm=vp!=8Q~{#7JUH8O#|rU0
zTO_D^)K4V+Awf0U?O3%KpTi0f=2QaA6`B9(L~zy?8FM{!j+H#~aIq4m%PJOu$Bb%J
zojca81ZS~z!e|UjMf|<IORv!Rr(W9d>R`UqtvDDjz0Dpun*5K+eyO}ks-=w8773cM
z#j&c;IU_+eEk$;IxI$I7be60aGR&zsV8a9$I?Vr=fYAwuv~t$TMp(|lv0=KbVjXv^
z8gSgPZrQPt=bizV6XZB!ZG9-f2?^@!EXIozT#=xf?RKm^$XH=*fYnBra}16R(~Xd0
z)qvxUwIz-<XVYo4MO5V#B`z2zImsk<tl>CTJ0z%kv@>3JB&a5LtoY?Qn|a=^#a!%=
zVa_o)HcYqc9V>sYj48&N8j+GC$uwH2C;B`uMpYPdI~1C_<ggdOO1W&d*UsXyg@XD=
zz4dC?xk5oT+wECBp<!-?RLb&PmkD$_7@)PZot3a+1_y`fM#!}?<N}e_+1;@i-D6pj
zn=-7dKn>Dp%CO1?uT{kD1B)BgyBezW!ctkOpbb3qH(7om?8BoW+Qw<#%1PafLkFU_
zwTy<^y1Br$*U+*B?=w`Be7Y=>q=K8=T#)Lyx61U0XY~Mv`4k7UU;@lAnEPpGkk!>f
zSTTd!2-9U1>wJ(^H7M>?w-O3)?K7Io(}e;w1y`CxHADBRJ$Ej5uXt5=V3<eYUKRFQ
z>mIl{KT~9F7UG7j6n_Vz<5Fef%uGussi!0TSX->z-En|S&iw_^bQbJjy!19BM5gJo
zi4{?KlT=IA1Zd-x-DHzy9GntSO-qrT?U$--X-$CS1r75O4&X2W1`l&UCS-KNIv?d|
zAynLCBlL2RXqj1uJc!WHbY&k*msPCee3c;=h_Ui@tU-E>(HjTpGgOm&h8G7&i#_+6
zt{$PRUdS-7!aZxGII9L6cdXku6p$9(r<|02IT$a!%}9}n+L@RUl{ZOo&RRj6K-E@M
zPiXc)hC5cn!D~9A?L3)aZ5CoMKr^X8j`i>OTV%{{kx`=oDc>i+stT}rQn!qN&Z^q{
z7MWDabxnZfpb^T~El;3rB$>B`n-M(#)&)G98rg;tTyeBX#B#}HK4Ot=)k|bNaAzLK
zVMJvD3}rSSHw~vcVV(bLb)dn{09nq#p<}wNVjT}Ogn?UUk8W|XK!(6p(Cd+-Tf#uz
zaH>gXPJay@enTCjZoh`JPAp@rdY5@zyW?0)zSs2-6h{|~wJ0TVJ-EdMKSO0q;X2nr
z)3wabv{_X%Oi!rQ1R}Y7c^0L4z46E^&4UN<R+CKhHo<xTV;=u44v`5kpUB9X$rI{?
zLu8?LPa;{)wmila8m7xC*6~q{;ugrO^3Z{kSGAkD&_`CbJ;rtPq?%zr>`J+xB55;q
zC3Ue_v5h1%XSm5j2aQ!lDM94b(o!7>9k}Dp9F_yzOn{-xd|OAHww(d8Vg@%srpqeU
zai{HJudK`gaO0r^*n?P@Ij)cKuhtsnX1P`cz|ku#t8!7PRFs8^vaVEcuoZ7xw5Wv`
z>JI~X!?c=*4&>$)rLG;2N^E_8dYVq5EoiG7P0k53Zc@gnth?Ve%(g9KtZ{A}jd#yR
zJ#?V9cOa6>S9Cd%T)T-8rK+|gtn0LhYKBd6(;{Ci8`ao>J2IQ%GmNfvgooPMh`Ao7
zK&q>iFkM!$j`L8>qyjls{_0s<#|kg8w41z8_h@yj8S}}cCW=Be!*;Bxyu0~Yrn<EH
z7HVT7$tIT34j(w6XH%o#K-wf?xzv&Hfd}x+X9bvtGWXVgzwH2`xgL7I?PY{amsPCe
ze%s4pS)l^}*ZBZKsPOg5+Ji7`HX;#(Q1Lg^G4dANogoAV01cH1;(S#_yR{W=2pS_S
zB=)Dd%s^E4@A(>AyEV2BI8(wJoFP%ou&J(;%VhW}TL<_ooRsqXASP>*h~-izxUMJi
zo8nB26LPX!FTy#dXi1n`7=07L_HdBnY_}U8mxY79feLo+ftiCFHansq-k0eb>gYz6
zm`aF`Jw0(WP(({y$w=s;^T^BrK|^JNz^5v<g72~x4@7u*Y?qQA!+Ki>peb#QF5yk5
z(wy%)pl9Kw6#5`0Ym<oOQYZMX7x2vO2M7^zx*Pu5l2Cx-Y_}WOZYa<jh#)7sZCD-V
z1*3r;a<01ufrEu&U6IjB%>^<V1>0GsH8`ibTR)h_fh=_a)lIsu%CyijO7|=#Xd}s&
zF$mTz4-we~k@f(dQ(gU<c*fy79u*1&I8JrDg6f80vz+RZ{Vb+kFd@cuy)2khU9=&o
zg<)N-!2?lUzKnr#z21oGO1ZT9N_mHuiQ6z(hcKUwB<H%(<+`58bFv#zvKtO^ob7f4
z+YJYMPIuc)cRg|Be7D7Xm$i5x!poO2^cLYwr`+G>V8dcL-|dZM4DP^lx*Jit>t(T<
z0*<rYZeY8iK+nl;yUDH>jGXJXnCof{9*FAl*lur8-J~eFpOMSCZmlX;7U)5Ygy;7H
z*Y!kxQ=ExC@8?_8!rG!14sx9Bb_3fD2YXI;+f8>papZir#eA2wI48W@c-^j)%kuNb
zk`BwR9m#$#5MEE@IpK|HbT=I2IN|LE!W$0uoba}r@Ot9N32%!DFKh8YgqJU7=&cBE
zevB{P-IjdU3wTa<BT9Ee0gkiXZeY8iK+nl;yUDH>jGXJXnCof{9*FAl#SFbgb)|HQ
zFJ{=5T-OtMPIe<ocEdrAv)yiByWwEZ>2AB}t|yM1@3xrlvK9|ScrWgb#m-GmPac~S
z;w8B$!+T57g2qNuURjL`^kk~|5$&$FIiHFzm6b|KC$KJ8U@?KP@5JosQ^GL3QVjiv
z{%AJFle}maBuB`H-qtc2l8epqy0U<8nAQ+dVe*N~OY+xVWrmBPCQ2_}xMy;9lI28J
za2if#NY9DVxFn8?;_V{({}6ezFU`)n<`XMqq;XMF6=+`S>t3m*T2SZn<*1-~P(6BW
zFsZ-geZf>xZ7A}Zk)$ZPzhFYHE>{#4Wd^X;`t&wy{VR%GDp!{mmTT2gts<KMpz<au
zlCmunM1wYu3I$EZKNLm{{b~d+D5$2T$j+cpm11pWwN_QCO*pzr_d;h2G*mtqG+oC?
z9jwG$k09VMUAAtx<L87iqYC6(XSr{+dwCD;HOVgYt=X*WQRdX>i4U`kYxH)+JgXH5
zTF~Hw9Mx>MXY~Mv`4k6em;eKZxgXPrrxPw$OX@ZdXXZi0O#>diY#VNKun@CO*ZWn9
zTOhB>U#sriXVTs1RYmt5_SC63*e<=zEZ5|HOy*1FO;W8UtX3pw#$DuAJ&-Z`Bg>G-
z1Q<FU_p4T9STTcR!*p52I?h+K6gci!x4y5zVUj~<-@&^K)g+%T?`ENbn+ye{CHM2l
zjSk6Ty!19RU6ae~jAOM1WoSWzBSAGy6?3y+s<Nd>fEO~%sgOEZuweoW9p-;b!03cS
zC9GCtSkA$*VY;khZon}4FpnFiCKbrB^7XSVW2{L-P?pZA+8kp|C*1=9j(esoc5EZb
z3>t3M>STUtvS=GM%O&R-sYR4FqZ*5~+z*1Dqq1qT>I$SyB9==o<EBM+G;Vg+AX2Ma
z!nQVwDjX!->jgaXLEYrH0~2Cb>{uH_$!~jDEGu*X;5xse08`Vj;FHVs1zi>WjlHY2
zMj5?E38Zz;M;0o|x>B*28*BA2hwLz*kH~_8Q1Lg^(I5I9gt0r<6F2~9s7#Q7Q&l85
zZ1J6D)NTS%T^^iuCQEoBL6<EjNaftAE}afL44d>eBekZfwu#tMd6QJDgR@p7XvV?k
zsZ`Ta<d9brcp<|ER|jmE07HjSHWM&9;ZS1;R%BSt!LebwtYRG>$u!^sIaa<g!O5|v
z$1}m6dS#;!Ly$6KdDqnhHgi!2L$;A*7L6ymQ4S*5!Dmw=TjO^{(k2nhrA`D9JdtN!
z%K>mEz;I>`t|Ok?4n<bX;NUS`R<VwI?r@OP&9NP!v&_X&B++(tC2crUOpjg;;zZ3l
zriqBF@iARX(Vck_Q;J>~Asp-tbL`H&8y%*h;Tf=@j?uSa!`#~vuvhOgziT(uHkq;O
zq}<Vvv6jh-4P`U0SGPmZpb*4<UCZoj!K#|tJR}ClEGH1*<r@>6d^Kz`2u66N@d#{8
zkTQAqQ)lQz+JW@Y+syP#Gj5aBQF)UT4<+E;A0`X+fQAjs4&X2W1`i`{CS-KNJeCk=
zt)VXSIVa4_Pft_NuRhvlQVz&*-`bLuUgCJF8+~g|>b*6zc2H2KZablL9p<Q}WjJsq
zGgQUAcAzR3l}bgiJkF%@Mho?VhFNy^_*Oe;%=Iv_O;>Q3E*msDKU2_<3*=k*VlQXj
zYF(<JEpdfC$8o{bXI$5rP|eU+(I(w#t0ksLt8bY*+DJ0C1ULC&FZxOyOG|YkPMh`s
zo;j`Vaop0l)dbD;2x5dxH$sj(6yPEer40|h9*NS10=*%C+0h_vc=#LY*m3IR1*0LX
zX&GmYLC7P*P;_Fnb^sbG6U6zdigvYgL(tefbYZhfasp9Z9-^gO&k&2x%t_fmVFauY
zg%E8r+haS%9H5{xb!!;G3ks^)_Ar7MG>nOL2^)2Q##|4*L-YcN=|+f+N*Z$9w{AT|
z>(FaxFd(==Q_Zl$2vRb`eJhNMF2h{se5(gE%&&0YIvS6ZZ>7j_-@3hhtNRNasb_I8
zUwWHGV@>qV^Z=X6o1|K}w7_AQqZtPu=BTEn$YDSJN@m=l;l4Ghr*Lb?ao@VVeQS>`
zE$AiR>JAO}t<m+ZDstSnZg1b}zP5m}HqKi+z+2s+;l4GxzI8LtviQLTw$HNCb5bhc
z{#vBa`S&1rJwom}*Xn?Tu^#%TB}Bm)398v{*Xo5#*LhY4WX$ylf(_G+P_$J8j+54{
zlU9dLLmx+3BYa-wOf^G)kR?iK*MkeFU+Egs>HvkY9zm>^YPQ?6N?y<~x8eW}6JW7$
z=7CIMaGh{?`?zfB2g^CQoiJTiF}LMXOr9*XBC>8fv^KMkOG7S@Z{?)5$4IN>ekKfc
zEDqL7Z!_C9c_5PkQ+bnA3!h{4jB8DMAj7<b12#;6p~D=I2^gI)_pDpnA*%rw$g%R3
zt-a+~B~j#_)jcrdI2eSzLcj|e<_x)Ojn?Wb4Y@$Rm6O)q@~vKn;#0@sV7>G<qeUh<
zYT`yz-Xz6ItMyzLFJ!o9jp#Eh4LI&tw-#v?(<4%DPRje<jztubsqv)qgOheeTGQzs
z+A-jQ1oh}-fS`d^KS)qb%UBRHXZ1p+s~oEfGUj>&!G`IwFh|GZtQv5E94kM2+sas_
z5mB0xa^u$5M^pD`G-nAbB$I{%C`^Z(QO&R)VI@ULbbW+nIe&!K-?G=-Mv@WP2%qWV
z44+Mnf+J~@h~-iz&UEoao_Q^Y=?W8II5P)lnycu9Ly6-$qsVd&4j$8G73;X?4hOjo
zpXw4eem!!kOE}mYjy&nmu`XeE07D(4Z^MS8P%LAu@!*n(oa^F@prJC9Is2hOs$wh7
zbzznhi16|dEhT)~!7l17DHAY~0HFsSNm#B`OSOvJq7bbM5;PuQ<UHC0*98fx$s-8@
z`a3au`V@<e(rKFdeqhEANjv+o5wi~UhkmeOx)I`|Jd)sI&SU~_>Na$CZ|XE;M3mA2
zBMA_C&>v+<^J?>(z}fgJ%+E}nhF9LI1vxi4J$Y<Sh+Ag8bYQWt8yiVB!-ShrJsp-1
zAUpVMYGicGD)g>M+9YDR#3KoDt^aS%bGsNJ^S;5sW4aOY+@3beN=<5PF;ynOEHo04
z1UoBah208%Jdz-<tj3QpP6R7)n#&925?mWH%(6QgNeHh480yF)3B|f1qxak|Fc)F_
zIzBv?r_NdB&xXK&MxI?<ut9+`XFr2y=OYRH(HSLtCZNS^P{J#6!iyZRh5e0MF9~SK
zwzWr#w#hav_k+OasBE%XS0rr`v0UPWw>5;<)(AZalPh>kmlNLjXxzdY9qp1foXH&N
z<sepQ)(K*KOxIF$=Osl<DSEH14QH}rAcLJGydfH}p^lvJhDCU75p*NGuKt%3UQB(^
zq@>YQXRI{t%I2$bu~}YM7Vr(ziWMqMKAm|Kxo`5;RGG|)%!D@?knlp!;1S;F5MDd@
zEX0(K)5bB^9Z8!+ESEUpZ4u$MLy_?py9l8>cubcQ-gXe)@bKe=w;Lk7Vt5^Z6JGft
z1#H-U-Ie3a6;n<V5i;;sKRn2w5M=P=gg3H;R}upfURVRc32$TwuPuBQVrox#-I26O
z#Bzxf-qsLaTND{-wIjUl;4xiJc-ugDL&T30-fkegA$9;xc-umFZ4q=Kyzc(D^MrSH
zcPvJicb4R)3|^4MTQxR{9#yM%wZJRAuvAtmXyFk3O`cQ>`|#R5w1U*UiqsfbwX8>@
zx3!Fh+8p6UtGLJJV0pdY-N2ec6iy0v-wRB41`w)XX?uE`wY_NzfhoXF<xNs9i>Zl*
zX@*IYA*Ir$m|DA9fiP;&Ii{xI0tD5x6xsQaC{-!eR#t0OrP_oCXs|eXs45qgN=3n&
zukl6$b-gx~S6fkiDYZK=jEOlw!vq*OjI^0>(FupKRaa|gD=;i)QjZ|eFkM!$jz6nR
ztGET)a>h?wq*F=Fm?d!}CC$N+2`O=06mJ*N|A&Oa%yA)3J-)HWkR^dAMA-SmYdjYo
zP?L8sUV59EuF3tF?3c=$q*~-xU7(;D2Zw@cT8bR9U-g8Bc@+n6m;i%^IUo}<I^i%p
zs|z$N=iuNlT~@J<dsc>AAm7Tj^ICi>jK^XMF;?tVefU;QA>Dc3s_i{=*mOo6i-Yyj
z+st-N9>`?CRNf@jBG;-ltQ85GvC(=p1s5c!rlrUsyH+n`m{)PYh6ylqm;*8aqZ1Cp
zvuX`(MTX@Z92=&~D%Nq&ssR_sv3@NUi_wXhpvDAl(=L#$c)E>YEToNhf@p%M4F#Zm
z)!N4JVPQllSDQ*5f&#)&R=84Is}E_CK>oDv@n&tcTvR7kSCmS5ZRKcE&Pz*UrKORg
zn3qP<V^V%3pU>t;7DY)CGh$KBW{OAa59{V%{d@X<x)c7rx&J%6;5+<pfB*IUvDY5T
z?iv2<PPI-ArJ@wk4%CX=Xb!{LvWC2@91&2t55H*$eGJ_%6wcubM+A3i9PA+CT0%yR
zhO(~I%gw7=209gXPp-)I71)4vL?|hZV!gbIw&NUu_)9}+9ub!7=+08PqEzJ-DC*)$
zV}WU)UnOO|TqJj($i*v4X|19xpm%p15!R~Zrn(1yxx6foV9mw|O0*Fd$Tl8l(4?d&
zLL(I6)Pu&5aB}wa{25{5lEAVLr@(_>>(ly1clJfFrrmFQ<;7ind#{c4?daRFbJy-Y
z*X+G^-*wl&>ea8=zyCEiycYf4i2e@Uxcesjh5QeFzw5@M=%at&=D`EE96a>8U-?zx
z*W$na`dbr2!-sFX{m97ZQBg{!(wXd7E<gT;JMO&ejdvF&jvb#oaq^z2d+(b*HFNsR
z+1Yb*^XD&Iyma|ZZ@&M5g}2Cy#gekLe5L&0LzR_kZS}47Mssa_<Ke51{F8tB8^8Is
zx4+}j-+Je}9((tD-uw7(Kk>fzf8fb~_SEk@{mch{_e1~u!@u{D-~Z?zJo~Yaf8vv$
z`WOH5xle!QUw!ruKliVn|NIyJ%@_ae3;*uLKl;*_zw*al{o2?6<QxC~Pyg(j|KX+K
zKkvW(Kqj_l|AE2XZ(URBSNmfF-`O?r<L};#{!4uK<(m^RZji*Ea)WH&;1G-T@9KZ@
z8u&NeUxz^<7Fz`yMStJwBcJz>kJ$b{S%<JV`+I%|`JPiRiR$Nm_47sb{*z^L;XMzM
z&zEi_pL=WM^UPbx=Ude4Z&7~_H^_DICh{qekJxk9kuR$^lg}sn$>&?t>mO3f{p>Ds
zefV1P`P@$R_ci2mzgoUQRWCO{u0OASe(OD|+!N&UyT3&~2j8oHs@M0czn^;txh|;J
zpHr{L-c7DQspc0`KZn(FzFqzOlKS~e^;1;KxmPXss#>4tNOG~e)%zFK`sLL0zOKr{
zQ2)eE_=ku67#a5Wpke>wUJB*lUNtTB@ts|Ld#=4(*!7LOZ~A6o|1SJ*@1w#!PrZKX
z<HP&+z4p3mrvGdD6#AF@!1r_S|9<Y(y9)9Bzk2qj$+NGYJbQfqOYan}N!|eeUV6v$
zu1BYL)dfi7rQ^ai6Ofp2<D-Y*JNgL%1$+_2;QBib?bZK8VrOrfKRd8?|4oB89QfGE
zq0is&y2908S-bxau6f{Nxu-sL_2ZxZ(ii^l|9<21cYJN|F?kRb=&z37|KI=NeK$gO
z!mj>Zs+YaKt~4Mp-B8YzA5n<2{m69#AGt2^{59Zd2W{FQXLAjk$3*|cG_)o%4%Ft$
z_0{6Yat*Z|{#v~K_C4Bnko#aC{KKt2Oj`Zbm~0IXU!!Uc?jd@hbVuKC?B_fCcH9)Z
z?wQMAUi0(H{4@LecAh_<?z{H<GxJO5KQ;f%VBhZZa3kLLy7S6|^W{sI=Hbryc?CYt
z&!1Q3m*y`$GvBX`FMG)HHd5!8cMQC|Bk=$lT<i*P;D#Ls2KU~3^}btU_kQAL`1<A3
zzx(N*q(6Ccf9wF*0lvLo6f>oyloH|Xkc*kp($ay!{bN#YacMD`6tl(5($W$%uqNJr
zthTZuS4-!bG7w{UFmM(PcLxTqZM?N2s6(G7eB)}pRuZKVWo@JgqIJ14A{`jK0bPYx
z-Hog@Mik(qBa$xt*GyG`EtX4#`tlk|{=ndl)A#A_?>=5Tsle4&!T(QG%F9=pQ`M!K
zPGEmBD`qlMax5!KnPh$}2lXix)2Z~LlFpXorQ%XaU!UDea;2fbu#G=;>0FO1Q?IRo
zw=NwRH2l<w?fs9xnEpxsPwsyswC=M%+xb6!ec;u1quaNU*1dbOUa!@KjVnN8h1H5&
zt(L3Hq@Bivb7!Xo1>GMP&dBwKQpdu>@yUl*fvqbgL8(H{TTKGj_S?Hid8kWxI~4cj
zb1!|laP_4(qT(J>ZzpPlFFf*^J*YN=fBM%yZ~pnKaBc2VZ$I_k2mbhLzd3_$-}l1O
z@4xe7{|##osJCCaGx=XW^Ovtfx3jbFIKS_u=k&Lsr)9InlsqOTm*9OqVj7THT1uCS
zOX*w|pvfkSX<f_Sy;U*{O}EAA&mt<$9%IYJpmNm1>Po8`{eeO12GG6X*nvX>KfJa7
z$vtS)f$u*$+W*Bq_4|iY{ZG^HpUw7vh<<-I-@mGgy!?j#b@l6q?q-^g9qWIydiklz
z{)g4CPe0iI)7|uYqyH~<s^9<d_xk$3OK&{!XSyHX`)MC5+7CXmgZ=p8qdWUwpoRY4
z-`vvw{#UE1ee0E5`hP@|{>q~V`+tth4UYMho9?+@hTgoffF@%ra(@EI$E~r!U1*|J
zdE$mccR(9}$G!g69fNxp*NP7*&8g$J#-<O^i`T%zM&w%b*LEt^b^Y(X$hRJ=l@$HO
zJu6D{O3k3RyH;N|roO+{TmTPLR2CYrY^I_Z#CO$e6)nkqP{7xe*6R3th;w8bNQ%81
zTPw{P$sH)y9tMxN3SQ#4Sp(fa2=^x+R*GwAa%(Gb3rL(%9fPgF%^(0ycFmdrYRT)g
zYLyK&BGV9B)LkHH7&1wH)UiaBf*H$!LkC{DG4YSLAJPU|j2;<mes=r7&u&lr@Ycj$
z7+~p0!iJZe&9zu;;G@5Duzzsi!|yuSe^)H=taL~l$h1B{Qh$2az|*@DKRTLtV&u?8
z%Lp`R8KxlV7jV+wfutXZB|evCNz<N0Qhudx;FZ3_hf|3!BoAF=<J6%2PzLG#XPoW}
zj~(p4983J^7)uxRMv~?eI|iQEk@##j@nYuCMeBezXgBgf%0Iv<Kl$#1{Y$aLljAI9
ztzVPmzuGtO)xN~D`NYe)Ll<EH95fG$kk<c$v<3!#0cqVGOT2UkOG`BelF~Q(2EN&s
zc=-*9N8fPhqH)k1Bx5Ng_4LyysTbaJuzw<!_{#!IikJ~ei2(GWyAv<o2>_Xg)IrO*
z3+a9a(j6T5C8P@geeW1c*J6w$i^uy09`8#$b}aFoi9;93h^!9Gkk~hHA;10JgZ;@^
z;_(wKF>Jgf0dTHg>`Oc~nfUhcLl+H0ws8Q5<i3NGd(Yzs`)6W_cdj1l-=|Vbv+0wR
zA3Z$q=;6fE4<;Tx3B8|=^n=>)KXBWy%~i<f$G8o?4f#O-|J!v|Y~oBv>Yv*^@VVWI
zryGgq-+Czf?OnqUY?U?$wIu;IkP?apakSc6wFHC%A8HprJQ#BNQ0>Q_LC`<_+Xwrf
z-4RRt^~d^H13UXn(imSD8u-Fc0*0-`bB{n<Q<|bpXWl+=XxQc|WcpFa^p=4yz7Mzc
zkN!fJr<)%pxxPF!@bXXs#>d2mKhX!-Qu;HvmvW+kyM}G9LbfmBY=1_w{n;;c*}6D)
zlIu5a8~Da;2^feI&;KRLm9nbAy_9MV3=G>`g<N08xjz4XT-jfId<Sb_rx8P+B@Fk8
zFa8o`NtxQ<UP|9kme#9~<&SWdKPFlJ@I5<OXVwoxB%2pY122{mKqL}h_~Z_77-pu2
zJi3t%LI#h05CQnv4`7G!(;wc#GO!D)0WnED`uI*BSmS}Uc0YB2zWOZgtB*f<u>Ygr
z;lA?rgN!s8s1`8?5}g3{oOt#hZUL*KQwYtg#q0wV17E_~Jogkf^XK0QxzI?nO(Xya
zdE#&1anL>*Fz{D6sUJX6x!Az--@gS2Ut%x%0x`G)hX$`N%+8*^FizO};NM(#6#YpX
zlZ8UzEc}Q5?kf~F3dfcU_o7SiStv~4>+lnP7YbA87hIyB@Z%I*Io<%Z!dnaQ{p7}R
zxHyS^pFqD)FN4~#TmjO$g49l+--S!)cd7tc-@6RSP0XVbOrqa2C<mN#p@hCdYVe23
zSZJJi<ml0daZypG=x<`;EM%nq)zJNoOX%_=i>dQhFD2(8y$5qgkLJ)tQfTz$>{Jnb
zKtUiKsMiL$MH@lQj%0n$y_I5WVtuhXQ<+azvX_a_rK3wnkDfq@po-$Z$rGfg_{S8w
zKY?WLIU!xXcy9(>J$okQTK-e${zd~NC)P_R@?vr2#Dka5=T|QmCK@O`+z2>Pl-lHR
z{QFe>cBmix#*lz2s@lZrV(Q9l>B4#G^8Bm>>4@@${Mvl-{DX_h%tOVEV>QS03X9Mw
z^y$@TdS@3;o>*I4sjOWV?b0k1vbcUAUMN7r<G+c*!{~G3B<gDL1!0f=7SaEpOzGK+
zS1OlM=W+RF?*Ys%&aOfk=PqBESzlb4eWY;gK~z87>ZH9v@$4!5`^q{bikctwQ}{cH
zuAZ!{E0dsmaY8DcJU>~wc<+@GTwAQnR4yg+YnSgqU1y?p6m*^Oo4ADT<EB1&{%Ub0
ze-)(<?R^1t{^QGqGpOId-;`<}xPRS)uAslOr|@t1b`lkCVi|qnKU#m#FBC4@)pzC*
zRA1bFT7Nn_Py0LcSbQHt59Mm8{c#etuXBBli>N+W>11BKn3}zM>B3CqVru3Jv{U2q
z1zeZH=`)asi+$aP?r)%?O{^DJO83s9N}N0n7hTGG0^i5=U%UV~pUgi5-CzooT*^C<
zM)z@Joh)4~o}4+pn4GPk><ZI!aK|OT`|y3-beAt=l3@St@~7~93?1~3=Hi7DYnKtY
z06w(W=|Ul=LcC0z5P%7P6px7s+z99rv<mJbYQKQrm#XI<l#}PhbMvHqA-#DuF?0Vx
zMyt9y^sNSMME{o2b@0pB|H4I_KYT(FP~SgsejZ#Ph8FsHQ|%9T#s&|p%5?~FHX+t6
az~lM`#3uwPH7bsZvHx^KEOw?aEB$}rdx8i6

literal 0
HcmV?d00001

diff --git a/test_fixtures/parquet/sql_query_log_2.parquet b/test_fixtures/parquet/sql_query_log_2.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a46c6419b4c479976ed417eaec1ce35814aeb2cf
GIT binary patch
literal 72243
zcmeEP1wa*9_XiVE6lwd3w198swE{;#KokK1J75x6`VoQ+VmEf8VqkZ-Vz;h}-LBo8
z|1<MIao5`YcK37l`@6V5-!O5{xhH<-&b{{qdxj9^Wz4tO;!l8iVHtBh6Ft4@Z^p`8
z67wuCAKjDXikQtDS>FKhk)drZH^dV*+j<CSJvOVafHTLJ9p#Vs(DFrGGsLo;j=BaQ
zT6%WKX@gkpe75`EtUOC6L+-hNw{xTxClO6fjaL^U`c`hwpGH)1QI6LUPwb4kFW@xq
zlq<g=o{RIdd4d?*K6r(I&ra<ilE&m&KB(2^W-{W?OG~L##C_WaUdcqXaGI8qg?R9g
zCRe~!P2av8hPa{ji&Mi9$K`M7mxK7C%={Bm5U-X}=2pqiv)t?z=VXjXl)JLLCZgnV
z(@C`upGEaFt%G>{bjlS6#L=$DhSW!NbPEO-#C12fOO%KU7K|RGL9{r%lwGhc&ob0|
z-la8&5f?`|twpSKbXSmoj~o^qE<l{EtQxWladn;2dj%XHYPI|rqKU-8{W#);acQ9f
z-dGn{y48U^%MOi`S_@d$x3GI_#8&l=4WkiL9QT%D5HHQzxJE$dhm=(!V%bLehDnG8
zaUCn9BEGMaR;wrCwKk^fA6!7a75MEJFkwAY<uzjZQ`PEsh)qssDBmNdhTlv!yqIS>
z>hObAl@Rx3cZoAXyxHSsx$1~}Z<LxZ;OV-4ohLlXvuyU*u=GU4&B+Zj1dQu*=*(=y
zGc%f(&PRNb(ufoAP@zHiD#Ul;+B*Uc<xU)0jp)@W_w6>sK6M@Klx1gH=3k!HP{97<
z>y&PcXgKU$e=o!Zo1fQdf_Ug~<l{iZ`DJ4)LJ)~D&nF1D@9^R=oe<v|F22_pac%sJ
zn#U|>S{{8~Ci6UE=h1_n3V39W$?*$_u92~g?;uX|4LW=m(Z@b1{XXKUXC1b_MjW@&
zy}=tqKIy^Bw}_ME{s$k_LtGLd;?~zXymll~{2V$WTExb$eiAXiZ@h?_oCFcC_DK|x
z-k2m}cDLyh*W1pt3@tzToq!3$xR$#SZEDR76>!8dzfJ<$^|~7_phPtzQNTM<!Kngv
zajertz$$BY^bjz-iu`q!!%WL|QA^(mIH!K3VIRcz^yrELzMeBl!~+wZ%!SXP#QbUk
zR!aKCPQVj}=NtqqIPIVokS*8uR8;+$mTmU4ajA$EBl~6wm^W)!zJL}>-W(QihIi>B
z0+u^iUc_^^qeYCWyZni8e3;|IifM>%7q)i-*O`{;=`}S1R^FfBC*Z`NUbYf&>(*>m
zz^Ccv$pS9DH!M@Y@>8mh5U>Whc&dP1ZJRD`j<_wz^!P{u&zmN{D&W%zBm8m@y*)3@
z6Y$vm6^jL|lyaf+D8x%YO-T~avy1okTtwqX_s$B~VWP6}G{n8zJ+D<&%(R?dw%W;B
zh^;qYJ|iI0|Jq#vJ1h!rSsQWYw&^Pbw0Oi>)j>Rc!M~e;{byfVQy<a0(V*)BKEFQ7
zZ?bx(<^I>Ln+v$Phopso!3Qda2v{cIcDR7M-*#mMe6w?1oPgOKCw3EX1eH8cz)P{M
z#t9f|;n~XKN}gq4D$`lO$6Xq80=mtck|JRJwKaMPc()TdNI*yHc_RhfyDVh9fVY=8
z<Ow+B_3VWL?!LL$bwaB=%dnc|YOh4RVD!r92qIY__0=83;kK5y4O-*<?F(<&Ao}`m
z+t(OTy;QrgBjUwQtrnysULUJBel+4-u42pdsd<*&T=FtA5Y5u6Z4~hIB+VuP-`p_V
zE?`={$|6=Jekl+>-$|-)RKUS!pBD<)<dEfA0hhFOKNmJF&oZxksECu+*1aHnuGRhF
zZ2^1U4Z9=Y)=SiV0XHl#c_3iDCG?V3i0K7aPq#*FoiMU=Tf~)>^vX+T<XLJi&!{Y*
z&$Lp;0$%IXc!Pk3?5RrxVhZI@MTXe0!vwV)k$b~Y3dHVxei^JpeAaWtN)@8g<nrxG
zGcn(pS2VAT_#iJ3svy?rSN<ns#OoXCjjxJ0s>})l6GTrltI7hFjj1N$=tVO`Tpw;%
zML6!NS2*>~tUSx}6JjNI5jXCuU01+ihuYf-*l_k>5o^SccMv|i&r9|Ya7g-HZvpfD
z{Cx#{QEi}~fN_Dn8t<NyXUV6p@)B_G#-v~YjoR}q1ss)gDonuHox)oSnA&p85dk-4
z_kObnaRI&6b{}H;DUYE2h{M-hOURpxc@Q2Z3fQ1)$SDEeUf)x07GkC*#C0~}%30?-
z%|T4rP`6J$VyWpJ7R*IF7Np;D9^&M?BSbviY>FatKI+p>L1!UOY}|ElAH*(q4{XRr
z+}8ExPkj;JPg@=%U~by>L;=mt?iR6~A)O|C&dWV9P(b$z9kL@9=2^B0tlLk(iG{X<
z1YBq5B%*aPF+})Wxct&|0aun&&JeJ3bd6a8O6f(j1)Tk=;D(5c%~v#9oM*Z4>?sl5
zDjBa4KHu_sD`H~fP7j37=X*@5>yAhgm6~`U=HIE10ElZ+>x^%V*ro62U7m>M<;<6q
z#h;EM;S&C5Y_5mQt7cn+Vi&hED0WIcJ-rGo^hz06HJ;Wxv$TPUL)$IWGW(P_wr)DU
z@${^|6|Kmq!Y$ML^sgfE8eru)BYU8!Lr8wxtuy)#Mmctm_sr`zw1!%Hqi}0p|KW8S
zXBkxUnmJ%(y{0)1?Y7MvILbbFM$-vivj&Z+-*#Elsco|ck82RIZ9p~e*+VACqK@Xb
z+dg~fBvs<&-4nd$44cv@{n3q6+vg0=h2EtqRBw_$V!HPL6UX)i`6KiEMp*k#Y%(`z
zR`c;>^yz}Rqvixn^BP#)XWr<!q4^<m+wYh+W`3I`^qz@6^T#giutt0H^p5%C7Kd-i
zs!+ps!T6=j?i|MsI~Pn?9(j0%|0Lgq6IaF*E{i_1bK#`b2^Y5wtl_t4^4jDZN9T6f
zwP?!vE)Op6ndG;4>c(y_9^E{%YjN)8UV5bsYc^dnZEGI`Q>Tu*mrUQ@-?(10$xWBe
z*fH2j_S4zjOY?ROmv|4V>A!5|o>2~=^Ey^Qwd$Fdsix2?XI#yyOiFYdXRc>aSjOT}
zef+VzY-^`i)9N$5q5jM<;YQ}W_3<ab+)yOY#&~lTp{9A7B&AA5!Sg&P9HXR4mQt`X
zPNS4-cv3AMBWV>)Xjq0LShZR~iO1vwPpR-sg-R)-b;s~51s>GM6*5Li5K=TEYjf+8
zCR9!|A>&`4Xl4BCreI47%jlQht*<vtU*D{h8UCw`d8Hy<7|^Lq442~HM5m8_r7ilE
zw!5K^`e^%Iw?<tAm<I$P2&IHMKFd2HE{=+4LsO^}E{=;&X_?AtGlULEPKuQz2_H}v
zD|4&g%2MIO8CK?>y^Br~{Trq9%=FLd>wOTSJfnN2PKFoy7BBSeq687X<Br(!w&rb%
z1u(@&Bx(~_DdEDUx-i&$EfwoRR4qD8;PKRC7mi9!aUoEQ+UB)?D@Ik}2A`a1W%`+W
z772shvI4tBZ@0ew2YE_!@o73)#+I@eTgom)kmb8?ixO2buM}WjO6-zS_?)2T*j7CD
z+qRYpYHo#pb!|a2`OpM|NNpnHoxm3PNxDV4orBPd4;SE5<mU2Xy_Ll~G)eH|QjG8)
z2)?n=kx?oB@q7Y4xw?7v-#*#sle4VKidXB}p|3O52PJyATpX7YmB8whvRB_?ufAQd
z(3~&a1snGp4^=V@$H*8dPf8U^nN%fs9#3nO6f5T_wM>IPND9YDg_>m43SOgDlO#`w
z9?D5&8YM4h6e_ipWEt^1wNk34)D$o0c!g9d=TXqld+4tpVrBg6N9e@<f`^t(&_;gN
zJ-_F^s0b5tlVV#CdWolElesUMLs{{+RyxqNs-#gEC!~a<Loi%;G8LB?%N6y<AbpEL
z`gX2DfBfnO_;`uAr1<fb@Qz{t&?cm&a9Z|Dt*BJ;B&$z8{kBbIv<b0AM!G`ZVuik)
zo6wF=F8cK?uvMRqfIQ!r6WYcq1==_DKOKCg2OR3(d_}3I32@a<bFYO)3km17Kew8f
z1o?5&9m=*zaJtR<j62~2V9}wTxraiuaH0K(IU@}cLAyQw$e~?H(6RCMXZLO;LL!}f
zFpExtz3~h8G#HT##tPeCi1mrEyK_O$n(O1ix%PTCpko3o>{a*ajW#JzdGwlJ9F0=p
zah)bR{N^P>#H+#z(=D{Ho!c{KW0NG<SiAL`QgxGH*r+RemuyIao?YLhZQq*&#EXy~
z17|10cBiy!yChkVon<_J+RY>=(1bs05Ss$*&-we^jweIiijB${FG+&JXY<{?`i+8f
z7j88(KhXu=9#ocDnUx63CmppvH!vPtpR;xo%O-$T<(tEQQ6xe0w8`!-jwZqFGjmqH
zwA4bs$<)=Vfobrd_EoDy9^13|;G0mb792aQ4z{|M2nJOD(Wdj`q1VmoEe|}*fX1cl
z8ZSe=*)lIJdK^!H=ndMOyZ-UOta|i*(u*WG6)`c$Gbsfc)zHiuKSB%6ZO!L&s}>8P
zK~wy9IwgYh>*?oTtCFBz?&w@2yCfLjmUOE1(<s>HY>@0A*TSNuecm0&OoC~OYAUH^
zGCT}WOkeJs1i35Ji|yL=0ZmM0^S*;);l_Y|6?R6Y!fM;YWQ{lR@V@Z%<|VZfA*Oq~
zJ3~@?faTz$w?gN|L+>SKO;6s5fw|31www%0f}<mw)fiAU8>S8ozb}hP0PkJRhYoy|
z2xc*FCr@pe0#~M-k{g~(gyZA;XWqV;1huau-*>@&%DUWpdxIMpFuY|&izsaO#%0S|
z4EITbHPj`OTMX*!r%Cxccg4enqiaqEC=y}R{n9fx&5wn~n;y<~i^ktKp9e!d5@AiN
zsS8uuCPL=o_u5PQv~YIMq%f<@1c+KOsF57?oj<1gK1pN(cs=d3bf9-KD9dITTMkVG
zMWn&Tb>)-b;)pk0WbL#t^LoWLBM2?*Nepe&1@&_$$J%$qo<!jG7~XiVzXBra-x_&F
znFRHZn_NFAO@@07a`Mj|P6TVku1U{3G4Ocg%2L%&a<KmX^eXKFqrpNQ-gsj_3LcD+
zpKKAtL!R%^tU@aqs@!cis<~SP_%7*G?f4`LqNYwuthJniM;djT8l^dq9QXFVwuFY`
zRZ4HM-9&??kMW#+7A&mHdS!Dpg@KrchkIDfVqxN_RKu)r7WTJS_rAW31^xPjh2(lT
z#Jw5&WWdVKAoDKhPID2Eb>_~j#f_4Io)vRXUO5~t^6n33=Q8l%qJ8CQ7b3yAtJlf$
zZ8#WxedJH$f2Ltxz{rd^eHJY1?5}FebI|ns{^!~U45X&@+&Mgzg^Jnb{Zbcjz^_$h
zy&K3vrK+u01pmZ9zOmDUYfB=)A+@s6oY_3wKH4={u84%FH9i|^`tne2%&MWj6+45Z
zVJ=<!ISqp=YR|3IV<EH2%2g3nY3OD+H_;=kGmLmy%VI$^2jyS?l&jTtg27kZl0Cg6
zAmWzn>>y7D!d0toEK){*YJba@DlQUkIgU7QvM3S?;@bK)FyUe0Vvki8T{*Z=u!^}K
z$wO!GIMC=43oiDRN3Ph*f#={Z#>WyEnAq(0rOx>@y#Kj+x-Ah2Eyq}1Zf%hQ<9yo9
zeuM2>yDsre&^!hPB+Ogh;35s7UA^Nv(F|-KefNe9wtL|?i%u>K4G*uE+tb=M9$X%b
zoOxm^4dD+bXDoV&cF?)}=6m%b;C-c@`jyx4u(OHv?fEBYcuGtg@z$|3w41d|+Ua&U
zwC>ZoXKkMtNM8^YzUB!95$hVPdF#YNf!nMG&mM+DO#Y6kJ_a;Mx|Hhnx+4dB>Z*S+
zsu2asJI3^xhzQ^x-VV7wA{T5f4Vi!D)+pG%@?rSTFE>N*^|6B$%cjBUnKjex$*Hh)
zNBc_(el$#Y*CMHUm(j4sx#NY9k)uIEJ)i!%{3w`7q)pB~kOSd^-!zX#e>*+Gy_v@;
z!9Sdr`mc%ys%ygDt9N7Iz?ONbjjzT*LVp({>gQOfGU{N?h>Lw7BTqJP-m5s^v`gJ=
z21Y}t{$|Zy+9tq@t|PAzn+8DI@Y(HbEc(Izy}vy3?3D>grTdy$ZSD^5US;NIKgxvk
zpygfp1KCi%T4{^5r@O<z6Bm`eXAFcJ(_($kX|iB^dDYs>?Rvr4t>=5JH|_^+{!ObI
z*3W?&wyUA5sy9sUGGg0}cLO1Ei(lmFd0CJ+>%A)xlL@moMuneWKLC#R>+~QkJsT>X
z@tXW>NnaS!@7?5)_WfW9nf}wJ%UQ6l@U{KuoJ^Q-uuI?c(gUD*c+~tF>$2h4onO@R
zE@Z<ay}-P%HGRQv<$P7cfIcwBv=+OrU0=9#Hs5qsK_3`@=1Sh)quIbst45t$*9SuP
z7<Juc)EB%54Jti<^c>h{)voH=3fXYrxgA~mZZ-^SsCYWWC=&|w5(h3`(;JfSSu{5o
z+#hbu3duZRl?5*>ZY0X2y&<E+_It_$ec{n+uU9o}dc%cTdzUBp<-qg9Ei!9aWI?LB
zckbQ%Y<N95f5w^3qakKnKS*gc5Ozjwn{;|pHrUkIH~!4sOgQ>Frna-eAec9{!}H4r
z2Em&L)5f)%(hJ5~wVA4p90;~uUWNp<>;uoXBvYzxS+Hbm>4Ef{0dT7PyiU!#_l2AO
zPlkH8&4#k^LG|3WWP|HZKTp1+-v>s|SbBJgX>S;3u<6Q{T7zN5+`H;CbQZ*&$?0*b
zQ8ol@yi$9(TOX)u-1S)RK3On4_vFo%rLrM%&a*JH!vjF8Ox7P`o&{qbtK=z1hd@Et
zmd@t}^o52lM#?bbY<PeFyy37_nXthA=PbQT{o(D*x;+vuX2J%pb4KgOy}^fnScy8+
z2hJNNMr^;;AIh$;c;@Dno-oFI#MO>v27zLBtX@be+ChtZr$cu1hSn=``mJo&4@R9|
z<$me@FgTEE{9JLnJ1jg9J;F7A5cJQibidTbOz^E)t5!PE8#Y$o>b0?Xf0*x~Y_WTC
zCOqFY&gqrMAV@j!g30me3xN$++F7HWwK1P3%WpOS_RhVRAH6Rd4q5G8?7k)w9?Cc9
z?OU7${vmGpiD_9NFQ2!r)S)a`7V>=cx{$trL~m7D{P}GaU{LJJRtCkvP)G6pe-Pv^
z8U!&cD*UUksah%}Iw3xkV-n)o<f7WSzQv0V;~~H88}(=V3;8rw{sH2k7LEawGdD0d
z5W5=dNpZAG7^)BkA@G;Q;|^sxtu{fcduVHm)^;_KwJXIl%9uYe!k+;1^ZMdtXb<9_
zpZc$hTN)z+gRRvMt?Ea>9jm-s)B+MlsMCKrR8|4O@~ao*_<L!$EvGb98h~%$G27b%
z3E011vcLUnH|U=>Zs7BJByf4VdRUndF!JJ&4Y|*QVA_+R`fe8f(04-H%IvJBu&^MO
z?PYBPaic2wJ=o9!=vudG&#l)0TIboA-0!LYBhSsH3-X#n-2tQC9k*%;1s1y=Jh36+
z_>mjlVX9z=HY`<ob5j`v2;5QA1oB|e<0*Zbf!|UGYDst~goJ`i<=%~;(Ee2Q1raUa
zrz0zD+{bu8!l<3v;k!McbFI*|nKwP5U+#!ZIx7e!x?jG#%TootJLa)lUiyK_0GDS4
zof|=il|zEP`!#|CyF5m18Q}(drj0ZlHrO5borO7)Iv&uha^j<ewi-Bd<=XX`+ZsX7
zijR6v^KAqht!G}`;O7P#AM}0lb1iq65#G9=_j@-u7t^=W;JO}=Qz@Bp2=<2|RrhtC
zqGtz3V*C$iBV`a|-}m;_8o>}UYhdl&7ZhN>*3xdvbWdnG>+*>zU3_718T!Sk3w7be
zkheMMvB5y8$y+PsBn)}k-J@z}6>MHRVY$XR2%0>pn9<EA42CTm+PU*M4Lo>x-_>Ae
zFr;ras<}O11~R>ez7C5BC{+-?X{E0(%zx0W<HWha;IaJNPg{G+q3M|_OANz(;LNq_
zRm1mcVDsvh%O9sUgcA=t)j4-u1E;ndPq-hAZ<+$1b^G}d344c@vp9dNIqX>4dCuC-
zY8YGE(#Y(D8ji_=9h_p_;g^05&WEI{;MEfP(ptg|Ds@_1e*7eNINZfJX2?5#$hr^^
zXWQHX{0@1xtu#vsOHS#-%Q-U0dwQbkR0Acn)(p{`o9+f%_SdHm-fs#$3$;X#tM#D9
zE`u}F+%QP1w0!%_NF`LYEjVFyT?MQ3wNp2>YYsCP)IT%Q)gMk*)$4xO+8!R?+)?#b
zq8tvz)Ga%!ZZNbBtaJLgw;axoi|d!`5D1fRO7B<L9tJJ%y_(x>umX&m$JJ|gw<%~3
zt?`+*O9?$PCYW?8^apKLi{lULg~5nMbFw83mEhfT(v<D1o5Pd?3;VO^Cz7p4mOACg
zVVvXifc_~;2(RUNVuzj@?)j`eQR$)r`W`;D{I!Lf5VY^<<B#ac)AV(oY=SOXQ#noF
z!JxP~Rt81>*&7iDv8L(kJ<!)Lk3%?h^tbj7mew?`APhT*f0jhZYgTF6gq$AcY#HFY
z{8EZro#3W&>oEcLVa-<3D?2u58=%ZP9;@mYxcbV|?hp~Iwy_xM$A+wBl~aPE!WzzM
zy|7Je>-E>tHijp*b+a3HATp)>Mt=1<^WvaZEjl3`M-CHlc%jMX!rKjFiyOvvIh8Tk
zSbatUi!TW#j2P6wp8)eUrNy9h#A05T9ey>zY@^?>wo5HwtYSs_b!#&Scri9~X?0U5
zTz~SFvacCLdMw=DqLu}mdv-K%!O2=+)&C~Zd$t+03$y9i(%b?}hi5Nwb~S_0diT3F
z3NZ({{xr?ctF1sDGWKCuxDDJLc4V&K6Eiq2yXA8+%oI%8tL&c~GKFR<)+V(zFo*F4
zo%i0;Sb^Dgo2FOY%;8LF_3~=jCNTBl<2lQhT7s|l&b+W~rtq-FeY?34<}gaODXsJ+
zONj9dS#vh38pvAJ_Pgt30WF>m@9FAn4(4Oe#cW<}2FCj9Jez6pxdux&b!p=SKi6FP
z%Kz9v7;t8#kJZqcP<oBF-OGArARqa(8_Ajh<+URwag90fPV1KTsA~orSG`UjUfK+v
zw>a_=lFh;YxbJB$%?jG+o%C4t*c^lI^4;gBo5A5i@3=*CO`wn3|LEF%rf`o|ZO+?d
z3JSX({dbnLgv<;3?o^34gI&>{trBA_flt%7uJDr?TsgkT^o$<rxtHF~hs`a}2@8ka
z8*U2HSpVAVcvDDzc0X<ON>f;6urdYunZTuly}QS|S;E>QXDtV-ETChgYVd3PePokC
zJx=AD!Otl+^5e(NVMm1~N|!Pn0Ny^>ba#1G*y(ZXj4Gdi%L6BlezM&XHcmXUhd5^j
zvqoF;!`E3t*-Bf|#`~DTy(qJc2F*-h@#cyPUX8ZIk%7RgQ;sCVb(2!f?A4|)m(aT8
zIGID^?wxmr?Q($b?w*}WZL^05tFq5}T{MLjCXscFW|%_0X`R6fR+&Lag*uBb>@|Z5
z5mm;}b!$SEl#!FFHueI$7gqxI<M}hjzpJG#QynTqnR+&FYzAAeTuj+=ts30zNyY15
zGl7Y_^f!f|KG)Z2(QWDhOK3@jX4@+*A+~C%a)yme;ia!xr(r{|y$_Ceo?Ffe+^)A@
zyVlMO-rah2c=t$ic(;9f?$3EvFj8;d;r(?jpjmv-`nJtYA$4qanay>~V7pgA*|6>1
z;Z<&d?A%ies9!_5%L^=^N~0;MC+pRQ-5stw>?_v_!e%r|S?kpjYR<ZKyp-KiXlvBN
zU>H_>q;uxxwk9FKk}>9P<yyd=(?2iq!(?pN|4HxKRcMHgsp8=gC4r}h(sv!~-2f)O
z%Xb>O+XQwlcfNG6eN{;5R^IP&Q!^;FKEt)Ui3w!in$y$ysu`49wCI<DKvUSf@al^*
zJxpLxld#xr8EBW|4F*4|VFJf9!rok{Y7QocvmcM^-3}(EhnBNC9RM*yeD1rxYzPPY
zbPtj>QG>z2AiAAvFf`dWcI^tS435^`R(?zcU+683oq5u^E%+^RUDe=WAXs*6;8beA
zA9(prdmdCO5GpFCPVv>a!7TsA*|ArHVFqP<HmPkGG)U`EC)_9$7Q{U4rPnI}>NVQ<
zsNU01D19||PJEqEXzVgvk67Ls-aR+ZuW_k8?6`OM*7kSJ!C*EQ*{x9kjLa(4Y^rZ7
z*l@To{;pvlbgMmJs{O)1DD(Q4TI89Q(9U!A$h;>lpi>GnZ1&5RQ2ypiwYEbG&@-&-
zeDq>4<Y+ezTTwp<!VYu_UvMu7R^4*1bh&XL>@!+hzh(DeSXn;H|NetOD3{mj<c$Om
zXx}z7MxE9W3XZVXZWyQ`D>f)#J<8kE>2W)ABuhO{jwyFt)fOI(3318at_J5<S>tki
zRN&ufLD;t0Er3{~I`P(026MfgqH8~B4&4ga%uCj7!D!36m0o7SFmQDGi6>8jVA493
z>-*QOppDW=yIie=ptD&6xt1*;d($ubui^QZ>z%LK+b0ODeNRr^;MN>$vitQK<Q)oS
zOP{{pxhl4MX1vpzh*0qN*)U60CK!AUgf}!U7XalHTO4le^oMuOsb<Cwtsr<Um9yVG
z5H3r7x>zk}4k3*?&s|~}1PkZPxH|($Ni(-01FmQTz$|F$>qHz;BHgpbJy@lNzFque
z<*nO8jIwZoWu!k`O&{6Z%%Cas3TyJ79q0$u&$bWps@Dd#s|@yS=;i_Pin8^?8g+zT
z*Ph==*c<|ZZ*D&fVmpInPGVO-OybJz@Ae;h76Im_d3r0_dw@^v?ZY<zqJi;|ube06
zDB*Q!`(r`JgTZ~+=~m7&o584_fqf<#w1o-#lx1VDhrzqfahg^;o5CFL+Ru8ip`e`B
z!s$s~0OTI>wyZI<Exep@;f`<r)^Kq6EqU1{fgm~8{nC^%*uIFy_AyER&~5EI-_iyC
zz`cGOI#|^luH;4;o<;jIHgc<2+B^WpH0lvJxME8o`JX6_KVL}x#qMlnQ0$PH-#;XO
z1C5@3snTW2mMd>i!LVW_qsmo`tD2aanOj&|RjXd3X06(FBz3Lp+1T3IJ2*PkcXn}Y
zASFndT%lB{H4PiNxqCokPcQE#KE8fU{hKupXb~6`+%hCItX1naZQHf)(6LkJ@Cb@#
zSdNd3ivB4kHZDFPF-e=8lA6{fJ)>*4?mc?;>YbU@C%bRI{sRUM8a!m^u;C*{=8PIW
zX6(4}6DCfYJY{O`wCOYQX3m;DCx7m|`3n{<TD)ZGvgIpQu3Eij?Yi|FHg4LyW$U)>
z1v_@`+P!D*zWoOd9y)yF=&_%VpEy}~>hu|b$$TIjHIQ)lq;pKFP)U83a!iw0Oq1AM
zt|3y6-|9`I9KSt`kN7p^=y(0`YKPg8km&q?irK<KIX&mS_14i)`1bMs?ll<5HcRT_
zoWR0O_GzDjbu3i&h@JL&3J;I?`toJ%IN0^-iC4GjG#Hkt(J%cD13OLT`*t^HpnE05
z%C;6b=+>Y^bXY|WV){JVwsQ~*N4DwR3c^7_&OCnMx-vMJYSMhUr#2EYBIY=}dB}ll
z&fTR%Sr%eu?$#en@enuuT4gU|4*GW+Iy;?UVZ$!dyOUqCFy&S0;j0RGcsjq4X9xXA
zIGAA;czi_!JfATCz~oE@f?ZGE$;pa>K6~l2YiC4)JAe9I{{;;A7;mjkSC4|@D>sdP
zW}F168<OJh+A&aRjNYOG4FjW&Ec9<Skby~(M*ZG9aM1m|-6d%w4xA_6%2VLrZsj!3
zUi)`Nf#;fp+4UN*m;$<9JiUp9@h>V4$=^+b`S~+Tnt9Q%{K)JxkLPQlqCuYd22U1z
zeFw};+dxB`h)nZ+5(Zk|o43H^2n&y^C#+h$IU1ba<-ID{$iw5q2Uf?pghP+QeK!lb
zv0&V0^YS|+3spv5EggV^n+@kRT(mKJB&6y$kj0MXpx=WgwW?laVd^^jfcOIp>{>EF
zS^hW+Rxx{?jA@C3$(7?z=Q~8exotU)b^CCzKctp=KnM-K7qbJ>hNIk9_Gd0KPXo)z
z@4T9NQ!uK**mie(BA`r$)qvyu@xCXs?P?C>f!bnm^-X^Uk|w_7d+lT(!YDqmU;qow
zou1hDKhD6uM;q=9%wpl#mZoC@aIksPvZyU7H<I98ZbZnY@hp^CckR@qgDjA8kFhJW
z7<f|Iethrd;m}jQ$o%mx8mcyawJEwX1w$jdA6c5iLE)2!znHqP;MbdpT;G}j$LA0G
z-I~ln=gjuGnOzvDmUd*Lb_oN03+|UYID>|)Q*S#htr-c_DW_5M`Z92=(ZjiItkIr#
zn8)}gbI|Q<P?u#m2+SUtxwz>V9(vSXS!Zzv8s40)bl~-|CE%`<_?bUupnm7dtCZm!
z<ab^?&igzI*0o#SZTfx^<fpGTdUt9B4D6OSc+jwUkZN}`sad%x5R~<Nz`UmuVRDw{
z@SAZXVc5*2OY>Tdgm+QnPfko90Z*^ZT(tVd2+()Qu;3?;fOOYMbq?cTf9af7t5$4o
z3+=YpcHjA+Gb|2sJa%t>1k9>i-N%;d4C#|pfrgJeLxcla+NE*_7&0!W@YTr9P*qhW
zEiI`dG-SuO2oH?_&*PP+<ViCjx<$>(eHZnHnp33@FT{0)dxu`G=shGAkVQEJ8)d+$
zT}H}z#+lHzjUvzQcnWNOo>SL)L>4TlaJyjLi*(T2u}QMCAPuskR-e8e*BkN;{9a#M
z(FaZ_U3OVIr@?S%W}VNJK5%y1j_Nyybp^|+bBCW#O^04@9oOuilLakL$8L{U)*b5F
z`No&I+7k|LsWRQJd@tBt#cf!2t8S2geSvFz_jEXxbh5FNOBQhbH=WI^+yf>}8alCM
z-JX!~V08a$`+LE%hGz#HTi*?s6((2qn)U+cJ)S2#cXxxcN$Y!dZqfrL?9FaZxZ=xH
z?ULvit|#2lq$I{xpADC?8a$iTvIiWSeg9<6jvlboNSe9Zv@6)N8}3%gN{4=XvR`e&
zLDxp7%Bs%gXTU-0F=tvYNrT9VkE*GT_JH663p-xTPlv6>p7;8ujes@%k138G%Yf?z
zH-CBFBomaD#vbb$;NbD;f`$1PvcRTwsd6*g^n_lo9fq1rONVMrTQnI|kOg;A;=5Ky
zebim1l*+79Veh%bTRXWvup-K--GIAY;i$KZUhc{aSpMWqLC1wz5SozRc(HqLsQ$+P
z;p=YQ!R^9fqlL|TLT*p*yLM*X;aI|iWo??Kfysl8FVFPPhUyT#vRkvRFl1}QpcQ?4
zLRzUlv9kiY!-<@hRjf931-{?%jJo)8;M|Dv(+%tNhI3OIA6$JI^*Q#=*4V&)koGLI
zUa9^#I9@Kh;;P7AU~RYSUbU@V!D^WC@*`H6Fl_6%9*fI%gS2szPG5MH0)|6p>b>aF
z4Y<Oq2O64YLSm&`_1r8{fxorc>dZ<Ul+DN-U0T@{p0>!{{H{X=EV|@VZeU0jgl`(O
zbM2AdF#L(r(zS*u&@!x!vHVOQ=wAQ$;`OHKFuByiOy`_5cu}FDvPuoK%ey}}itx(-
zLl>7C*{%D)+SQ##xk`ILwZaj3&I8h6!+!k<$M0vtj_T)}f=~8>kac4>`EeQ04}ckY
zrz>nIrRg%KYd3hdwBnJgmAb<z>C09L+k1YCaumD2l|ivX{*O|Q>PR_)W2yM~==jKQ
zGmhV?O=KLuJuEVgB{j^K;2_5;-5|%SRWQkCqZ3$|cuh&Swu3X5&ox?F*9K^-a@VfB
zvxB2l)w1;!4sg561{=pF^`Tkcz1CYz?BUUihWkCj9l(is713mp9XRVpy{W2ofx<A;
z0|z-9c<mK2c2J-##CX{z={0Er_eahd7Ch1x*edr~X@)Jlf9!nU@0<kA+TE+X80?{-
z!dmZ{t?Zy{@32Kz%GkndC~W*Fs~)@=*M8Q#%{Gu%zG?c@I(9JRb&u$MoGrYWdNp|U
zL~EF|Z0v#~J`JFe>5MVw=i5LpmuZ9-<p4f=+)gYzW&>dl_Ff(2X$MPEV;5c8U<0N2
zO?`6J5uo3_#Tf5u_ORr^jJPtdt>IpmliQwDu!X73*VP|!u^x1&AJKYMcUx#^dsptJ
za)gR2TamlxT0{B1wT>O!s)ESe(JH5<_As&D+Gv}4cCgX#b-7KUw!n`lbD_dpM+m>y
zCGf~tTiD>yqDtA3_E2rX&R*K5*6?utj+-@a*}$CZ1M65u*g~T%hckQ!*g?ykZh`5i
zY#_Je%*bUj3AnyF(sE;M8yN4DR_;UvN6=S|I9!18wO=lcwQ#Y8)%{Wj**0?mGhb8N
zd094KUwT&KS|gpn`m&4DrbSM$$+6|6bMIq8F~WR8r3w<bXS2@A@=y?rYdt)XrS0JI
znN6E2j<<tBY5as9M{Qtp^WafEC)S0PlehHT?^F-AYfpw%HnW8;FV~Sfhj@YYJlpn7
zGwOl*NbPa`)wZyx*|l*7_X1$e;xN}C`8J@iFbe4X+y+_=C-my{ss{t(mcP0&*A{+y
z(xKHX3u`c*78=-Ry&cquEZm!+tOr}-3cL4ktOtY6pIpOx+CYP5TjlFd)`d7(#l+fk
z9H704d-pR<Y~aqIz+)TB+rs?{hChwD;{<H(3oHN1DEFWlU2?Y9gC;TOR>xPg1qWw!
zYmY(~Sg`VJ#qfAL$hW@yb5fc;tm$FgP5-eY)EZXx?fCQd5RkBML%6>!TyB)UrR94Y
zFz8vf$Gm~DaBSzRgWU>k!F0f@GLld`xG~{Lk0E|`U>urru0{hln4$Km`Dk(@;GA}y
z={4OQI@?z&b>T&0m{fc4j3(1P;Flh6Hy?9$gPu*(=J1Eq@HF$d*5ZO1pwHwhH3kDr
zv+HM<xxx)9UamE~R3&@R_?LQoDX}(8_$g@Q@Q`}2-^0T&_hnre;x(pffYuIPrnFu+
zldKC3>x~Sqa^4zdbUxI9Yvc$UcU@btw6z3+PFB2gkhh2DkE>@m)FWWU?aBqGDFVXo
z<vns)<P4#8G|MZsmw;w{i1og%4v>)HrO0S&3s*yDROb)Z0k1>v9%?5z!h{Mpr|h0&
z3!^-mrDyM}3)R}qJwAG!3tYM2Ic7|@9b9;^Gl&hx=kI;y)nkSWG;iC-ck)m>Xxt{S
z|HfGK!`+2@Osl&>weG>?-ZmoO?QJ_Rc^Lxss&|o{`nrJA;-_AN_BVi>aSJxnj6Ixf
zG|WVAumc>5UX)#`O9N;XP`au%%^Ap=DT@xo5MZd8rQKui1Pu(fuAceY1=>~|PnMqP
z3~qz_K2%?IfsQlG8+RR0A6y1gWnU$@LifcZhuq#rfYB?n7fDB)A%5!fbyFxQklS;s
zmM0vc;V%s*U&wTTm-Q}WR>n6DNAq?(e*Mx0o}6lRB9F3$xLmT0va$;t?^`+Xxq~(I
zXw9n5y>x(awmUYid|(SMSxvKRNNquRY3p`vs55lkJz@7NcSo4>)9m=<qxNvOG2<6*
zRUh7&t!vY&xeZh$W>!5mKmxg2pPHrTH-N)a4osSv?+lSw4|f}damXxmad(r&F3|mA
zss16e379{9N{oL87ijU2Oz_($h5N^<+&|+{AAC&bUbdTN4PFh-uj?>G2BwdE2S_{|
zK<hYWX!|@n$UQ%yV5x~CEX|{PENkuz6%vk~x4+~Hb<_LZZKjoi^V_BJD;W;Z?am&X
zSa%1=4|zW(cf13*C(Jo4Ic^KN_xtGmjBzx7$;ep)VqGAk_G3+b7mRbxgC?aFVw@c1
z-(%}WSGaZl{gH0{?2GhV7q<GOp02gW3CtxYQ|7F+h3130#aycI1Vg4CaF@+-h9M{7
zoa7a);kw*o*2TrOU|gQ6N&PFX&~<&!hjZ@LgS2h2J&)PhK&QYfTV^}kLbKe7_i9eG
z2k8s%kdCn=M0av6+_TXUy0i<7i;l7d{jeJk*I*vfV^1!)iTzM_z+vj%Q#+srukK>t
z=?FQw?76Zr4sdJqs2%0kIl|Ap_rZwz4Zg)Rio>IoL2*O>VN8Sg<D(m&3~pE=)4<u=
zsh@wBGD2c8LSlDRH?Z+roryH#w}(ZVaj}#6#ZDjAp9+(1aWXAI5xj!bXh;o5^TII&
zgR^T@6swUb<uuJ|gku_B$#NPsD<>I^jFl;bV?0NwRXoM2NrhCYlGDO5Lc!50Ql^o~
z305vA#q)52zEaMrH8LqdYor=hIFICcm6W6@ysZ-N%?QURMnfuO*a|g^M>K?ROrxOX
zJjatVHNnt?N}Lo-D%HZcl}bUXXc?;z+lTXj<tT?rMyfRmict#J(n^|_suc`DC`gLX
zO(-UKlIArUf+Q$LP7re8Jf5HlnnOLPC?3TVucc@;rQ+mLjS7z|<TCMoEPhbS)lx=-
z!(b{=S16=ZuH<DhnUYf}b=P83d4{0XQYEcc<7k<1ty(3QvOL2`RWeq^v!Wa-m6GSB
zluSlw<b<3Tb)cp-3Ju3<@aY6Emx@alq-vQ$gRN2G^ku13DlT!r4=Ms@%gZPgEtQhu
zq-r^@B4kn-wnwd2E9IhW2&7casui3<iFy-tB~z+pGL4*8%XtzPKS)K{NQPD@)pEIp
z<dsU@F^-bsRSd0Sl`2l5Yl~7%P;xa-kQ7NOIZ`(vTdHPwg^Jg349@8$L_3fXl$4Ol
z8H%SVIjI!wfRaizyqZwRNSa~fw0Nxyo2Qafs5zyKm+SPblv9jMt|17OOhd_7QGXg%
z#;VaiI5msvAw~Vkc!I%IA2LdfOE%PmC<jl`GL=%HA!P(ja#B$T*nU!>!4BnBO6)(;
zt|*mU#^7QQv|X8!P>Scta41x&Vr2LXg5%Vp?s;BD%LtB_votE75al2h3XMW0S850u
z&&U+wc`A~|^W->l9c>t!DCm@vQlyHbNjWK3s0o?azvx0}+ANyBl98c_i07#^N*S(8
zK|_<#Xjr;yS-iUpC%&U^l7y~*@dTxmQE3zk7Uj^%rY4jcmQ%{mF44NgXUNnXnt>b@
z#?$x@v3=;MDq8RxshVSyy8b{bQ&BXpW@xpNlFM~^Rw%HoXz)D8QH)xAF2(YcoR(>L
zsglFV{G$F8Y8u<D<`g6b1C3mKE-sr<P;v#TM1d;RohMVNRZ1m#9InLS(BlNz<dlr&
zNCKDLC|I>hAwGjAIF_fCEX{Em4JuGL4_E7`RceyLWjN@XMaNiF3NPc7tV*t+8Bu>E
zCFPYGHAxd{8LL$2Y>ku?oKlKrf!C9SC>x_7Wx|R9Dal~4;B>l2D^bfeDprQ-K{FJd
z%c$kJDnVGVpk~kqMA^_xI2B7%3UnDY8me#}D`!aTa0Q{_<r+CB_PG)R4Mq*QT88E!
zRqN!Ht2mC(U?^f~76YVktqhf};7~Ip$7^V%c#I&>*wH)GN=~8RIq|vZ(G<oy1|Op#
z<c#QB3WlU;PJtajq3tVm{U$|gX0d~D`4Wwf6P}?`C<Q@O9EIV8*2QNzt5Ol19Q_J|
zw}KPnnhe`cpx4kEMuA-=>P10gP+>J19PX#2azZ>0mutwRQbNw+Ni?kz+sCO@QeH-&
zm8p4|f)jmOMJjk1b_M#4f}wT#)8K^+p(Y8G15I7jvz*5M$N0lYSyDlhq8tPolT@iu
zkf<6Jp|ex963qz>gJpz>sTQA$;fv>KDbLb^Vl?9Y&~<r2gR0U{=pRZ^_c9F{Et)*7
zQPFbDK!RRKTrP$timK*ObV|G*_B~6=aV;3;1zc|;%0ZH(icz8iGg4Yc>0$(S5RZ`*
zqpn8EvPJQQQsFf!3?JC@8l9h0EGl1vd6DAOyi#XtQVrUpj8tMEljDLI@wr0BFf57I
zhPFi)=@~^p_r)Mc<B}Olv^51!vm}d$alwolJwlKJSF!ObwOUGJh!#?faE!%VD<ueq
zQ}H~jQj7LNkus&=3OviGSp_ZjC9ZJeRSFqK19SwM7v;s2BA3cAo{%yr23hf1xkjVH
zC`+QJD=>kH*Ge@Cl}t@>v{He2Sf@W3b~n-m>?R63R3=_4RSS88p*V(@k~|~!FJ>$Z
zFj91DDTQ_-wiy`&$Fd4S#wj!yLv_BTVrYiv6g=h(<QAg5N(wWfT7@>ra2P?wYY8bY
zh>FIdkz<O`^{)&G4aP1N=6Xu4^L<?Mr^e(igkG9cDa5v*e$at1=A)geFu4f&Lvxp_
zQK76_iP4P}+d>kY6jv5v&|(EI7W<7-$)p&*1w)gfUPOPBV(wL-_hLThF`bL&u~JTo
zzJwOT5R8fuxsRMB<ftn`MxceONL?IKYLLIkNQ^fG#!4}UAth6(P<96ODCK0L?pf>}
zDaGUEEXF6Do)v2JUnB#_Kal+Bbbx7E#c34iLp&~H<i-2(=-WKa%dj_)V(Dx{jd@I^
zlA<h13cEzK0|LcDpW+GBpoW%<dge6v0rM$?{;#Hra=c7|R0AUsCJamg;(1aGa2P6)
zSI|;9sS%F}3o$Y5p+yU(uhSp8wvtfEF`Hw+mhz$w)U=T7FbI;UNU2opBL$_xP>-xg
zrc|+{s)*OAS)>#~Tod?!C>w!<hE;HsRKcm#JSFl2R>spJCBao@oQ4+r2(4YA#ONu<
z(4*vZwnhqJ7oC*VAf3cSC)j}oBf3&p7>a8~RWz;3DVRM-R502;>X9t64IV=<k|(7c
zm8Ij8G6wyKVKHk-kzla8e6B*xaTFm$M(jiJehl&(<k{%I_ym<M2OvFBBd<jV!<a+p
z;wkbV217o<sg=lLq@u25YKD-bN=dX!rAF6ps7VR~rwr)?CPzxAdkv4gU8R&W=yDnz
z2S(;hOBDnzjYXP`F;9#o4CZkXIfMpX6(fRh4EYHr8VU&qF3UqR)aeCpO-PZbAh+P9
zqON#U2hw)D6xY!*Iu6HDB&9)8B4sh%qMX9DJd1x&05ouPF16T~GGs$Y%V<;>5(Sm$
zA5u&V7&HlN2mVgy=NN7njHNWvX_8||@jRIVYX?@&<I_3pIZ@9tLWVj~;$mKn1_Qhp
zH!%LGk;h?(!eRjVm(Ugk1~GgjQh%%ju!s<kA?fFk4x;;O(5A)yrQ}j1e?m%<6S#O$
zyq3kzLQY1jc_}7aN_++q5E>UKVq8L_#s!Sx{YVx`7)8lc!kS}VcP%Rf7zQf~2D7EE
zK0>8Ql~@I#wPF&|-H*~BLBRFPc&$Kf#QRAx6U#Z|K0@3;E+RZb$V*BL+~|rt8l2c>
zg+|3_G+0BaRV0bAMmUdWNg>#&keVPl)^R;_J0YR4tVT++N}ZoeF=|LzWKGybSnG*;
z!F8G%8j~;D4T0Jb+rp9>xeDz=hTIct1o1p2g-)nI70?Pf<^i$KX{=*7jPydigkC4=
zfWw%K)SbZKf`M08zp-d$B!T{{LcOYV+)>IgY9w(=8OPy+b<e;^kJ%AV#TbTbNp*Pv
z9U8*}x~7s=p_vQvvMeq)MOJ~OEUiJlBOK!}1riw86bw=wG)&<bg^N$IyU-)B+*0fK
zE><>J8sek~q|um2#QO>1TSg&&!`vdrh%Q`<U5-p&iA9POBd3njU{x!GN_2lI(os(5
zM@W7c4mE{$;E|>Y*U~~q%COMFOpDqT+o#0B2<=LS`4huFC+dn;N-<rcSs=N_+DjKV
z)PxEn5!U8P{GCFK2MmMc2g8^gD=`X7Uf~&-J<u94C}Ii_$hvS0Yi%?SB=`z7g`8D<
z2J&Y$uFplXhgCTSQQ<rWDL>|K)HSJLDV<+3xVBoQVi`G02_>Rve;7lt;z3`>5d`dZ
z@qR*nVmT@D5G8VQU92OpT%v{b_ec{NjatVeIEv(0F*YN|((x-adh|CuiN*#qa`71q
z)~ys81y<FZ63MryKMsvkhEx{~fxt%#$I!fJk%y_#Y$%Nw53oq!k#MpUj}<i%P2oHi
zc?Lmfa5*hj%$$zLV!D%&$lPhUkeGEk#kv4pMj<qf;YnTGP?9VX0Xd2OfUJlR^^8WL
zP?2&erfwB_o!Az+5?!7~^3C8Q(7c4_Vs6C5hMmaZxP)9S9>bv%<c%bT7!{?VIMEK&
zs1><PhMpjFppLV0DjAPe1)316Mhg?q<7tfYC?|tu4aMs?97%C1Iex~$93E>vQP09y
z0(!m-twTjItf&K0h$FllhgGm>Lw$=n6=JT6k|8;!(NRQup>doA87dx^qHpT#1xX`~
zwTm2sjG7~K&%j`VYQX@Ay2JP<J{LI>mX#QWSy~uf673Yz1diAshh@+p<QknXAPc~B
ziHpN!$U#W)S|N%c_dpIUMPs1EzQhg_MlFz9t2uNiQ4XxZrC7LOwA9EHXyt+&NDt9V
zF+(ZvRJ^Bfj6jPZk)SbnCdMJL-!#Y+NMr^AbzpGw22lq%9wQ82VWdz?vFI0$DbSyg
zKx6hrbz_Ap97B=t!L*E|uqIdPwvG^n4&*{Kp$IirO`<RGNQW?RvIIs7%t>PVSmc3f
zWIbqb7}_aO_ZY=d(?S5msDKPecm|RyYzt-w4W=Ge7sIedM2d{XA*sL~(T(R|Uou$0
z%5lpIOc%O1iK$T-p2IqlQlgEB_d|Atp&u<?fo_PjMtFu)uF@cP!)S*A1v9MpTpFV>
zj%g8`N<%4iF%~_X#+ZYWNfipYuAWiIr7E;<Ea9ar8n)P%*v%xa;YUV~Y#WPQ;eI@h
z=+W3=9ENGk$>K9`bO=KoRw^vkTnb&x!SMhbB*5V-S}xFfQO`(Wap(+39#Jm2O2<p&
z3LMeF`U-_c9;fR!Iacae17HTltcL}r@LUQ9x@aUvIJ}0Glos2E<uZl=EM!qPG^R~a
zUQVFS40bbeLyULAF#?0DFdV}or4&+za15=7q9_&8Jsd;8Voto4M{iLm2ppyqoJWjJ
z$g<_QzX7Il>?&6GTn&;}98*$ZEsbGbDz*;?9<XvCNHxlVjS}03G+NHGI0k^$rbK=r
zJVS#+2^hDrT1Jm4icKU#;C2Zzno;6Nt4tx@k7Y0);J^^6K}LoJxNt2dQzZ_@VhF-f
zRAk3un+X!_9_cg2aP$G4O=5jPF(lFi+;RbPj&PnzMXRuUR;tl)D2^4+!?dl$Y@)^y
zQyloz#U>ty1uzoxIG9T#n-#A`289-jR)dX1U5f3)ID)|xM`clF>{ihZSS5j#GnybJ
z=#deRVXcazL0E-iZlKYX#Wv&pC<P9#;7BbF1B$+i*I~p)rxv=B(D4ndGYJ-{6UG}Q
z8k{cXs974hA`Y$LI3-$#C>y#N26AkqkcV|+OIWI5%$1?%;z%>EtC?sOQc(?dEh!9-
z=*Ac|m>4idV5iA(<dWCbDVS8%X!|&{B}EewZC9vtk)Gl(4Z0kTTH)NzFRaEh{&i_)
zW&G=668i%;lllMEd4JLByyE3?6+i5>5nZ&&#v0aQ4QrRxS=im;yZQ+ykUm*nR`?rW
zzC^bd%x3I8LrKw}P@e!F?=XqehXcMLK`kWKxM@!!hbse9wVd^bU$trK6XNsXldye~
z#KT<@pU}m@QQ{(rj>p|R(x_O8y;2gDkgB!+aHeNqlMny4PDx0NW~?Rtff5J%C@wZS
zHLd|VCPpikcR|`mxX3Z_xZn`2#)Za+jEs_@lnSPSHr?Lw!(E=B2)dRV;V~BX7z(GO
z;~hRWK_Q8Yj>q*s_RbQaox1-gr{I1i5_?QGQWxC9M@ocAr47ZO_V$hv&(NY%y@Of>
zhB=@lcr00B9hQ)SLbae$m?$pU+Q?8;kdYz26?2Uf4jLJXdJ!)c^kFZs1{Z>KA=F`n
zs-c`{NGRIdOHc{AQ`}0NLMlp}Vr1wS64WYK;?+(<NQ?}FLYnx5NW3y6gppzKe<u2W
zliZ;aYwuVpIT^d^FO*nRim1f!l#E2qofHowrX_~Q;hsb8_Q7ev_TQnEYJa0vno!ZP
z8Golv6rxU|lf&auaW5^LUmhMy(eC!*7Cqmi9RI&jIiXB~mXm}aHU2v_r2352NW^0j
zR9m7pIt}APvBo44+?7nCvpEKjM<!@9-0g+kjJ`*i&VQpa!!U-%Q^MY9f2T6##pd>F
zy$SK_cYCJ$->5I|C@MCVi;w*4ZSPYxhzesUd^AgG!-c)#GQx#^lVF@LQlbl;kdTbo
z(S^a78s&n+hE!^7%J&%D`@bvEVZaYfh)wv*P3@!ZgvWi43OxQs8w%D13yDt}_Tyig
z?0@5CDcV#{#}YngOLhMfWAgqRHR#C-suO7A-^d$&Uk3PG0-%bU{fR|>O2S(FjcW8N
z3K#+ji~4a@`PpPz6zsyqVE2hWf5mEl`4HUxYzogLU_4G{IEUoaIFZRaOC2Q?l6hY*
ziI1@5h_yt=<9Rw<+=Apw99|&k>V{JMk$#QCs~Zw>@syGjz666eeEC&TiK1#H`Hz$<
z#UhIGq1~iNe37G~2PPK_i5-#>&KDj0x}*g0y^@LnOv_Qp3Gwdst>R;_42-ugA$|$z
zM|^Ocul4gE5<ZS&qp5gZu_p9yvB)}t*M*8s!Sxa0LJ9bFxr?emQ7Eyn{dJXwJ*?2F
zMhG7aZvE=Oq-wPo>%uAgKNTJw&vNNPA7Byc=;p4_xrR{2km5f-CO)C65f3?VLaYpr
zW|5~#;!|T|B?(%|$Aj_Mm9e;JN@8zY#3O8V*L}F`;~7Ohmt4d!cx^P&k=P7ddq+pZ
zNZe>MQ9@%OS)xzY{m=D@hdUt`)&A|yBXwOS7NNS1{O|SN$1#Bty|?lId+*DxM<_q~
zp6XY=7m^xB$0E-XocN=s{iW{r*$!O$KiGll@AN)G_Obtd$k_SxyOQ~ToXf*FCMqBy
z`9m5{!O!v0SRI7N;?b|?vm!e3J^4&1>~Zfme1*xe#iJx8^0=f#*g5g{2;QP-thI=v
ziMjmG688&|ZTNqG{34Yp>5o5ZJU#za%Pf8y@P{boC;VFXG9ZLY&WL9|I3`kU;p5j$
z^!piR2@@^hd!v4Q3snejQ@&OvX&OrZD2e`24euXQ!zxhu-`6=M?5||-H~a^X&Og}#
zv1HKM-M32yLi3Td{HV6-_92A)`7cdBjJSM!dn$~%;7e6&(popx^6{1F*CVB(C`Nqo
zjp@fQBJ!J}w>jb)MBPYWae?e^pAb(cP+ArT0bIIpvG@fsoJ%65P04Ucle;QPR@1rx
z1F?vACrfxhi3n`?U(%oKpQb<A7xcI0gZ=%m<3N9U0MXT=-;N);m?4fh{E#)z?GFU&
zZ;c&+To*~JWKNr0>;v&!m+(|P@MndP5~f-*+5MMXU0N*WhuI~6mYhW|#C2ir&kL2(
z68b9{Dj$C^m#?|{A1RSPB${GgUOcoPnUD|}%L%1yJjbLI4+QIuh8G?Ev$R=4>XKGt
z_;Fs+*TnuqWk~Vdr(&_i;r`+|I=TeQP*}y|J8(8!>}Kv86^ZQo^Q@4Hb+M_0<R$w1
zpYeJ_)9^c}g`&i}p6`3T@xu=Xq1?)ElK+zR!FLV^37cRR%^&>C6ukfFrPdd{yx6b5
zIVkjdg#L_C;~c!04|L@(m}H1^5o0)EQdewBbYg5Yr*)QmWX>XO{ylGfNqHE70wXad
zzg~ZrJf%b~`06CH?;5uCa3^)cw!aa~1LrS(90PYIOTJVWJ{A9W<?GfY{SRhz#!=}X
zUlICy21^`Bd6WddGA<Si1IJ>Mil!37@%R^W#P8H`3872oh;v^TQ#W<?56oKqBrk|E
z3X^bSGd>ZIk4>Ob{$c@deh~2UOp*AHdNMjbK^vZigb-&!{?=uR3e9hl#6RAZ&SZ(S
z=S=>kG2&MNL7aN}M<ycw$k9Q3^^GsOh1cID<3!0QQ5+$R4F3tKwgi9p>`Ve(oGu=U
z`_5E5@<T5A(~0GuGKGSaHvATy#8Zl^>+hLD@gbpzyz|#XYJc|}ijN7bg!}*K)`%LH
ztG-VBN_5Kacn`S#f6yBLz(CHgjrK>GeDL)u^*_qbU*zq`NJ{XmFH#Uim=H<6@gC_{
zMw=K*rSJ*bICp!DE82u;w&XOllGN)*f5|I|{*9Wvgy`QFME}U7iyzf)f8=@-JLdai
zZHdyLIGtd<`J+#h{PWH8$Dbzoqrda~^ZVp?Oq2X4Otkx7>_AfbT|+i*?j`5B{-{C!
zpFpy|QSu2%yTrWu?=1O#c)^K=@^1-F{1L%_=rsR(g#LV*{|EGIah?9985rar>enB0
z_NXAsx9W`#;+E*ql7+=TV-grjD*sk_@hwUJ84H;NNq=8x`ggqP6XgEJG(W-I$&X_~
z-<gYk^pO1BLYHvh5|!6~X|@OPduDr(<hQ+-{P2SNubCJ}d`5l3va%mOrF~S)7gHLZ
zQsO1~clyN-n$mucxFx#uADGfm(r+d24;}jjN&g`!4aJasjN6}kvxE_%qzV5^Qrhp9
z`#W9thfZm~N9Yn}T_UCZ6GF0X$@~}Bs0sF5;`P;c_Uk`E@DhpUOOE{e1^?lPk7eH=
zFv=3!v3=(N){nl*{@I7QFn{`M`_Ik?4Tw(T#Jy{ZnC0)85lYIw@#5{1{`oK?RP~wo
zgPic6Hvt&s|2mf_DSru}OR^@zAAMan3i<UhfRaLfmlJxp|B+XWKja#RA3k-Ug?_ct
z{iO4L$9#?-w2b*Z;(oy){~;rDDCt+pP|1${f~1D}f8C^<;u#jsxVqvC8!Hu0?nvfv
z)6j(YuTAjyT_H%Y;^N1DeuBp*Gcx}cB^0OL-^`rTO+OX4TVhjj_uOwP;`<Yh$dW%Q
zB03@YlUXjGDB%ZR4F9`wPzh@<u5OTB1fs1M_kyHo3BrfZ!p_D`Ji~m#z5D|m`~$;$
zLRxzU;I_ZN-38$DTLCw080O!?Cp65nMX<POn7DnEvm~@t3x~9X*wi>qT-K`l=ZA;t
zwoS%;nsD#a4+nJ{14|r4lJDgc*2c#t@Z<i#IALBl8Y|bh;+Og1chl+i|NF3kFe(4d
z2_LsW7NiuM!#X9JiQ#ZVGu$DtS#XPwyDop2vM9D+IQ>JrinbPRC(-SUCH(GRr&0Vy
zO8j!FuV}RBguhIqA^$cT#ZCP2OF6%y(V`Q+u2J8xwywAi`;Xe`zfP<8ov5hRuV}UC
zgs*GWC#-3!&?aGjRI5S%He1E-wZSh4`HEJHPWZZ3z1q9`{!x{-{#U6~A$7&?fWU9`
z`;tmOp70HoM#l@QRR5C@@UPP>e)l4Nwc1xSTXe!-uGxQAa7B~E@7<Lvz8D5Rp70IL
zhI#)d!S!FKRXKhqo$71$T6DtKwW`5Cu1%Z&(fInW)2b2!tn!P56>YWXgs*E=ieFLr
zA86I?<A8NqG_5OPoyGD+EO3&!_+(sviL1RSAIxtbzk)?L#aUP;?cnVh>LbCzCs3lq
zl0+p56aE=3agw+Y5}yG4U9`j}u!&egMT=LYhR5M9!s0&@6STs1xJ5gMTl?ZJyAs^i
zUmMBcp2kdcY;=n7)f-|}Wn}oJqAE-n$W=5XWv=*TRz<6!OO#Z?ONJ7^HZsH_%E-{c
zIvT%|CR7p_6ebC56%Zf^3XxdrTK36t$D;GC`J(n<b9G;4qm+F3U6B|?Uvfj(JKK}^
zSB8J(_*a2{mH1bMf00c){_u)ae_fHnuQly)pYQ*+A`3s-u4s;uXlF^0aY_Ch%PKM*
zjVpd#@Ru#C=+}RmLhK29$I~BWOAE)ZHgc9AQ0W3tX_30+;=jXfKKr#zB?d^oXyrm%
ziUo797Mt>^e+Z5v{(?&X{4cP%pZ)^x_pj|arM=_8k3Hw~clVHWiu6-m|1f5Riuv=+
z=x<kb2`PS~>VK_GtA3zpgl+lr&FF9FIiew{!ME+M`1N+5%Ojusx>&&fn4uu+aQxTG
zhL-Su?OhLS6W1BPbN*o3bPc6<oT<BITe-DM<Jf0AvC{%M=Z{GsAu%LzOd<3CBz59o
z=O5WW)-uKz-7-pHrL4_ZVW_IsO=A^8T|3bUA*6L^>nc=j60NF=qKS2=s@Pg}`@VN~
z{)5lX)LBl3d`j$l?|bk2-uu1Z`@VN)`(44Oe+v6DrvGAH)md=;i{vvK;K9P#$tR2R
zM!uVFd7J>!20&V}11XD(KxVsOQ-HJ}<+8g&MZEAp7QlzXFEsGmSO$3FxEiz4#CZb*
zu1FW{#VC*jenAPCm<{mo^X!<Q7%t87hRMz7wj>~J5>n(d<h0u(61ik?5y%{FivS6J
zsjg1hbuSZ0MO*}OULcWjIe?VKMIh$`(u#~!UD}bUt%!?2&I=?`E(ee(hLv9t$lO4}
zHye?Ws%vCmWKvuNGAEGc`UbPjU>CCZ$P{rA$oYVTu$&7>MO*}OULcW?S$rfg(jtqC
zK+Xpwgymd75^*uuxLUG)@JlWcfpS))T$W&yB3^h#7UVvH8TP?28BFT8`UD6l331*4
z!OsY9+P=X(^hJDYS7?9-I9u%olTFnnn-k(i;YkWW66i53F-?Yg{E9;6y^b{T|Jwve
zYka}_R+tZw7N&1-1GFHVw7?I*1%ptZWs;_d^T)}>eRCH!w*gEn28%`YD_iiBv7|U}
znB0svcO@Y0xq+0$MIdu5$_bDrWMr1e0gP0{MIh$|5-4X?{oIC3ZIdi60+|Cya|3K7
z3O=&lsOlpvsp2A#^8#tl4J3*e#n)OF@#O|#Bsf8WROSklFj5g0895&#!3pLB62vBO
zp+z9)15yA+B2rlxsfdd}&I=?`PW7AaWNHhtc;SI8z*QURu~{(KO!ZlrMa20G8h9&h
zQTu<`wgeo!y)FwTDdL4E$)X3+4nSsE+98m#xCmrUw*^RngU<yd5$AK3-i-GV$c{wd
z;O%v)m%kR=ut*UvJR=L@|DnhDt!ee2ei1;37;ft24UhtTl#*<GmSC@RJ+FN$%$;QT
zC8!O@&1P41tx0h{Jwk!TE!p@&_U&dKx5a&z99%AN*=i8Xs`FH$1B&4*LV5E+pE|k~
zz$xSBC=F(~No!X9z&|O5-xcHykVXDId3eMJU}7~`)Ial=Fp<T1!{k$wWDzgQ36K^e
z0GXX!ins`5jzzfyBvLL1kSNaQEWK$N2|Ixhq%!Yrk*TeSi;SF)kyfN!o!ZSD2}oI7
z1Tr^}_*9A<K+0mcLA3BdGUd!%3H&|8Tr6cMe5r&i>JGh|mp2ieXVV+*HBDj@&EmGB
zdDwOoHpjSFN^8&2l$q9)l$Mn*(JsB^){5Icb9?2oJC@(cuq!y-%DeP;fA(`#pRfAD
z7w=icf9c-)zFd8OjiJ_Pf_1LNT2~KOb`1|axO&an#<ilOskx=qxvs5!{f3Utja}WY
zO`Cgq`x1w{-?QbRt=k5^;`PC+-x~;q!h_zx(D1{Nk<sYb_E>y;V$0UaZBx@*c0hp{
zZvbAu>J5DD>)-gMH?Z?tk372T+q<`XXU}){es9Y@5IweK>-V<}{J<NS9r&R)u>bJ`
zPaHhNROqZ!d8Ljk+ddJD?O-VOVk!I1Z_4nWKYX%`r;2ogeQQ&GDp$B&ZYhc>WsHn*
z1DW<fKKba9-;HI`35ZuRD>L1MRw=+~m8$R#4AxxLP$jhHO&Scy!(R~##>b+g@!+PR
znV`giGc<dK<~INXI2BDcl|0g52jfs0jAp~JF@KG}y1G1ZzC<|-N1^2|#+BLfC8Sn3
z%9Tsi<z|;vQk2M3%V`R>^D1qmO6f8<Yb-tS+<saE!nx-z(3%9J%j40Bm|yP9ZVkIz
z!*4@6XDZh%)95(ux%XZ$JlXTp6>$1ir038P+p{YeO2<+Z5m(lmY`&0B5R7$xYbX>7
z>bOc@z0Vr**4r&sv%k({GNu$~nxZ2k@X+&3<KFS$NN{w#8>g3!TN>XUsgkE!Lih5q
zSTtZX)dVMM{7^LJjntTQ+%n8HLH0zv2KF2quQ8?Q*KAy`<KQJFtE@Td436u#+u*e|
zG87&hZyOCoQwmfHR-53rSwnW4O%OsRFLWW~v)6e8mY~;Wwwpp;A*Fa($Qy}6H>LL!
zK?P}lEIKhBj0JRDS~8`WcF9M>&%|O|*8-rg{`wySumAIvwThV5aWsYhm~pXGf>>ec
zPN;C?J>%b(zPg!In1iBZgwFoz{<mK9EJKC*V^{vH;eVMfFWKMo;>lOsDF67)ir;_q
zr(dMY-?gkccIkz?QU3CM-B11X-;p$VCG2Tuv<kL5v&kN^g#3QQCuBE<d?9nlDEMpv
zn@{Ng5T@~e4X!%wzYOFlMiq<zyu<`d5zF&M$0;P2WqS6?DrUAE$V}<<><0$sX<9zt
zCor?h`C%(_R5`y^&veN}-nBC`^69D7$(pl`%tLba5eG9ZpI#ke4k~ueZD$(f?29{T
z<_(RU_~RVSyj&_LPMt|f?0Y9AvG<&Yc~Q}I?NKdreyLpNcL%i0q*DKomN`LM4bJT2
zJJ(fsW3lMeK-@nV9Pu(dxVTjmSBiVAMfNSzKL`kbo2po);k3R9*o%C;t$7u-L9b*k
zf!&^Q-lpc21V<-R(^_;&P0>IwHM4vqI6fFnt5+6{h10cHM#l$Wdi#R|aqq}jB$!sb
zG!~5{RNev&yj|Ire^U@1Cl!cG%`OW<|6+=1ET<caPJ_D{gZbDjZ{(k-f%3aR-ZCBZ
zPv92+swGxJiLQuubaZGmtXg0N6aWu4k!b*tyfx9Sp|l=JYYL^QrBE_0%t-x_-LzZ^
zoPkcS`*=D3uWEfFIFZqlg3`5W_F6T6WfiZ5sHC{*r0=$=<WUs6=a`n^*t1V+nKcx@
z->6T-hy({D?Ym3Z-6i}727X_S-ksszxr`V9s^^I6U5B;IHi|!DPO4UzBx$}*v)5^U
zpTIwB(z}yEfK!hXp!XEf`!?wHQ2a}ENxjHMN#{Wgdr-q4w(`d<dUs~D;MBqjXud!+
zzx0%r2~qqRTT*j^*OK~acACFd&!4Z?yCKkEuz|{#p)$)JJ)(t~#b0VjR+jlMRXtC$
z=V|_3JAcrwcc;fAP6|_?>Jy^s9H??o{2ObMs-$@%X_7|d)N201127^tMk_8Oq=DW8
z$8oUt{+Nu&-x`y8GiIM;VlU0^rTN)L{_VATw-oH;m<P(92W2dK8I%bWf1oL;jLcR^
z!AY7uN%Kb>{3TKEPK$==u@TgrA?gl4tz|kXe%F|uxkX-o-8k1J&DX2h>(%_#A%0&o
z&|L|lTq3gCRPw;eMS{R(umW^{B$jNn<U1tYy=*UkZae?{c6}WLV2XX8H0}5^TIL>#
z|M{bIvTfP@i=^ptC40G&2M^DmjX^V&(4AOy?quiadsVW*=<c6snU(B$Fxp7*v&T}5
z=5kt+*|{oqu8N0P!oR<Z2D{3VfYU1L1Y$Qc8|=;zyL;z|-4D;E*v;VzQ_Vuu;@>!q
zW|fr&r&X38#B63Zm_03ZS2BD4a*Ek(4py=`TgA>+@emyO)928pvU=gP%94iI%*+Ox
zuS;G13F+!@U(+Pnkm|N2JEsEdsQ?dgm)~=R#`$)`>W9l%B7uRcQm4QGc)*t)(<Tk5
zT~k6F=P&+2Q@pBNxT{Kb&k3A$Cyr{FD>TKQegcMU-s?bEa`D$6$EcfmRjGb0$Q}hV
z9J~Kn;)X8$D1{g*mu9C9Y56w~YSk{!*wduF7eJAXVn6(32`mzLEuP$o%UP~Mbh$S6
zHApKy?k~62;$fO<5k;{Z{_$8Zic?}!SZv2Eco#(n$%iDQMX?Q2kfppsq62c8<4{f9
zF2cEUsu?m{Fx`sjjbW(PWD`NxAXaO|wAhbnvk0c!!=TR5gDq&mbSGLMmc;;`f;Kp?
zjbglOrnYvPG!>2F;c#?=A^C`+e5xO_XME<~9sNQN=o_}x*4i*rYBZj7xB2l68UlKt
zUsIAu0YNq=nO@f(@tYl!zR}J|j}WQzNQL@qL$$T7s02Goj+R!bsU*>catD@O*J|>(
z+pU<}-DS?%{tlE+#i68QGSF%_`bS!aJ-zlZkLZY_J^}(!irQM5NxCCe4gG^O84{?9
zU2}~2%!95#U$4p2<1&FBqqomK(IfN@`vmL5{;8&DR(+xmNRg^9s_5(XIa?=uBasP@
zQB9{P){*`}aZv=|$>9*E@!jD>s=*14Jsv)Mfi_uO?!idExtFxhwGPJ2?HYqNZuazb
zPWnb%Gh)*)_K%=0;SCkLJ4kwP5|kqR$Wu6+nCpzd!!n_++hGbgds_nT_Q3$;`68W>
ze!)KBS%=hdL~EgL_VIJ{qntqP?A_rXvF||rfOj8~zd0;+A#dPlljjHF*E-C>qq~Eo
z;mnDRbA<7o97_M8z9@Rs`rA2!{U!J%_|)B_@EsUS<YeffU2&O@Er@US{xlo0Kcte=
zZgiVnJNo-NBW`o&Am9}D^pU=Z8@oVBj`_77<x|)+$E1HG(C)%cIGZ6eM|)d|ob=z<
z2jlFtKMXWzgGxEt>#(4lfaMJA@H;!3eS#~3=EMz~L6XDXdLk##dHSpZ%>UfwJBXYN
z9q?n^*Vj7X!NG;W2fQ|lqD>y+u;hebFyT<f!{HzxU>2Z4C?fL$(*2{o!(O4+xT#0N
z7xeYW%5LNjPFmU3;aL<gQjRd@gI^~87cz-`c!wq+_gi~=zy*?_1)h(~d{|9!x-oAI
gZa|IKgrilH!5BWjugYXLz$0HM8txBvi7wNB081OOqyPW_

literal 0
HcmV?d00001

diff --git a/test_fixtures/parquet/sql_query_log_3.parquet b/test_fixtures/parquet/sql_query_log_3.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..dc6f5e3602df2dbd1f12c1a4a6d7d97d32615ea8
GIT binary patch
literal 60961
zcmeEv2SC(F`!`*{0v4KBR#b`tCfSlK>OLqS76hqwaZ46aniK^aRYU}(sn`oxu-CKq
zE}p&je)ir){hnmO(-Y5o-}Ae>|GnNV{x&m{$;@w_Hj`wwOB+AZOwVkq06$)48G2?~
zwY0Qe?G2v4XK9gzN$}KTdl8RM|L|cSVxK;P>`D<=-&x~x0P%3m^kF|EPA=;bbP%z1
z{aRf)G&F0`i^KU#ggpB9aqc#6d6C75yRS#iN8FVti7Y`B#ctuTzuBQ!?)XKna~y}0
z-517lII2~<ksR(mbtRochm?Y`9Nvsok4sxwWN}DQD~H3^9}03gJVBnC#^F1k&NDeY
z&{TI0hZ%)x9vy6NuHlY79&Ele5;32CVKfS{rMKbw;MGMIx(e9DVT*?^cpTFH;AZal
zVB>*z`XaU+4CV?%>o@OPDiH_1ph8uM{;U1-S;Q+M$t*RZ#CPG7mTQVEvaFmNwMHyS
zKHOf0n4ats1&FEV4JNxH5<!mZ+8~awY_Xoh?uoivIIO+59GUvtxZ?#WABwEkVSlM7
zhw34&sDJAyhX&6^oZxUlbK{d7D#&U)o_hi3xZ_lr`&|wrYUh-3*#1E`-TH{qtY?p3
zt}n7MVxIFDR5R{5cPzSUYxoNBnnPzJ4x0_RUYo-K+a}lH@Y%Fzt=EW6TQqfggLuw(
zZqQrA8`FNaKDM#Q!o~4KLk_20xGLb#-FuVaal|0)tHKkA#V56TpG3^q^DgrgqG=Pi
zHK!3fNQRs{gBa#|T>C6yNu3(~mv1hzXdI-SzXEZt&yGzi5gpei-dTlcc<f~T)rdEo
ztQ&ABUNo&Shp%SL;IZ+jdri3Gcn>|2!<)$~TIOsmvRHS24Ubisu`=#B^1x<q4j-tG
z@o4Yb$e%ktDeo1);f2PFEOQaxI$mg(hdAu<kpcOL9XCG-c=DmhqRm~09vmKJ<byd}
zvtvjBhti9tTR8L^AKv^a;_K}WlR3=J9cTCqaZBft{v5u_DH_Bfy;bH(=@eUpEbph_
z(A9N#2#1@}^S6r;?|WXq#No}B<W&w|)L_G;i1SN_jOK8{xbc%YyfL@x!&Zo4WApdz
z)ho6T?CSpuht>;vjNFg-tZ&2F9Lje(ZQ-z5&Pm4uh&BU9bmXw(l9By5JT)MVK8OhS
zqXRi?IHy%<vns_F-B<SFv0K3W@aA}2(zV_g4qcxHlyEq{TjM@Xh}(13LpdCNXL=!r
z0cT83J0qqvpZ%P}YKPQ^B8`eI?wJ?yxb{>rk6`V3f=jnP<iX>DPX4F3;}3@-cx-n|
zdWJjRJT;HUiEW50+_7@Zn%H+$i!I!K%G|A0t=K|!sL^!}>nyGEgu@L-s|4sF+CP!U
zawtjg9L8a{L3J-yLG;MFXJCl<NPks54rK@RR=U@~{>8pt#i8o9Xf=nvM#*dW^dr4D
zaTpsKRLWtsT9P9iHg&d<dmt9kfjmaIuNc7{4-a-9dc?HYqVK}F1ss}qUVD8U(bhf0
z;sj#Po0)Yv^xxki@iby!qivHow7s!qDu-iI1~{BYob0rq9fxyXUugTP7LI!@138EF
z-d0z0I3%v%%3H*!23PKI*qwO#fy1D^RR?@P>>qkHn#0<g2aM*hcl=}{-P*+#=Z$)Y
zg_<ER`hzk#?Ax$y)&Rr<j<-uV+&#I)9uAjAj(QY^sJlJAdN^WWah>KIHk8M_i9lRc
ze}hFNVvyY6p%mLWJDa)biTFO`&BZ`Ovkfy&DiIYuKO7v0c;i*=J&A}5$K`IxM)bOS
ze$5m_(}7;gN)V~Tz2~n%l*WxonAEMvq8;=IpNAL{v5H-dm~9%}cL!oE2g{%%i0xOn
z`d>oq6FSrLKH{(%HQ_Dd2g6W_QTHMXot#M00&%hJ{k$PLsQ1$Y@-h$ucC=5MfT$Rq
zH?$bBahiL~3dC%YB5EsQ^Tmxq4<bICD(G__ksLEG@DAdcIXRtBhb?-<te87^FWRw!
z6B9=wo;-JMd_H1n$E%sM5PQFRl)4l#G<@K&O^EK6nPH`fFH`rc&mb;x?A-eoM7O&K
zdOb%RH+j$owHER;vfYwc#FP;K`6-CSA^oT4Aci-uHgP)Q*2Ww17a`Wx8a!$}Vt!=3
zl)Z>AZO05ciCEpHSM+s67kTb>B~xs1<>1DB0}+pQk64w6=yS$@Q8uFO``e4AAU506
zuDAs8b@wxq)^NvT60>(9HW^=(b__8sxPf<f^p|D^W+(OV(~A!nxPMH|v@mk0*Uhk^
zsHI`WplNAo8M$fc7+Si{OiI=@tYzPA>&)cTDyBB=3teZWjI3%&^*g<FR_Z7dLE8+=
zHnY=4*S7bY-)-CMkz*_f#ood;Md{hrO!TeO+loewYv`I{Xw|kjBe!w;Jp1n3i$~|%
zb(!6MQrkJ1<C}L|(eKRmIb$Za=(|0`%584eBw@d!^Skeun>~e&y1I9g+q|*UTE;)Q
zb!NxBaWf!E$EdFR{G3^C8MPdG?3|xl<e6jBVX}Kk-keT_ROs2ACHeDwX0{z&*JD9J
ziT`}Rk{-Jjj9=JwnPT5$kA)K!_gou&`|PfT6PE^WO);`=x2SNra&MkP&)thAtqeIl
zyTg=rizlxhaC$}Px!sGWtckq5eYCablBw%rZXGS@xo63=4TB$D-8aQ^>GVxQUp={f
zZqL#gTN1T&jq4dAQ(9(vR$?s!Q!7io*wAp+Ov^k&&%DqUKekr|wp#TpD<PPvB#o=c
zY>3$qFB~c^qBF&+6|_REmeW#^Sg4ecB*CSz1S?d?X^})q3Y8QssflT!5rgDmp(=qQ
zPC0;$6?AaNIxNjBKOJnY(t6!$eo@lW^tTtm`ZDyib@pm&?bX(f*D<SB-aSK2?N~YA
z#YH;ii*#%!Xk!=uV71skFEcMM1g?K(cpbOM@NjvA%0E^f%Z9TNvA%I^bb=9P#0(4*
z4CD?z=}5drYK^&$NaF2{K9;dGtF#`hCP!C0UQ1iklaKwVqSRbf(~UB1^D=GQe%OmY
zRwuT;n_0JtPH^N$MMtVgf^>?Dabo2$u});|@<dEy<D8V(uV{Ig6ZuJ3v@~5QACoUC
zu&m7`{pNJ66o;j}Gx1s<yQ9lS4AOKZN5?!z$2OK9kUv@{c84;fD!QYC7xC$2g8z2V
zNK~8XkT`B&u<f<XYJDkBIzF8&O@33hyrDU|TJhQ+TWJ^)9i?=ORF&(;W^MD$+P2<Y
zQ!B3m+a)p+RkTYV>vWIwWMix27ld|Up&|WZJ4C1>zt*topOv!I=g+RRWd`AFY`;jA
zhQjUI=G(PxyKwFLN)6bsncDdAGBf<Bri_GO<cwS`V%1`T5UDveRf<J&1*KvYB1R};
zX(^W`6e=WyN+P1AVn!|(ig^`hBtn&%7D^Pfgrz73J5<lC-j__9{<cC()8AI7>0Y_w
z*3nVveq>)iGCHI}Z%^x(pVqP6%IobPsuX){V`lT|P^jf$F{~b5E;c%j{n`lVahaC-
zd|6G)wA1ypbbb{H)5zFh)C?sX93u~p3S-NMV7j*XbZuK1Hw2$of$b2O;fz;@=e-)%
z$y<o%$hcTGT7}K1W>)P>-l~0C$nvuzO{-9=KJug&9T`?Gp{umbS83a};+pZ<Nx!WE
zTlM)&yvlj&UG(iKFnP(sCr3w(gxiA*6LuU)g(1<COlsXshHE`PjF^y~2K^t88n{O|
z5)}6j^d7360{OJ};BlRkq2HWA3nJ&FgOU5m^{2L`!k{A`#-@x;frY&muWovEBuED*
zuQYv@0vi*!3P-@D!kNn-Zo>2G)X`HtOox}LGoMX$OopSyK{=hrjfRu?_v&A0k_I_5
z-M2R1odgY&s6w9yv*2;yh@SJW&4K5$kHimelLBq)y{aaiJQ^Mkr{8aNOog_yO1F%5
z%7DlflAbGLN5jLg;5j|Er@^5aRhtdY&VrXdKM!!NH3oY1&wBVEZUO{vJ~G84DHSq1
z6g@9HodE&+@68z<n*ndPPnp*9cqXh#4SBnX$$+xCL;DU)OoJWIhW4)!lMZ#Q4f?T=
z4(n7Y`i)zrK-ON5hr$(UFygFs6`u~Hz%YH&<j%PxVai#h@CZ2)N@tf@-D)rrwh0CM
zyuy;fyZY3WTN%j^SMsXY>xfa{a69YP5p@o%^qR0Mvt=PfOl{l1Vc<xx?KtJ^sv&7`
z{KlDp*YPP}=Q8bCP>p1e2M3o(eA8jox~BWLzZ(lvt~V0jDoKL+(H-Mo2r_|QW|J#8
znF`0dJZ^ZxE(M<K-!(jSaT<)(YuL1SP9_jLOw>EA(qQIHU(@HeMuU4N;{n&hQsCX*
z#F__El3`O$=-y+Ub79{yH^tn``LNgc=KGV`X|P~;^QZdeX)vYcjnn;#(%@mqn~(`p
zlOS!<&(`N&i~?T+@6?<LX^^^L_@Wr=RH#<d*tV*F8eH-|GGuE;I;?T9AKf11z$3A`
zQ`hTh(8{&ddZz(N5SDKB@Zh{O=sNB8P5HxIxY@FS@7$PF=y|NPM%%7a;d;pfc>(hM
z_QYO+aa<x)8C*@|zCHso8tyw>+&vGvKfhPn!YCCit}Gw#e<>3N2|i?Re3J#oG9S&)
z&Ci5gedsa${8E6pTzuznhb)Nu;D0E<G9T7-*?#?H$7DFs#BufUcNyTT$Ul?N3He@g
z;P#5bG$`&ZKWLYl3TqU~v>IFT!K`4+Ykgu2w5ng_(&DU9pfe_GuB!8R2wSk~rkh;>
z^znXj_n0~l8djN+cVkUH_&7B0?RYdFUh1v0)eoKkGk3S}d9iv7SUzuRFI~F;bfe?I
zepUwb$(R<WQ@kAR+FTx>^vr~f-Hcu)#HE8vgPHd{rQ>1AsP@Zl_>X}BQM<tMPBA=}
zj)-&HG!qW>am($$bO9_op{VVuJreZx3eU?{PK4K4&l?eeV<7ZmtHR9<(xH7&$)sEQ
z6G12a*_-Q-1=49Y<6d7&hgRq0*2X#$;MLJXj`!b;0oRzSlkas)hfb^8PLro6!SlB-
zdbyP)!!M6&>}ujL2E6Rx=Z*4o7<KHeZO77S(Es_ifs@yag%!mmS+`S1Lt@$JN1aa>
z!twF%n%Vcv06|&lhSLp3LDo;!eYSMSho2lw?#;WE1kH?yGLQL5kP`eMrSSYPc=}V#
z!!Oz=LAQE~A0*!$2EF&sQOYMK!u<5)Q)N^ttg99&+xcV!c*H49>`YRj^OTVJ&e91G
zaw|F0@JJ>^+peLF9%ewT5e*hEQBQyX)0pLJ?=FEoBT|3a^3xQUc~<(arS2@q{b|mm
z>uy)UretK^zLIQs_w$LF2_0v`rLfLZPe~FX>v*$k^EM5K4zqR^8kU`es}SJRICVal
zn7;INb=(hoyZT0sB&R^g%f`E}ho^vf{V1nb)n-6!jbm@?oksbOsk(LL+wrjaopF7Y
zT{<-I^IqiAemrCj^eCHgI1@6ePOmfQb_Trac)m))v=<QN@u6#QU>?-GQ8eh-gLJ5^
zJ5KDLJ09HM_8#J}C<~msw11F2BnMPk3q~#Pm;;T9+s()ojt2SlYU8hWn*zfHKXnXM
zW<l?LgCFnjp8;d*=3TKoFaf&K_6voE`LK3?4s3KE1Ko926+Ls93u&8=-Flz60QTH3
zPUx1E0W}of;bo--u<h#moXEr3a7y&-@%yDHcYf0Sb(>Cr=MvqHiw9;x-O&r$<lAIG
z$h+oRKi@BaDM9*sXD4LAVd;wp>j#g5JLUldeM}!Bv&Ehr0;<5~n2>pe*K}Z{yyS(r
ziVpm8=Z?7gLv2W3-~H#bC<E{hSm$(ShdyjueP_<YARSPL&B}fHSRabJxsn}|jljTn
zcuR|wFJ#QYjw?$7@5$c0Pg~vV#RHkUz)h;E^-R{{ocryi%O1&G2fwX0C-IrA*1?I3
zc3R(;t++AqUg7<Fvi1ibHhe$#p=_tKS<U1<k7Tl<w`cMWJe0MxZ#${Q%jdG)J)6I4
z{#F-yJsbJzWXxk(r*lom*W37B7PRDszH9S)vPN&cbt4>~$>!B~xP8`;=Q8~%{r1Go
zek9xIQM@m#?<3j5SFP_YOL-!DDvQpYWcpmz?U7nH?0Rk3G%vi-WTOYNCX*#IEIgmf
ztoD1=ei!>nR#sSLVsn#cvK|dyK77*ivFuuJtp#1izL3qZJKbm8il?$Crb}0bXFrkY
zb=`XEUf@I7p~a(F;?Q&1w7RozM@@SsBd>Lwuw(Qi*_de3PqpT$Y}(rDX|>lsm7ULG
zTD8P}M}-F--Y9-53m}hPIs4O7+43Z(*;M#l+3bihO&`mj$ku5kojz}NU$#_8!`r&g
zWc?P*TDLX+xvXB$hG{EKJ(h(m$&4--_E6R-A-Bbr)AwbLc{F$_p31h3aFed?{#1s6
z%6UEfd>N`3R*15tVZ}u?A<B=*_#dB)8<)>kO;mBw^4QSG2!B=?8KH{N#P`!}&8L5y
zmw!n&Q|S<3h<896KNTUc07s>Krfn0z#>9q3gtX-+-Zp}W$XJ0oGA=?T!!kJI5wOwG
zk<l&!CC+>Vtp&fCDG5|;j1rS0A_XdWtXv_FVFl4F2COl$7!4bk)iJBXOG%FIr&He?
zU3=M|tJ<<+q2Q~{$JX=nH|;uFs$U~^eZDsuCX@8clk{xwanlINU(+HSh9oQec$py|
z+zP_2<AW0l_ln^`U(=u4HFtrxxy#P)fR@mTUes)HLk47Nv$L-c1lYMVh$)n|f~sy_
zr~7;Rzzk+kwXg^Fa6z~4&|mh6VQaksKlK<C0483emcJh40@WVWTvFuQ27YqSDoNYY
z1*WuJ_b&5hQz+edEqUEbAINzWvLz$0H5}9$8y)_hfu?ce-<`SK3C!Jhth8Po1QRAV
zzwg}`VEW+_p{X+oPv1)7?~HN>+hd~=A4W)Fm8wDR%xdkxJX}Ae^)NqJ_+;Iku(2Xo
zylLa(LmN860;X$T&8{th&>yGRWvPP1%Oef*TepV2Yx)jMOZJ7+1-o-@*a(50MozL6
zcL1xqJ<1wx1-RDzz5n?UF0i@Z>Wi`Atzo^*Qlo<EGML=qWnI4otze{IY}brp<YC*p
z?Tt@+fPM`hul@@F3e#Q|)Gu&_Q^C~+jL38av46w*eJ%i;ig>i$F<b^wTP7QBa_kJ|
z!%Sw4z0({N51;y+E}-G(gcG-O>-$6Qx{%2R?^{FXftO3Qp1H%6QPxHibzEVci%IqK
zh3%mJz6et1cyp+lC+OO~l!W97?S{{7=ntJIt*E-bJq-&@x?j0Bt~0#eaZUL`t2eZz
zTs!z4kb?7N7Xxy$ADkO+GRxD90If5`ill2IXgNG`(zBS3uxZt%I~yBxfd}uNwW^vX
zh7JSTI6jtkhGlO|?CU*s0iR79XJ%}40Z+0lty&#7SmZwL;L@r7pgQg`J2}h+ZcNO*
zG^ErM40F2FGc;=ho_i&ZckYRR?i#7%w%G-|mpR4Qn6`xT&)*gFZrc*x#`mb{Ugi!V
zeH-m#joN{9{D!rGy+~;EGCFqZd?Dz?)l8UY!axwNTjlK%!L!IN_AQ!vLY8ymq+EY{
zuxhco7t@l02ma5WKRxFTTayL{)(Mw@+tLJ4_XC~biQ0DPt49HFF}<XwK<)${3F$Fu
zZQ6nATHQlBW}a|s=~TxfLLVqy?|j3(iWE$orf3fd@&cKem(j43QYg&4MdfUFh0ceC
z%YE0oKt|)o_cuI?gv2Fll?(D&cxE!Iu-CZm(4fDp=dL^bAnWi?bf>D3FukO3-{#Ut
zNDZ%BFky8RRO{os&8dA9tQ_C0Z=;>DuxI|+%u~}<FfHn*bl1W-sJnZLUd9b2%vpT)
z=hjRVL^kz1I=C$hgz@UNy;A+)(jvyVbH4$ge$hBfdR+;&5%CY+LSG;U>Gf+%^asU~
zvylfRec@JF&e#j3-tc;Y;=E_)Nbt<-biQ|j8Xk(9p78$Q1NLK&#0EBrgt6A_<W}?K
zFkqtN0vmB>=#x96$AG+0*i<T<G+?+A?%KvREveTH+V?JuI;ZOe8`dtVb4Kb0J?=KI
zm3dVG^WJ60U$72>X=S3P=?(*7*g30$W0nKqriJ~?&9i-B`A;?~YDIsrw;DD1?!vwx
z-{;n0dw*}pwX$e-CVv2AzcBncSnLUx{3LH5YPmvnHfj60C9cpft$82O@D8xM%9f)C
zqg+9@tJMSj#T`NG__QtSWdZO@>FLdP+O`MFK>f#!7WzRC9jilUy#_*{QL+sUECj8%
zaN9MsE8O$$*`Y(7FnFO#^y}O%0+#F;wRc7zA1KNbx6ii@2UwFcM83B#oW3|in&IFE
zcAE>u`+N6;uBtNwURip=CU@E5!CO3G>8<*03_i32W723r;8}m@K08}KDZ>XoocnOT
zxOZomvZJ=wu09<grCrGy%Sm}q)#>8#r8)j^d2^3u3F`)ctKq%gd-DR}?(3)1-t_Aa
zwS7*`Idq1F!XwZ7O?%e`ZpE5x@x2lOMpvKj7VTET?2x3_2TQuZihVDfCvE5t!F2l4
z!ZS+P@vg|c?)@&XVWr~bbhl7=={*1Dyf|N|yP)Zw>+`~)Y<c#SZA;bAuG_vNeFp`B
z`SojesO@1ec<{<6#=8bXHC35U)}%lfFmi*NO@naQd}GejU3y9w{d|_X=wMfftus>W
z71<xE940M|<5ZwKyJnK>O<zdw_O52S_5iq+JE}M&mxUfT3mfii?FX%VF9~*<^oJ26
zOVXWAM}Xt@j$;j0`U85{>ALvgee7>mmkldOvNWtP|GZDc^<+b#mbQ+rp1wg9LnGs=
z)vDJpF|Apvb{#Ww3rnlI*7fQ)Xeel8)3`}fTRVG)X3ZU)oLdkiB@~GzG$VCs*{Zb+
zT-&sDbN6WH*}g-^PF|h8eY*Ji`3D3Bb?w%@N6%ip`vmuuE0ii$9nvqf|A4UYh{&jc
z(J`@cg9gVZ3>i9X_=v=$<doF3k?Er{MrV%6${sr|CpRy@VElxMg_9;vnL2IyjG41$
z7ZuN$J8yo;f`y9~FIl>5`HGdRR{ykS?Yi|FHg4LyW$U)>J9h5cy~nH~Sh5O@jK?+f
zC|pT%uPkgTu5Vsk-*&bYI(y4Xm@$vHuOyYfq4BxLUcEj+w`_Gb?Cap=HfePinCd$B
zVVxJ9VZ*(iC-e4phOTkz#PcTmfn@)O^br?)Vcv7w+0uem;PJ>VLB7BR0!j-mcIin&
zVZp_V`bz_0&dymo?wWUk-FEGAYrpb`vqPhI=S*@3qigR5+K04+Cyh+sllOe#xq(1r
zG(rZZ=LdERDCz{ute@sw?GOYvR_O;#KOF#<H!R=%a=$kW-b+or-FE_z4eQtP=}`a^
z&-d*m-sJ<MCYDM!y$OJx23;%~+W5g;XMf{|O9P;@cJ7`dkGsLa(h0ZkXLNz6B)Lb*
z%6?EtT<M?iL<N0HmRVft>IX-dT|385?F51@eHNb<1;K%8=8N3k`-0i(I}Y}5J43H!
z_k=^`zEH(&q1NHHz7RksD!SYVgw5IAk}qU?!+_OY(_EA;;5D@Nxv`sk;K}yteJ{`P
zht>=B%o=##9}4$Bd@fqz11l`XbZgSM3+T<KO;gXxK%`@*S~0X0Tpf{Jt52B}R<Bot
zq^ta4^wb?)L$`ZFz1OyTIvn!{zlo+(_Zs+uu&~w`Q+-!hK>mDFZ&+(s<k@e4=dD&S
zZhkZK^-+FMbYt<tYx6q77MG}fhb)6YcWg=D*M$Kf_f<T|yXOUUh#|A`4)=lyDf7=h
zSucbbtH3eM9(RGl(dQ0cUE2*>m)$MR*yaa~XT;<-(e;MiOzW3)X?HkqVoSPVau0a)
zHl&~Gj4zP4E@X^$_Jw5yyWWhm2n5sqwcfW5?gCazY9y^)*cFoA7p<Ltw+rZ>`g!hA
z#tW7{&e}fBt{VhQfa`~D^oEUtB8FKX?E<4Gnsi<N#t*tooV&(;C=GXB)L-`)ulMH3
zWWyFI!{O+&^Xu51ouFOgQ`%EP^I%{5^Q#Cq8YI0F7smw3;A*bliixERbiV9(@%g3%
zuzImo^1gvLq;#LFz5IL+nA2H#YoB>Hh%W6ZS*NC;^`o5|+D7?<=V`araruFexWV~Z
zq{tsK+nsOOXpk?|%kE-F-U|Xls?cK?%Kz4ePHTo?`Q0Zxoo6iV0@M0u%OaoS8fwYT
zvukR~VPL<@LSOwrh)uLks?{bM+TAR&Io>x7oZcm5rs=0bs^htPzm$#y?UUPHXEYuO
zXLYm=nWSgJ+#YVdTet;7%l4O_-ab?eqf_e~pY=;uupa0@C}I}?kz8kX$lSgl2{c+X
zJKrB7GbdHwe<KD~uUn{_=ca^)-RzgpO=rODT-NjbgR$T}YS6^dFN>jCZp#h*s&t0N
z4NLDgaT@~5O5fDHI#3Rq7H7SDdDahJ4Yh08dSe_Utfu{ko$3j8M!G{czxIN8Ez+mi
ztRD!yT#5YpO_fmk%D}_DeIN|Zvg&m7iW^u?ZX+Jk5+m+$7VqRE`ohFh;?CDmZjRaS
zyQH`?6dXyt*3H)Rhxx<fk_0hb!9Ac&+xm?LL4wQl(CI`kP&|73rc~?$&Pz&<cBqpE
z<A-&e>Fd`B(q8x9`?$yg4lh`p?02soELk3VY^J9Gdf%}Ny|=n9Xx1c}f=-BGC$^ys
zs)o9)s%8gJl7G3>E5!vyI!!X(W-<t7#96noo-7Bev+w(x?e799%Ns9Vx-15kjP#P-
zST!G<uNr{0Ru1?ydT4M%mIJMvTbG#axdPV`XSCQAp@5(f;J>QQSQziR*nEPgE1Z9;
zWuDc&DNqmAWTubV54rOK=2ZEm2=?{rKc(*ZJ@Dpf&x;dc$3n|z?K=>@?%?g<xuR!Q
z7OakMb3UC7fWx{rEsFzVV1CO!Aq5`YVM$V2cb|^2u(7LwzH94X(DKx(R&_%V9BbOm
z*46a|{Ibz3u4yU@52xgq?nF=CS=sF7t=7>XkPdt_YJV{FNw3}g&O9GDc|3Zu#imd&
zIx@d+SK}bKxpplpvl$4@e=?bTX0rmSZ8kj8>!>e$IILRsf{BJkes}g;-;D>ULG6C=
zAJ9G;?wGjkKps4vBG_nPRt%X@OXn6~c`K`Mfxsvlk{bu5%i1X+Ms&1&mAzdcp<Uy<
zH<DvP5;}L&y1Bi9jCOaPsSbb%IoDsG`6(KzJt+&BZK8x1i^dHpSQq&F9=oEmmWCC>
z{x9~}*66W4;#p-Jx5V*npZz5_oX`F;wGy8_d{<^l3nm>d4?JBzaOVg}vduVG-(fhk
z+?<i1bu<Bdt6qr<YB?MV9-o|$Z8`!b8oaS?_;d&qy??Fu$SNLQu3k_!ZDk6CdW?L(
zf8P+eXjV0Qpvh1u*4epCfAt6$GrdvkZ6}67V1W6pQHSE-O5ElKlkvQ}XExccmZ!kJ
z-Nh|~8^lBJHy=jLn~(xg@rNEH(?j8{1DiIy*I*d)`f;a>r77TfR>!M%;BdIJZeMAe
znenhUazwKI@)Ve9V03or`vlNCab`~4<%2;weC*s6=aa#>XiiAl+hK5X=@_j9NCCYJ
zx78+A$#BH3>h{rd^Wk*odL#3@41>}^&-2WdCd2(SY2>jgcz+|@UU<3daJYWDW9J@`
zDKL7-1iK4q!@;f5t_2z4iLhc|yS(S;l3-@*15>Pi8VVDedCLrU4hNh1?^c?4CPRB!
z(AL>+hQpH=wa;EDMZR{G<+Sw~0R=yeu)df&1V%OM)OSU60*ra*-80fO6(+<C*k9(F
z47JPjGU=bj!)@3yJoET)AlKJ&F)J93I%pv1G-oKRondQYc_|*M+AJJ1sC5dQ+GIA`
zsDBd7&|Z?;{@@_gp`t!Rd!@qc8he-14ntu>TCZw(W%0l?zcjj5k0H=^?dhycmvkTl
zW_#Rp90sdanVZhKo&v5C-Igit@vv=9Vq@j9creYdZFRY2Dp-D4Tk^DJ4osulTz)it
zI2gvh5EkSPhwV4a^E$2^1~YwDnQlBi2wE-p(CM%o?@xA%Kl!lb2xz{mj@H1PL!o<w
zot5AGq3}k!CN%Wr2#DHt)lm0Z0&HH8dCnQf@$kMD=WmC`!{`!Gi?}yK!Jy8;f@k7n
z(0x!smo7;_pQm;;KROKBIgJQul`|Bk9^cyJ(DA{LaJErt)ZA1^w7%A=j7o<3v%{WT
zSU41-PQUS3)+P<+youV}{%|^E_)Qli7pH>5(pPURhsHyqS4yP!vXO9q5Y=|$kwH-J
z)P*)<&kcsuQt|i;!&6{xglgWME5q>t%h_!Rb~prOS!+cG4Fgp{C!<SRDE}5Q>o!hI
zhOF$h$*Y2sVBPyxxn1^VfrU#>arWJzU|B06Cua5#ID4gf%X-ffpm4y#mNjjLLLYzC
z;bpgm!oXujS8q*C0mtHlVGs3EA>FmKor6^(JdSBKDfd)1SctAQw#56qik=hScZy1c
zjfx(<6f4KT<P4L;MOrCvV^lab<X93^UCFGt_)9j%YR96}g(=Xc*{-p>XN>??f%~I9
z2H8+3PB4G{AsMJ8ou<Am8v&F0Pic4}bS%{K*EP{wnhF6!y8627Cj;F|c5LDMY*;dM
zX~#9e$uK|XX~%>4i7-85=<0}egW=Ar*z2l|gCViw(@DcRr9hh7Q-|@*lEAIWmO!KP
z`7pOzqb6CFDe&T4*v-h@$<RPCuTxcnTwL8*)Ai}8Bp7tbZhy+vM7Z!;I-zIMFqjf8
zo9%Qq9s-vw8X2V*2PdK~ZyTEw1M=uB#~Y6a!Qwt;kzF=K!}*>k!?cH@?+*wz{(1ON
z7<*{a<7&rZVUqVC2XfI+*zerM;OK@d7(L2A+xd7h^gdpl-C~=F?WOkS4a$by!6PQm
z$V>$jrH++CmI7}_E!%uHIUA1Lp6u~KG8(S#Jh?s=@4M0&fyRyQWW)W8l*z|-#lVf2
zdvSR~;-If@q~-XqL6G7qoxH(w2-FE!ygF|L%Axfy(n<FdA?*3nh38P;_C=i2wiaeV
zzq!`U=U+$yvgCO+Z*dmvi@%XOc>4%&ekE{gKW+>-k=i4t7^c9*8OM9MElY$D(_E*s
zI^)3X)yn%1F5`0pN2kx2rb>kMn~KREp4o8P{Jqf{PrUzsX+YxC@Fd8Q2B)1(9SNqP
z;$w4HB!StTMO#lT$cNa%wSor1RCra>ZOZc5$xy|4)XuX5#zCR=p_;CGBcY+rhHirV
zR5)Wjq;Hie1t8-MXBxC=|GZHllLQls^4?7oje(<{*Y^CpI0d?On9zMrR1(a#be}k^
zXBG@@l67+3+GN<~vbme<og}Cg<dP|M$bzOdLhqG5OomZsChXL$iS7BlS$2g{vox$2
z-hZ*ru8TgqOPD+&0&f_6+hu=A4d=4IOyynnBTKVKmR@FKH4Ed_Vv5{tW;nskh0!&}
zU2Fl)DQ5YDnh+3vv~gLR+IG009Ukd%$Qds7Hac21iGbMg#(LMzIl%A<kA=R?9YN1v
z!!ubd1$Kx0n3K40*<g&Zz4;>tXrC5)tJh2dZcI0tU%II|1cvsc)cYEMX`DK+?};W*
zGpOJ8CI=kh;g;KX)pQdSMV)Oq`6N)MY8Gwtv4e}VqI<XXX#zz-Z*S!#^oHuoh7Xu!
zs)SSP&nI}OY@xWM*Rt?c1k9bZcgfOY&A@9zEvLJ=O<`X^lzh%Xdq`Vw@M-jvW^k%_
zbtm&c63#e}8JnK$4980sT1<Ux5BFGEf$ic(aIor>XS%y>Vfu=9r?1)&aNH|KN59w^
z`kL#aRVLA-|Gd24B1~T!@@nu0C%E`zT9y2bj!^YR^wa01jUm60U5CChN0{DR7~ZKn
z0qgJCUf-@m!p#*CCTYD07?~bEZ&XKTIC*!j<FgaCz%bW#KJ3~Q4!^f$LoX39EIsk%
zCAkgsG+AxymS6*|3@6r!neGh64fhTkARu6%>vq;}QgfJj>vh1MOgqSab9it^b6a?`
z=R$j%*9{>w{$=BP3k5*A93QeexH;@38~oxNWeY1Jl&N<%H-n{#RHHUtPS9hb!KSqa
zw$Ol0x;5Jud9TvKU__KXWXEN!Ng?e)E9gbLxO8V&Gx&5BU2`F9Ol`f(-naz_yC1Jn
zuW@r|mA&ZN_`!CtvP<*C!=_E3Q-2?|$qQ%5SuVf-q=_^1at?J=HEj-!3#xZ*SKI=2
zyeSzlX@nz~_Ovjr+uR;*-Z@ejcC7`p@@(rgVSQ65n%=A1nJ`DNyYP#N{x39WGn?D?
zJZ%SU{PN6pXS+i7M7KQK1I}=wtnHaOU7Ns?s=n4e8xv6G*8Cz{t1F~Wc##;v+JLpK
zpEpc*fVr`2BP|o`;6UI##i9ev;N0P`6U&T6V7;LBb^9$suq#tH4*Iz<^ou@l_95PJ
z5jiYqd@qTB{8c06Vfjwr`ux%@9lUX{JNL}(8M%(I>4;zCK5qtUK6o*^Wp5`)E={y~
zb(RGAmP{?SXA8LbTI#UHw<CNwdUWumAry>16F9P$f`ZhlJ(rxnAcUw1A?)566r7hS
zUR($#;kf?6rC}}vEKnHX%!+_lQ^OB#9!kOPQ*JY7csql2%ZY{c3n}RSRI${eIsq%%
zn%QNc{+#pNqp!1zgdQK7d-v+&3|CD-dRyBOy7z8<p+#T|c(NrdeBQwp@Z!i_&w<Se
zP>$}>U(bewerKca%};Fs!|qKG?jeORse|>EeM1RY+%@rmM_*^~A_T>)oSdP_(#$%I
zhqi#c_YRhD+!c&0wjb(>)6R~=cMm%K)ETndDg^V?B&=h{g&ua7fPT1b|KKtbj!dt!
z@Z4n*&NkaSHPDEFU=v@DCiMu|Rc-Y{pAi%s6Fqu3zoigb7Pze|J}QG*PZAd;j%@=i
zC;7$fzS|bo3RuI&)7rq`$bJJSymW=0wn2g32im~8h-dN7rnG~@1)d`&I=2A_a&J!D
zK@UhCW@J{c4*|vF<M-`<(gH%0mbMwUl7u!_>depzCShuR(7PQk2<Y70DB)-s0SiMr
zR3BTT1=x;m<I=#{6U+}LcE5W9+k1S^wXo#`*u6h~M`q#<@&2;y%#k+GP9oYI9I_p%
z&0N!Bwq*xUU(}zoV|p?KYhM_8abZt5d#m=6G<P@f8YxRS9!bJof7_ekP81lwZr%D-
zG6m+bT~qH~Z3Z#RLyB)ycZOEakHkt3wSf7z$h^^qDX3jg>kiXD7Luu>Zw7a50kunx
zy^TIifF$x!?btA9&|iGUY0x?Xo_M9-TCtmegCWzMt;GbqR>ZvD5bq2PBMn@821;Pb
z;h!7&%qC#j*cyiOEGW3C?-SHveGAY_a^HSgLBi{OIxYc6Nw}ijNV5D9ma9Ib#<eIC
zEEHP<+q<@a1DVo$*PAtmdb050wqhYPbjz#PV<7=cr<IJJo=ZY({DA7pv#wC@*nzG0
zaw%|GwZUs+LlSPe)y-;$@;YPPQm;*J<nO!c3NkGXE6o0PaMd;cSu5fn7sM^lRr|)V
zag{HJ7t}W|sBfF6Sq}e_7tT+Anaca=nZ9N-eLudm$vyw8mdY8iNGuX5Ns<(@44+0y
zMRJxD3gv`EBBq608cEY)LM4@pXt7YvDk(0FR;jQIDaBU=<g8FE=hEa#O36}$f|V!;
ziIC;XQ3Nelszp+g6p2YW$#UhC3W=B%vX~=R$?>seE=^3xB|@4|$fc}U%qVCsO+r$v
zil9ifl%Uj<rX0!AYJy=IR;<LwnHjzu!w6|wK`TjGiLIgeG_go2p~S3Mimxn4C9Hz0
zPfbw@ky;`ascE^2k?_w&t4X;6?~kwwf|ZCB6yFaurB(|S3W9srff1?sXR0YeB38*M
z1*23d#MpPPK1wc82vrKHk|pI5R;uFCShZS0unHloVra2asCgb+$r2PrQ;ZP*B?z8p
zxg6WCCZwc_BqeG&|2(!>r4SJ+8oMQCNrvRpScR0LM5Kb2Qc{J4;<-{OB}$Q+B^64B
zrU@Zm9}<Tz0?6fJ2_+Y)BpRL>S}Mf~#H>;-r6hbgf>f#qhE_>gl|qGYb8u}IBGE!p
zN)vLiQbaI3FLFvOrIkWT$Vy4IT+Z`CGDx68BoQgZN;RS3PsGql?6XR!lF(utX|6s7
ziBZc{YKBqcSP6MKPz%*+k(v@pS%OiiG}lm|1W=@kVMsY%T*Il5q(w@CphPMbtE2d_
zq)B90#1J&ek`fie^P-}WVk8)al$A&nBA$B^g_Bar385PAVQKotC|HysLV}~KRHIOH
z<y2BiC6&^QTp^WUL;3zvN(qXBNUT)SG^&M!D@V~vMj}@USrlI>sp%U_Q%Z$MNHYxH
zSCcA8t{jTCL?u<qCHO2pOH26n5n`2+R!S6V9A=gg^YX%o#A=jLkrH)@U_^YIX{8ti
zRW8CFGg1-Dj~fm%c9UXQR0RbNGj|Paol=PsBoazQYDUS+np%v)ObAhd<#@W7=Sr?3
z)e=S}#^?G)6e;EV!Ac1wHdo9r42mhmw;4rVEtU#dg-R(FG6X+H$hS&`sxDFyBGf~^
z9ICEFBBvxu92f~UfoqGDm58KDN-ajJ@XZ1K8rTCC6$VF0jK)MGYf6Pss8lkj(Iir@
z;$=vxqU59;UsO<#3Xw|5b085(RC3M;v8dhnf&zC91*fc196cqATB)%cDuqyt;z1$#
z3gnXSuS$w?N05vZ4S*0wfgcYcgIA{2B9ajjsGwXL!AQgmAwqpsO2l#vr<9OF(<Ndl
z1{<sp^ZiguRj5eF4QeHUXLIMW*hiFcg<3_hXpDG0q8VC5vO)@p601nL#^$I{X>mLl
zxlkykB^sSkD;O!y9)<5rDEMoj(u-9nRA`J8DslO{7$Qn8RH1sX1VIQze0@@sACg9%
zSd?dcOM`0*%7I#`l!|c#)U=f4`>T*+kC1<S)q_OS%a5C!m8e;fgdv1NhQznHxN{jb
z&5<va%9Yq-1wWSf<QGARrFbHv5{VQV{lMlclxU$@R4);sk!PinU>K27EyV#qQn~sl
z78RXAvqUK6Dnia*LoQY*aJU&-BB8OJd^v0%zGcFSX>=AiqI@|5wU?pQ3N&wGu}Z@M
zf>0q93016=rLdpeH8{o<LbXUGMD<W=(xh@JH~i>-aO`P~ZD$myXA;hsN<=DjCOlU{
z5ksLxP)aFQE@b%rs#TO!MJSM126bJj@jXg3x@xo#A{-D}q&XKYoRa2*2o0}F!nZ|?
z`az*AsA!VJ@;uKX5%x>Wh*g|*mS|*%Wl2iGNtFoAG|g)4l?dfo!YPqhgf@%62A0Pm
zQcFcRFoZ&5C#3{~hMq+Wi&h!^Bi9e4g{9DJ3q_Prj`oS~8@dyP6s@!f4KX&GOGEQ2
zM4=EN2|^XE*7zBb9L=plOmXtTs(J3wpK?}%Qqod}k!oza9QmLa^wuOgDeN47T?ML#
z3JthifqKL6=W-e<K{LsskHP;ce1%x57PD%!02B%Ue=Z?sl&pxOlNA#Zja(7vt|+NQ
zC>5hhP#WFBwy<hOsuZypcxc{VA|<HfN)!&%ea<y=^~ott?1U<kQKKHQJb&mB(D7g=
zloBC9(VB87$|QD8sZ^s$V)!(gWtA$7F3@eVDk;IIDI`)1J>;}fK}Z>_mb(TDkVr^j
zY(V2UQRVzmQi5bD^hvZxs8sP|Nuq7UU_pe4|0;Q%7NY4SrE-NDLn7=D&x=Y)NTn(j
zIs%p<RhpOv#YTwkUadf<ES1yzb;S}q=DZ^#l%RI<a*7TB-3&5~!HpUN1+IN+blWt8
zfe7!>X+BL!aUw*hB`TF1jXpm{7-*q)LEkIlyt78eX*H{+Ndg-~vRJQ%7Y4OfDnzG`
z<cbM?%*ARHTQ#~kbgN>CCXT?4GAaclp)ssMm(KTBEa!%Y#jc4JNWMmAaMZ*!nh27h
z2(^^w0M%PAMMI7umy#r;{Fp1{7#&E|5}Ht><79Xar0C>v)D#lbX$cxuuD=RCL?qB}
z(<-G@6Nh0$L*eOabdC(i16K}53^jm2TZ+ShLc*UbCUK-$^s_XM0?q3$rD8DHP>2~6
zK<p*Yl@d)mD@2)6ON2^=#vds}*k4jCRS@WYB^qCbK3*&qN=bB~Xrndu6yr6qQjYRN
z(WF>TY2quCD)d?SKZc`<^6O%tqei!YK?Ea_kbL{p5~&moGdgZoDpYFH(6nMii2<dW
z<OYPVPk|N@eJxTa#}HjJMiLR)Y6)5xHHk0zY3u<yKMZ!|1lnG7-Wq?7?H7s^N=izh
z(x3t0coE5QY?Um{V3;aVY5a^5&6E_~i3F8PO={v(R-_g)s6b*NiUGzj+_|W>*k254
zg@g!^w?p_9Jp@{5R)`rIyQab@4Mzv%S&Blb88<N@7NPwj(BEO$D&)^aQKndQCrY$)
zNGdM}LM2W~(6Zsnia6WQ%t>%4G1S4@QP|X!CN36}=*`iXvJ5%~R12<sYE)pU5S15=
z8$&308E4c|P9`NdACXh20$e#912o_)sS>I1tw)V7!!UxRROqP`Ds-J1J1#-<z+!A9
zrwNSk6#RG~9~c7S2%+Q0fQ6TFk&=?g#T33yh;cSk;kVGsOK=jULh>1frY$NAdN4j!
z$<aLFWQc1EO^7LlP>L>2Ax5jJu`lQt#3;NJPF7Hb`EqK8kgL^7bgCkZ3FN%o<D^9*
z#DIks5>i4g)Qm4F!Kj2*OVmP@7^ef=H87^2C=!jFP=zx#&Ad!RqiZAx5?!H+q$C<Y
zgBRoU2Xhokxh7T-5m+}$fCOEX7<<gshx#s-vLXzgD6~x?-X35r47zyyC^`M&^$m-o
zS`uP>Ih8@F<81_rFlQwgwDmZ_67xEU9*4on3)*r*P0=d8Z|HM`Xid?zVkn0un7b}A
zpp;5+Dymcp(bV!WEBX$MY0#L62r23VZ#zkhL-8$jF(oF^nrr3|s2n(<Kx-(c&|_#~
zbS3Jym{MWXj*|~s!MBgVXp>Nh#OVA<n$Y-tp%7gUBSNpQP@uoy^%rHAQ!_cE#uu<v
znpldP1*vfUtCC}+CntGM<sy-aREV*5nx<%)@2>)Fg;a^g8f6?O3_Mq;02EG0kUF#m
zXwSI*V!VZOON?NosNFaj=h9GDNeq23s8lG0$PSl=^97oqx%dDBez8V&(Y>Rr;3aSt
zj82F@S4Cq0fHspvw~6sGFO%qRXlx;d1{4MpY7Gah3TKHp;>f31t(lYHA&d1X#V9c%
z38C>lYHmh|FB%glX(&bf7@^^#NEVH$gjAuC;M2r7`BRX1d5qqL<@!q~acW6P#bQ#0
zE=*%PF_OYqL8(AdzzGr0vyzpfgHh2m`ewCAGuM@(vBH%B2}!d!AJUAk5T~VTIm(t4
zo5_eYb56BFuA<N-3n>!o;CW^^M}n3e?YRhj7|#nTpo+rDKaEQWlJYoFj#`Gk2<3w_
z#~N8<xVR2=3qxy++BLRGA;p%VM^&&~)S$7?N}S4}jH<BD6i(qZI>@bxFlw=qGsRMk
zUMj^>B}VtS8iLU~PNz7oR2a+<oPp(R4qYDaGHRT`qUB-a7>|m0T~KiIP?UNp{^87r
z=MUW_8c%f6O5D2vHI}PS&4^f`9G7U67#X57;nT#}PgD_%T5&3`k$YUiL9xPE9M?-o
zv}9a4N-DysDGn&N!h<t7E{#PEP@`#5h(#C?YGhZ5f=ko*Cb}A(o1)xLGeWdZxOOAL
z=o|elcP=Bvi3J)?jN5TNr+iL=CYYvi6$FD%AqL!BIW!m;{b3NOW-;#2#1W)gEXFxH
zhW9v^#fXhBhx2$0Q_upa(R{PKOma(2xLks>2yQ~Hv9VYgEtF&6$ttnsG_RLv`NTN)
z!Pp)rIY=T`AFl7<<Q3z1)G(ZDXzHVds4A2a<2DS<HM-A7!}t%H0y(SD#7H=;!u2s+
zA){FlMjd>AF`GtC)k<1INO9`Vx0zxITxvm5(4S*e!=(|(u>_r?R3*Z7AByK5rztqk
zRihtc6c`HdZB{WXPSqI#*BEf(DB*RBMO`3pVF1HRWLjf4lz3sZQzD#(p>}HY4VM70
zE$GxyjV0)Pxa;yu8B(lMjxLT?@NJRci~z+H<8*}z)s5!`cmI&1874S;j3FP-J<d`w
zN>X#Or^2|6+hO7>ua=pj?ti9+mZrZw8QX&UOZ<4%?2o@{R<SWk)sGt)gjVcf(A9i;
zSKIBr-0lH?h<bdt>G*41?$66?t2Y03lZUpJmcNIWhg(2(V}XF%TE>z4V(-?*-vi&H
z@$eQ%@YOL|5Wsy66*Ln#kpd4d%nKEGc)L3Y+V~4X`7`2z!*O2{{%cfZbZl^_s=9GI
zKcCJ5n|8QeiNLL&JUWCGw391C!$M;dY^ob$<$fO3jk!w-+Ij?Z^}q|*QRoYtNXm&4
z1rTBvl5!D>oy9cO!%on~+x_!w$*0-XjWNHvvAs>Gji7_Sz}qK4;2r4YCGhbR*l1ez
zS-L}cc^h?kd$74!z*}H15q!)m?}jGbj<j>Mqwr6Ne<J)7<DUfoX#8XB1P=dnB1OdS
z5?PV`gUD}5q==J9;a4THBKrrC-y%}XN#xgw<g-OR?0yjWO(HpoM5exCBgN&}KZyK&
zB84slCz0Z>5czBN-%sRe45Kt&TjR3P2Mfv_u;6=mUts`&?iG71a+VO}*9j@l{%(Yz
zmvuP%Yu^S1hlfTu3J~P+9LNWSIPx9-4?+tW_u~6tdk2A&AT$EEaKbI@1a_oYfHmWL
zXfYM$<;zHbYfDAP>}~j_d>%t^!3CcokFPufoBMeN);Ids-ZlHHdtnnxeBPdatr$t{
z{@L-ENaBK48Sep9u3DdDS4iNGz_!vEf{5mf_ScF0HT&-;@-NXgju5dkDWtz5S-)m~
zH$pf=`LFF8T9SW^L&COH4BG!;hxAD={vL<)cc=*c4-zA6%l8+fzrX;{E)>ZbHwjZ|
zMEY5FMF;=1+r%}VzrwhE*7z^yai3*>zrLX){N?fJC(`$Y`u~I1&?LO@Nu2Q!fXY0p
zW`N4GzuN%)8y6#aB1!UhiLA)}LFDhOSZIjkL{9oui6qOj|9&F>68|6;yATp*imG%8
zhsG(_$L#M$2(JJClc$>8Jd^9RP(oBLB%fq|H%|VYW(%1@yM&KwewE0I>>ot_uG!*<
z<RtQUh%C?kLFBiHq&SJJ^!6xk?kci>5cy3aIgzXMW|W4=^6Vc(ev3#UCy`$#vLgHM
zC-Sd~NQ4-X2!Dr=^6c+M2)7*f51vTjOi9W)QVCV*9ZTlZ?C-|OzcC`w5Gg?-D-Cxw
zM3!g&Ao6$27EdI4%HJcB&&InoKZyJ$k(|g?dS8o|NUA*h2a(?*5<R7as`NGvPh>^*
z-%sRU;7Bz+mN?UrO4m9*%l>YJ{6+V*K5P7!OTV9Gf4{!rvi3hXNMKwDp$kel!&IsO
z(vR7=82jBg`8Ng$8X_e~WTng28Y0WH|9&F>0!yOlG3|_Rj#ajf^I7(HBjhi#B%d|@
zOH1-u_V?@CUvq=(v&R2reZ$4wzeFa5w2P2-W<-@PiAga}lZ_(#e{b9>8vj@J?Vr6U
z&u{}OqMS)_r7L&*c=OrcbG-kZ-i;%YMk2pr%s<KgZi@4-B~r{uWTlHTyhK)H|NTV%
z|2N4bflE{_LdIFhRJOSCNjAo(-;I!e_`O7K*^V<?qDn(24JQ@Z-;I-hqqb>?q>;$V
zw5^=T^6Vc({?0lHPbAu<-y^aj`v;NVB$5-kubVq@dG-$?f1gOoh2SKzvL#SXwSUe2
zLF6}y<V5awh%C?k`-%K(A`%oGj7X%F#k%E^Ri6Fb2>FNKOO(2Z#4c#yaUZEl7ITo3
zPqM!oC;!ffgky?EA}d>OEhn-(`v;M~Yqs#+Ewuk%C-T?qA4GnWNKWJ`Te#*V5{WF&
z{`-miOYA?_W5yYGJ^PBS`!)N!5%Sk8T$hjLU){IAWZ@d8%oO25NS$$Wx5~!tv+VCT
zZvW=K{i7FmQI>JMNy=F&s_cntjW|?ff4A}e7se#zMAAs)*A<5>&;CK=*IlWGNQ#rl
z%5FDkB(fs=?<Z0VH>5Hy-`k>@W(O%_d)$#qASh4t^XVGw9oV_8haYb0;1lG5n=s%`
z4DN_+yYrhv;GR@&0?p1;Dn~&x{#?yYR7AW;io0v$mS?#A8t$blVsKA22`y5na2t8t
zN)Wf;#65f^xFx9?Hzmfs<w)FPU%~A;hg)^y7Upy?5s$la<F2o`lOJxni<`c3+uGw6
z{kUx$_gl(J-0T=PAtZ5Ie%#g-_jOasagS%*G8Q+il;G}nLWPa7gJ$O|L8wZAdsE;x
zu_BtkpNYq<xI+nTeDD(vjP1+&?H|~g+m$Q8-bNlC85a>75IKO2zzrSbF#>m=z_wl<
z4t$?AndMt={UbWeZ}eqs6Ppml;`S4`$Ch0T8y04V&Bdm3UmgFAeN@CHly?!o8C&Z7
zD%Jm%-ts%QRldE7KqXJG<F;h`pBVQ4n2671qqE+pvhk^@)2KU^_K9G9)V#uzHr$qb
z<qH2#Gi{ue{@fJ1d-%KYt@P^9xkG?}G}hQCWo(>0EI2AURLP?9f6-+$3am0VGCDX6
zw+^(o`3(V@Er1+)+Tm6{ajaeMe>fq$`O`S2R+Vi<B<L3z7j35@CNhEz<_>>Q!|l=}
z`IMTN(0HDjm>=A5+)$!V$>HP%O|FI-<Nv)7iuyc+64C#I91x##K>ZJLfIl<yStw2X
zFLEIHj02?Pe@_?0pHHd8IHl_IYXFYnhDsjG1}j1%>^pb|c=!di@xm<u@y9JbZVFf)
zVsI0b!J!eV$icz<&d7KsACh(M;Qd8Z!p~;P<1+5Nu*kuVAIrDt_RaGBLPPp}JmF&i
zqz;Y#BN5QAb3iT_vWtvjBkaDc?2koLzo|<Z78%3Z{qyFNn$4=YDfF+KMvDUoTx;b%
z&baS*4?7nZZrEaC<>67hUEl-tR=+ueaS;n$#JE7q-D&>h3}a6}pTI8MqQWO4VEi$>
z{UniFaA;|q|LdgvPf6Gxndjpe@xkx^!m#{iApg^0sru`%;93yB4)pnq<KMNOgg??j
zGru}P{K+kv#NSaui-^CPA|;J`e#;L2x-;qzBgieP@dow(m=VP1b^k*nh-Zjhgd%4~
zjJI3=gb~d7Rof~zf`3H3{*@N+&x{Ga^!O*nB==im^2u(<U)T8`gW^PCd{Ln&QsU1I
zO0RF(;XgDeKcdmU{dSJ#?#5T5(f?lQr35}3{{K_yg@0tW^F!%BS9<3Ev(o>eLHVKd
z|5~Nzp6&e;O8-r#BKadq|3kk1>GJiTE^%mr7O^uy{kiD=n~~%nlCOWMQ~B=)1s}`#
z6H4Fvn~@~*L+iQQ#Q(P!HgN-l|7uQ;S}*-`3!8c0oIm~{0sEo$Tr7!?+Yut}$#sf*
z`TtL-{SRONKX5yPla~KX>wlFm?s@Y+Az$CrdfZa_58N;NS86@JY4P8ViuBJ->wh@=
zKln`8|CqD?sP_0P>3>S?zbRmUOznTnA^+KP$p7Ag;jgX#DFOTOAjThkto8r43F90j
ze*XA_7#~fTgxfy+Ped+19>l;OR>!-q6_2I=x#wek9Tw@I3n<MD%<k#nr<YlVo>@Hi
zkExk4{sw(LNxc18?edpg{@!1F#WxEpUV!?3Kd@(cQLp@crpoV#|9J4M{4xIjF?&aR
zuLsZY7lm_lwd1v_2$Pcfy4qXT$VgLcV??H7t29Z;VH)Hc&QYxEYg3S=-f=r#HZ!N6
zd$ancwtI}my9fft?rFBYEz@AJ;jZT?yKL&5$qHI|Axe5-?5UQ5VPp50T#46eTre}L
zy<^>RdyAv@Ihtqhnf<t%sKJ=s$|LcZ=%GB;U32s*t=$~|{wId)`k2@K{)BVSM~vBB
zRB^(m=UrRgua$YWSv4!MmVv32rT)M;Hafvf%REERe0nDTXZz;0uC3M&+wkL|RQ?s8
zAGSgB80&xf^rM!RnV#7)ZTxtdnWM8sTT%Ys<r<nVg7Rb6a?~=>d^ug7Z(n}Ato(Rw
zId&;8-vw_!aQ~ruA7(a;ckWsm=Wf6`_ilz26<8W-3iAJE=xOWi)z-3ZL<J18w`ok3
ze;hcRY~wH~k59_Wn`A@rdA}CX(u%@9l^c#~(QHgqWJC<>A3B89*pK<T=JR!J4{M|G
zu&m_TOeJYNNfI-O7jnf>R0W$D5*?*<3W=;%@iih+#X9wm!CqRLS$^5eYJB<fLY7s^
z&&HFnquSWfz1nD?KiWHk7<|V|S>E6S+U5teZT+yl-M&_fRZSf$OG9bgn1o5)iY)MJ
zlkr7q$CzRZGc7A~EiFFVR6#4mYB?<xiG@lDNfK6;HA5rtDPerCSP`cjz{UzXxPMaK
z%q=oJTppqFkHu%<!`X;fU(N<wSsKL*3=@>wV=MD`P35(tq9auV>BPo4DY0m@Jj{v2
z`ngmRljCBXSbS5@iTtE~lMb4Tw}}pk<Epo^)bZ)`Nxr^&q$i81)$!kUVfgTSY=;PS
z<R=A8g+hgdP~q#*QZXZ!3&mEJrb@9$uAo$`Lc|C~EG_+{xV~B*7Q<Ru>cvLKA+sNO
zsmAvuIx;SnjaFG%es=gtF~j9aPP3`-yE0_=RmbyH5~jU?PqHg0v<W~x42=kB%QI?&
z@AJh9)c7_w)+|Gb5wOwGk<l&!B}$8+wcs}t2vls05|bk$xfc-R3V94Gh-R^UF|n+Q
zyI78{hHk?tZP&fayxA16F)8gz)>ux;;~P^x8ir}F`|o{RWWi-0Yue4$)m78hitNRG
z$|AdLh~~0um+58FKaKzF?7+~TlzLwSF1wvXv^h9r`={Ba_~g7u%BV#OTByRe+Hsss
z)oO)YMd6F>LW)!fRq9WOas9Kgt(B$gpN##xam}gdJ8xa^AM9L(;rXPs6^e;+@TtO<
zm#AlcM_~4oK5CSfRXy{)_GS~b%fFv(ZdOq7{gJcT)Qa!7NVBN&B9ldC(dAz^(q@Gf
zC8oQW^({|c)XFTr{OkKZW*aN6b1=lLK&L!CZ=|+aieCA_i=s~sE^hziVAYjR57z3K
zov)~D_BsQzV$<>pE}u3qJ6+L`MduC7hHx2ZJj?2|uO>%H9UL5^?8k=7&1^BXsTS%E
zzAh1#RkL0zo<_53O?4~{HO8ZZds8j1dfe$;Vil#vczl|s%SH_P^w^MN*bNoo(?o-C
zHnv}+>eD3s$mo#IYd4LI4MrQIWP@Yy&5bbjv*LQukzpUJG{&}=sN$mKv7wO>{;V=G
zLKUNFp+2vP71!a)S%pT%qs@y#L*V|~Gq`+f%=d_Am2q5Mm6oWBCAx&kBO=fMR9e6S
z3!oW|`|<)DX;nXVEA+D-X`<hXrs`qI&)k}(Kjki?yc7n?R`smPYTM4Vt@qIx@}sA5
zf`zu`h5xU8tBGymh{C%rvZA)fdbO<_iPMyp7XJ}y(}I|df0EFc*u{T)K+W3VG&m&1
z#3ng$s@kfDsLBDM%Bm{VQYj}CA%qY}2q9Xbo)ASTrD+N&qAiU$R2(AI@6CF>CU(q$
zD<g&7dE@!M_ujmj_dE%V-QUQ%kOZw{VP}wF8<`|2cm6_lQ;y$ux^_zJ_8aa-zG}pP
zi9Y90<{a$mCU*LDceXXjs4b}o*sB8j2CzSr*n_uhY%7#Z=v7%+mD$2(_V7)r{q1C=
zw$E2Up8cD&KJ%N?rAzD!uZ>IrX`;?Jlo<zG*}_)ehQcjbOKq3lfIKIVXBM5V0}{LG
zw~-s6ZNguWl?yVv=VK>)?kuvLx;FO#>oTwu<v(EUf|fltR((g9NS9>glFUwQWj_Yo
z*(WojY9>rTEeNPHzdK!;#I6TzD03&7NH-nIO$R&w4qNSUXIt{7+LBy>yCTee1l)ZR
zd%WGoZP|||MNL*}GOKN8iy?Pbq+Qia!od3UA2RpGlGEjt*vTC>me{`L{CQcKm)YEV
zY~@{)KFw+R$t(@Hnt=P{52q_Gv2z8t>&1EjxOVf`HDgMRvB&$^S342^RytQ3IX>7%
z29|lSsLFM)ApVy=vXwRu6BGN`2IbfW_EV89?RRgRd12EtrhWCPl?4>LCKUVfj?;BZ
zme}lPvh9HNPc>8BlikY6Zic+VRt~}!RyCnEN(|-brZ!TL-T~=r%7eSY8;`zg;_P%%
zHnSpuvFo49ptY(LwNbhdZL1Wtj|J`AvY=hO(nQ<g_-$rIrex>8CswQYQM;^)h*(>t
zU_JhTjJ<ENp1Iq^y55;?QqFcOv)v5&n0<4VD6N7@?Xs#YqHL9d^0c5l^`|g&=7Pfx
zx8_7)($q3aEyL!|I@rTy8T)x{L8i7;Z6LTN2$n&B{e0yIr%llAz6F&7J9W|V{Kn<^
zjmz^JmuL9x6E@zm*m#%kIbAh}#Lg~Y=RNhp0<}7u`Pqph#QNG|-<Pz%vrvxxOJ~;7
zeQ|(%`)kwwZganY@}DjI;#~iZK{)|9RrwlqT3Ax9o^ztcWL@;~3Vj_?U*<{tPucC}
z7Y6m#OCD;zYJ_v1zz?<EoL6~okq?jvPtG+VM;Cq0`>6+!^+ab7l5iQ*_{SXYv1%B?
z2=$}X-&4f2-XI6o2u+JppXaC_;GiBT0!~Yj1`+DVi9s;)0=)tajdbMYp=pojs8E$i
zscBjQMD;dK<JBCAr;R{*D(6oDZ#3xf1W9NrO)rW5JUyTYa9~%}9I~8Hr!%Sh21@yW
zRxu96ODTV8n{Iw6=Q-f<L<xip6)h4qRTZ6n8rSHveNmsD9q1uxVkpp2|3MnBmhnlg
zQiz7U`LXDzo(@guTq_fvaDo6Ov`AR=2M=$CedvoN!BjF$n=k?+$wDUW(^E+waJ<8r
z&}7P=9yR<u2lLh5@pbXIfuLx{8@KQhMl3pMjFl#JZ#zuRw+Z|3an9j*(KLRPo?47T
z4KK9z)C~FuI&Dp6M@qRsTIiST!<uE26VP$5o{3kCvE(%G9VPpO*G+$8Vq#GAM=Ag%
z_fwpr#Yh?}RrW_PH>>#yv2>)69T+J<W|ZQkoIf<F_fhC*;~vaifBm!^jSE-D(o^}d
z&=ldr-x&)2aFGvD+@ST>_lJnDK9W#N42nM9Vx*i_q^D?>{g}_W-p=0mG}$Zs*NCS?
z%8GYHtQZ$d553Cuc#M#L*S9C^C40oASjd|VB&Twjcqto*kHAl5JtJ)4dxikgVSnwW
z@hYjNRq|tnfg~9b3q#mJ->4WD_8S?jb1ZZaVbG6B9rV?<(ztL|tT2_2#lwa_St2_A
z{$7l9;M*<6h3oW8k01NL^XCV}xL7*G#}OkFoz!V@vH0-U9?paHB`%tW5Ec{7S`SSV
zP9PDyB0{7d7wG2>rALSTY3~Oq)4#w=)xp-r4{h4|QAcm%@R8MuB*(cd&R+-xKb}y5
zVn3Qr;RqB<i#R`0kB1$S`s&2+VSFPyf-CTimHmgyxDwgu3v9ukb4a^zLq5clzW)Iu
Ce&(0}

literal 0
HcmV?d00001

diff --git a/test_fixtures/wal/9.dat b/test_fixtures/wal/9.dat
new file mode 100644
index 0000000000000000000000000000000000000000..f261e7d876c5f4240b515395a1f9c9bacc134e09
GIT binary patch
literal 467
zcmXAk!D|yy5XNUV@0HzW)2<JyNkCaFUNos`LtDIr*SdtPSz=`|S}1JS%}WhUc1!m~
zR1kX6lSeQ1XdyRG9&_o%lg-)xz>{Fl71T}oV;DYWhHv<sX3c%_tW|+ajHosK0f3`_
zGXP05*gDvTF9@UEi}N8pKV~PCf24e*@jBCcaTus>9GO~SeRGxFWs6A=sle~YQ8x_2
z_;swLA0A4%e=yjS%1`1!)aC?cbBjE6N7S@i2;5<F&ts+HzTcL6>2+de3M;%^nc|4!
zPZR2*Ghyh6={Rx3_$+1CzTD?+fsNurHlGX*J3%P@_L2)Q#-i?S396H`M}|&5G>jdR
zcT%<8Fxu*8LrmV67*XB6G)ci?H&0LA{i3H|hkxh|wlI(PUfpIiQGt?a#_O94><S|f
zHlA+q3}|GU;kT7~<Yp|5fBRd-r79L5vujF5`+i4CWzdxiz&AR8U0Ty^(V<9eL38BD
zF0s6Y--|B0n#jQK$(ORX)&!^^63@%lz%#1gW=&G_j27%>b#mm<=KDIX?%><cb$pR2
Yu;u&f<x+V$Eer1uT{E+3&zN)n0S>cw!vFvP

literal 0
HcmV?d00001

diff --git a/test_helpers/Cargo.toml b/test_helpers/Cargo.toml
index 0568202f111..1b0882ef708 100644
--- a/test_helpers/Cargo.toml
+++ b/test_helpers/Cargo.toml
@@ -5,16 +5,19 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 dotenvy = "0.15.7"
 parking_lot = "0.12"
-tempfile = "3.8.0"
-tracing-log = "0.1"
+tempfile = "3.9.0"
+tracing-log = "0.2"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 observability_deps = { path = "../observability_deps" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-async-trait = { version = "0.1.73", optional = true }
-tokio = { version = "1.32.0", optional = true, default_features = false, features = ["time"] }
+async-trait = { version = "0.1.77", optional = true }
+tokio = { version = "1.35.1", optional = true, default_features = false, features = ["time"] }
 
 [features]
 default = []
diff --git a/test_helpers_end_to_end/Cargo.toml b/test_helpers_end_to_end/Cargo.toml
index e7f6104779d..64ad443179f 100644
--- a/test_helpers_end_to_end/Cargo.toml
+++ b/test_helpers_end_to_end/Cargo.toml
@@ -5,36 +5,43 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-arrow = { workspace = true, features = ["prettyprint"] }
+arrow = { workspace = true }
 arrow-flight = { workspace = true }
 arrow_util = { path = "../arrow_util" }
-assert_cmd = "2.0.12"
+assert_cmd = "2.0.13"
+assert_matches = "1.5.0"
 bytes = "1.5"
 data_types = { path = "../data_types" }
 dml = { path = "../dml" }
 futures = "0.3"
 generated_types = { path = "../generated_types" }
-http = "0.2.9"
+http = "0.2.11"
 hyper = "0.14"
 influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] }
 ingester_query_grpc = { path = "../ingester_query_grpc" }
+insta = { version = "1.34.0", features = ["yaml"] }
 iox_catalog = { path = "../iox_catalog" }
+iox_query_params = { path = "../iox_query_params" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch_pb = { path = "../mutable_batch_pb" }
 nix = { version = "0.27", default-features = false, features = ["signal"] }
 observability_deps = { path = "../observability_deps" }
-once_cell = { version = "1.18", features = ["parking_lot"] }
+once_cell = { version = "1.19", features = ["parking_lot"] }
 parking_lot = "0.12"
-prost = "0.11"
+prost = { workspace = true }
 rand = "0.8.3"
-regex = "1.9"
-reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] }
-snafu = "0.7"
-sqlx = { version = "0.7.1", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] }
-tempfile = "3.8.0"
+regex = "1.10"
+reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] }
+serde_json = "1.0.111"
+snafu = "0.8"
+sqlx = { version = "0.7.3", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] }
+tempfile = "3.9.0"
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
-tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-util = "0.7"
 tonic = { workspace = true }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
diff --git a/test_helpers_end_to_end/src/addrs.rs b/test_helpers_end_to_end/src/addrs.rs
index 2fc6249da80..69a0ff5479f 100644
--- a/test_helpers_end_to_end/src/addrs.rs
+++ b/test_helpers_end_to_end/src/addrs.rs
@@ -11,7 +11,14 @@ use std::{
 // running locally.
 static NEXT_PORT: AtomicU16 = AtomicU16::new(8090);
 
-// represents port on localhost to bind / connect to
+/// Socket type
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum SocketType {
+    Tcp,
+    Udp,
+}
+
+/// Represents port on localhost to bind / connect to
 #[derive(Debug, Clone)]
 pub struct Address {
     /// the actual address, on which to bind. Example `127.0.0.1:8089`
@@ -21,23 +28,33 @@ pub struct Address {
 }
 
 impl Address {
-    fn new() -> Self {
-        let bind_addr = Self::get_free_port().to_string();
-        let client_base = format!("http://{bind_addr}");
+    fn new(t: SocketType) -> Self {
+        let bind_addr = Self::get_free_port(t).to_string();
+        let protocol = match t {
+            SocketType::Tcp => "http",
+            SocketType::Udp => "udp",
+        };
+        let client_base = format!("{protocol}://{bind_addr}");
 
         Self {
             bind_addr: bind_addr.into(),
             client_base: client_base.into(),
         }
     }
-    fn get_free_port() -> SocketAddrV4 {
+
+    fn get_free_port(t: SocketType) -> SocketAddrV4 {
         let ip = std::net::Ipv4Addr::new(127, 0, 0, 1);
 
         loop {
             let port = NEXT_PORT.fetch_add(1, Ordering::SeqCst);
             let addr = SocketAddrV4::new(ip, port);
 
-            if std::net::TcpListener::bind(addr).is_ok() {
+            let is_working = match t {
+                SocketType::Tcp => std::net::TcpListener::bind(addr).is_ok(),
+                SocketType::Udp => std::net::UdpSocket::bind(addr).is_ok(),
+            };
+
+            if is_working {
                 return addr;
             }
         }
@@ -57,26 +74,94 @@ impl Address {
 pub struct BindAddresses {
     router_http_api: std::sync::Mutex<Option<Address>>,
     router_grpc_api: std::sync::Mutex<Option<Address>>,
+    router_gossip_api: std::sync::Mutex<Option<Address>>,
+    querier_http_api: std::sync::Mutex<Option<Address>>,
     querier_grpc_api: std::sync::Mutex<Option<Address>>,
+    querier_gossip_api: std::sync::Mutex<Option<Address>>,
+    ingester_http_api: std::sync::Mutex<Option<Address>>,
     ingester_grpc_api: std::sync::Mutex<Option<Address>>,
+    ingester_gossip_api: std::sync::Mutex<Option<Address>>,
+    compactor_http_api: std::sync::Mutex<Option<Address>>,
     compactor_grpc_api: std::sync::Mutex<Option<Address>>,
+    compactor_gossip_api: std::sync::Mutex<Option<Address>>,
+    catalog_http_api: std::sync::Mutex<Option<Address>>,
+    catalog_grpc_api: std::sync::Mutex<Option<Address>>,
+    catalog_gossip_api: std::sync::Mutex<Option<Address>>,
+    parquet_cache_http_api: std::sync::Mutex<Option<Address>>,
 }
 
 impl BindAddresses {
     pub fn router_http_api(&self) -> Address {
-        get_or_allocate(&self.router_http_api)
+        get_or_allocate(&self.router_http_api, SocketType::Tcp)
     }
+
     pub fn router_grpc_api(&self) -> Address {
-        get_or_allocate(&self.router_grpc_api)
+        get_or_allocate(&self.router_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn router_gossip_api(&self) -> Address {
+        get_or_allocate(&self.router_gossip_api, SocketType::Udp)
+    }
+
+    pub fn querier_http_api(&self) -> Address {
+        get_or_allocate(&self.querier_http_api, SocketType::Tcp)
     }
+
     pub fn querier_grpc_api(&self) -> Address {
-        get_or_allocate(&self.querier_grpc_api)
+        get_or_allocate(&self.querier_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn querier_gossip_api(&self) -> Address {
+        get_or_allocate(&self.querier_gossip_api, SocketType::Udp)
+    }
+
+    pub fn ingester_http_api(&self) -> Address {
+        get_or_allocate(&self.ingester_http_api, SocketType::Tcp)
     }
+
     pub fn ingester_grpc_api(&self) -> Address {
-        get_or_allocate(&self.ingester_grpc_api)
+        get_or_allocate(&self.ingester_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn ingester_gossip_api(&self) -> Address {
+        get_or_allocate(&self.ingester_gossip_api, SocketType::Udp)
     }
+
+    pub fn compactor_http_api(&self) -> Address {
+        get_or_allocate(&self.compactor_http_api, SocketType::Tcp)
+    }
+
     pub fn compactor_grpc_api(&self) -> Address {
-        get_or_allocate(&self.compactor_grpc_api)
+        get_or_allocate(&self.compactor_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn compactor_gossip_api(&self) -> Address {
+        get_or_allocate(&self.compactor_gossip_api, SocketType::Udp)
+    }
+
+    pub fn catalog_http_api(&self) -> Address {
+        get_or_allocate(&self.catalog_http_api, SocketType::Tcp)
+    }
+
+    pub fn catalog_grpc_api(&self) -> Address {
+        get_or_allocate(&self.catalog_grpc_api, SocketType::Tcp)
+    }
+
+    pub fn catalog_gossip_api(&self) -> Address {
+        get_or_allocate(&self.catalog_gossip_api, SocketType::Udp)
+    }
+
+    pub fn all_gossip_apis(&self) -> Vec<Address> {
+        vec![
+            self.router_gossip_api(),
+            self.ingester_gossip_api(),
+            self.compactor_gossip_api(),
+            self.querier_gossip_api(),
+        ]
+    }
+
+    pub fn parquet_cache_http_api(&self) -> Address {
+        get_or_allocate(&self.parquet_cache_http_api, SocketType::Tcp)
     }
 }
 
@@ -97,13 +182,16 @@ impl Display for BindAddresses {
         if let Some(addr) = self.compactor_grpc_api.lock().unwrap().as_ref() {
             write!(f, "compactor_grpc: {} ", addr.bind_addr)?
         }
+        if let Some(addr) = self.catalog_grpc_api.lock().unwrap().as_ref() {
+            write!(f, "catalog_grpc: {} ", addr.bind_addr)?
+        }
         Ok(())
     }
 }
 
-fn get_or_allocate(locked_addr: &std::sync::Mutex<Option<Address>>) -> Address {
+fn get_or_allocate(locked_addr: &std::sync::Mutex<Option<Address>>, t: SocketType) -> Address {
     let mut locked_addr = locked_addr.lock().unwrap();
-    let addr = locked_addr.take().unwrap_or_else(Address::new);
+    let addr = locked_addr.take().unwrap_or_else(|| Address::new(t));
     *locked_addr = Some(addr.clone());
     addr
 }
diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs
index 53ce5f0b2ba..74e8ada9612 100644
--- a/test_helpers_end_to_end/src/client.rs
+++ b/test_helpers_end_to_end/src/client.rs
@@ -9,6 +9,7 @@ use influxdb_iox_client::{
     connection::Connection,
     ingester::generated_types::{write_service_client::WriteServiceClient, WriteRequest},
 };
+use iox_query_params::StatementParam;
 use mutable_batch_lp::lines_to_batches;
 use mutable_batch_pb::encode::encode_write;
 use std::fmt::Display;
@@ -16,10 +17,10 @@ use tonic::IntoRequest;
 
 /// Writes the line protocol to the write_base/api/v2/write endpoint (typically on the router)
 pub async fn write_to_router(
-    line_protocol: impl Into<String>,
-    org: impl AsRef<str>,
-    bucket: impl AsRef<str>,
-    write_base: impl AsRef<str>,
+    line_protocol: impl Into<String> + Send,
+    org: impl AsRef<str> + Send,
+    bucket: impl AsRef<str> + Send,
+    write_base: impl AsRef<str> + Send,
     authorization: Option<&str>,
 ) -> Response<Body> {
     let client = Client::new();
@@ -46,7 +47,7 @@ pub async fn write_to_router(
 
 /// Writes the line protocol to the WriteService endpoint (typically on the ingester)
 pub async fn write_to_ingester(
-    line_protocol: impl Into<String>,
+    line_protocol: impl Into<String> + Send,
     namespace_id: NamespaceId,
     table_id: TableId,
     ingester_connection: Connection,
@@ -80,8 +81,28 @@ pub async fn write_to_ingester(
 
 /// Runs a SQL query using the flight API on the specified connection.
 pub async fn try_run_sql(
-    sql_query: impl Into<String>,
-    namespace: impl Into<String>,
+    sql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    try_run_sql_with_params(
+        sql_query,
+        namespace,
+        [],
+        querier_connection,
+        authorization,
+        with_debug,
+    )
+    .await
+}
+
+/// Runs a SQL query using the flight API on the specified connection.
+pub async fn try_run_sql_with_params(
+    sql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
     querier_connection: Connection,
     authorization: Option<&str>,
     with_debug: bool,
@@ -98,7 +119,12 @@ pub async fn try_run_sql(
     // Normally this would be done one per connection, not per query
     client.handshake().await?;
 
-    let mut stream = client.sql(namespace.into(), sql_query.into()).await?;
+    let mut stream = client
+        .query(namespace)
+        .sql(sql_query.into())
+        .with_params(params)
+        .run()
+        .await?;
 
     let batches = (&mut stream).try_collect().await?;
 
@@ -114,8 +140,25 @@ pub async fn try_run_sql(
 
 /// Runs a InfluxQL query using the flight API on the specified connection.
 pub async fn try_run_influxql(
-    influxql_query: impl Into<String>,
-    namespace: impl Into<String>,
+    influxql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
+    try_run_influxql_with_params(
+        influxql_query,
+        namespace,
+        [],
+        querier_connection,
+        authorization,
+    )
+    .await
+}
+
+pub async fn try_run_influxql_with_params(
+    influxql_query: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
     querier_connection: Connection,
     authorization: Option<&str>,
 ) -> Result<(Vec<RecordBatch>, SchemaRef), influxdb_iox_client::flight::Error> {
@@ -129,7 +172,10 @@ pub async fn try_run_influxql(
     client.handshake().await?;
 
     let mut stream = client
-        .influxql(namespace.into(), influxql_query.into())
+        .query(namespace)
+        .influxql(influxql_query.into())
+        .with_params(params)
+        .run()
         .await?;
 
     let batches = (&mut stream).try_collect().await?;
@@ -148,8 +194,8 @@ pub async fn try_run_influxql(
 ///
 /// Use [`try_run_sql`] if you want to check the error manually.
 pub async fn run_sql(
-    sql: impl Into<String>,
-    namespace: impl Into<String>,
+    sql: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
     querier_connection: Connection,
     authorization: Option<&str>,
     with_debug: bool,
@@ -165,12 +211,35 @@ pub async fn run_sql(
     .expect("Error executing sql query")
 }
 
+/// Runs a SQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_sql`] if you want to check the error manually.
+pub async fn run_sql_with_params(
+    sql: impl Into<String> + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+    with_debug: bool,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_sql_with_params(
+        sql,
+        namespace,
+        params,
+        querier_connection,
+        authorization,
+        with_debug,
+    )
+    .await
+    .expect("Error executing sql query")
+}
+
 /// Runs an InfluxQL query using the flight API on the specified connection.
 ///
 /// Use [`try_run_influxql`] if you want to check the error manually.
 pub async fn run_influxql(
-    influxql: impl Into<String> + Clone + Display,
-    namespace: impl Into<String>,
+    influxql: impl Into<String> + Clone + Display + Send,
+    namespace: impl Into<String> + Send,
     querier_connection: Connection,
     authorization: Option<&str>,
 ) -> (Vec<RecordBatch>, SchemaRef) {
@@ -183,3 +252,24 @@ pub async fn run_influxql(
     .await
     .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}"))
 }
+
+/// Runs an InfluxQL query using the flight API on the specified connection.
+///
+/// Use [`try_run_influxql`] if you want to check the error manually.
+pub async fn run_influxql_with_params(
+    influxql: impl Into<String> + Clone + Display + Send,
+    namespace: impl Into<String> + Send,
+    params: impl IntoIterator<Item = (String, StatementParam)> + Send,
+    querier_connection: Connection,
+    authorization: Option<&str>,
+) -> (Vec<RecordBatch>, SchemaRef) {
+    try_run_influxql_with_params(
+        influxql.clone(),
+        namespace,
+        params,
+        querier_connection,
+        authorization,
+    )
+    .await
+    .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}"))
+}
diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs
index 91bc9fdcb37..7c45a5e276d 100644
--- a/test_helpers_end_to_end/src/config.rs
+++ b/test_helpers_end_to_end/src/config.rs
@@ -34,6 +34,9 @@ pub struct TestConfig {
 
     /// Which ports this server should use
     addrs: Arc<BindAddresses>,
+
+    /// Wait for server to be ready during creation.
+    wait_for_ready: bool,
 }
 
 impl TestConfig {
@@ -58,6 +61,7 @@ impl TestConfig {
             wal_dir: None,
             catalog_dir,
             addrs: Arc::new(BindAddresses::default()),
+            wait_for_ready: true,
         }
     }
 
@@ -73,6 +77,45 @@ impl TestConfig {
         .with_catalog_dir(other.catalog_dir.as_ref().map(Arc::clone))
     }
 
+    /// Create new catalog node w/o peers
+    fn new_catalog(dsn: Option<String>, catalog_schema_name: String) -> Self {
+        Self::new(ServerType::Catalog, dsn, catalog_schema_name)
+            .with_env("INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY", "100ms")
+    }
+
+    /// Create a triplet of catalog cache nodes.
+    pub fn catalog_nodes(dsn: impl Into<String>) -> [Self; 3] {
+        let dsn = Some(dsn.into());
+        let catalog_schema_name = random_catalog_schema_name();
+
+        let n0 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+        let n1 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+        let n2 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone());
+
+        let n0 = n0.with_catalog_peers([
+            n1.addrs().catalog_http_api().client_base(),
+            n2.addrs().catalog_http_api().client_base(),
+        ]);
+        let n1 = n1.with_catalog_peers([
+            n0.addrs().catalog_http_api().client_base(),
+            n2.addrs().catalog_http_api().client_base(),
+        ]);
+        let n2 = n2.with_catalog_peers([
+            n0.addrs().catalog_http_api().client_base(),
+            n1.addrs().catalog_http_api().client_base(),
+        ]);
+
+        [n0, n1, n2]
+    }
+
+    /// Create a minimal router configuration that doesn't connect to an ingester. If you need a
+    /// router that connects to an ingester, call `new_ingester` first and then pass the resulting
+    /// `TestConfig` to `new_router`.
+    pub fn router_only(dsn: impl Into<String>) -> Self {
+        let dsn = Some(dsn.into());
+        Self::new(ServerType::Router, dsn, random_catalog_schema_name()).with_new_object_store()
+    }
+
     /// Create a minimal router2 configuration sharing configuration with the ingester2 config
     pub fn new_router(ingester_config: &TestConfig) -> Self {
         assert_eq!(ingester_config.server_type(), ServerType::Ingester);
@@ -117,6 +160,7 @@ impl TestConfig {
             wal_dir: None,
             catalog_dir: ingester_config.catalog_dir.as_ref().map(Arc::clone),
             addrs: Arc::new(BindAddresses::default()),
+            wait_for_ready: ingester_config.wait_for_ready,
         }
         .with_existing_object_store(ingester_config)
         .with_new_wal()
@@ -224,6 +268,11 @@ impl TestConfig {
             .with_env("INFLUXDB_IOX_SINGLE_TENANCY", "true")
     }
 
+    /// Enable partial writes.
+    pub fn with_partial_writes(self) -> Self {
+        self.with_env("INFLUXDB_IOX_PARTIAL_WRITES_ENABLED", "true")
+    }
+
     // Get the catalog DSN URL if set.
     pub fn dsn(&self) -> &Option<String> {
         &self.dsn
@@ -323,6 +372,46 @@ impl TestConfig {
             .with_env("INFLUXDB_IOX_COMPACTION_SHARD_ID", shard_id.to_string())
     }
 
+    /// Limit the number of concurrent queries.
+    pub fn with_max_concurrent_queries(self, n: usize) -> Self {
+        self.with_env("INFLUXDB_IOX_MAX_CONCURRENT_QUERIES", n.to_string())
+    }
+
+    /// Set up a metadata signing key for bulk ingest.
+    pub fn with_bulk_ingest_metadata_signing_key(self, metadata_signing_key_file: &str) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE",
+            metadata_signing_key_file,
+        )
+    }
+
+    /// Use a mock presigned URL generator rather than whatever object store may have been
+    /// configured. Allows for testing bulk ingest without needing S3.
+    pub fn with_mock_presigned_url_signer(self) -> Self {
+        self.with_env(
+            "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER",
+            "true",
+        )
+    }
+
+    /// Register catalog peers.
+    pub fn with_catalog_peers<I, S>(self, peers: I) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: std::fmt::Display,
+    {
+        let peers = peers.into_iter().map(|s| s.to_string()).collect::<Vec<_>>();
+        self.with_env("INFLUXDB_IOX_CATALOG_CACHE_PEERS", peers.join(","))
+    }
+
+    /// Set [`wait_for_ready`](Self::wait_for_ready).
+    pub fn with_wait_for_ready(self, wait_for_ready: bool) -> Self {
+        Self {
+            wait_for_ready,
+            ..self
+        }
+    }
+
     /// Get the test config's server type.
     #[must_use]
     pub fn server_type(&self) -> ServerType {
@@ -351,6 +440,28 @@ impl TestConfig {
     pub fn ingester_base(&self) -> Arc<str> {
         self.addrs().ingester_grpc_api().client_base()
     }
+
+    /// Return a HTTP base that is usable for health and metrics.
+    ///
+    /// This depends on the [server type](Self::server_type).
+    #[must_use]
+    pub fn http_base(&self) -> Arc<str> {
+        let addr = match self.server_type {
+            ServerType::AllInOne => self.addrs.router_http_api(),
+            ServerType::Ingester => self.addrs.ingester_http_api(),
+            ServerType::Router => self.addrs.router_http_api(),
+            ServerType::Querier => self.addrs.querier_http_api(),
+            ServerType::Compactor => self.addrs.compactor_http_api(),
+            ServerType::Catalog => self.addrs.catalog_http_api(),
+            ServerType::ParquetCache => self.addrs.parquet_cache_http_api(),
+        };
+        addr.client_base()
+    }
+
+    /// Wait for server to be ready during creation.
+    pub fn wait_for_ready(&self) -> bool {
+        self.wait_for_ready
+    }
 }
 
 fn random_catalog_schema_name() -> String {
diff --git a/test_helpers_end_to_end/src/data_generator.rs b/test_helpers_end_to_end/src/data_generator.rs
index d167fd86873..c070abd247f 100644
--- a/test_helpers_end_to_end/src/data_generator.rs
+++ b/test_helpers_end_to_end/src/data_generator.rs
@@ -1,6 +1,7 @@
 use std::time::SystemTime;
 
 /// Manages a dataset for writing / reading
+#[derive(Debug)]
 pub struct DataGenerator {
     ns_since_epoch: i64,
     line_protocol: String,
diff --git a/test_helpers_end_to_end/src/database.rs b/test_helpers_end_to_end/src/database.rs
index 4aa16baaa4b..8e242284e9a 100644
--- a/test_helpers_end_to_end/src/database.rs
+++ b/test_helpers_end_to_end/src/database.rs
@@ -11,7 +11,7 @@ use tokio::sync::Mutex;
 static DB_INITIALIZED: Lazy<Mutex<BTreeSet<String>>> = Lazy::new(|| Mutex::new(BTreeSet::new()));
 
 /// Performs once-per-process database initialization, if necessary
-pub async fn initialize_db(dsn: &str, schema_name: &str) {
+pub(crate) async fn initialize_db(dsn: &str, schema_name: &str) {
     let mut init = DB_INITIALIZED.lock().await;
 
     // already done
diff --git a/test_helpers_end_to_end/src/grpc.rs b/test_helpers_end_to_end/src/grpc.rs
index 3cd14dae1e1..0ad3da3fe87 100644
--- a/test_helpers_end_to_end/src/grpc.rs
+++ b/test_helpers_end_to_end/src/grpc.rs
@@ -491,7 +491,7 @@ impl GrpcRequestBuilder {
     }
 }
 
-pub fn field_ref_node(field_name: impl Into<String>) -> Node {
+pub(crate) fn field_ref_node(field_name: impl Into<String>) -> Node {
     Node {
         node_type: NodeType::FieldRef.into(),
         children: vec![],
@@ -499,7 +499,7 @@ pub fn field_ref_node(field_name: impl Into<String>) -> Node {
     }
 }
 
-pub fn tag_ref_node(tag_name: impl Into<Vec<u8>>) -> Node {
+pub(crate) fn tag_ref_node(tag_name: impl Into<Vec<u8>>) -> Node {
     Node {
         node_type: NodeType::TagRef as i32,
         children: vec![],
@@ -507,7 +507,7 @@ pub fn tag_ref_node(tag_name: impl Into<Vec<u8>>) -> Node {
     }
 }
 
-pub fn string_value_node(value: impl Into<String>) -> Node {
+pub(crate) fn string_value_node(value: impl Into<String>) -> Node {
     Node {
         node_type: NodeType::Literal as i32,
         children: vec![],
@@ -515,7 +515,7 @@ pub fn string_value_node(value: impl Into<String>) -> Node {
     }
 }
 
-pub fn comparison_expression_node(lhs: Node, comparison: Comparison, rhs: Node) -> Node {
+pub(crate) fn comparison_expression_node(lhs: Node, comparison: Comparison, rhs: Node) -> Node {
     Node {
         node_type: NodeType::ComparisonExpression as i32,
         children: vec![lhs, rhs],
diff --git a/test_helpers_end_to_end/src/http_reverse_proxy.rs b/test_helpers_end_to_end/src/http_reverse_proxy.rs
new file mode 100644
index 00000000000..5b58a93a7af
--- /dev/null
+++ b/test_helpers_end_to_end/src/http_reverse_proxy.rs
@@ -0,0 +1,160 @@
+//! Poor-mans simulation of an HTTP/2 service that randomizes incoming requests to a number of backend services.
+
+use std::{
+    net::{SocketAddr, TcpListener},
+    sync::{Arc, Weak},
+    thread::JoinHandle,
+};
+
+use http::{Request, Response};
+use hyper::{
+    client::HttpConnector,
+    service::{make_service_fn, service_fn},
+    Body, Client, Server,
+};
+use rand::seq::SliceRandom;
+use tokio_util::sync::CancellationToken;
+
+use crate::service_link::{LinkableService, LinkableServiceImpl};
+
+/// A basic HTTP reverse proxy for use by end-to-end tests
+///
+/// Intended to approximate a Kubernetes Service.
+///
+/// # Implementation
+/// This runs in a dedicated thread in its own tokio runtime. The reason is that we potentially share a single proxy
+/// between multiple tests, but every test sets up its own tokio runtime and moving IO tasks between runtimes can cause blocking.
+#[derive(Debug)]
+pub struct HttpReverseProxy {
+    addr: SocketAddr,
+    shutdown: CancellationToken,
+    task: Option<JoinHandle<()>>,
+    links: LinkableServiceImpl,
+}
+
+impl HttpReverseProxy {
+    pub fn new<I, S>(backends: I) -> Self
+    where
+        I: IntoIterator<Item = S>,
+        S: ToString,
+    {
+        let client = Client::builder().http2_only(true).build_http();
+        let inner = Arc::new(Inner {
+            backends: backends.into_iter().map(|s| s.to_string()).collect(),
+            client,
+        });
+        assert!(!inner.backends.is_empty(), "need at least 1 backend");
+
+        let addr = SocketAddr::from(([127, 0, 0, 1], 0));
+
+        let make_service = make_service_fn(move |_conn| {
+            let inner = Arc::clone(&inner);
+
+            async move {
+                Ok::<_, hyper::Error>(service_fn(move |req| {
+                    let inner = Arc::clone(&inner);
+
+                    async move { inner.handle(req).await }
+                }))
+            }
+        });
+
+        let listener = TcpListener::bind(addr).unwrap();
+        let addr = listener.local_addr().unwrap();
+
+        let shutdown = CancellationToken::new();
+        let shutdown_captured = shutdown.clone();
+        let task = std::thread::spawn(move || {
+            let rt = tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()
+                .unwrap();
+
+            rt.block_on(async {
+                let server = Server::from_tcp(listener)
+                    .unwrap()
+                    .http2_only(true)
+                    .serve(make_service);
+
+                tokio::select! {
+                    _ = shutdown_captured.cancelled() => {}
+                    res = server => {
+                        if let Err(e) = res {
+                            eprintln!("server error: {}", e);
+                        }
+                    }
+                }
+            })
+        });
+
+        Self {
+            addr,
+            shutdown,
+            task: Some(task),
+            links: Default::default(),
+        }
+    }
+
+    pub fn addr(&self) -> SocketAddr {
+        self.addr
+    }
+}
+
+impl Drop for HttpReverseProxy {
+    fn drop(&mut self) {
+        self.shutdown.cancel();
+
+        if self.task.take().expect("not joined yet").join().is_err() {
+            eprintln!("server task error, check logs");
+        }
+    }
+}
+
+impl LinkableService for HttpReverseProxy {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.links.add_link_client(client)
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        self.links.remove_link_clients()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.add_link_server(server)
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.remove_link_server(server)
+    }
+}
+
+#[derive(Debug)]
+struct Inner {
+    backends: Vec<String>,
+    client: Client<HttpConnector>,
+}
+
+impl Inner {
+    async fn handle(&self, req: Request<Body>) -> Result<Response<Body>, hyper::Error> {
+        let uri = self.pick_backend();
+
+        let (mut parts, body) = req.into_parts();
+
+        // build URI
+        let mut uri = uri.to_owned();
+        uri.push_str(parts.uri.path());
+        if let Some(q) = parts.uri.query() {
+            uri.push('?');
+            uri.push_str(q);
+        }
+        parts.uri = uri.parse().unwrap();
+
+        let req = Request::from_parts(parts, body);
+        self.client.request(req).await
+    }
+
+    fn pick_backend(&self) -> &str {
+        let mut rng = rand::thread_rng();
+        self.backends.choose(&mut rng).expect("not empty")
+    }
+}
diff --git a/test_helpers_end_to_end/src/lib.rs b/test_helpers_end_to_end/src/lib.rs
index e6d1db77a07..8fa331c9ebb 100644
--- a/test_helpers_end_to_end/src/lib.rs
+++ b/test_helpers_end_to_end/src/lib.rs
@@ -16,9 +16,11 @@ mod data_generator;
 mod database;
 mod error;
 mod grpc;
+mod http_reverse_proxy;
 mod mini_cluster;
 mod server_fixture;
 mod server_type;
+mod service_link;
 pub mod snapshot_comparison;
 mod steps;
 mod udp_listener;
@@ -30,6 +32,7 @@ pub use config::TestConfig;
 pub use data_generator::DataGenerator;
 pub use error::{check_flight_error, check_tonic_status};
 pub use grpc::GrpcRequestBuilder;
+pub use http_reverse_proxy::HttpReverseProxy;
 pub use mini_cluster::MiniCluster;
 pub use server_fixture::{ServerFixture, TestServer};
 pub use server_type::{AddAddrEnv, ServerType};
diff --git a/test_helpers_end_to_end/src/mini_cluster.rs b/test_helpers_end_to_end/src/mini_cluster.rs
index 5ab9833bc2c..49115f056e1 100644
--- a/test_helpers_end_to_end/src/mini_cluster.rs
+++ b/test_helpers_end_to_end/src/mini_cluster.rs
@@ -1,6 +1,8 @@
 use crate::{
-    dump_log_to_stdout, log_command, rand_id, server_type::AddAddrEnv, write_to_ingester,
-    write_to_router, ServerFixture, TestConfig, TestServer,
+    dump_log_to_stdout, log_command, rand_id,
+    server_type::AddAddrEnv,
+    service_link::{link_services, LinkableService},
+    write_to_ingester, write_to_router, HttpReverseProxy, ServerFixture, TestConfig, TestServer,
 };
 use arrow::{datatypes::SchemaRef, record_batch::RecordBatch};
 use arrow_flight::{
@@ -50,6 +52,12 @@ pub struct MiniCluster {
     /// Standard optional compactor configuration, to be used on-demand
     compactor_config: Option<TestConfig>,
 
+    /// Catalog reverse proxy.
+    catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
+
+    /// Catalog cache servers.
+    catalog: Vec<ServerFixture>,
+
     // Potentially helpful data
     org_id: String,
     bucket_id: String,
@@ -97,6 +105,8 @@ impl MiniCluster {
         ingesters: Vec<ServerFixture>,
         querier: Option<ServerFixture>,
         compactor_config: Option<TestConfig>,
+        catalog: Vec<ServerFixture>,
+        catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
     ) -> Self {
         let org_id = rand_id();
         let bucket_id = rand_id();
@@ -107,6 +117,8 @@ impl MiniCluster {
             ingesters,
             querier,
             compactor_config,
+            catalog,
+            catalog_reverse_proxy,
 
             org_id,
             bucket_id,
@@ -202,13 +214,24 @@ impl MiniCluster {
     /// querier. Save config for a compactor, but the compactor service should be run on-demand in
     /// tests using `compactor run-once` rather than using `run compactor`.
     pub async fn create_non_shared(database_url: String) -> Self {
-        let ingester_config = TestConfig::new_ingester(&database_url);
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config =
+            TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr()));
         let router_config = TestConfig::new_router(&ingester_config);
         let querier_config = TestConfig::new_querier(&ingester_config);
         let compactor_config = TestConfig::new_compactor(&ingester_config);
 
         // Set up the cluster  ====================================
         Self::new()
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
             .with_ingester(ingester_config)
             .await
             .with_router(router_config)
@@ -223,13 +246,26 @@ impl MiniCluster {
     /// compactor service should be run on-demand in tests using `compactor run-once` rather than
     /// using `run compactor`.
     pub async fn create_non_shared_never_persist(database_url: String) -> Self {
-        let ingester_config = TestConfig::new_ingester_never_persist(&database_url);
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config = TestConfig::new_ingester_never_persist(format!(
+            "http://{}",
+            catalog_reverse_proxy.addr()
+        ));
         let router_config = TestConfig::new_router(&ingester_config);
         let querier_config = TestConfig::new_querier(&ingester_config);
         let compactor_config = TestConfig::new_compactor(&ingester_config);
 
         // Set up the cluster  ====================================
         Self::new()
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
             .with_ingester(ingester_config)
             .await
             .with_router(router_config)
@@ -247,9 +283,17 @@ impl MiniCluster {
     /// than using `run compactor`.
     pub async fn create_non_shared_with_authz(
         database_url: String,
-        authz_addr: impl Into<String> + Clone,
+        authz_addr: impl Into<String> + Clone + Send,
     ) -> Self {
-        let ingester_config = TestConfig::new_ingester(&database_url);
+        let catalog_configs = TestConfig::catalog_nodes(&database_url);
+        let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new(
+            catalog_configs
+                .iter()
+                .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()),
+        ));
+
+        let ingester_config =
+            TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr()));
         let router_config =
             TestConfig::new_router(&ingester_config).with_single_tenancy(authz_addr.clone());
         let querier_config =
@@ -258,6 +302,9 @@ impl MiniCluster {
 
         // Set up the cluster  ====================================
         Self::new_based_on_tenancy(true)
+            .with_catalog(catalog_configs)
+            .await
+            .with_catalog_reverse_proxy(catalog_reverse_proxy)
             .with_ingester(ingester_config)
             .await
             .with_router(router_config)
@@ -280,20 +327,36 @@ impl MiniCluster {
 
     /// create a router with the specified configuration
     pub async fn with_router(mut self, router_config: TestConfig) -> Self {
-        self.router = Some(ServerFixture::create(router_config).await);
+        assert!(self.router.is_none());
+        let fixture = ServerFixture::create(router_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.add_ingester_client(fixture.strong());
+        self.router = Some(fixture);
         self
     }
 
     /// create an ingester with the specified configuration;
     pub async fn with_ingester(mut self, ingester_config: TestConfig) -> Self {
-        self.ingesters
-            .push(ServerFixture::create(ingester_config).await);
+        let fixture = ServerFixture::create(ingester_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.ingesters.push(fixture);
         self
     }
 
+    fn add_ingester_client(&self, client: Arc<dyn LinkableService>) {
+        for ingester in &self.ingesters {
+            let ingester = ingester.strong();
+            link_services(ingester, Arc::clone(&client));
+        }
+    }
+
     /// create a querier with the specified configuration;
     pub async fn with_querier(mut self, querier_config: TestConfig) -> Self {
-        self.querier = Some(ServerFixture::create(querier_config).await);
+        assert!(self.querier.is_none());
+        let fixture = ServerFixture::create(querier_config).await;
+        self.add_catalog_reverse_proxy_client(fixture.strong());
+        self.add_ingester_client(fixture.strong());
+        self.querier = Some(fixture);
         self
     }
 
@@ -302,6 +365,34 @@ impl MiniCluster {
         self
     }
 
+    /// create an catalog with the specified configuration;
+    pub async fn with_catalog(mut self, catalog_configs: [TestConfig; 3]) -> Self {
+        assert!(self.catalog.is_empty());
+        self.catalog = ServerFixture::create_multiple(catalog_configs).await;
+        self
+    }
+
+    fn add_catalog_client(&self, client: Arc<dyn LinkableService>) {
+        for catalog in &self.catalog {
+            let catalog = catalog.strong();
+            link_services(catalog, Arc::clone(&client));
+        }
+    }
+
+    /// Register catalog reverse proxy.
+    pub fn with_catalog_reverse_proxy(mut self, proxy: Arc<HttpReverseProxy>) -> Self {
+        assert!(self.catalog_reverse_proxy.is_none());
+        self.add_catalog_client(Arc::clone(&proxy) as _);
+        self.catalog_reverse_proxy = Some(proxy);
+        self
+    }
+
+    fn add_catalog_reverse_proxy_client(&self, client: Arc<dyn LinkableService>) {
+        if let Some(proxy) = &self.catalog_reverse_proxy {
+            link_services(Arc::clone(proxy) as _, client);
+        }
+    }
+
     /// Retrieve the underlying router server, if set
     pub fn router(&self) -> &ServerFixture {
         self.router.as_ref().expect("router not initialized")
@@ -344,8 +435,10 @@ impl MiniCluster {
     ///
     /// [`GRACEFUL_SERVER_STOP_TIMEOUT`]:
     ///     crate::server_fixture::GRACEFUL_SERVER_STOP_TIMEOUT
-    pub fn gracefully_stop_ingesters(&mut self) {
-        self.ingesters = vec![];
+    pub async fn gracefully_stop_ingesters(&mut self) {
+        for ingester in self.ingesters.drain(..) {
+            ingester.shutdown().await;
+        }
     }
 
     /// Restart querier.
@@ -485,7 +578,7 @@ impl MiniCluster {
     /// org/bucket
     pub async fn write_to_router(
         &self,
-        line_protocol: impl Into<String>,
+        line_protocol: impl Into<String> + Send,
         authorization: Option<&str>,
     ) -> Response<Body> {
         write_to_router(
@@ -499,7 +592,11 @@ impl MiniCluster {
     }
 
     /// Write to the ingester using the gRPC interface directly, rather than through a router.
-    pub async fn write_to_ingester(&self, line_protocol: impl Into<String>, table_name: &str) {
+    pub async fn write_to_ingester(
+        &self,
+        line_protocol: impl Into<String> + Send,
+        table_name: &str,
+    ) {
         write_to_ingester(
             line_protocol,
             self.namespace_id().await,
@@ -675,6 +772,8 @@ struct SharedServers {
     ingesters: Vec<Weak<TestServer>>,
     querier: Option<Weak<TestServer>>,
     compactor_config: Option<TestConfig>,
+    catalog: Vec<Weak<TestServer>>,
+    catalog_reverse_proxy: Option<Weak<HttpReverseProxy>>,
 }
 
 /// Deferred creation of a mini cluster
@@ -683,6 +782,8 @@ struct CreatableMiniCluster {
     ingesters: Vec<Arc<TestServer>>,
     querier: Option<Arc<TestServer>>,
     compactor_config: Option<TestConfig>,
+    catalog: Vec<Arc<TestServer>>,
+    catalog_reverse_proxy: Option<Arc<HttpReverseProxy>>,
 }
 
 async fn create_if_needed(server: Option<Arc<TestServer>>) -> Option<ServerFixture> {
@@ -693,6 +794,17 @@ async fn create_if_needed(server: Option<Arc<TestServer>>) -> Option<ServerFixtu
     }
 }
 
+async fn create_if_needed_many(
+    servers: impl IntoIterator<Item = Arc<TestServer>> + Send,
+) -> Vec<ServerFixture> {
+    servers
+        .into_iter()
+        .map(|server| async move { ServerFixture::create_from_existing(server).await })
+        .collect::<FuturesOrdered<_>>()
+        .collect::<Vec<_>>()
+        .await
+}
+
 impl CreatableMiniCluster {
     async fn create(self) -> MiniCluster {
         let Self {
@@ -700,37 +812,36 @@ impl CreatableMiniCluster {
             ingesters,
             querier,
             compactor_config,
+            catalog,
+            catalog_reverse_proxy,
         } = self;
 
         let router_fixture = create_if_needed(router).await;
-        let ingester_fixtures = ingesters
-            .into_iter()
-            .map(|ingester| create_if_needed(Some(ingester)))
-            .collect::<FuturesOrdered<_>>()
-            .collect::<Vec<_>>()
-            .await
-            .into_iter()
-            .flatten()
-            .collect();
+        let ingester_fixtures = create_if_needed_many(ingesters).await;
         let querier_fixture = create_if_needed(querier).await;
+        let catalog_fixtures = create_if_needed_many(catalog).await;
 
         MiniCluster::new_from_fixtures(
             router_fixture,
             ingester_fixtures,
             querier_fixture,
             compactor_config,
+            catalog_fixtures,
+            catalog_reverse_proxy,
         )
     }
 }
 
 impl SharedServers {
     /// Save the server processes in this shared servers as weak references
-    pub fn new(cluster: &MiniCluster) -> Self {
+    pub(crate) fn new(cluster: &MiniCluster) -> Self {
         Self {
             router: cluster.router.as_ref().map(|c| c.weak()),
             ingesters: cluster.ingesters.iter().map(|c| c.weak()).collect(),
             querier: cluster.querier.as_ref().map(|c| c.weak()),
             compactor_config: cluster.compactor_config.clone(),
+            catalog: cluster.catalog.iter().map(|c| c.weak()).collect(),
+            catalog_reverse_proxy: cluster.catalog_reverse_proxy.as_ref().map(Arc::downgrade),
         }
     }
 
@@ -742,13 +853,11 @@ impl SharedServers {
         // aren't present so that the cluster is recreated correctly
         Some(CreatableMiniCluster {
             router: server_from_weak(self.router.as_ref())?,
-            ingesters: self
-                .ingesters
-                .iter()
-                .flat_map(|ingester| server_from_weak(Some(ingester)).unwrap())
-                .collect(),
+            ingesters: servers_from_weak(&self.ingesters)?,
             querier: server_from_weak(self.querier.as_ref())?,
             compactor_config: self.compactor_config.clone(),
+            catalog: servers_from_weak(&self.catalog)?,
+            catalog_reverse_proxy: server_from_weak(self.catalog_reverse_proxy.as_ref())?,
         })
     }
 }
@@ -756,7 +865,7 @@ impl SharedServers {
 /// Returns None if there was a weak server but we couldn't upgrade.
 /// Returns Some(None) if there was no weak server
 /// Returns Some(Some(fixture)) if there was a weak server that we can upgrade and make a fixture from
-fn server_from_weak(server: Option<&Weak<TestServer>>) -> Option<Option<Arc<TestServer>>> {
+fn server_from_weak<T>(server: Option<&Weak<T>>) -> Option<Option<Arc<T>>> {
     if let Some(server) = server.as_ref() {
         // return None if can't upgrade
         let server = server.upgrade()?;
@@ -767,6 +876,20 @@ fn server_from_weak(server: Option<&Weak<TestServer>>) -> Option<Option<Arc<Test
     }
 }
 
+/// See [`server_from_weak`].
+fn servers_from_weak<'a, T>(servers: impl IntoIterator<Item = &'a Weak<T>>) -> Option<Vec<Arc<T>>>
+where
+    T: 'a,
+{
+    let mut out = vec![];
+
+    for server in servers {
+        out.push(server.upgrade()?);
+    }
+
+    Some(out)
+}
+
 static GLOBAL_SHARED_SERVERS: Lazy<Mutex<Option<SharedServers>>> = Lazy::new(|| Mutex::new(None));
 static GLOBAL_SHARED_SERVERS_NEVER_PERSIST: Lazy<Mutex<Option<SharedServers>>> =
     Lazy::new(|| Mutex::new(None));
diff --git a/test_helpers_end_to_end/src/server_fixture.rs b/test_helpers_end_to_end/src/server_fixture.rs
index 1e667885cee..03ce374de7e 100644
--- a/test_helpers_end_to_end/src/server_fixture.rs
+++ b/test_helpers_end_to_end/src/server_fixture.rs
@@ -16,13 +16,18 @@ use tempfile::NamedTempFile;
 use test_helpers::timeout::FutureTimeout;
 use tokio::sync::Mutex;
 
-use crate::{database::initialize_db, dump_log_to_stdout, log_command, server_type::AddAddrEnv};
+use crate::{
+    database::initialize_db,
+    dump_log_to_stdout, log_command,
+    server_type::AddAddrEnv,
+    service_link::{link_services, unlink_services, LinkableService, LinkableServiceImpl},
+};
 
 use super::{addrs::BindAddresses, ServerType, TestConfig};
 
 /// The duration of time a [`TestServer`] is given to gracefully shutdown after
 /// receiving a SIGTERM, before a SIGKILL is sent to kill it.
-pub const GRACEFUL_SERVER_STOP_TIMEOUT: Duration = Duration::from_secs(5);
+pub(crate) const GRACEFUL_SERVER_STOP_TIMEOUT: Duration = Duration::from_secs(5);
 
 /// Represents a server that has been started and is available for
 /// testing.
@@ -45,6 +50,20 @@ impl ServerFixture {
         Self::create_from_existing(Arc::new(server)).await
     }
 
+    /// Create multiple, potentially interdependent sever fixtures concurrently because [`create](Self::create)  only
+    /// returns when health is OK.
+    pub async fn create_multiple(
+        test_configs: impl IntoIterator<Item = TestConfig> + Send,
+    ) -> Vec<Self> {
+        let test_configs = test_configs.into_iter().collect::<Vec<_>>();
+        let n_configs = test_configs.len();
+        futures::stream::iter(test_configs)
+            .map(|cfg| async move { Self::create(cfg).await })
+            .buffered(n_configs)
+            .collect::<Vec<_>>()
+            .await
+    }
+
     /// Create a new server fixture that shares the same TestServer,
     /// but has its own connections
     pub(crate) async fn create_from_existing(server: Arc<TestServer>) -> Self {
@@ -62,21 +81,50 @@ impl ServerFixture {
     ///
     /// This will break all currently connected clients!
     pub async fn restart_server(self) -> Self {
+        // unlink clients because we are going to drop the server
+        let clients = unlink_services(Arc::clone(&self.server) as _);
+
         // get the underlying server, if possible
         let mut server = match Arc::try_unwrap(self.server) {
             Ok(s) => s,
             Err(_) => panic!("Can not restart server as it is shared"),
         };
 
+        // disconnect so server doesn't wait for our client
+        drop(self.connections);
+
         server.restart().await;
         let connections = server.wait_until_ready().await;
+        let server = Arc::new(server);
+
+        // relink clients
+        for client in clients {
+            link_services(Arc::clone(&server) as _, client);
+        }
 
         Self {
-            server: Arc::new(server),
+            server,
             connections,
         }
     }
 
+    /// Shutdown server in a clean way and wait for process to exit.
+    pub async fn shutdown(self) {
+        // unlink clients because we are going to drop the server
+        unlink_services(Arc::clone(&self.server) as _);
+
+        // get the underlying server, if possible
+        let mut server = match Arc::try_unwrap(self.server) {
+            Ok(s) => s,
+            Err(_) => panic!("Can not restart server as it is shared"),
+        };
+
+        // disconnect so server doesn't wait for our client
+        drop(self.connections);
+
+        server.stop().await;
+    }
+
     pub fn connections(&self) -> &Connections {
         &self.connections
     }
@@ -118,9 +166,24 @@ impl ServerFixture {
         self.server.addrs().querier_grpc_api().client_base()
     }
 
+    /// Return the http base URL for the catalog HTTP API
+    pub fn catalog_http_base(&self) -> Arc<str> {
+        self.server.addrs().catalog_http_api().client_base()
+    }
+
+    /// Return the grpc base URL for the catalog gRPC API
+    pub fn catalog_grpc_base(&self) -> Arc<str> {
+        self.server.addrs().catalog_grpc_api().client_base()
+    }
+
     /// Return log path for server process.
-    pub async fn log_path(&self) -> Box<Path> {
-        self.server.server_process.lock().await.log_path.clone()
+    pub fn log_path(&self) -> Box<Path> {
+        self.server.log_path.clone()
+    }
+
+    /// Get a strong reference to the underlying `TestServer`
+    pub(crate) fn strong(&self) -> Arc<TestServer> {
+        Arc::clone(&self.server)
     }
 
     /// Get a weak reference to the underlying `TestServer`
@@ -136,6 +199,7 @@ enum ServerState {
     Starting,
     Ready,
     Error,
+    Stopped,
 }
 
 /// Mananges some number of gRPC connections
@@ -149,6 +213,9 @@ pub struct Connections {
 
     /// connection to querier gRPC, if available
     querier_grpc_connection: Option<Connection>,
+
+    /// connection to catalog gRPC, if available
+    catalog_grpc_connection: Option<Connection>,
 }
 
 impl Connections {
@@ -183,6 +250,14 @@ impl Connections {
             .clone()
     }
 
+    /// Return a channel connected to the gRPC API, panic'ing if not the correct type of server
+    pub fn catalog_grpc_connection(&self) -> Connection {
+        self.catalog_grpc_connection
+            .as_ref()
+            .expect("Server type does not have router")
+            .clone()
+    }
+
     /// (re)establish channels to all gRPC services that were started with the specified test config
     async fn reconnect(&mut self, test_config: &TestConfig) -> Result<(), String> {
         let server_type = test_config.server_type();
@@ -223,6 +298,18 @@ impl Connections {
             _ => None,
         };
 
+        self.catalog_grpc_connection = match server_type {
+            ServerType::Catalog => {
+                let client_base = test_config.addrs().catalog_grpc_api().client_base();
+                Some(
+                    grpc_channel(test_config, client_base.as_ref())
+                        .await
+                        .map_err(|e| format!("Cannot connect to catalog at {client_base}: {e}"))?,
+                )
+            }
+            _ => None,
+        };
+
         Ok(())
     }
 }
@@ -250,31 +337,38 @@ pub struct TestServer {
     /// Is the server ready to accept connections?
     ready: Mutex<ServerState>,
 
+    /// Path to log file.
+    log_path: Box<Path>,
+
     /// Handle to the server process being controlled
-    server_process: Arc<Mutex<Process>>,
+    server_process: Arc<Mutex<Option<Child>>>,
 
     /// Configuration values for starting the test server
     test_config: TestConfig,
-}
 
-#[derive(Debug)]
-struct Process {
-    child: Child,
-    log_path: Box<Path>,
+    /// Service links.
+    links: LinkableServiceImpl,
 }
 
 impl TestServer {
     async fn new(test_config: TestConfig) -> Self {
         let ready = Mutex::new(ServerState::Started);
 
-        let server_process = Arc::new(Mutex::new(
-            Self::create_server_process(&test_config, None).await,
-        ));
+        let (_log_file, log_path) = NamedTempFile::new()
+            .expect("opening log file")
+            .keep()
+            .expect("expected to keep");
+
+        let server_process = Arc::new(Mutex::new(Some(
+            Self::create_server_process(&test_config, &log_path).await,
+        )));
 
         Self {
             ready,
+            log_path: log_path.into_boxed_path(),
             server_process,
             test_config,
+            links: Default::default(),
         }
     }
 
@@ -283,39 +377,57 @@ impl TestServer {
         self.test_config.addrs()
     }
 
+    /// Stop server.
+    async fn stop(&mut self) {
+        let mut ready_guard = self.ready.lock().await;
+        let mut server_lock = self.server_process.lock().await;
+
+        Self::stop_inner(
+            &mut ready_guard,
+            &mut server_lock,
+            self.test_config.server_type(),
+        )
+        .await;
+    }
+
+    async fn stop_inner(
+        ready: &mut ServerState,
+        server_process: &mut Option<Child>,
+        t: ServerType,
+    ) {
+        let server_process = server_process.take().expect("server process exists");
+        tokio::task::spawn_blocking(move || {
+            kill_politely(server_process, Duration::from_secs(5), t);
+        })
+        .await
+        .expect("kill politely worked");
+
+        *ready = ServerState::Stopped;
+    }
+
     /// Restarts the tests server process, but does not reconnect clients
     async fn restart(&mut self) {
         let mut ready_guard = self.ready.lock().await;
-        let mut server_process = self.server_process.lock().await;
-        kill_politely(&mut server_process.child, Duration::from_secs(5));
-        *server_process =
-            Self::create_server_process(&self.test_config, Some(server_process.log_path.clone()))
-                .await;
+        let mut server_lock = self.server_process.lock().await;
+
+        Self::stop_inner(
+            &mut ready_guard,
+            &mut server_lock,
+            self.test_config.server_type(),
+        )
+        .await;
+
+        *server_lock = Some(Self::create_server_process(&self.test_config, &self.log_path).await);
         *ready_guard = ServerState::Started;
     }
 
-    async fn create_server_process(
-        test_config: &TestConfig,
-        log_path: Option<Box<Path>>,
-    ) -> Process {
+    async fn create_server_process(test_config: &TestConfig, log_path: &Path) -> Child {
         // Create a new file each time and keep it around to aid debugging
-        let (log_file, log_path) = match log_path {
-            Some(log_path) => (
-                OpenOptions::new()
-                    .read(true)
-                    .append(true)
-                    .open(&log_path)
-                    .expect("log file should still be there"),
-                log_path,
-            ),
-            None => {
-                let (log_file, log_path) = NamedTempFile::new()
-                    .expect("opening log file")
-                    .keep()
-                    .expect("expected to keep");
-                (log_file, log_path.into_boxed_path())
-            }
-        };
+        let log_file = OpenOptions::new()
+            .read(true)
+            .append(true)
+            .open(log_path)
+            .expect("log file should still be there");
 
         let stdout_log_file = log_file
             .try_clone()
@@ -362,9 +474,7 @@ impl TestServer {
 
         log_command(command);
 
-        let child = command.spawn().unwrap();
-
-        Process { child, log_path }
+        command.spawn().unwrap()
     }
 
     /// Polls the various services to ensure the server is
@@ -387,6 +497,9 @@ impl TestServer {
                 ServerState::Error => {
                     panic!("Server was previously found to be in Error, aborting");
                 }
+                ServerState::Stopped => {
+                    panic!("Server was stopped");
+                }
             };
         }
 
@@ -416,14 +529,16 @@ impl TestServer {
         let server_process = Arc::clone(&self.server_process);
         let try_http_connect = async {
             let client = reqwest::Client::new();
-            let url = format!("{}/health", self.addrs().router_http_api().client_base());
-            let mut interval = tokio::time::interval(Duration::from_millis(1000));
+            let url = format!("{}/health", self.test_config.http_base());
+            let mut interval = tokio::time::interval(Duration::from_millis(100));
             loop {
                 if server_dead(server_process.as_ref()).await {
                     break;
                 }
                 match client.get(&url).send().await {
-                    Ok(resp) => {
+                    Ok(resp)
+                        if resp.status().is_success() || !self.test_config.wait_for_ready() =>
+                    {
                         info!(
                             "Successfully got a response from {:?} HTTP: {:?}",
                             self.test_config.server_type(),
@@ -431,6 +546,14 @@ impl TestServer {
                         );
                         return;
                     }
+                    Ok(resp) => {
+                        info!(
+                            "Waiting for {:?} HTTP server to be up: {:?}",
+                            self.test_config.server_type(),
+                            resp
+                        );
+                        return;
+                    }
                     Err(e) => {
                         info!(
                             "Waiting for {:?} HTTP server to be up: {}",
@@ -471,7 +594,7 @@ impl TestServer {
 
     pub async fn wait_for_grpc(&self, connections: &Connections) {
         let server_process = Arc::clone(&self.server_process);
-        let mut interval = tokio::time::interval(Duration::from_millis(1000));
+        let mut interval = tokio::time::interval(Duration::from_millis(100));
 
         let server_type = self.test_config.server_type();
         loop {
@@ -486,6 +609,20 @@ impl TestServer {
                         `influxdb_iox compactor run-once` instead"
                     );
                 }
+                ServerType::Catalog => {
+                    if check_catalog_v2_service_health(
+                        server_type,
+                        connections.catalog_grpc_connection(),
+                        self.test_config.wait_for_ready(),
+                    )
+                    .await
+                    {
+                        return;
+                    }
+                }
+                ServerType::ParquetCache => {
+                    unimplemented!("ParquetCache server should not use grpc, only http");
+                }
                 ServerType::Router => {
                     if check_catalog_service_health(
                         server_type,
@@ -544,6 +681,24 @@ impl TestServer {
     }
 }
 
+impl LinkableService for TestServer {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.links.add_link_client(client)
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        self.links.remove_link_clients()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.add_link_server(server)
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.links.remove_link_server(server)
+    }
+}
+
 /// checks catalog service health, as a proxy for all gRPC
 /// services. Returns false if the service should be checked again
 async fn check_catalog_service_health(server_type: ServerType, connection: Connection) -> bool {
@@ -568,6 +723,35 @@ async fn check_catalog_service_health(server_type: ServerType, connection: Conne
     }
 }
 
+/// checks catalog service V2 health, as a proxy for all gRPC
+/// services. Returns false if the service should be checked again
+async fn check_catalog_v2_service_health(
+    server_type: ServerType,
+    connection: Connection,
+    wait_for_ready: bool,
+) -> bool {
+    let mut health = influxdb_iox_client::health::Client::new(connection);
+
+    match health
+        .check("influxdata.iox.catalog.v2.CatalogService")
+        .await
+    {
+        Ok(ready) => {
+            if ready || !wait_for_ready {
+                info!("CatalogService service {:?} is running", server_type);
+                true
+            } else {
+                info!("CatalogService {:?} is not running", server_type);
+                false
+            }
+        }
+        Err(e) => {
+            info!("CatalogService {:?} not yet healthy: {:?}", server_type, e);
+            false
+        }
+    }
+}
+
 /// checks the arrow service service health, returning false if the service should be checked again
 async fn check_arrow_service_health(server_type: ServerType, connection: Connection) -> bool {
     let mut health = influxdb_iox_client::health::Client::new(connection);
@@ -606,24 +790,74 @@ impl Drop for TestServer {
             .try_lock()
             .expect("should be able to get a server process lock");
 
-        server_dead_inner(server_lock.deref_mut());
-        kill_politely(&mut server_lock.child, GRACEFUL_SERVER_STOP_TIMEOUT);
+        if let Some(server_process) = server_lock.take() {
+            let test_config = self.test_config.clone();
+            let log_path = self.log_path.clone();
+            let links = self.links.clone();
 
-        dump_log_to_stdout(
-            &format!("{:?}", self.test_config.server_type()),
-            &server_lock.log_path,
-        );
+            let kill_and_dump = move || {
+                kill_politely(
+                    server_process,
+                    GRACEFUL_SERVER_STOP_TIMEOUT,
+                    test_config.server_type(),
+                );
+
+                dump_log_to_stdout(&format!("{:?}", test_config.server_type()), &log_path);
+
+                // keep links til server is actually gone
+                drop(links);
+
+                // keep test config til the very last because it contains the WAL dir
+                drop(test_config);
+            };
+
+            // if there's still a tokio runtime around, use that to help the shut down process, because our client
+            // connections need to interact with the HTTP/2 shutdown and we shall not block the runtime during that
+            match tokio::runtime::Handle::try_current() {
+                Ok(handle) => {
+                    // tokio might decide to not schedule our future, in which case we still want to kill the child, so
+                    // we wrap the kill method into a helper that is either executed within a tokio context or is
+                    // executed when tokio drops it.
+                    let mut kill_and_dump = ExecOnDrop(Some(Box::new(kill_and_dump)));
+                    handle.spawn_blocking(move || {
+                        kill_and_dump.maybe_exec();
+                    });
+                }
+                Err(_) => {
+                    kill_and_dump();
+                }
+            }
+        }
+    }
+}
+
+struct ExecOnDrop(Option<Box<dyn FnOnce() + Send>>);
+
+impl ExecOnDrop {
+    fn maybe_exec(&mut self) {
+        if let Some(f) = self.0.take() {
+            f();
+        }
+    }
+}
+
+impl Drop for ExecOnDrop {
+    fn drop(&mut self) {
+        self.maybe_exec();
     }
 }
 
 /// returns true if the server process has exited (for any reason), and
 /// prints what happened to stdout
-async fn server_dead(server_process: &Mutex<Process>) -> bool {
-    server_dead_inner(server_process.lock().await.deref_mut())
+async fn server_dead(server_process: &Mutex<Option<Child>>) -> bool {
+    match server_process.lock().await.deref_mut() {
+        Some(server_process) => server_dead_inner(server_process),
+        None => true,
+    }
 }
 
-fn server_dead_inner(server_process: &mut Process) -> bool {
-    match server_process.child.try_wait() {
+fn server_dead_inner(server_process: &mut Child) -> bool {
+    match server_process.try_wait() {
         Ok(None) => false,
         Ok(Some(status)) => {
             warn!("Server process exited: {}", status);
@@ -637,7 +871,16 @@ fn server_dead_inner(server_process: &mut Process) -> bool {
 }
 
 /// Attempt to kill a child process politely.
-fn kill_politely(child: &mut Child, wait: Duration) {
+fn kill_politely(mut child: Child, wait: Duration, t: ServerType) {
+    if server_dead_inner(&mut child) {
+        // fast path
+        return;
+    }
+
+    kill_politely_inner(&mut child, wait, t);
+}
+
+fn kill_politely_inner(child: &mut Child, wait: Duration, t: ServerType) {
     use nix::{
         sys::{
             signal::{self, Signal},
@@ -652,23 +895,23 @@ fn kill_politely(child: &mut Child, wait: Duration) {
     let wait_errored = match signal::kill(pid, Signal::SIGTERM) {
         Ok(()) => wait_timeout(pid, wait).is_err(),
         Err(e) => {
-            info!("Error sending SIGTERM to child: {e}");
+            info!("Error sending SIGTERM to child ({t:?}): {e}");
             true
         }
     };
 
     if wait_errored {
         // timeout => kill it
-        info!("Cannot terminate child politely, using SIGKILL...");
+        warn!("Cannot terminate child ({t:?}) politely, using SIGKILL...");
 
         if let Err(e) = signal::kill(pid, Signal::SIGKILL) {
-            info!("Error sending SIGKILL to child: {e}");
+            info!("Error sending SIGKILL to child ({t:?}): {e}");
         }
         if let Err(e) = waitpid(pid, None) {
-            info!("Cannot wait for child: {e}");
+            info!("Cannot wait for child ({t:?}): {e}");
         }
     } else {
-        info!("Killed child politely");
+        info!("Killed child ({t:?}) politely");
     }
 }
 
diff --git a/test_helpers_end_to_end/src/server_type.rs b/test_helpers_end_to_end/src/server_type.rs
index 3cd4a346031..ab23f8217b9 100644
--- a/test_helpers_end_to_end/src/server_type.rs
+++ b/test_helpers_end_to_end/src/server_type.rs
@@ -7,6 +7,8 @@ pub enum ServerType {
     Router,
     Querier,
     Compactor,
+    Catalog,
+    ParquetCache,
 }
 
 impl ServerType {
@@ -18,6 +20,8 @@ impl ServerType {
             Self::Router => "router",
             Self::Querier => "querier",
             Self::Compactor => "compactor",
+            Self::Catalog => "catalog",
+            Self::ParquetCache => "parquet-cache",
         }
     }
 }
@@ -66,12 +70,25 @@ fn addr_envs(server_type: ServerType, addrs: &BindAddresses) -> Vec<(&'static st
         ServerType::Ingester => vec![
             (
                 "INFLUXDB_IOX_BIND_ADDR",
-                addrs.router_http_api().bind_addr().to_string(),
+                addrs.ingester_http_api().bind_addr().to_string(),
             ),
             (
                 "INFLUXDB_IOX_GRPC_BIND_ADDR",
                 addrs.ingester_grpc_api().bind_addr().to_string(),
             ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.ingester_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
         ],
         ServerType::Router => vec![
             (
@@ -86,26 +103,79 @@ fn addr_envs(server_type: ServerType, addrs: &BindAddresses) -> Vec<(&'static st
                 "INFLUXDB_IOX_INGESTER_ADDRESSES",
                 addrs.ingester_grpc_api().bind_addr().to_string(),
             ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.router_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
         ],
         ServerType::Querier => vec![
             (
                 "INFLUXDB_IOX_BIND_ADDR",
-                addrs.router_http_api().bind_addr().to_string(),
+                addrs.querier_http_api().bind_addr().to_string(),
             ),
             (
                 "INFLUXDB_IOX_GRPC_BIND_ADDR",
                 addrs.querier_grpc_api().bind_addr().to_string(),
             ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.querier_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
         ],
         ServerType::Compactor => vec![
             (
                 "INFLUXDB_IOX_BIND_ADDR",
-                addrs.router_http_api().bind_addr().to_string(),
+                addrs.compactor_http_api().bind_addr().to_string(),
             ),
             (
                 "INFLUXDB_IOX_GRPC_BIND_ADDR",
                 addrs.compactor_grpc_api().bind_addr().to_string(),
             ),
+            (
+                "INFLUXDB_IOX_GOSSIP_BIND_ADDR",
+                addrs.compactor_gossip_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GOSSIP_SEED_LIST",
+                addrs
+                    .all_gossip_apis()
+                    .into_iter()
+                    .map(|a| a.bind_addr().to_string())
+                    .collect::<Vec<_>>()
+                    .join(","),
+            ),
+        ],
+        ServerType::Catalog => vec![
+            (
+                "INFLUXDB_IOX_BIND_ADDR",
+                addrs.catalog_http_api().bind_addr().to_string(),
+            ),
+            (
+                "INFLUXDB_IOX_GRPC_BIND_ADDR",
+                addrs.catalog_grpc_api().bind_addr().to_string(),
+            ),
         ],
+        ServerType::ParquetCache => vec![(
+            "INFLUXDB_IOX_BIND_ADDR",
+            addrs.parquet_cache_http_api().bind_addr().to_string(),
+        )],
     }
 }
diff --git a/test_helpers_end_to_end/src/service_link.rs b/test_helpers_end_to_end/src/service_link.rs
new file mode 100644
index 00000000000..e649f340b19
--- /dev/null
+++ b/test_helpers_end_to_end/src/service_link.rs
@@ -0,0 +1,99 @@
+//! Helpers to ensure service links are respected during shutdown.
+//!
+//! This does NOT affect correctness of the tests but often speeds them up because clients (like the ingester
+//! communicating with the catalog) no longer get stuck on retries during the shutdown phase (and would be killed after
+//! a timeout).
+use std::sync::{Arc, Weak};
+
+use parking_lot::Mutex;
+
+/// An abstract service that can be linked in a client-server relationship
+pub(crate) trait LinkableService: std::fmt::Debug + Send + Sync {
+    /// Add new known client.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`add_link_server`](Self::add_link_server)) for the
+    /// client. Use [`link_services`] instead.**
+    fn add_link_client(&self, client: Weak<dyn LinkableService>);
+
+    /// Unlink all clients from this service.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`remove_link_server`](Self::remove_link_server)) for the
+    /// returned clients. Use [`unlink_services`] instead.**
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>>;
+
+    /// Add new known server that should be kept alive until the client is gone.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`add_link_client`](Self::add_link_client)) for the
+    /// client. Use [`link_services`] instead.**
+    fn add_link_server(&self, server: Arc<dyn LinkableService>);
+
+    /// Remove given server.
+    ///
+    /// The server will no longer kept alive. This is a no-op if the server is unknown.
+    ///
+    /// **NOTE: This does NOT perform the opposite operation ([`remove_link_clients`](Self::remove_link_clients)) for the
+    /// server. Use [`unlink_services`] instead.**
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>);
+}
+
+/// Simple implementation of [`LinkableService`] that can be used as a struct member.
+///
+/// Using this as a struct member and NOT directly is important so that the tracked [`Arc`]s use the actual service
+/// struct, not this helper.
+#[derive(Debug, Default)]
+pub(crate) struct LinkableServiceImpl {
+    clients: Mutex<Vec<Weak<dyn LinkableService>>>,
+    servers: Mutex<Vec<Arc<dyn LinkableService>>>,
+}
+
+impl LinkableService for LinkableServiceImpl {
+    fn add_link_client(&self, client: Weak<dyn LinkableService>) {
+        self.clients.lock().push(client);
+    }
+
+    fn remove_link_clients(&self) -> Vec<Arc<dyn LinkableService>> {
+        let mut guard = self.clients.lock();
+        guard
+            .drain(..)
+            .filter_map(|client| client.upgrade())
+            .collect()
+    }
+
+    fn add_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.servers.lock().push(server);
+    }
+
+    fn remove_link_server(&self, server: Arc<dyn LinkableService>) {
+        self.servers
+            .lock()
+            .retain(|server2| !Arc::ptr_eq(&server, server2));
+    }
+}
+
+impl Clone for LinkableServiceImpl {
+    fn clone(&self) -> Self {
+        let clients = self.clients.lock();
+        let server = self.servers.lock();
+        Self {
+            clients: Mutex::new(clients.clone()),
+            servers: Mutex::new(server.clone()),
+        }
+    }
+}
+
+/// Cross-link server and client.
+pub(crate) fn link_services(server: Arc<dyn LinkableService>, client: Arc<dyn LinkableService>) {
+    server.add_link_client(Arc::downgrade(&client));
+    client.add_link_server(server);
+}
+
+/// Unlink clients from a given server so it is no longer kept alive.
+///
+/// The known clients are returned so they can potentially be re-linked.
+pub(crate) fn unlink_services(server: Arc<dyn LinkableService>) -> Vec<Arc<dyn LinkableService>> {
+    let clients = server.remove_link_clients();
+    for client in &clients {
+        client.remove_link_server(Arc::clone(&server));
+    }
+    clients
+}
diff --git a/test_helpers_end_to_end/src/snapshot_comparison.rs b/test_helpers_end_to_end/src/snapshot_comparison.rs
index 6803a35beb0..1aeced7322b 100644
--- a/test_helpers_end_to_end/src/snapshot_comparison.rs
+++ b/test_helpers_end_to_end/src/snapshot_comparison.rs
@@ -9,12 +9,11 @@ use arrow_util::test_util::{sort_record_batch, Normalizer, REGEX_UUID};
 use influxdb_iox_client::format::influxql::{write_columnar, Options, TableBorders};
 use once_cell::sync::Lazy;
 use regex::{Captures, Regex};
-use snafu::{OptionExt, ResultExt, Snafu};
+use snafu::{OptionExt, Snafu};
 use sqlx::types::Uuid;
 use std::collections::HashMap;
 use std::{
     fmt::{Display, Formatter},
-    fs,
     path::{Path, PathBuf},
 };
 use tonic::Code;
@@ -31,28 +30,6 @@ pub enum Error {
 
     #[snafu(context(false))]
     MakingOutputPath { source: OutputPathError },
-
-    #[snafu(display("Could not write to output file '{:?}': {}", output_path, source))]
-    WritingToOutputFile {
-        output_path: PathBuf,
-        source: std::io::Error,
-    },
-
-    #[snafu(display("Could not read expected file '{:?}': {}", path, source))]
-    ReadingExpectedFile {
-        path: PathBuf,
-        source: std::io::Error,
-    },
-
-    #[snafu(display(
-        "Contents of output '{:?}' does not match contents of expected '{:?}'",
-        output_path,
-        expected_path,
-    ))]
-    OutputMismatch {
-        output_path: PathBuf,
-        expected_path: PathBuf,
-    },
 }
 
 pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -146,96 +123,51 @@ impl Display for Language {
 
 pub async fn run(
     cluster: &mut MiniCluster,
-    input_path: PathBuf,
+    input_file_path: PathBuf,
     setup_name: String,
     contents: String,
     language: Language,
 ) -> Result<()> {
     // create output and expected output
-    let output_path = make_output_path(&input_path)?;
-    let expected_path = {
-        let mut p = input_path.clone();
-        let ext = p
-            .extension()
-            .expect("input path missing extension")
-            .to_str()
-            .expect("input path extension is not valid UTF-8");
-        p.set_extension(format!("{ext}.expected"));
-        p
-    };
+    let test_name = input_file_path
+        .file_name()
+        .expect("input path missing file path")
+        .to_str()
+        .expect("input path file path is not valid UTF-8");
+
+    let output_path = input_file_path.parent().context(NoParentSnafu {
+        path: &input_file_path,
+    })?;
+    let output_path = make_absolute(output_path);
 
-    println!("Running case in {input_path:?}");
-    println!("  writing output to {output_path:?}");
-    println!("  expected output in {expected_path:?}");
+    println!("Running case in {input_file_path:?}");
+    println!("Producing output in {output_path:?}");
     println!("Processing contents:\n{contents}");
 
     let queries = TestQueries::from_lines(contents.lines(), language);
 
+    //Build up the test output line by line
     let mut output = vec![];
     output.push(format!("-- Test Setup: {setup_name}"));
 
     for q in queries.iter() {
+        q.add_comments(&mut output);
         output.push(format!("-- {}: {}", language, q.text()));
         q.add_description(&mut output);
         let results = run_query(cluster, q).await?;
         output.extend(results);
     }
 
-    fs::write(&output_path, output.join("\n")).context(WritingToOutputFileSnafu {
-        output_path: &output_path,
-    })?;
-
-    // Now, compare to expected results
-    let expected_data = fs::read_to_string(&expected_path).context(ReadingExpectedFileSnafu {
-        path: &expected_path,
-    })?;
-    let expected_contents: Vec<_> = expected_data.lines().map(|s| s.to_string()).collect();
-
-    if expected_contents != output {
-        let expected_path = make_absolute(&expected_path);
-        let output_path = make_absolute(&output_path);
+    // Configure insta to send the results to query_tests/out/<test_name>.sql.snap
+    let mut settings = insta::Settings::clone_current();
+    settings.set_snapshot_path(output_path);
+    settings.set_prepend_module_to_snapshot(false);
+    settings.bind(|| {
+        let test_output = output.join("\n");
+        insta::assert_snapshot!(test_name, test_output); // panic on failure
+    });
 
-        if std::env::var("CI")
-            .map(|value| value == "true")
-            .unwrap_or(false)
-        {
-            // In CI, print out the contents because it's inconvenient to access the files and
-            // you're not going to update the files there.
-            println!("Expected output does not match actual output");
-            println!(
-                "Diff: \n\n{}",
-                String::from_utf8(
-                    std::process::Command::new("diff")
-                        .arg("-du")
-                        .arg(&expected_path)
-                        .arg(&output_path)
-                        .output()
-                        .unwrap()
-                        .stdout
-                )
-                .unwrap()
-            );
-        } else {
-            // When you're not in CI, print out instructions for analyzing the content or updating
-            // the snapshot.
-            println!("Expected output does not match actual output");
-            println!("  expected output in {expected_path:?}");
-            println!("  actual output in {output_path:?}");
-            println!("Possibly helpful commands:");
-            println!("  # See diff");
-            println!("  diff -du {expected_path:?} {output_path:?}");
-            println!("  # Update expected");
-            println!("  cp -f {output_path:?} {expected_path:?}");
-        }
-
-        OutputMismatchSnafu {
-            output_path,
-            expected_path,
-        }
-        .fail()
-    } else {
-        Ok(())
-    }
+    Ok(())
 }
 
 #[derive(Debug, Snafu)]
@@ -250,41 +182,6 @@ pub enum OutputPathError {
     NoParent { path: PathBuf },
 }
 
-/// Return output path for input path.
-///
-/// This converts `some/prefix/in/foo.sql` (or other file extensions) to `some/prefix/out/foo.sql.out`.
-fn make_output_path(input: &Path) -> Result<PathBuf, OutputPathError> {
-    let stem = input.file_stem().context(NoFileStemSnafu { path: input })?;
-    let ext = input
-        .extension()
-        .context(MissingFileExtSnafu { path: input })?;
-
-    // go two levels up (from file to dir, from dir to parent dir)
-    let parent = input.parent().context(NoParentSnafu { path: input })?;
-    let parent = parent.parent().context(NoParentSnafu { path: parent })?;
-    let mut out = parent.to_path_buf();
-
-    // go one level down (from parent dir to out-dir)
-    out.push("out");
-
-    // make best effort attempt to create output directory if it
-    // doesn't exist (it does not on a fresh checkout)
-    if !out.exists() {
-        if let Err(e) = std::fs::create_dir(&out) {
-            panic!("Could not create output directory {out:?}: {e}");
-        }
-    }
-
-    // set file name and ext
-    out.push(stem);
-    out.set_extension(format!(
-        "{}.out",
-        ext.to_str().expect("extension is not valid UTF-8")
-    ));
-
-    Ok(out)
-}
-
 /// Return the absolute path to `path`, regardless of if it exists on the local filesystem
 fn make_absolute(path: &Path) -> PathBuf {
     let mut absolute = std::env::current_dir().expect("cannot get current working directory");
diff --git a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs
index 259f88ec721..1008af4a8df 100644
--- a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs
+++ b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs
@@ -4,13 +4,16 @@ use arrow_util::test_util::Normalizer;
 
 /// A query to run with optional annotations
 #[derive(Debug, PartialEq, Eq, Default)]
-pub struct Query {
+pub(crate) struct Query {
     /// Describes how query text should be normalized
     normalizer: Normalizer,
 
     /// Specifies the query language of `text`.
     language: Language,
 
+    /// Comments that precede the query
+    comments: Vec<String>,
+
     /// The query string
     text: String,
 }
@@ -22,58 +25,75 @@ impl Query {
         Self {
             normalizer: Normalizer::new(),
             language: Language::Sql,
+            comments: vec![],
             text,
         }
     }
 
-    pub fn text(&self) -> &str {
+    pub(crate) fn text(&self) -> &str {
         &self.text
     }
 
-    pub fn language(&self) -> Language {
+    pub(crate) fn language(&self) -> Language {
         self.language
     }
 
-    pub fn with_sorted_compare(mut self) -> Self {
+    /// Add a comment to the query
+    #[cfg(test)]
+    pub(crate) fn with_comment(mut self, comment: impl Into<String>) -> Self {
+        self.comments.push(comment.into());
+        self
+    }
+
+    pub(crate) fn with_sorted_compare(mut self) -> Self {
         self.normalizer.sorted_compare = true;
         self
     }
 
-    pub fn with_normalized_uuids(mut self) -> Self {
+    pub(crate) fn with_normalized_uuids(mut self) -> Self {
         self.normalizer.normalized_uuids = true;
         self
     }
 
-    pub fn with_normalize_metrics(mut self) -> Self {
+    pub(crate) fn with_normalize_metrics(mut self) -> Self {
         self.normalizer.normalized_metrics = true;
         self
     }
 
-    pub fn with_normalize_filters(mut self) -> Self {
+    pub(crate) fn with_normalize_filters(mut self) -> Self {
         self.normalizer.normalized_filters = true;
         self
     }
 
-    pub fn with_no_table_borders(mut self) -> Self {
+    pub(crate) fn with_no_table_borders(mut self) -> Self {
         self.normalizer.no_table_borders = true;
         self
     }
 
     /// Take the output of running the query and apply the specified normalizations to them
-    pub fn normalize_results(&self, results: Vec<RecordBatch>, language: Language) -> Vec<String> {
+    pub(crate) fn normalize_results(
+        &self,
+        results: Vec<RecordBatch>,
+        language: Language,
+    ) -> Vec<String> {
         language.normalize_results(&self.normalizer, results)
     }
 
-    /// Adds information on what normalizations were applied to the input
-    pub fn add_description(&self, output: &mut Vec<String>) {
+    /// Adds any comments from the input to the output
+    pub(crate) fn add_comments(&self, output: &mut Vec<String>) {
+        output.extend_from_slice(&self.comments);
+    }
+
+    /// Adds information to the output about what normalizations were applied
+    pub(crate) fn add_description(&self, output: &mut Vec<String>) {
         self.normalizer.add_description(output)
     }
 }
 
 #[derive(Debug, Default)]
 struct QueryBuilder {
-    pub language: Language,
-    pub query: Query,
+    pub(crate) language: Language,
+    pub(crate) query: Query,
 }
 
 impl QueryBuilder {
@@ -83,6 +103,9 @@ impl QueryBuilder {
             ..Default::default()
         }
     }
+    fn push_comment(&mut self, s: &str) {
+        self.query.comments.push(s.to_string())
+    }
 
     fn push_str(&mut self, s: &str) {
         self.query.text.push_str(s)
@@ -108,13 +131,13 @@ impl QueryBuilder {
 
 /// Poor man's parser to find all the SQL queries in an input file
 #[derive(Debug, PartialEq, Eq)]
-pub struct TestQueries {
+pub(crate) struct TestQueries {
     queries: Vec<Query>,
 }
 
 impl TestQueries {
     /// find all queries (more or less a fancy split on `;`
-    pub fn from_lines<I, S>(lines: I, language: Language) -> Self
+    pub(crate) fn from_lines<I, S>(lines: I, language: Language) -> Self
     where
         I: IntoIterator<Item = S>,
         S: AsRef<str>,
@@ -150,6 +173,10 @@ impl TestQueries {
                                 _ => {}
                             }
                         }
+                    } else if line.starts_with("-- IOX_SETUP: ") {
+                        // ignore setup lines
+                    } else if line.starts_with("--") {
+                        builder.push_comment(line);
                     }
 
                     if line.starts_with("--") {
@@ -183,7 +210,7 @@ impl TestQueries {
     }
 
     // Get an iterator over the queries
-    pub fn iter(&self) -> impl Iterator<Item = &Query> {
+    pub(crate) fn iter(&self) -> impl Iterator<Item = &Query> {
         self.queries.iter()
     }
 }
@@ -208,8 +235,8 @@ select * from bar;
             queries,
             TestQueries {
                 queries: vec![
-                    Query::new("select * from foo;"),
-                    Query::new("select * from bar;"),
+                    Query::new("select * from foo;").with_comment("-- This is a test"),
+                    Query::new("select * from bar;").with_comment("-- another comment"),
                 ]
             }
         )
@@ -228,7 +255,7 @@ select * from bar
             TestQueries {
                 queries: vec![
                     Query::new("select * from foo;"),
-                    Query::new("select * from bar")
+                    Query::new("select * from bar").with_comment("-- no ending semi colon"),
                 ]
             }
         )
@@ -290,8 +317,14 @@ select * from waz;
             TestQueries {
                 queries: vec![
                     Query::new("select * from foo;"),
-                    Query::new("select * from bar;").with_sorted_compare(),
-                    Query::new("select * from baz;"),
+                    Query::new("select * from bar;")
+                        .with_comment(
+                            "-- The second query should be compared to expected after sorting"
+                        )
+                        .with_sorted_compare(),
+                    Query::new("select * from baz;").with_comment(
+                        "-- Since this query is not annotated, it should not use exected sorted"
+                    ),
                     Query::new("select * from baz2;"),
                     Query::new("select * from waz;").with_sorted_compare(),
                 ]
@@ -324,7 +357,10 @@ select * from foo;
         assert_eq!(
             queries,
             TestQueries {
-                queries: vec![Query::new("select * from foo;")]
+                queries: vec![
+                    // Note the --IOX_COMPARE is not treated as a comment
+                    Query::new("select * from foo;")
+                ]
             }
         )
     }
diff --git a/test_helpers_end_to_end/src/steps.rs b/test_helpers_end_to_end/src/steps.rs
index 8315870ffad..c727de40125 100644
--- a/test_helpers_end_to_end/src/steps.rs
+++ b/test_helpers_end_to_end/src/steps.rs
@@ -1,19 +1,23 @@
 use crate::snapshot_comparison::Language;
 use crate::{
-    check_flight_error, run_influxql, run_sql, snapshot_comparison, try_run_influxql, try_run_sql,
-    MiniCluster,
+    check_flight_error, run_influxql, run_influxql_with_params, run_sql, run_sql_with_params,
+    snapshot_comparison, try_run_influxql, try_run_influxql_with_params, try_run_sql,
+    try_run_sql_with_params, MiniCluster,
 };
 use arrow::record_batch::RecordBatch;
 use arrow_util::assert_batches_sorted_eq;
 use futures::future::BoxFuture;
 use http::StatusCode;
+use iox_query_params::StatementParam;
 use observability_deps::tracing::info;
+use std::collections::HashMap;
 use std::{path::PathBuf, time::Duration};
 use test_helpers::assert_contains;
 
 const MAX_QUERY_RETRY_TIME_SEC: u64 = 20;
 
 /// Test harness for end to end tests that are comprised of several steps
+#[allow(missing_debug_implementations)]
 pub struct StepTest<'a, S> {
     cluster: &'a mut MiniCluster,
 
@@ -22,6 +26,7 @@ pub struct StepTest<'a, S> {
 }
 
 /// The test state that is passed to custom steps
+#[derive(Debug)]
 pub struct StepTestState<'a> {
     /// The mini cluster
     cluster: &'a mut MiniCluster,
@@ -154,12 +159,14 @@ impl<'a> StepTestState<'a> {
 ///   }.boxed()
 /// });
 /// ```
-pub type FCustom = Box<dyn for<'b> Fn(&'b mut StepTestState) -> BoxFuture<'b, ()> + Send + Sync>;
+pub type FCustom =
+    Box<dyn for<'b> Fn(&'b mut StepTestState<'_>) -> BoxFuture<'b, ()> + Send + Sync>;
 
 /// Function to do custom validation on metrics. Expected to panic on validation failure.
-pub type MetricsValidationFn = Box<dyn Fn(&mut StepTestState, String) + Send + Sync>;
+pub(crate) type MetricsValidationFn = Box<dyn Fn(&mut StepTestState<'_>, String) + Send + Sync>;
 
 /// Possible test steps that a test can perform
+#[allow(missing_debug_implementations)]
 pub enum Step {
     /// Writes the specified line protocol to the `/api/v2/write`
     /// endpoint, assert the data was written successfully
@@ -170,6 +177,8 @@ pub enum Step {
     WriteLineProtocolExpectingError {
         line_protocol: String,
         expected_error_code: StatusCode,
+        expected_error_message: String,
+        expected_line_number: Option<usize>,
     },
 
     /// Writes the specified line protocol to the `/api/v2/write` endpoint
@@ -217,6 +226,16 @@ pub enum Step {
         expected: Vec<&'static str>,
     },
 
+    /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// results match the expected results using the
+    /// `assert_batches_eq!` macro
+    QueryWithParams {
+        sql: String,
+        params: HashMap<String, StatementParam>,
+        expected: Vec<&'static str>,
+    },
+
     /// Read the SQL queries in the specified file and verify that the results match the expected
     /// results in the corresponding expected file
     QueryAndCompare {
@@ -233,6 +252,16 @@ pub enum Step {
         expected_message: String,
     },
 
+    /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// request returns the expected error code and message
+    QueryWithParamsExpectingError {
+        sql: String,
+        params: HashMap<String, StatementParam>,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
     /// Run a SQL query using the FlightSQL interface authorized by the
     /// authorization header. Verify that the
     /// results match the expected results using the `assert_batches_eq!`
@@ -271,6 +300,15 @@ pub enum Step {
         expected: Vec<&'static str>,
     },
 
+    /// Run an InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// in the query text with values provided by the params HashMap. Then verify that the
+    /// results match the expected results using the `assert_batches_eq!` macro
+    InfluxQLQueryWithParams {
+        query: String,
+        params: HashMap<String, StatementParam>,
+        expected: Vec<&'static str>,
+    },
+
     /// Read the InfluxQL queries in the specified file and verify that the results match the
     /// expected results in the corresponding expected file
     InfluxQLQueryAndCompare {
@@ -287,6 +325,16 @@ pub enum Step {
         expected_message: String,
     },
 
+    /// Run InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables
+    /// with the supplied parameters. Then verify that the
+    /// request returns the expected error code and message
+    InfluxQLWithParamsExpectingError {
+        query: String,
+        params: HashMap<String, StatementParam>,
+        expected_error_code: tonic::Code,
+        expected_message: String,
+    },
+
     /// Run an InfluxQL query using the FlightSQL interface including an
     /// authorization header. Verify that the results match the expected
     /// results using the `assert_batches_eq!` macro.
@@ -332,7 +380,7 @@ impl AsRef<Step> for Step {
 
 impl<'a, S> StepTest<'a, S>
 where
-    S: AsRef<Step>,
+    S: AsRef<Step> + Send,
 {
     /// Create a new test that runs each `step`, in sequence, against
     /// `cluster` panic'ing if any step fails
@@ -382,6 +430,8 @@ where
                 Step::WriteLineProtocolExpectingError {
                     line_protocol,
                     expected_error_code,
+                    expected_error_message,
+                    expected_line_number,
                 } => {
                     info!(
                         "====Begin writing line protocol expecting error to v2 HTTP API:\n{}",
@@ -389,6 +439,40 @@ where
                     );
                     let response = state.cluster.write_to_router(line_protocol, None).await;
                     assert_eq!(response.status(), *expected_error_code);
+
+                    let body: serde_json::Value = serde_json::from_slice(
+                        &hyper::body::to_bytes(response.into_body())
+                            .await
+                            .expect("should be able to read response body"),
+                    )
+                    .expect("response body should be valid json");
+
+                    assert_matches::assert_matches!(
+                        body["message"],
+                        serde_json::Value::String(ref s) if s.contains(expected_error_message),
+                        "error message did not match: expected '{}' to contain '{}'",
+                        body["message"],
+                        expected_error_message
+                    );
+
+                    match expected_line_number {
+                        Some(line) => {
+                            assert_matches::assert_matches!(
+                                body["line"],
+                                serde_json::Value::Number(ref n) if n == &serde_json::Number::from(*line),
+                                "error line did not match: expected '{}' to be '{}'",
+                                body["line"],
+                                line
+                            );
+                        }
+                        None => {
+                            assert!(
+                                !body.as_object().unwrap().contains_key("line"),
+                                "error line should not be present"
+                            );
+                        }
+                    };
+
                     info!("====Done writing line protocol expecting error");
                 }
                 Step::WriteLineProtocolWithAuthorization {
@@ -466,6 +550,27 @@ where
                     assert_batches_sorted_eq!(expected, &batches);
                     info!("====Done running");
                 }
+                Step::QueryWithParams {
+                    sql,
+                    params,
+                    expected,
+                } => {
+                    info!("====Begin running SQL query: {}", sql);
+                    info!("params: {:?}", params);
+                    // run query
+                    let (mut batches, schema) = run_sql_with_params(
+                        sql,
+                        state.cluster.namespace(),
+                        params.clone(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
                 Step::QueryAndCompare {
                     input_path,
                     setup_name,
@@ -507,6 +612,29 @@ where
 
                     info!("====Done running");
                 }
+                Step::QueryWithParamsExpectingError {
+                    sql,
+                    params,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!("====Begin running SQL query expected to error: {}", sql);
+
+                    let err = try_run_sql_with_params(
+                        sql,
+                        state.cluster().namespace(),
+                        params.clone(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                        false,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
                 Step::QueryWithAuthorization {
                     sql,
                     authorization,
@@ -569,6 +697,26 @@ where
                     assert_batches_sorted_eq!(expected, &batches);
                     info!("====Done running");
                 }
+                Step::InfluxQLQueryWithParams {
+                    query,
+                    expected,
+                    params,
+                } => {
+                    info!("====Begin running InfluxQL query: {}", query);
+                    info!("params: {:?}", params);
+                    // run query
+                    let (mut batches, schema) = run_influxql_with_params(
+                        query,
+                        state.cluster.namespace(),
+                        params.clone(),
+                        state.cluster.querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await;
+                    batches.push(RecordBatch::new_empty(schema));
+                    assert_batches_sorted_eq!(expected, &batches);
+                    info!("====Done running");
+                }
                 Step::InfluxQLQueryAndCompare {
                     input_path,
                     setup_name,
@@ -612,6 +760,31 @@ where
 
                     info!("====Done running");
                 }
+                Step::InfluxQLWithParamsExpectingError {
+                    query,
+                    params,
+                    expected_error_code,
+                    expected_message,
+                } => {
+                    info!(
+                        "====Begin running InfluxQL query expected to error: {}",
+                        query
+                    );
+                    info!("params: {:?}", params);
+                    let err = try_run_influxql_with_params(
+                        query,
+                        state.cluster().namespace(),
+                        params.clone(),
+                        state.cluster().querier().querier_grpc_connection(),
+                        None,
+                    )
+                    .await
+                    .unwrap_err();
+
+                    check_flight_error(err, *expected_error_code, Some(expected_message));
+
+                    info!("====Done running");
+                }
                 Step::InfluxQLQueryWithAuthorization {
                     query,
                     authorization,
@@ -650,7 +823,7 @@ where
                 Step::GracefulStopIngesters => {
                     info!("====Gracefully stop all ingesters");
 
-                    state.cluster_mut().gracefully_stop_ingesters();
+                    state.cluster_mut().gracefully_stop_ingesters().await;
                 }
                 Step::VerifiedMetrics(verify) => {
                     info!("====Begin validating metrics");
diff --git a/test_helpers_end_to_end/src/udp_listener.rs b/test_helpers_end_to_end/src/udp_listener.rs
index 7a47c0fdac2..02da4149b83 100644
--- a/test_helpers_end_to_end/src/udp_listener.rs
+++ b/test_helpers_end_to_end/src/udp_listener.rs
@@ -32,6 +32,7 @@ impl ToString for Message {
     }
 }
 
+#[derive(Debug)]
 pub struct UdpCapture {
     socket_addr: std::net::SocketAddr,
     join_handle: tokio::task::JoinHandle<()>,
@@ -117,7 +118,7 @@ impl UdpCapture {
     // wait for a message to appear that passes `pred` or the timeout expires
     pub async fn wait_for<P>(&self, pred: P)
     where
-        P: FnMut(&Message) -> bool + Copy,
+        P: FnMut(&Message) -> bool + Copy + Send,
     {
         let end = Instant::now() + Duration::from_secs(MAX_WAIT_TIME_SEC);
 
diff --git a/tokio_metrics_bridge/Cargo.toml b/tokio_metrics_bridge/Cargo.toml
index 7b2faeeff59..9a43a1a3a18 100644
--- a/tokio_metrics_bridge/Cargo.toml
+++ b/tokio_metrics_bridge/Cargo.toml
@@ -5,10 +5,13 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 metric = { path = "../metric" }
 parking_lot = "0.12.1"
-tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
diff --git a/tokio_watchdog/Cargo.toml b/tokio_watchdog/Cargo.toml
new file mode 100644
index 00000000000..c050a9295b9
--- /dev/null
+++ b/tokio_watchdog/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "tokio_watchdog"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+metric = { path = "../metric" }
+observability_deps = { path = "../observability_deps" }
+tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] }
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
+test_helpers = { path = "../test_helpers" }
diff --git a/tokio_watchdog/src/lib.rs b/tokio_watchdog/src/lib.rs
new file mode 100644
index 00000000000..e7e2d759e2e
--- /dev/null
+++ b/tokio_watchdog/src/lib.rs
@@ -0,0 +1,231 @@
+//! Monitors if the tokio runtime still looks healthy.
+#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)]
+#![warn(
+    missing_copy_implementations,
+    missing_docs,
+    clippy::explicit_iter_loop,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use observability_deps::tracing::warn;
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+use std::time::{Duration, Instant};
+
+use metric::{DurationHistogram, Registry, U64Counter};
+use tokio::{
+    runtime::Handle,
+    sync::mpsc::{channel, error::TryRecvError},
+};
+
+/// Tokio watchdog config.
+#[allow(missing_debug_implementations)]
+pub struct WatchdogConfig<'a> {
+    handle: &'a Handle,
+    metric_registry: &'a Registry,
+    runtime_name: &'static str,
+    tick_duration: Duration,
+    warn_threshold: Duration,
+    new_thread_hook: Option<Box<dyn FnOnce() + Send>>,
+}
+
+impl<'a> WatchdogConfig<'a> {
+    /// Create new config for given runtime handle and metric registry.
+    #[must_use]
+    pub fn new(handle: &'a Handle, metric_registry: &'a Registry) -> Self {
+        Self {
+            handle,
+            metric_registry,
+            runtime_name: "tokio",
+            tick_duration: Duration::from_millis(100),
+            warn_threshold: Duration::from_millis(100),
+            new_thread_hook: None,
+        }
+    }
+
+    /// Set runtime name.
+    #[must_use]
+    pub fn with_runtime_name(self, name: &'static str) -> Self {
+        Self {
+            runtime_name: name,
+            ..self
+        }
+    }
+
+    /// Set tick duration.
+    ///
+    /// The tick duration determines how often the alive check will be performed.
+    #[must_use]
+    pub fn with_tick_duration(self, d: Duration) -> Self {
+        Self {
+            tick_duration: d,
+            ..self
+        }
+    }
+
+    /// Set warn duration.
+    ///
+    /// Determines how long the watchdog waits after each check before it detects a hang.
+    #[must_use]
+    pub fn with_warn_duration(self, d: Duration) -> Self {
+        Self {
+            warn_threshold: d,
+            ..self
+        }
+    }
+
+    /// Sets a hook that is called when the watchdog thread is created.
+    ///
+    /// The hook is called from the new thread.
+    #[must_use]
+    pub fn with_new_thread_hook<F>(self, f: F) -> Self
+    where
+        F: FnOnce() + Send + 'static,
+    {
+        Self {
+            new_thread_hook: Some(Box::new(f)),
+            ..self
+        }
+    }
+
+    /// Install watchdog.
+    ///
+    /// # Panic
+    /// Panics if the sum of [tick duration](Self::with_tick_duration) and [warn duration](Self::with_warn_duration) is zero.
+    pub fn install(self) {
+        let Self {
+            handle,
+            metric_registry,
+            runtime_name,
+            tick_duration,
+            warn_threshold,
+            new_thread_hook,
+        } = self;
+
+        assert!(
+            !(tick_duration + warn_threshold).is_zero(),
+            "sum of tick and warn duration must be non-zero"
+        );
+
+        let (tx_request, mut rx_request) = channel::<Instant>(1);
+        let (tx_response, mut rx_response) = channel::<Duration>(1);
+
+        let metric_latency = metric_registry
+            .register_metric::<DurationHistogram>(
+                "tokio_watchdog_response_time",
+                "Response time of the tokio watchdog task",
+            )
+            .recorder(&[("runtime", runtime_name)]);
+        let metric_hang = metric_registry
+            .register_metric::<U64Counter>(
+                "tokio_watchdog_hangs",
+                "Number of hangs detected by the tokio watchdog",
+            )
+            .recorder(&[("runtime", runtime_name)]);
+
+        handle.spawn(async move {
+            loop {
+                let Some(start) = rx_request.recv().await else {
+                    return;
+                };
+
+                if tx_response.try_send(start.elapsed()).is_err() {
+                    return;
+                }
+            }
+        });
+
+        std::thread::Builder::new()
+            .name(format!("tokio watchdog {runtime_name}"))
+            .spawn(move || {
+                if let Some(hook) = new_thread_hook {
+                    hook();
+                }
+
+                loop {
+                    std::thread::sleep(tick_duration);
+
+                    if tx_request.try_send(Instant::now()).is_err() {
+                        return;
+                    }
+
+                    std::thread::sleep(warn_threshold);
+
+                    let d = match rx_response.try_recv() {
+                        Ok(d) => d,
+                        Err(TryRecvError::Empty) => {
+                            warn!(runtime = runtime_name, "tokio starts hanging",);
+                            metric_hang.inc(1);
+
+                            let Some(d) = rx_response.blocking_recv() else {
+                                return;
+                            };
+                            warn!(
+                                runtime = runtime_name,
+                                hang_secs = d.as_secs_f64(),
+                                "tokio stops hanging",
+                            );
+                            d
+                        }
+                        Err(TryRecvError::Disconnected) => {
+                            return;
+                        }
+                    };
+
+                    metric_latency.record(d);
+                }
+            })
+            .expect("start watchdog thread");
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::sync::Arc;
+
+    use test_helpers::tracing::TracingCapture;
+
+    use super::*;
+
+    #[tokio::test]
+    #[should_panic(expected = "sum of tick and warn duration must be non-zero")]
+    async fn test_panic_zero_duration() {
+        let registry = Registry::default();
+        WatchdogConfig::new(&Handle::current(), &registry)
+            .with_tick_duration(Duration::ZERO)
+            .with_warn_duration(Duration::ZERO)
+            .install();
+    }
+
+    #[tokio::test]
+    async fn test() {
+        let capture = Arc::new(TracingCapture::new());
+        let registry = Registry::default();
+        let tick_duration = Duration::from_millis(100);
+        let warn_threshold = Duration::from_millis(200);
+
+        let capture2 = Arc::clone(&capture);
+        WatchdogConfig::new(&Handle::current(), &registry)
+            .with_tick_duration(tick_duration)
+            .with_warn_duration(warn_threshold)
+            .with_new_thread_hook(move || {
+                capture2.register_in_current_thread();
+            })
+            .install();
+
+        std::thread::sleep(warn_threshold * 2);
+        tokio::time::sleep(tick_duration * 2).await;
+
+        let logs = capture.to_string();
+        assert!(logs.contains("tokio starts hanging"));
+        assert!(logs.contains("tokio stops hanging"));
+    }
+}
diff --git a/tower_trailer/Cargo.toml b/tower_trailer/Cargo.toml
new file mode 100644
index 00000000000..e2da6380911
--- /dev/null
+++ b/tower_trailer/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "tower_trailer"
+description = "Allow to send HTTP/2 trailer using a tower layer"
+version.workspace = true
+authors.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[lints]
+workspace = true
+
+[dependencies]
+futures = "0.3"
+http = "0.2"
+http-body = "0.4"
+parking_lot = "0.12"
+pin-project = "1.1"
+tower = "0.4"
+workspace-hack = { version = "0.1", path = "../workspace-hack" }
+
+[dev-dependencies]
diff --git a/tower_trailer/src/lib.rs b/tower_trailer/src/lib.rs
new file mode 100644
index 00000000000..153d0c1fec0
--- /dev/null
+++ b/tower_trailer/src/lib.rs
@@ -0,0 +1,194 @@
+#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)]
+#![warn(
+    missing_debug_implementations,
+    clippy::explicit_iter_loop,
+    clippy::use_self,
+    clippy::clone_on_ref_ptr,
+    // See https://github.com/influxdata/influxdb_iox/pull/1671
+    clippy::future_not_send,
+    clippy::todo,
+    clippy::dbg_macro,
+    unused_crate_dependencies
+)]
+
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+use futures::ready;
+use http::{Request, Response};
+use http_body::SizeHint;
+use parking_lot::Mutex;
+use pin_project::pin_project;
+use tower::{Layer, Service};
+
+// Workaround for "unused crate" lint false positives.
+use workspace_hack as _;
+
+// re-export public types
+pub use http::HeaderMap;
+
+/// Layer that installs [`Trailers`] as a [request extension](Request::extensions).
+#[derive(Debug, Clone, Default)]
+#[allow(missing_copy_implementations)]
+pub struct TrailerLayer;
+
+impl<S> Layer<S> for TrailerLayer {
+    type Service = TrailerService<S>;
+
+    fn layer(&self, service: S) -> Self::Service {
+        TrailerService { service }
+    }
+}
+
+#[derive(Debug, Clone)]
+pub struct TrailerService<S> {
+    service: S,
+}
+
+impl<S, ReqBody, ResBody> Service<Request<ReqBody>> for TrailerService<S>
+where
+    S: Service<Request<ReqBody>, Response = Response<ResBody>>,
+    ResBody: http_body::Body,
+{
+    type Response = Response<WrappedBody<ResBody>>;
+    type Error = S::Error;
+    type Future = WrappedFuture<S::Future>;
+
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        self.service.poll_ready(cx)
+    }
+
+    fn call(&mut self, mut request: Request<ReqBody>) -> Self::Future {
+        let trailers = Trailers::new();
+        let callbacks = trailers.callbacks.clone();
+        let existing = request.extensions_mut().insert(trailers);
+        assert!(
+            existing.is_none(),
+            "trailer layer/service installed multiple times"
+        );
+
+        WrappedFuture {
+            callbacks,
+            inner: self.service.call(request),
+        }
+    }
+}
+
+#[pin_project]
+#[derive(Debug)]
+pub struct WrappedFuture<F> {
+    callbacks: SharedCallbacks,
+    #[pin]
+    inner: F,
+}
+
+impl<F, ResBody, Error> Future for WrappedFuture<F>
+where
+    F: Future<Output = Result<Response<ResBody>, Error>>,
+    ResBody: http_body::Body,
+{
+    type Output = Result<Response<WrappedBody<ResBody>>, Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let result: Result<Response<ResBody>, Error> =
+            ready!(self.as_mut().project().inner.poll(cx));
+
+        match result {
+            Ok(response) => Poll::Ready(Ok(response.map(|body| WrappedBody {
+                callbacks: self.callbacks.clone(),
+                inner: body,
+            }))),
+            Err(e) => Poll::Ready(Err(e)),
+        }
+    }
+}
+
+#[pin_project]
+#[derive(Debug)]
+pub struct WrappedBody<B> {
+    callbacks: SharedCallbacks,
+    #[pin]
+    inner: B,
+}
+
+impl<B: http_body::Body> http_body::Body for WrappedBody<B> {
+    type Data = B::Data;
+    type Error = B::Error;
+
+    fn poll_data(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<Self::Data, Self::Error>>> {
+        self.as_mut().project().inner.poll_data(cx)
+    }
+
+    fn poll_trailers(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Result<Option<http::header::HeaderMap>, Self::Error>> {
+        let result: Result<Option<http::header::HeaderMap>, Self::Error> =
+            ready!(self.as_mut().project().inner.poll_trailers(cx));
+
+        let res = match result {
+            Ok(trailers) => {
+                let mut trailers = trailers.unwrap_or_default();
+
+                for callback in self.callbacks.0.lock().iter() {
+                    callback(&mut trailers);
+                }
+
+                Ok((!trailers.is_empty()).then_some(trailers))
+            }
+            Err(e) => Err(e),
+        };
+        Poll::Ready(res)
+    }
+
+    fn is_end_stream(&self) -> bool {
+        self.inner.is_end_stream()
+    }
+
+    fn size_hint(&self) -> SizeHint {
+        self.inner.size_hint()
+    }
+}
+
+type TrailerCallback = Box<dyn for<'a> Fn(&'a mut HeaderMap) + Send>;
+
+#[derive(Clone, Default)]
+struct SharedCallbacks(Arc<Mutex<Vec<TrailerCallback>>>);
+
+impl std::fmt::Debug for SharedCallbacks {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_tuple("SharedCallbacks").field(&"...").finish()
+    }
+}
+
+/// Handle to manage trailers of a HTTP response.
+#[derive(Clone, Debug)]
+pub struct Trailers {
+    callbacks: SharedCallbacks,
+}
+
+impl Trailers {
+    /// Private constructor.
+    ///
+    /// It is pointless / a potential bug to construct this type outside this crate, because it will NOT be hooked up
+    /// into the layer.
+    fn new() -> Self {
+        Self {
+            callbacks: Default::default(),
+        }
+    }
+
+    /// Register callback that is called when the trailers are sent.
+    pub fn add_callback<F>(&self, f: F)
+    where
+        for<'a> F: Fn(&'a mut HeaderMap) + Send + 'static,
+    {
+        let mut guard = self.callbacks.0.lock();
+        guard.push(Box::new(f));
+    }
+}
diff --git a/trace/Cargo.toml b/trace/Cargo.toml
index 1e431dabf56..1f09b8a3c07 100644
--- a/trace/Cargo.toml
+++ b/trace/Cargo.toml
@@ -6,6 +6,9 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 chrono = { version = "0.4", default-features = false }
 observability_deps = { path = "../observability_deps" }
diff --git a/trace/src/lib.rs b/trace/src/lib.rs
index 3a12d9be4ce..6e352ff209e 100644
--- a/trace/src/lib.rs
+++ b/trace/src/lib.rs
@@ -35,7 +35,7 @@ pub trait TraceCollector: std::fmt::Debug + Send + Sync {
 }
 
 /// A basic trace collector that prints to stdout
-#[derive(Debug)]
+#[derive(Debug, Copy, Clone)]
 pub struct LogTraceCollector {}
 
 impl LogTraceCollector {
diff --git a/trace/src/span.rs b/trace/src/span.rs
index 1ed1d549faf..d7a0de1670d 100644
--- a/trace/src/span.rs
+++ b/trace/src/span.rs
@@ -58,27 +58,26 @@ impl Span {
     }
 
     /// Record an event on this `Span`
-    pub fn event(&mut self, meta: impl Into<Cow<'static, str>>) {
-        let event = SpanEvent {
-            time: Utc::now(),
-            msg: meta.into(),
-        };
-        self.events.push(event)
+    pub fn event(&mut self, event: SpanEvent) {
+        self.events.push(event);
     }
 
     /// Record success on this `Span` setting the status if it isn't already set
-    pub fn ok(&mut self, meta: impl Into<Cow<'static, str>>) {
-        self.event(meta);
-        if self.status == SpanStatus::Unknown {
-            self.status = SpanStatus::Ok;
-        }
+    pub fn ok(&mut self, msg: impl Into<Cow<'static, str>>) {
+        self.event(SpanEvent::new(msg));
+        self.status(SpanStatus::Ok);
     }
 
     /// Record an error on this `Span` setting the status if it isn't already set
-    pub fn error(&mut self, meta: impl Into<Cow<'static, str>>) {
-        self.event(meta);
+    pub fn error(&mut self, msg: impl Into<Cow<'static, str>>) {
+        self.event(SpanEvent::new(msg));
+        self.status(SpanStatus::Err);
+    }
+
+    /// Set status of `Span`
+    pub fn status(&mut self, status: SpanStatus) {
         if self.status == SpanStatus::Unknown {
-            self.status = SpanStatus::Err;
+            self.status = status;
         }
     }
 
@@ -110,6 +109,25 @@ pub struct SpanEvent {
     pub time: DateTime<Utc>,
 
     pub msg: Cow<'static, str>,
+
+    pub metadata: HashMap<Cow<'static, str>, MetaValue>,
+}
+
+impl SpanEvent {
+    /// Create new event.
+    pub fn new(msg: impl Into<Cow<'static, str>>) -> Self {
+        Self {
+            time: Utc::now(),
+            msg: msg.into(),
+            // assume no metadata by default
+            metadata: HashMap::with_capacity(0),
+        }
+    }
+
+    /// Set meta data.
+    pub fn set_metadata(&mut self, key: impl Into<Cow<'static, str>>, value: impl Into<MetaValue>) {
+        self.metadata.insert(key.into(), value.into());
+    }
 }
 
 /// Values that can be stored in a Span's metadata and events
@@ -183,9 +201,9 @@ impl SpanRecorder {
     }
 
     /// Record an event on the contained `Span` if any
-    pub fn event(&mut self, meta: impl Into<Cow<'static, str>>) {
+    pub fn event(&mut self, event: SpanEvent) {
         if let Some(span) = self.span.as_mut() {
-            span.event(meta)
+            span.event(event);
         }
     }
 
@@ -203,6 +221,13 @@ impl SpanRecorder {
         }
     }
 
+    /// Set status of contained `Span` if any
+    pub fn status(&mut self, status: SpanStatus) {
+        if let Some(span) = self.span.as_mut() {
+            span.status(status);
+        }
+    }
+
     /// Take the contents of this recorder returning a new recorder
     ///
     /// From this point on `self` will behave as if it were created with no span
diff --git a/trace_exporters/Cargo.toml b/trace_exporters/Cargo.toml
index 53a6c0ec318..177ad961fc7 100644
--- a/trace_exporters/Cargo.toml
+++ b/trace_exporters/Cargo.toml
@@ -6,17 +6,20 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 async-trait = "0.1"
 clap = { version = "4", features = ["derive", "env"] }
 futures = "0.3"
 iox_time = { path = "../iox_time" }
 observability_deps = { path = "../observability_deps" }
-snafu = "0.7"
+snafu = "0.8"
 thrift = { version = "0.17.0" }
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt", "sync"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "sync"] }
 trace = { path = "../trace" }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies]
-chrono = { version = "0.4", default-features = false, features = ["clock"] }
\ No newline at end of file
+chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/trace_exporters/src/jaeger.rs b/trace_exporters/src/jaeger.rs
index 1e6e4fbdb5e..d02c68a82e8 100644
--- a/trace_exporters/src/jaeger.rs
+++ b/trace_exporters/src/jaeger.rs
@@ -33,6 +33,11 @@ impl JaegerTag {
             value: value.into(),
         }
     }
+
+    /// Key.
+    pub fn key(&self) -> &str {
+        &self.key
+    }
 }
 
 impl From<JaegerTag> for jaeger::Tag {
@@ -169,7 +174,83 @@ impl AsyncExport for JaegerAgentExporter {
         self.rate_limiter.send().await;
 
         if let Err(e) = self.client.emit_batch(batch) {
-            error!(%e, "error writing batch to jaeger agent")
+            let e = NiceThriftError::from(e);
+
+            // not a user-visible error but only a monitoring outage, print on info level
+            // Ref: https://github.com/influxdata/influxdb_iox/issues/9726
+            info!(%e, "error writing batch to jaeger agent")
+        }
+    }
+}
+
+/// Thrift error formatting is messy, try better.
+///
+/// See <https://github.com/influxdata/influxdb_iox/issues/9726>.
+#[derive(Debug)]
+struct NiceThriftError(thrift::Error);
+
+impl From<thrift::Error> for NiceThriftError {
+    fn from(e: thrift::Error) -> Self {
+        Self(e)
+    }
+}
+
+impl std::fmt::Display for NiceThriftError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.0 {
+            thrift::Error::Transport(e) => {
+                let kind = match e.kind {
+                    thrift::TransportErrorKind::Unknown => "unknown",
+                    thrift::TransportErrorKind::NotOpen => "not open",
+                    thrift::TransportErrorKind::AlreadyOpen => "already open",
+                    thrift::TransportErrorKind::TimedOut => "timed out",
+                    thrift::TransportErrorKind::EndOfFile => "end of file",
+                    thrift::TransportErrorKind::NegativeSize => "negative size message",
+                    thrift::TransportErrorKind::SizeLimit => "message too long",
+                    _ => "unknown variant",
+                };
+
+                write!(f, "transport: {}: {}", kind, e.message)
+            }
+            thrift::Error::Protocol(e) => {
+                let kind = match e.kind {
+                    thrift::ProtocolErrorKind::Unknown => "unknown",
+                    thrift::ProtocolErrorKind::InvalidData => "bad data",
+                    thrift::ProtocolErrorKind::NegativeSize => "negative message size",
+                    thrift::ProtocolErrorKind::SizeLimit => "message too long",
+                    thrift::ProtocolErrorKind::BadVersion => "invalid thrift version",
+                    thrift::ProtocolErrorKind::NotImplemented => "not implemented",
+                    thrift::ProtocolErrorKind::DepthLimit => "maximum skip depth reached",
+                    _ => "unknown variant",
+                };
+
+                write!(f, "protocol: {}: {}", kind, e.message)
+            }
+            thrift::Error::Application(e) => {
+                let kind = match e.kind {
+                    thrift::ApplicationErrorKind::Unknown => "unknown",
+                    thrift::ApplicationErrorKind::UnknownMethod => "unknown service method",
+                    thrift::ApplicationErrorKind::InvalidMessageType => {
+                        "wrong message type received"
+                    }
+                    thrift::ApplicationErrorKind::WrongMethodName => {
+                        "unknown method reply received"
+                    }
+                    thrift::ApplicationErrorKind::BadSequenceId => "out of order sequence id",
+                    thrift::ApplicationErrorKind::MissingResult => "missing method result",
+                    thrift::ApplicationErrorKind::InternalError => "remote service threw exception",
+                    thrift::ApplicationErrorKind::ProtocolError => "protocol error",
+                    thrift::ApplicationErrorKind::InvalidTransform => "invalid transform",
+                    thrift::ApplicationErrorKind::InvalidProtocol => "invalid protocol requested",
+                    thrift::ApplicationErrorKind::UnsupportedClientType => {
+                        "unsupported protocol client"
+                    }
+                    _ => "unknown variant",
+                };
+
+                write!(f, "application: {}: {}", kind, e.message)
+            }
+            thrift::Error::User(e) => write!(f, "user: {e}"),
         }
     }
 }
@@ -243,11 +324,13 @@ mod tests {
     use crate::thrift::agent::{AgentSyncHandler, AgentSyncProcessor};
     use chrono::{TimeZone, Utc};
     use iox_time::SystemProvider;
+    use std::borrow::Cow;
+    use std::collections::HashMap;
     use std::sync::{Arc, Mutex};
     use thrift::server::TProcessor;
     use thrift::transport::TBufferChannel;
     use trace::ctx::{SpanContext, SpanId, TraceId};
-    use trace::span::{SpanEvent, SpanStatus};
+    use trace::span::{MetaValue, SpanEvent, SpanStatus};
 
     struct TestHandler {
         batches: Arc<Mutex<Vec<jaeger::Batch>>>,
@@ -382,9 +465,11 @@ mod tests {
         span.events = vec![SpanEvent {
             time: Utc.timestamp_nanos(200000),
             msg: "hello".into(),
+            metadata: HashMap::from([(Cow::from("evt_md"), MetaValue::Int(42))]),
         }];
         span.start = Some(Utc.timestamp_nanos(100000));
         span.end = Some(Utc.timestamp_nanos(300000));
+        span.metadata = HashMap::from([(Cow::from("span_md"), MetaValue::Int(1337))]);
 
         exporter.export(vec![span.clone(), span.clone()]).await;
         exporter.export(vec![span.clone()]).await;
@@ -452,14 +537,18 @@ mod tests {
         let logs = b1_s0.logs.as_ref().unwrap();
         assert_eq!(logs.len(), 1);
         assert_eq!(logs[0].timestamp, 200);
-        assert_eq!(logs[0].fields.len(), 1);
+        assert_eq!(logs[0].fields.len(), 2);
         assert_eq!(logs[0].fields[0].key.as_str(), "event");
         assert_eq!(logs[0].fields[0].v_str.as_ref().unwrap().as_str(), "hello");
+        assert_eq!(logs[0].fields[1].key.as_str(), "evt_md");
+        assert_eq!(logs[0].fields[1].v_long.unwrap(), 42);
 
         let tags = b1_s0.tags.as_ref().unwrap();
-        assert_eq!(tags.len(), 1);
+        assert_eq!(tags.len(), 2);
         assert_eq!(tags[0].key.as_str(), "ok");
         assert!(tags[0].v_bool.unwrap());
+        assert_eq!(tags[1].key.as_str(), "span_md");
+        assert_eq!(tags[1].v_long.unwrap(), 1337);
     }
 
     #[test]
diff --git a/trace_exporters/src/jaeger/span.rs b/trace_exporters/src/jaeger/span.rs
index f6234f5e3cc..d4aa44f4544 100644
--- a/trace_exporters/src/jaeger/span.rs
+++ b/trace_exporters/src/jaeger/span.rs
@@ -56,12 +56,15 @@ impl TryFrom<Span> for jaeger::Span {
 
         let tags = match s.metadata.is_empty() {
             true => None,
-            false => Some(
-                s.metadata
-                    .into_iter()
-                    .map(|(name, value)| tag_from_meta(name.to_string(), value))
-                    .collect(),
-            ),
+            false => {
+                let mut md = s.metadata.into_iter().collect::<Vec<_>>();
+                md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2));
+                Some(
+                    md.into_iter()
+                        .map(|(name, value)| tag_from_meta(name.to_string(), value))
+                        .collect(),
+                )
+            }
         };
 
         let logs = match s.events.is_empty() {
@@ -115,11 +118,14 @@ impl TryFrom<SpanEvent> for jaeger::Log {
     type Error = String;
 
     fn try_from(event: SpanEvent) -> Result<Self, Self::Error> {
+        let mut md = event.metadata.into_iter().collect::<Vec<_>>();
+        md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2));
+
         Ok(Self {
             timestamp: event.time.timestamp_nanos_opt().ok_or_else(|| {
                 format!("timestamp cannot be represented as nanos: {}", event.time)
             })? / 1000,
-            fields: vec![jaeger::Tag {
+            fields: std::iter::once(jaeger::Tag {
                 key: "event".to_string(),
                 v_type: jaeger::TagType::String,
                 v_str: Some(event.msg.to_string()),
@@ -127,7 +133,9 @@ impl TryFrom<SpanEvent> for jaeger::Log {
                 v_bool: None,
                 v_long: None,
                 v_binary: None,
-            }],
+            })
+            .chain(md.into_iter().map(|(k, v)| tag_from_meta(k.to_string(), v)))
+            .collect(),
         })
     }
 }
diff --git a/trace_exporters/src/lib.rs b/trace_exporters/src/lib.rs
index a2f07a2db83..b1a5337b95e 100644
--- a/trace_exporters/src/lib.rs
+++ b/trace_exporters/src/lib.rs
@@ -10,6 +10,7 @@
     clippy::dbg_macro,
     unused_crate_dependencies
 )]
+#![allow(unreachable_pub)]
 
 // Workaround for "unused crate" lint false positives.
 use workspace_hack as _;
@@ -205,8 +206,23 @@ fn jaeger_exporter(config: &TracingConfig) -> Result<Arc<AsyncExporter>> {
     )?;
 
     // Use any specified static span tags.
-    if let Some(tags) = &config.traces_jaeger_tags {
-        jaeger = jaeger.with_tags(tags);
+    let mut tags = config
+        .traces_jaeger_tags
+        .as_ref()
+        .cloned()
+        .unwrap_or_default();
+
+    // add hostname
+    const TAG_HOSTNAME: &str = "hostname";
+    if !tags.iter().any(|t| t.key() == TAG_HOSTNAME) {
+        if let Ok(hostname) = std::env::var("HOSTNAME") {
+            tags.push(JaegerTag::new(TAG_HOSTNAME, hostname));
+        }
+    }
+
+    // commit tags
+    if !tags.is_empty() {
+        jaeger = jaeger.with_tags(&tags);
     }
 
     Ok(Arc::new(AsyncExporter::new(jaeger)))
diff --git a/trace_http/Cargo.toml b/trace_http/Cargo.toml
index 89c90c53e95..691a8aecdc5 100644
--- a/trace_http/Cargo.toml
+++ b/trace_http/Cargo.toml
@@ -6,18 +6,22 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
+bytes = "1.5"
 trace = { path = "../trace" }
 futures = "0.3"
 hashbrown = { workspace = true }
 http = "0.2"
 http-body = "0.4"
-itertools = "0.11"
+itertools = "0.12"
 metric = { path = "../metric" }
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 pin-project = "1.1"
-snafu = "0.7"
+snafu = "0.8"
 tower = "0.4"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
diff --git a/trace_http/src/classify.rs b/trace_http/src/classify.rs
index eb53df97c3f..4b00bdc5757 100644
--- a/trace_http/src/classify.rs
+++ b/trace_http/src/classify.rs
@@ -6,39 +6,59 @@ use std::borrow::Cow;
 /// e.g. a request that encounters both a ClientErr and a ServerErr will
 /// be recorded as a ServerErr
 #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)]
-pub enum Classification {
+pub(crate) enum Classification {
     /// Successful request
     Ok,
-    /// The request was to an unrecognised path
+
+    /// The request was to an unrecognized path
     ///
     /// This is used by the metrics collection to avoid generating a new set of metrics
     /// for a request path that doesn't correspond to a valid route
     PathNotFound,
-    /// The request was unsuccessful but it was not the fault of the service
+
+    /// Method was not allowed.
+    MethodNotAllowed,
+
+    /// The request was unsuccessful (4XX) but it was not the fault of the service
     ClientErr,
-    /// The request was unsuccessful and it was the fault of the service
+
+    /// The request was unsuccessful (5XX) and it was the fault of the service
     ServerErr,
+
+    /// The request produced a response that is not 2XX Ok, 4XX ClientErr or 5XX
+    /// ServerErr. This is unexpected and likely shouldn't happen
+    UnexpectedResponse,
 }
 
-pub fn classify_response<B>(response: &http::Response<B>) -> (Cow<'static, str>, Classification) {
+pub(crate) fn classify_response<B>(
+    response: &http::Response<B>,
+) -> (Cow<'static, str>, Classification) {
     let status = response.status();
-    match status {
-        http::StatusCode::OK | http::StatusCode::CREATED | http::StatusCode::NO_CONTENT => {
-            classify_headers(Some(response.headers()))
-        }
-        http::StatusCode::BAD_REQUEST => ("bad request".into(), Classification::ClientErr),
-        // This is potentially over-zealous but errs on the side of caution
-        http::StatusCode::NOT_FOUND => ("not found".into(), Classification::PathNotFound),
-        http::StatusCode::TOO_MANY_REQUESTS => {
-            ("too many requests".into(), Classification::ClientErr)
-        }
-        http::StatusCode::INTERNAL_SERVER_ERROR => {
-            ("internal server error".into(), Classification::ServerErr)
+
+    if status.is_success() {
+        classify_headers(Some(response.headers()))
+    } else if status.is_client_error() {
+        match status {
+            http::StatusCode::NOT_FOUND => ("not found".into(), Classification::PathNotFound),
+            http::StatusCode::METHOD_NOT_ALLOWED => (
+                "method not allowed".into(),
+                Classification::MethodNotAllowed,
+            ),
+            _ => (
+                format!("unexpected 4XX status code: {status}").into(),
+                Classification::ClientErr,
+            ),
         }
-        _ => (
-            format!("unexpected status code: {status}").into(),
+    } else if status.is_server_error() {
+        (
+            format!("unexpected 5XX status code: {status}").into(),
             Classification::ServerErr,
-        ),
+        )
+    } else {
+        (
+            format!("unexpected non-error status code: {status}").into(),
+            Classification::UnexpectedResponse,
+        )
     }
 }
 
@@ -47,7 +67,7 @@ pub fn classify_response<B>(response: &http::Response<B>) -> (Cow<'static, str>,
 ///
 /// [1]: https://grpc.github.io/grpc/core/md_doc_statuscodes.html
 /// [2]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Trailer
-pub fn classify_headers(
+pub(crate) fn classify_headers(
     headers: Option<&http::header::HeaderMap>,
 ) -> (Cow<'static, str>, Classification) {
     match headers.and_then(|headers| headers.get("grpc-status")) {
diff --git a/trace_http/src/lib.rs b/trace_http/src/lib.rs
index 2aeb0398138..06c26bba040 100644
--- a/trace_http/src/lib.rs
+++ b/trace_http/src/lib.rs
@@ -16,5 +16,5 @@ use workspace_hack as _;
 
 mod classify;
 pub mod ctx;
-mod metrics;
+pub mod metrics;
 pub mod tower;
diff --git a/trace_http/src/metrics.rs b/trace_http/src/metrics.rs
index 32cbbd3eb6a..e32035a05d9 100644
--- a/trace_http/src/metrics.rs
+++ b/trace_http/src/metrics.rs
@@ -1,60 +1,128 @@
 use crate::classify::Classification;
 use hashbrown::HashMap;
+use http::Method;
 use metric::{Attributes, DurationHistogram, Metric, ResultMetric, U64Counter};
 use parking_lot::{MappedMutexGuard, Mutex, MutexGuard};
 use std::sync::Arc;
 use std::time::Instant;
 
-/// `MetricsCollection` is used to retrieve `MetricsRecorder` for instrumenting http requests
+/// The family of [`RequestMetrics`] to publish
+#[derive(Debug, Copy, Clone)]
+pub enum MetricFamily {
+    HttpServer,
+    GrpcServer,
+    HttpClient,
+    GrpcClient,
+}
+
+#[derive(Debug, PartialEq, Eq, Hash)]
+struct MetricsKey {
+    /// request path or None for 404 responses
+    path: Option<String>,
+
+    /// method or None for invalid methods
+    method: Option<Method>,
+}
+
+/// Metrics collected for HTTP/gRPC requests
 #[derive(Debug)]
-pub struct MetricsCollection {
-    /// Whether this `MetricCollection` should publish to grpc_request* or http_request*
-    is_grpc: bool,
+pub struct RequestMetrics {
+    /// Whether this `MetricCollection`
+    family: MetricFamily,
 
     /// Metric registry for registering new metrics
     metric_registry: Arc<metric::Registry>,
 
-    /// Metrics keyed by request path or None for 404 responses
-    metrics: Mutex<HashMap<Option<String>, Metrics>>,
+    /// Metrics.
+    metrics: Mutex<HashMap<MetricsKey, Metrics>>,
+
+    /// Maximum path segments.
+    max_path_segments: Option<usize>,
 }
 
-impl MetricsCollection {
-    pub fn new(metric_registry: Arc<metric::Registry>, is_grpc: bool) -> Self {
+impl RequestMetrics {
+    pub fn new(metric_registry: Arc<metric::Registry>, family: MetricFamily) -> Self {
         Self {
-            is_grpc,
+            family,
             metric_registry,
             metrics: Default::default(),
+            max_path_segments: None,
         }
     }
 
+    /// Restrict metric paths to `segments`
+    pub fn with_max_path_segments(mut self, segments: usize) -> Self {
+        self.max_path_segments = Some(segments);
+        self
+    }
+
     /// Gets the `MetricsRecorder` for a given http request
-    pub fn recorder<B>(self: &Arc<Self>, request: &http::Request<B>) -> MetricsRecorder {
+    pub(crate) fn recorder<B>(self: &Arc<Self>, request: &http::Request<B>) -> MetricsRecorder {
         MetricsRecorder {
             metrics: Arc::clone(self),
             start_instant: Instant::now(),
             path: Some(request.uri().path().to_string()),
+            method: Some(request.method().clone()),
             classification: None,
         }
     }
 
-    fn request_metrics(&self, path: Option<String>) -> MappedMutexGuard<'_, Metrics> {
+    fn request_metrics(
+        &self,
+        path: Option<String>,
+        method: Option<Method>,
+    ) -> MappedMutexGuard<'_, Metrics> {
+        // method is only important for HTTP / non-gRPC
+        let method = match self.family {
+            MetricFamily::HttpServer | MetricFamily::HttpClient => method,
+            MetricFamily::GrpcServer | MetricFamily::GrpcClient => None,
+        };
+
         MutexGuard::map(self.metrics.lock(), |metrics| {
+            let key = MetricsKey { path, method };
             let (_, request_metrics) =
-                metrics.raw_entry_mut().from_key(&path).or_insert_with(|| {
-                    let attributes = match path.as_ref() {
-                        Some(path) => Attributes::from([("path", path.clone().into())]),
-                        None => Attributes::from([]),
-                    };
+                metrics.raw_entry_mut().from_key(&key).or_insert_with(|| {
+                    let mut attributes = Attributes::from([]);
+                    if let Some(path) = &key.path {
+                        attributes.insert("path", truncate_path(path, self.max_path_segments));
+                    }
+                    if let Some(method) = &key.method {
+                        attributes.insert("method", method.to_string());
+                    }
+                    if let (Some(path), Some(method)) = (&key.path, &key.method) {
+                        // help Grafana because you can only repeat a single variable, not a cross-product of the two
+                        attributes.insert(
+                            "method_path",
+                            format!("{} {}", method, truncate_path(path, self.max_path_segments)),
+                        );
+                    }
 
                     let metrics =
-                        Metrics::new(self.metric_registry.as_ref(), attributes, self.is_grpc);
-                    (path, metrics)
+                        Metrics::new(self.metric_registry.as_ref(), attributes, self.family);
+
+                    (key, metrics)
                 });
             request_metrics
         })
     }
 }
 
+fn truncate_path(path: &str, segments: Option<usize>) -> String {
+    let search = || {
+        let s = segments?;
+        let mut indices = path.match_indices('/');
+        for _ in 0..s {
+            indices.next();
+        }
+        let end = indices.next()?.0;
+        if end + 1 == path.len() {
+            return None;
+        }
+        Some(format!("{}/*", &path[..end]))
+    };
+    search().unwrap_or_else(|| path.to_string())
+}
+
 /// The request metrics for a specific set of attributes (e.g. path)
 #[derive(Debug)]
 struct Metrics {
@@ -69,10 +137,16 @@ struct Metrics {
 }
 
 impl Metrics {
-    fn new(registry: &metric::Registry, attributes: impl Into<Attributes>, is_grpc: bool) -> Self {
-        let (counter, duration) = match is_grpc {
-            true => ("grpc_requests", "grpc_request_duration"),
-            false => ("http_requests", "http_request_duration"),
+    fn new(
+        registry: &metric::Registry,
+        attributes: impl Into<Attributes>,
+        family: MetricFamily,
+    ) -> Self {
+        let (counter, duration) = match family {
+            MetricFamily::GrpcServer => ("grpc_requests", "grpc_request_duration"),
+            MetricFamily::HttpServer => ("http_requests", "http_request_duration"),
+            MetricFamily::GrpcClient => ("grpc_client_requests", "grpc_client_request_duration"),
+            MetricFamily::HttpClient => ("http_client_requests", "http_client_request_duration"),
         };
 
         let counter: Metric<U64Counter> =
@@ -98,20 +172,25 @@ impl Metrics {
 
 /// A `MetricsRecorder` is used to record metrics for a given http request
 #[derive(Debug)]
-pub struct MetricsRecorder {
-    metrics: Arc<MetricsCollection>,
+pub(crate) struct MetricsRecorder {
+    metrics: Arc<RequestMetrics>,
     start_instant: Instant,
     path: Option<String>,
+    method: Option<Method>,
     classification: Option<Classification>,
 }
 
 impl MetricsRecorder {
     /// Sets the classification of this request if not already set
-    pub fn set_classification(&mut self, classification: Classification) {
+    pub(crate) fn set_classification(&mut self, classification: Classification) {
         if matches!(classification, Classification::PathNotFound) {
             // Don't want to pollute metrics with invalid paths
             self.path = None
         }
+        if matches!(classification, Classification::MethodNotAllowed) {
+            // Don't want to pollute metrics with invalid methods
+            self.method = None
+        }
 
         self.classification = Some(match self.classification {
             Some(existing) => existing.max(classification),
@@ -122,7 +201,9 @@ impl MetricsRecorder {
 
 impl Drop for MetricsRecorder {
     fn drop(&mut self) {
-        let metrics = self.metrics.request_metrics(self.path.take());
+        let metrics = self
+            .metrics
+            .request_metrics(self.path.take(), self.method.take());
 
         let duration = self.start_instant.elapsed();
         match self.classification {
@@ -130,7 +211,9 @@ impl Drop for MetricsRecorder {
                 metrics.request_count.ok.inc(1);
                 metrics.request_duration.ok.record(duration);
             }
-            Some(Classification::ClientErr) | Some(Classification::PathNotFound) => {
+            Some(Classification::ClientErr)
+            | Some(Classification::PathNotFound)
+            | Some(Classification::MethodNotAllowed) => {
                 metrics.request_count.client_error.inc(1);
                 metrics.request_duration.client_error.record(duration);
             }
@@ -138,7 +221,30 @@ impl Drop for MetricsRecorder {
                 metrics.request_count.server_error.inc(1);
                 metrics.request_duration.server_error.record(duration);
             }
+            Some(Classification::UnexpectedResponse) => {
+                metrics.request_count.unexpected_response.inc(1);
+                metrics
+                    .request_duration
+                    .unexpected_response
+                    .record(duration);
+            }
             None => metrics.aborted_count.inc(1),
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_truncate() {
+        assert_eq!(truncate_path("/health", Some(1)), "/health");
+        assert_eq!(truncate_path("/api/v2/write", Some(3)), "/api/v2/write");
+        assert_eq!(truncate_path("/api/v2/write/", Some(3)), "/api/v2/write/");
+        assert_eq!(truncate_path("/api/v2/write", Some(2)), "/api/v2/*");
+        assert_eq!(truncate_path("/v1/p/000000000000053e", Some(2)), "/v1/p/*");
+        assert_eq!(truncate_path("/a/b/c/d/e/f", None), "/a/b/c/d/e/f");
+        assert_eq!(truncate_path("/a/b/c/d/e/f/", None), "/a/b/c/d/e/f/");
+        assert_eq!(truncate_path("/v1/p/", Some(2)), "/v1/p/");
+    }
+}
diff --git a/trace_http/src/tower.rs b/trace_http/src/tower.rs
index 120f416cc9c..bfba5e1d9cd 100644
--- a/trace_http/src/tower.rs
+++ b/trace_http/src/tower.rs
@@ -18,6 +18,7 @@ use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
 use std::task::{Context, Poll};
 
+use bytes::Buf;
 use futures::ready;
 use http::{HeaderValue, Request, Response};
 use http_body::SizeHint;
@@ -25,11 +26,12 @@ use pin_project::{pin_project, pinned_drop};
 use tower::{Layer, Service};
 
 use observability_deps::tracing::{error, warn};
+use trace::span::{SpanEvent, SpanStatus};
 use trace::{span::SpanRecorder, TraceCollector};
 
 use crate::classify::{classify_headers, classify_response, Classification};
 use crate::ctx::{RequestLogContext, RequestLogContextExt, TraceHeaderParser};
-use crate::metrics::{MetricsCollection, MetricsRecorder};
+use crate::metrics::{MetricsRecorder, RequestMetrics};
 
 /// `TraceLayer` implements `tower::Layer` and can be used to decorate a
 /// `tower::Service` to collect information about requests flowing through it
@@ -43,7 +45,7 @@ use crate::metrics::{MetricsCollection, MetricsRecorder};
 #[derive(Debug, Clone)]
 pub struct TraceLayer {
     trace_header_parser: TraceHeaderParser,
-    metrics: Arc<MetricsCollection>,
+    metrics: Arc<RequestMetrics>,
     collector: Option<Arc<dyn TraceCollector>>,
     name: Arc<str>,
 }
@@ -52,14 +54,13 @@ impl TraceLayer {
     /// Create a new tower [`Layer`] for tracing
     pub fn new(
         trace_header_parser: TraceHeaderParser,
-        metric_registry: Arc<metric::Registry>,
+        metrics: Arc<RequestMetrics>,
         collector: Option<Arc<dyn TraceCollector>>,
-        is_grpc: bool,
         name: &str,
     ) -> Self {
         Self {
             trace_header_parser,
-            metrics: Arc::new(MetricsCollection::new(metric_registry, is_grpc)),
+            metrics,
             collector,
             name: name.into(),
         }
@@ -74,7 +75,7 @@ impl<S> Layer<S> for TraceLayer {
             service,
             collector: self.collector.clone(),
             metrics: Arc::clone(&self.metrics),
-            trace_header_parser: self.trace_header_parser.clone(),
+            trace_header_parser: Some(self.trace_header_parser.clone()),
             name: Arc::clone(&self.name),
         }
     }
@@ -84,12 +85,30 @@ impl<S> Layer<S> for TraceLayer {
 #[derive(Debug, Clone)]
 pub struct TraceService<S> {
     service: S,
-    trace_header_parser: TraceHeaderParser,
+    trace_header_parser: Option<TraceHeaderParser>,
     collector: Option<Arc<dyn TraceCollector>>,
-    metrics: Arc<MetricsCollection>,
+    metrics: Arc<RequestMetrics>,
     name: Arc<str>,
 }
 
+impl<S> TraceService<S> {
+    /// Create a new [`TraceService`] for instrumenting a client
+    pub fn new_client(
+        service: S,
+        metrics: Arc<RequestMetrics>,
+        collector: Option<Arc<dyn TraceCollector>>,
+        name: &str,
+    ) -> Self {
+        Self {
+            service,
+            trace_header_parser: None,
+            metrics,
+            collector,
+            name: name.into(),
+        }
+    }
+}
+
 impl<S, ReqBody, ResBody> Service<Request<ReqBody>> for TraceService<S>
 where
     S: Service<Request<ReqBody>, Response = Response<ResBody>>,
@@ -106,23 +125,22 @@ where
     fn call(&mut self, mut request: Request<ReqBody>) -> Self::Future {
         let metrics_recorder = Some(self.metrics.recorder(&request));
 
-        let request_ctx = match self
-            .trace_header_parser
-            .parse(self.collector.as_ref(), request.headers())
-        {
-            Ok(Some(ctx)) => {
-                let ctx = RequestLogContext::new(ctx);
+        let request_ctx = self.trace_header_parser.as_ref().and_then(|parser| {
+            match parser.parse(self.collector.as_ref(), request.headers()) {
+                Ok(Some(ctx)) => {
+                    let ctx = RequestLogContext::new(ctx);
 
-                request.extensions_mut().insert(ctx.clone());
+                    request.extensions_mut().insert(ctx.clone());
 
-                Some(ctx)
-            }
-            Ok(None) => None,
-            Err(e) => {
-                error!(%e, "error extracting trace context from request");
-                None
+                    Some(ctx)
+                }
+                Ok(None) => None,
+                Err(e) => {
+                    error!(%e, "error extracting trace context from request");
+                    None
+                }
             }
-        };
+        });
 
         let span = request_ctx.as_ref().and_then(|ctx| {
             let ctx = ctx.ctx();
@@ -196,7 +214,7 @@ where
                         metrics_recorder.set_classification(Classification::Ok);
                         span_recorder.ok("request processed with empty response")
                     }
-                    false => span_recorder.event("request processed"),
+                    false => span_recorder.event(SpanEvent::new("request processed")),
                 },
                 (error, c) => {
                     metrics_recorder.set_classification(c);
@@ -292,16 +310,29 @@ impl<B: http_body::Body> http_body::Body for TracedBody<B> {
         let projected = self.as_mut().project();
         let span_recorder = projected.span_recorder;
         let metrics_recorder = projected.metrics_recorder;
+
         match &result {
-            Ok(_) => match projected.inner.is_end_stream() {
-                true => {
-                    metrics_recorder.set_classification(Classification::Ok);
-                    span_recorder.ok("returned body data and no trailers");
-                    projected.was_done_data.store(true, Ordering::SeqCst);
-                    projected.was_ready_trailers.store(true, Ordering::SeqCst);
+            Ok(body) => {
+                let size = body.remaining() as i64;
+                match projected.inner.is_end_stream() {
+                    true => {
+                        metrics_recorder.set_classification(Classification::Ok);
+
+                        let mut evt = SpanEvent::new("returned body data and no trailers");
+                        evt.set_metadata("size", size);
+                        span_recorder.event(evt);
+                        span_recorder.status(SpanStatus::Ok);
+
+                        projected.was_done_data.store(true, Ordering::SeqCst);
+                        projected.was_ready_trailers.store(true, Ordering::SeqCst);
+                    }
+                    false => {
+                        let mut evt = SpanEvent::new("returned body data");
+                        evt.set_metadata("size", size);
+                        span_recorder.event(evt);
+                    }
                 }
-                false => span_recorder.event("returned body data"),
-            },
+            }
             Err(_) => {
                 metrics_recorder.set_classification(Classification::ServerErr);
                 span_recorder.error("error getting body");
@@ -309,6 +340,7 @@ impl<B: http_body::Body> http_body::Body for TracedBody<B> {
                 projected.was_ready_trailers.store(true, Ordering::SeqCst);
             }
         }
+
         Poll::Ready(Some(result))
     }
 
diff --git a/tracker/Cargo.toml b/tracker/Cargo.toml
index 3143c12e35d..d058226cbaa 100644
--- a/tracker/Cargo.toml
+++ b/tracker/Cargo.toml
@@ -6,23 +6,26 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 futures = "0.3"
 hashbrown = { workspace = true }
-lock_api = "0.4.10"
+lock_api = "0.4.11"
 metric = { path = "../metric" }
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
 pin-project = "1.1"
 iox_time = { path = "../iox_time" }
-tokio = { version = "1.32", features = ["macros", "parking_lot", "sync", "time"] }
-tokio-util = { version = "0.7.9" }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "sync", "time"] }
+tokio-util = { version = "0.7.10" }
 trace = { path = "../trace"}
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
-sysinfo = "0.29.10"
+sysinfo = "0.30.5"
 
 [dev-dependencies]
-tempfile = "3.8.0"
+tempfile = "3.9.0"
 # Need the multi-threaded executor for testing
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "time"] }
 test_helpers = { path = "../test_helpers", features = ["future_timeout"] }
diff --git a/tracker/src/async_semaphore.rs b/tracker/src/async_semaphore.rs
index 6377c95bf73..3b8ce7b37a7 100644
--- a/tracker/src/async_semaphore.rs
+++ b/tracker/src/async_semaphore.rs
@@ -1,5 +1,12 @@
 //! Tooling to track/instrument [`tokio::sync::Semaphore`]s.
-use std::{future::Future, marker::PhantomData, sync::Arc, task::Poll, time::Instant};
+use std::{
+    future::Future,
+    marker::PhantomData,
+    ops::Deref,
+    sync::Arc,
+    task::Poll,
+    time::{Duration, Instant},
+};
 
 use futures::{future::BoxFuture, FutureExt};
 use metric::{Attributes, DurationHistogram, MakeMetricObserver, U64Counter, U64Gauge};
@@ -284,8 +291,8 @@ impl<'a> Future for InstrumentedAsyncSemaphoreAcquire<'a> {
                     this.metrics.permits_acquired.inc(*this.n as u64);
                     this.metrics.holders_acquired.inc(1);
 
-                    let elapsed = this.t_start.elapsed();
-                    this.metrics.acquire_duration.record(elapsed);
+                    let acquire_duration = this.t_start.elapsed();
+                    this.metrics.acquire_duration.record(acquire_duration);
 
                     // reset "pending" metrics if we've reported any
                     if *this.reported_pending {
@@ -308,6 +315,7 @@ impl<'a> Future for InstrumentedAsyncSemaphoreAcquire<'a> {
                         inner: permit,
                         n: *this.n,
                         metrics: Arc::clone(this.metrics),
+                        acquire_duration,
                         span_recorder,
                     }))
                 }
@@ -380,6 +388,9 @@ pub struct InstrumentedAsyncOwnedSemaphorePermit {
     /// Metrics.
     metrics: Arc<AsyncSemaphoreMetrics>,
 
+    /// The time it took to acquire this permit.
+    acquire_duration: Duration,
+
     /// Span recorder for the entire semaphore interaction.
     ///
     /// No direct interaction, will be exported during drop (aka the end of the span will be set).
@@ -387,6 +398,13 @@ pub struct InstrumentedAsyncOwnedSemaphorePermit {
     span_recorder: SpanRecorder,
 }
 
+impl InstrumentedAsyncOwnedSemaphorePermit {
+    /// The time it took to acquire this permit.
+    pub fn acquire_duration(&self) -> Duration {
+        self.acquire_duration
+    }
+}
+
 impl Drop for InstrumentedAsyncOwnedSemaphorePermit {
     fn drop(&mut self) {
         self.metrics.holders_acquired.dec(1);
@@ -406,6 +424,14 @@ pub struct InstrumentedAsyncSemaphorePermit<'a> {
     phantom: PhantomData<&'a ()>,
 }
 
+impl<'a> Deref for InstrumentedAsyncSemaphorePermit<'a> {
+    type Target = InstrumentedAsyncOwnedSemaphorePermit;
+
+    fn deref(&self) -> &Self::Target {
+        &self.owned_permit
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::time::Duration;
@@ -611,6 +637,7 @@ mod tests {
         );
 
         let p1 = semaphore.acquire_many(5, None).await.unwrap();
+        let p1_duration = p1.acquire_duration();
 
         let fut = semaphore.acquire_many(6, None);
         pin!(fut);
@@ -619,9 +646,12 @@ mod tests {
         tokio::time::sleep(Duration::from_millis(10)).await;
 
         drop(p1);
-        fut.await.unwrap();
+        let p2 = fut.await.unwrap();
+        let acquire_duration_method = p1_duration + p2.acquire_duration();
+        let acquire_duration_metric = metrics.acquire_duration.fetch().total;
 
-        assert!(metrics.acquire_duration.fetch().total >= Duration::from_millis(10));
+        assert_eq!(acquire_duration_method, acquire_duration_metric);
+        assert!(acquire_duration_method >= Duration::from_millis(10));
     }
 
     #[tokio::test]
diff --git a/tracker/src/disk_metric.rs b/tracker/src/disk_metric.rs
index 4267a169dae..261b4d664b3 100644
--- a/tracker/src/disk_metric.rs
+++ b/tracker/src/disk_metric.rs
@@ -3,7 +3,7 @@ use std::path::PathBuf;
 use std::time::Duration;
 
 use metric::{Attributes, U64Gauge};
-use sysinfo::{DiskExt, RefreshKind, System, SystemExt};
+use sysinfo::Disks;
 use tokio::sync::watch;
 
 /// The interval at which disk metrics are updated.
@@ -53,10 +53,10 @@ pub struct DiskSpaceMetrics {
     available_disk_space: U64Gauge,
     total_disk_space: U64Gauge,
 
-    /// The [`System`] containing the disk list at construction time.
-    system: System,
+    /// The [`Disks`] containing the disk list at construction time.
+    disks: Disks,
 
-    /// The index into [`System::disks()`] for the disk containing the observed
+    /// The index into [`Disks::list()`] for the disk containing the observed
     /// directory.
     disk_idx: usize,
 
@@ -92,13 +92,14 @@ impl DiskSpaceMetrics {
             .recorder(attributes);
 
         // Load the disk stats once, and refresh them later.
-        let system = System::new_with_specifics(RefreshKind::new().with_disks_list());
+        let mut disks = Disks::new();
+        disks.refresh_list();
 
         // Resolve the mount point once.
         // The directory path may be `/path/to/dir` and the mount point is `/`.
         let (disk_idx, initial_disk) = loop {
-            if let Some((idx, disk)) = system
-                .disks()
+            if let Some((idx, disk)) = disks
+                .list()
                 .iter()
                 .enumerate()
                 .find(|(_idx, disk)| disk.mount_point() == directory)
@@ -120,7 +121,7 @@ impl DiskSpaceMetrics {
             Self {
                 available_disk_space,
                 total_disk_space,
-                system,
+                disks,
                 disk_idx,
                 snapshot_tx,
             },
@@ -135,8 +136,8 @@ impl DiskSpaceMetrics {
             interval.tick().await;
 
             let disk = self
-                .system
-                .disks_mut()
+                .disks
+                .list_mut()
                 .get_mut(self.disk_idx)
                 .expect("disk list never refreshed so should not change");
 
diff --git a/tracker/src/lock.rs b/tracker/src/lock.rs
index 0f067dfdc22..98b37619d07 100644
--- a/tracker/src/lock.rs
+++ b/tracker/src/lock.rs
@@ -2,7 +2,8 @@ use std::sync::Arc;
 
 use metric::{Attributes, DurationCounter, Metric, U64Counter};
 
-type RawRwLock = InstrumentedRawRwLock<parking_lot::RawRwLock>;
+type RawRwLock = InstrumentedRawLock<parking_lot::RawRwLock>;
+type RawMutex = InstrumentedRawLock<parking_lot::RawMutex>;
 
 /// An instrumented Read-Write Lock
 pub type RwLock<T> = lock_api::RwLock<RawRwLock, T>;
@@ -12,6 +13,11 @@ pub type MappedRwLockReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, RawR
 pub type MappedRwLockWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, RawRwLock, T>;
 pub type RwLockUpgradableReadGuard<'a, T> = lock_api::RwLockUpgradableReadGuard<'a, RawRwLock, T>;
 
+/// An instrumented mutex
+pub type Mutex<T> = lock_api::Mutex<RawMutex, T>;
+pub type MutexGuard<'a, T> = lock_api::MutexGuard<'a, RawMutex, T>;
+pub type MappedMutexGuard<'a, T> = lock_api::MappedMutexGuard<'a, RawMutex, T>;
+
 #[derive(Debug)]
 pub struct LockMetrics {
     exclusive_count: U64Counter,
@@ -86,9 +92,26 @@ impl LockMetrics {
     pub fn new_lock_raw<R: lock_api::RawRwLock, T: Sized>(
         self: &Arc<Self>,
         t: T,
-    ) -> lock_api::RwLock<InstrumentedRawRwLock<R>, T> {
+    ) -> lock_api::RwLock<InstrumentedRawLock<R>, T> {
         lock_api::RwLock::const_new(
-            InstrumentedRawRwLock {
+            InstrumentedRawLock {
+                inner: R::INIT,
+                metrics: Some(Arc::clone(self)),
+            },
+            t,
+        )
+    }
+
+    pub fn new_mutex<T: Sized>(self: &Arc<Self>, t: T) -> Mutex<T> {
+        self.new_mutex_raw(t)
+    }
+
+    pub fn new_mutex_raw<R: lock_api::RawMutex, T: Sized>(
+        self: &Arc<Self>,
+        t: T,
+    ) -> lock_api::Mutex<InstrumentedRawLock<R>, T> {
+        lock_api::Mutex::const_new(
+            InstrumentedRawLock {
                 inner: R::INIT,
                 metrics: Some(Arc::clone(self)),
             },
@@ -102,7 +125,7 @@ impl LockMetrics {
 ///
 /// This is a raw lock implementation that wraps another and instruments it
 #[derive(Debug)]
-pub struct InstrumentedRawRwLock<R: Sized> {
+pub struct InstrumentedRawLock<R: Sized> {
     inner: R,
 
     /// Stores the tracking data if any
@@ -126,7 +149,7 @@ pub struct InstrumentedRawRwLock<R: Sized> {
 /// exists.
 ///
 /// This is done by delegating to the wrapped RawRwLock implementation
-unsafe impl<R: lock_api::RawRwLock + Sized> lock_api::RawRwLock for InstrumentedRawRwLock<R> {
+unsafe impl<R: lock_api::RawRwLock + Sized> lock_api::RawRwLock for InstrumentedRawLock<R> {
     const INIT: Self = Self {
         inner: R::INIT,
         metrics: None,
@@ -229,7 +252,7 @@ unsafe impl<R: lock_api::RawRwLock + Sized> lock_api::RawRwLock for Instrumented
 ///
 /// This is done by delegating to the wrapped RawRwLock implementation
 unsafe impl<R: lock_api::RawRwLockUpgrade + Sized> lock_api::RawRwLockUpgrade
-    for InstrumentedRawRwLock<R>
+    for InstrumentedRawLock<R>
 {
     fn lock_upgradable(&self) {
         match &self.metrics {
@@ -292,6 +315,54 @@ unsafe impl<R: lock_api::RawRwLockUpgrade + Sized> lock_api::RawRwLockUpgrade
     }
 }
 
+/// # Safety
+///
+/// Implementations of this trait must ensure that the `Mutex` is actually
+/// exclusive: an exclusive lock can't be acquired while another exclusive
+/// lock exists.
+///
+/// This is done by delegating to the wrapped RawMutex implementation
+unsafe impl<R: lock_api::RawMutex + Sized> lock_api::RawMutex for InstrumentedRawLock<R> {
+    const INIT: Self = Self {
+        inner: R::INIT,
+        metrics: None,
+    };
+
+    type GuardMarker = R::GuardMarker;
+
+    fn lock(&self) {
+        match &self.metrics {
+            Some(shared) => {
+                // Early return if possible - Instant::now is not necessarily cheap
+                if self.try_lock() {
+                    return;
+                }
+
+                let now = std::time::Instant::now();
+                self.inner.lock();
+                let elapsed = now.elapsed();
+                shared.exclusive_count.inc(1);
+                shared.exclusive_wait.inc(elapsed);
+            }
+            None => self.inner.lock(),
+        }
+    }
+
+    fn try_lock(&self) -> bool {
+        let ret = self.inner.try_lock();
+        if let Some(shared) = &self.metrics {
+            if ret {
+                shared.exclusive_count.inc(1);
+            }
+        }
+        ret
+    }
+
+    unsafe fn unlock(&self) {
+        self.inner.unlock()
+    }
+}
+
 #[cfg(test)]
 mod tests {
     // Clippy isn't recognizing the explicit drops; none of these locks are actually being held
@@ -302,7 +373,7 @@ mod tests {
     use std::time::Duration;
 
     #[test]
-    fn test_counts() {
+    fn test_rwlock_counts() {
         let metrics = Arc::new(LockMetrics::new_unregistered());
         let lock = metrics.new_lock(32);
 
@@ -319,6 +390,21 @@ mod tests {
         assert_eq!(metrics.shared_count.fetch(), 2);
     }
 
+    #[test]
+    fn test_mutex_counts() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let mutex = metrics.new_mutex(32);
+
+        let g = mutex.lock();
+        drop(g);
+
+        let g = mutex.lock();
+        drop(g);
+
+        assert_eq!(metrics.exclusive_count.fetch(), 2);
+        assert_eq!(metrics.shared_count.fetch(), 0);
+    }
+
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn test_shared_wait_time() {
         let metrics = Arc::new(LockMetrics::new_unregistered());
@@ -366,6 +452,29 @@ mod tests {
         assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
     }
 
+    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
+    async fn test_mutex_wait_time() {
+        let metrics = Arc::new(LockMetrics::new_unregistered());
+        let l1 = Arc::new(metrics.new_mutex(32));
+        let l2 = Arc::clone(&l1);
+
+        let g = l1.lock();
+        let join = tokio::spawn(async move {
+            let _g = l2.lock();
+        });
+
+        std::thread::sleep(Duration::from_millis(100));
+        std::mem::drop(g);
+
+        join.await.unwrap();
+
+        assert_eq!(metrics.exclusive_count.fetch(), 2);
+        assert_eq!(metrics.shared_count.fetch(), 0);
+        assert_eq!(metrics.shared_wait.fetch(), Duration::ZERO);
+        assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80));
+        assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200));
+    }
+
     #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
     async fn test_multiple() {
         let metrics = Arc::new(LockMetrics::new_unregistered());
diff --git a/tracker/src/task.rs b/tracker/src/task.rs
index 9fbc0df9b37..1631f936abd 100644
--- a/tracker/src/task.rs
+++ b/tracker/src/task.rs
@@ -204,7 +204,7 @@ impl TaskResult {
 }
 
 /// The status of the tracked task
-#[derive(Debug, Clone, Eq, PartialEq)]
+#[derive(Debug, Clone, Eq, PartialEq, Copy)]
 pub enum TaskStatus {
     /// More futures can be registered
     Creating,
diff --git a/tracker/src/task/history.rs b/tracker/src/task/history.rs
index 42db815f854..3ef8e2e9d91 100644
--- a/tracker/src/task/history.rs
+++ b/tracker/src/task/history.rs
@@ -86,7 +86,7 @@ struct SizeLimitedHashMap<K, V> {
 }
 
 impl<K: Copy + Hash + Eq + Ord, V> SizeLimitedHashMap<K, V> {
-    pub fn new(capacity: usize) -> Self {
+    pub(crate) fn new(capacity: usize) -> Self {
         Self {
             values: HashMap::with_capacity(capacity),
             ring: Vec::with_capacity(capacity),
@@ -96,14 +96,14 @@ impl<K: Copy + Hash + Eq + Ord, V> SizeLimitedHashMap<K, V> {
     }
 
     /// Get the value associated with a specific key
-    pub fn get(&self, key: &K) -> Option<&V> {
+    pub(crate) fn get(&self, key: &K) -> Option<&V> {
         self.values.get(key)
     }
 
     /// Returns an iterator to all values stored within the ring buffer
     ///
     /// Note: the order is not guaranteed
-    pub fn values(&self) -> impl Iterator<Item = &V> + '_ {
+    pub(crate) fn values(&self) -> impl Iterator<Item = &V> + '_ {
         self.values.values()
     }
 
@@ -116,7 +116,7 @@ impl<K: Copy + Hash + Eq + Ord, V> SizeLimitedHashMap<K, V> {
     /// from the buffer.
     ///
     /// This returns the replaced value (if any).
-    pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
+    pub(crate) fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
         if let Entry::Occupied(occupied) = self.values.entry(key) {
             // If already exists - replace existing value
             occupied.replace_entry(value);
diff --git a/trogging/Cargo.toml b/trogging/Cargo.toml
index 3c7c6960815..b4f547c785f 100644
--- a/trogging/Cargo.toml
+++ b/trogging/Cargo.toml
@@ -6,12 +6,15 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies]
 clap = { version = "4", features = ["derive", "env"], optional = true }
 logfmt = { path = "../logfmt" }
 observability_deps = { path = "../observability_deps" }
-thiserror = "1.0.48"
-tracing-log = "0.1"
+thiserror = "1.0.56"
+tracing-log = "0.2"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 
 [dev-dependencies]
diff --git a/wal/Cargo.toml b/wal/Cargo.toml
index 789080c5bcf..1a9fc0a0dd7 100644
--- a/wal/Cargo.toml
+++ b/wal/Cargo.toml
@@ -5,8 +5,11 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
-byteorder = "1.3.4"
+byteorder = "1.5.0"
 crc32fast = "1.2.0"
 data_types = { path = "../data_types" }
 generated_types = { path = "../generated_types" }
@@ -15,10 +18,10 @@ mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
 mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
 observability_deps = { path = "../observability_deps" }
 parking_lot = "0.12"
-prost = "0.11"
-snafu = "0.7"
-snap = "1.0.0"
-tokio = { version = "1.32", features = ["macros", "fs", "io-util", "parking_lot", "rt-multi-thread", "sync", "time"] }
+prost = { workspace = true }
+snafu = "0.8"
+snap = "1.1.1"
+tokio = { version = "1.35", features = ["macros", "fs", "io-util", "parking_lot", "rt-multi-thread", "sync", "time"] }
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
diff --git a/wal/src/blocking/reader.rs b/wal/src/blocking/reader.rs
index 582fcda42b8..c0e9dcac174 100644
--- a/wal/src/blocking/reader.rs
+++ b/wal/src/blocking/reader.rs
@@ -11,8 +11,10 @@ use std::{
     path::{Path, PathBuf},
 };
 
+/// A closed segment file reader over an `R`, tracking the number of compressed
+/// bytes read.
 #[derive(Debug)]
-pub struct ClosedSegmentFileReader<R>(R);
+pub struct ClosedSegmentFileReader<R>(R, u64);
 
 impl ClosedSegmentFileReader<BufReader<File>> {
     pub fn from_path(path: impl AsRef<Path>) -> Result<Self> {
@@ -28,7 +30,7 @@ where
     R: Read,
 {
     pub fn new(f: R) -> Self {
-        Self(f)
+        Self(f, 0)
     }
 
     fn read_array<const N: usize>(&mut self) -> Result<[u8; N]> {
@@ -36,6 +38,7 @@ where
         self.0
             .read_exact(&mut data)
             .context(UnableToReadArraySnafu { length: N })?;
+        self.1 += N as u64;
         Ok(data)
     }
 
@@ -66,6 +69,16 @@ where
 
         let (actual_compressed_len, actual_checksum) = decompressing_read.into_inner().checksum();
 
+        // Track the size of the entry header and total amount of compressed
+        // data successfully read so far by the reader. The header values are
+        // tracked here to avoid continuously counting bytes read from a
+        // corrupted segment where no further entries can be read.
+        //
+        // This accounting is done before checksum/length mismatch, if the data has still
+        // been read in successfully.
+        self.1 += 2 * std::mem::size_of::<u32>() as u64;
+        self.1 += actual_compressed_len;
+
         ensure!(
             expected_len == actual_compressed_len,
             LengthMismatchSnafu {
@@ -100,6 +113,12 @@ where
 
         Ok(None)
     }
+
+    /// Returns the total amount of bytes successfully read from this reader's
+    /// underlying file, in bytes.
+    pub fn bytes_read(&self) -> u64 {
+        self.1
+    }
 }
 
 struct CrcReader<R> {
@@ -208,6 +227,7 @@ mod tests {
 
         let entry = reader.one_entry().unwrap();
         assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
     }
 
     #[test]
@@ -236,11 +256,17 @@ mod tests {
 
         let entry = reader.one_entry().unwrap();
         assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
     }
 
     #[test]
     fn unsuccessful_read_too_short_len() {
         let mut segment_file = FakeSegmentFile::new();
+
+        // The bad entry will prevent any entries being read, thus the
+        // no bytes can be reported as successfully read.
+        let want_bytes_read = segment_file.size_bytes();
+
         let bad_entry_input = FakeSegmentEntry::new(b"hello");
         let good_length = bad_entry_input.compressed_len();
         let bad_entry_input = bad_entry_input.with_compressed_len(good_length - 1);
@@ -260,14 +286,22 @@ mod tests {
         assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => {
             assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof);
         });
+        assert_eq!(reader.bytes_read(), want_bytes_read);
         // Trying to continue reading will fail as well, see:
         // <https://github.com/influxdata/influxdb_iox/issues/6222>
         assert_error!(reader.one_entry(), Error::UnableToReadData { .. });
+        // Ensure no magical bean counting occurs when stuck unable to read data.
+        assert_eq!(reader.bytes_read(), want_bytes_read);
     }
 
     #[test]
     fn unsuccessful_read_too_long_len() {
         let mut segment_file = FakeSegmentFile::new();
+
+        // The bad entry will prevent any entries being read, thus the
+        // no bytes can be reported as successfully read.
+        let want_bytes_read = segment_file.size_bytes();
+
         let bad_entry_input = FakeSegmentEntry::new(b"hello");
         let good_length = bad_entry_input.compressed_len();
         let bad_entry_input = bad_entry_input.with_compressed_len(good_length + 1);
@@ -287,14 +321,18 @@ mod tests {
         assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => {
             assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof);
         });
+        assert_eq!(reader.bytes_read(), want_bytes_read);
         // Trying to continue reading will fail as well, see:
         // <https://github.com/influxdata/influxdb_iox/issues/6222>
         assert_error!(reader.one_entry(), Error::UnableToReadData { .. });
+        // Also no magical bean counting when cannot read more.
+        assert_eq!(reader.bytes_read(), want_bytes_read);
     }
 
     #[test]
     fn unsuccessful_read_checksum_mismatch() {
         let mut segment_file = FakeSegmentFile::new();
+
         let bad_entry_input = FakeSegmentEntry::new(b"hello");
         let good_checksum = bad_entry_input.checksum();
         let bad_entry_input = bad_entry_input.with_checksum(good_checksum + 1);
@@ -320,6 +358,7 @@ mod tests {
 
         let entry = reader.one_entry().unwrap();
         assert!(entry.is_none());
+        assert_eq!(reader.bytes_read(), segment_file.size_bytes());
     }
 
     #[derive(Debug)]
@@ -356,6 +395,23 @@ mod tests {
 
             f
         }
+
+        fn size_bytes(&self) -> u64 {
+            std::mem::size_of::<FileTypeIdentifier>() as u64
+                + std::mem::size_of::<SegmentIdBytes>() as u64
+                + self
+                    .entries
+                    .iter()
+                    .map(|e| {
+                        // Each entry is sized by the two 4 byte
+                        // header values (checksum and compressed_len)
+                        // as well as the length of the compressed data.
+                        (std::mem::size_of::<u32>()
+                            + std::mem::size_of::<u32>()
+                            + e.compressed_data().len()) as u64
+                    })
+                    .sum::<u64>()
+        }
     }
 
     #[derive(Debug, Clone, PartialEq)]
diff --git a/wal/src/lib.rs b/wal/src/lib.rs
index 145d1cf0f5f..0801021e16b 100644
--- a/wal/src/lib.rs
+++ b/wal/src/lib.rs
@@ -24,6 +24,11 @@ use std::{
     time::Duration,
 };
 
+use hashbrown::HashMap;
+use parking_lot::Mutex;
+use snafu::prelude::*;
+use tokio::{sync::watch, task::JoinHandle};
+
 use data_types::{sequence_number_set::SequenceNumberSet, NamespaceId, TableId};
 use generated_types::{
     google::{FieldViolation, OptionalField},
@@ -31,13 +36,9 @@ use generated_types::{
         sequenced_wal_op::Op as WalOp, SequencedWalOp as ProtoSequencedWalOp,
     },
 };
-use hashbrown::HashMap;
 use mutable_batch::MutableBatch;
 use mutable_batch_pb::decode::decode_database_batch;
 use observability_deps::tracing::info;
-use parking_lot::Mutex;
-use snafu::prelude::*;
-use tokio::{sync::watch, task::JoinHandle};
 use writer_thread::WriterIoThreadHandle;
 
 use crate::blocking::{
@@ -235,7 +236,7 @@ impl Wal {
     ///
     /// Similarly, editing or deleting files within a `Wal`'s root directory via some other
     /// mechanism is not supported.
-    pub async fn new(root: impl Into<PathBuf>) -> Result<Arc<Self>> {
+    pub async fn new(root: impl Into<PathBuf> + Send) -> Result<Arc<Self>> {
         let root = root.into();
         info!(wal_dir=?root, "Initalizing Write Ahead Log (WAL)");
         tokio::fs::create_dir_all(&root)
@@ -550,7 +551,7 @@ pub struct ClosedSegmentFileReader {
 }
 
 impl Iterator for ClosedSegmentFileReader {
-    type Item = Result<Vec<SequencedWalOp>>;
+    type Item = Result<(Vec<SequencedWalOp>, u64)>;
 
     /// Read the next batch of sequenced WAL operations from the file
     fn next(&mut self) -> Option<Self::Item> {
@@ -558,6 +559,7 @@ impl Iterator for ClosedSegmentFileReader {
             .next_batch()
             .context(UnableToReadNextOpsSnafu)
             .transpose()
+            .map(|result| result.map(|batch| (batch, self.bytes_read())))
     }
 }
 
@@ -567,6 +569,12 @@ impl ClosedSegmentFileReader {
         self.id
     }
 
+    /// Returns the total number of bytes successfully read by the underlying file reader
+    /// from disk.
+    pub fn bytes_read(&self) -> u64 {
+        self.file.bytes_read()
+    }
+
     /// Open the segment file and read its header, ensuring it is a segment file and reading its id.
     pub fn from_path(path: impl AsRef<Path>) -> Result<Self> {
         let path = path.as_ref();
@@ -629,7 +637,7 @@ impl Iterator for WriteOpEntryDecoder {
             self.reader
                 .next()?
                 .context(FailedToReadWalSnafu)
-                .map(|batch| {
+                .map(|(batch, _)| {
                     batch
                         .into_iter()
                         .filter_map(|sequenced_op| match sequenced_op.op {
@@ -680,6 +688,7 @@ mod tests {
     use std::io::Write;
 
     use assert_matches::assert_matches;
+
     use data_types::{NamespaceId, SequenceNumber, TableId};
     use dml::DmlWrite;
     use generated_types::influxdata::{
@@ -730,7 +739,7 @@ mod tests {
         let ops: Vec<SequencedWalOp> = wal
             .reader_for_segment(closed.id)
             .expect("should be able to open reader for closed WAL segment")
-            .flat_map(|batch| batch.expect("failed to read WAL op batch"))
+            .flat_map(|batch| batch.expect("failed to read WAL op batch").0)
             .collect();
         assert_eq!(vec![op1, op2, op3, op4], ops);
 
@@ -863,15 +872,9 @@ mod tests {
         assert_eq!(wal_entries.len(), 2);
         let write_op_entries = wal_entries.into_iter().flatten().collect::<Vec<_>>();
         assert_eq!(write_op_entries.len(), 3);
-        assert_matches!(write_op_entries.first(), Some(got_op1) => {
-            assert_op_shape(got_op1, &w1);
-        });
-        assert_matches!(write_op_entries.get(1), Some(got_op2) => {
-            assert_op_shape(got_op2, &w2);
-        });
-        assert_matches!(write_op_entries.get(2), Some(got_op3) => {
-            assert_op_shape(got_op3, &w3);
-        });
+        assert_op_shape(&write_op_entries[0], &w1);
+        assert_op_shape(&write_op_entries[1], &w2);
+        assert_op_shape(&write_op_entries[2], &w3);
     }
 
     #[tokio::test]
@@ -916,7 +919,7 @@ mod tests {
         // error is thrown
         assert_matches!(decoder.next(), Some(Ok(batch)) => {
             assert_eq!(batch.len(), 1);
-            assert_op_shape(batch.first().unwrap(), &good_write);
+            assert_op_shape(&batch[0], &good_write);
         });
         assert_matches!(
             decoder.next(),
diff --git a/wal/tests/end_to_end.rs b/wal/tests/end_to_end.rs
index 331c9b49bfd..aa53c8c85fa 100644
--- a/wal/tests/end_to_end.rs
+++ b/wal/tests/end_to_end.rs
@@ -1,3 +1,4 @@
+use assert_matches::assert_matches;
 use data_types::{NamespaceId, SequenceNumber, TableId};
 use dml::DmlWrite;
 use generated_types::influxdata::{
@@ -61,7 +62,7 @@ async fn crud() {
     // Can read the written entries from the closed segment, ensuring that the
     // per-partition sequence numbers are preserved.
     let mut reader = wal.reader_for_segment(closed_segment_details.id()).unwrap();
-    let mut op = reader.next().unwrap().unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
     let mut got_sequence_numbers = op
         .remove(0)
         .table_write_sequence_numbers
@@ -69,7 +70,7 @@ async fn crud() {
         .collect::<Vec<_>>();
     got_sequence_numbers.sort();
     assert_eq!(got_sequence_numbers, Vec::<u64>::from([42, 43]),);
-    let mut op = reader.next().unwrap().unwrap();
+    let (mut op, bytes_read) = reader.next().unwrap().unwrap();
     let mut got_sequence_numbers = op
         .remove(0)
         .table_write_sequence_numbers
@@ -78,6 +79,11 @@ async fn crud() {
     got_sequence_numbers.sort();
     assert_eq!(got_sequence_numbers, Vec::<u64>::from([44, 45]),);
 
+    // Ensure that all entries have been read and the total bytes read reflect
+    // the segment size.
+    assert_matches!(reader.next(), None);
+    assert_eq!(bytes_read, closed_segment_details.size());
+
     // Can delete a segment, leaving no closed segments again
     wal.delete(closed_segment_details.id()).await.unwrap();
     let closed = wal.closed_segments();
@@ -114,7 +120,7 @@ async fn replay() {
     // Can read the written entries from the previously closed segment
     // ensuring the per-partition sequence numbers are preserved.
     let mut reader = wal.reader_for_segment(closed_segment_ids[0]).unwrap();
-    let mut op = reader.next().unwrap().unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
     let mut got_sequence_numbers = op
         .remove(0)
         .table_write_sequence_numbers
@@ -125,7 +131,7 @@ async fn replay() {
 
     // Can read the written entries from the previously open segment
     let mut reader = wal.reader_for_segment(closed_segment_ids[1]).unwrap();
-    let mut op = reader.next().unwrap().unwrap();
+    let (mut op, _) = reader.next().unwrap().unwrap();
     let mut got_sequence_numbers = op
         .remove(0)
         .table_write_sequence_numbers
diff --git a/wal_inspect/Cargo.toml b/wal_inspect/Cargo.toml
index 26bb9cbe754..4224b2761ba 100644
--- a/wal_inspect/Cargo.toml
+++ b/wal_inspect/Cargo.toml
@@ -5,13 +5,16 @@ authors.workspace = true
 edition.workspace = true
 license.workspace = true
 
+[lints]
+workspace = true
+
 [dependencies] # In alphabetical order
 data_types = { version = "0.1.0", path = "../data_types" }
 hashbrown.workspace = true
 mutable_batch = { version = "0.1.0", path = "../mutable_batch" }
 parquet_to_line_protocol = { version = "0.1.0", path = "../parquet_to_line_protocol" }
 schema = { version = "0.1.0", path = "../schema" }
-thiserror = "1.0.48"
+thiserror = "1.0.56"
 workspace-hack = { version = "0.1", path = "../workspace-hack" }
 
 [dev-dependencies] # In alphabetical order
@@ -20,5 +23,5 @@ generated_types = { version = "0.1.0", path = "../generated_types" }
 mutable_batch_lp = { path = "../mutable_batch_lp" }
 mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" }
 test_helpers = { path = "../test_helpers" }
-tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] }
 wal = { version = "0.1.0", path = "../wal" }
diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml
index 29c9fdd5194..ce4996cb7a8 100644
--- a/workspace-hack/Cargo.toml
+++ b/workspace-hack/Cargo.toml
@@ -17,10 +17,9 @@ license.workspace = true
 ### BEGIN HAKARI SECTION
 [dependencies]
 ahash = { version = "0.8" }
-arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["dyn_cmp_dict", "prettyprint"] }
-arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", default-features = false, features = ["chrono-tz"] }
-arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["flight-sql-experimental"] }
-arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", default-features = false, features = ["dyn_cmp_dict"] }
+arrow-array = { version = "49", default-features = false, features = ["chrono-tz"] }
+arrow-cast = { version = "49", default-features = false, features = ["prettyprint"] }
+arrow-ipc = { version = "49", features = ["lz4"] }
 bitflags = { version = "2", default-features = false, features = ["std"] }
 byteorder = { version = "1" }
 bytes = { version = "1" }
@@ -29,8 +28,6 @@ clap = { version = "4", features = ["derive", "env", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
 crypto-common = { version = "0.1", default-features = false, features = ["std"] }
-datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178" }
-datafusion-optimizer = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178", default-features = false, features = ["crypto_expressions", "regex_expressions", "unicode_expressions"] }
 digest = { version = "0.10", features = ["mac", "std"] }
 either = { version = "1", features = ["serde"] }
 fixedbitset = { version = "0.4" }
@@ -44,7 +41,9 @@ futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
+hyper = { version = "0.14", features = ["full"] }
 indexmap = { version = "2" }
+itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" }
 libc = { version = "0.2", features = ["extra_traits"] }
 lock_api = { version = "0.4", features = ["arc_lock"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -52,29 +51,28 @@ md-5 = { version = "0.10" }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
-object_store = { version = "0.7", default-features = false, features = ["aws", "azure", "gcp"] }
+object_store = { version = "0.8", default-features = false, features = ["aws", "azure", "gcp"] }
 once_cell = { version = "1", features = ["parking_lot"] }
 parking_lot = { version = "0.12", features = ["arc_lock"] }
-parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["experimental", "object_store"] }
 petgraph = { version = "0.6" }
 phf_shared = { version = "0.11" }
 proptest = { version = "1", default-features = false, features = ["std"] }
-prost = { version = "0.11" }
-prost-types = { version = "0.11" }
+prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" }
+prost-a6292c17cd707f01 = { package = "prost", version = "0.11" }
+prost-types-5ef9efb8ec2df382 = { package = "prost-types", version = "0.12" }
+prost-types-a6292c17cd707f01 = { package = "prost-types", version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 rand_core = { version = "0.6", default-features = false, features = ["std"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
-regex-syntax-c38e5c1d305a1b54 = { package = "regex-syntax", version = "0.8" }
-regex-syntax-ca01ad9e24f5d932 = { package = "regex-syntax", version = "0.7" }
+regex-syntax = { version = "0.8" }
 reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls", "stream"] }
-ring = { version = "0.16", default-features = false, features = ["std"] }
-rustls = { version = "0.21", default-features = false, features = ["dangerous_configuration", "logging", "tls12"] }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.21", features = ["dangerous_configuration"] }
 serde = { version = "1", features = ["derive", "rc"] }
 serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10" }
 similar = { version = "2", features = ["inline"] }
-sqlparser = { version = "0.37", features = ["visitor"] }
 sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-rustls", "sqlite", "uuid"] }
 sqlx-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "any", "json", "migrate", "offline", "uuid"] }
 sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
@@ -84,7 +82,6 @@ thrift = { version = "0.17" }
 tokio = { version = "1", features = ["full", "tracing"] }
 tokio-stream = { version = "0.1", features = ["fs", "net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io"] }
-tonic = { version = "0.9", features = ["tls-webpki-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] }
 tracing-core = { version = "0.1" }
@@ -116,7 +113,8 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 heck = { version = "0.4", features = ["unicode"] }
 indexmap = { version = "2" }
-itertools = { version = "0.10" }
+itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" }
+itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" }
 lock_api = { version = "0.4", features = ["arc_lock"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 md-5 = { version = "0.10" }
@@ -127,14 +125,17 @@ once_cell = { version = "1", features = ["parking_lot"] }
 parking_lot = { version = "0.12", features = ["arc_lock"] }
 petgraph = { version = "0.6" }
 phf_shared = { version = "0.11" }
-prost = { version = "0.11" }
-prost-types = { version = "0.11" }
+prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" }
+prost-a6292c17cd707f01 = { package = "prost", version = "0.11" }
+prost-types-5ef9efb8ec2df382 = { package = "prost-types", version = "0.12" }
+prost-types-a6292c17cd707f01 = { package = "prost-types", version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 rand_core = { version = "0.6", default-features = false, features = ["std"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
-regex-syntax-c38e5c1d305a1b54 = { package = "regex-syntax", version = "0.8" }
-rustls = { version = "0.21", default-features = false, features = ["dangerous_configuration", "logging", "tls12"] }
+regex-syntax = { version = "0.8" }
+ring = { version = "0.17", features = ["std"] }
+rustls = { version = "0.21", features = ["dangerous_configuration"] }
 serde = { version = "1", features = ["derive", "rc"] }
 serde_json = { version = "1", features = ["raw_value"] }
 sha2 = { version = "0.10" }
@@ -144,7 +145,7 @@ sqlx-macros-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "j
 sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
 sqlx-sqlite = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
+syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] }
 tokio = { version = "1", features = ["full", "tracing"] }
 tokio-stream = { version = "0.1", features = ["fs", "net"] }
 tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] }
@@ -156,48 +157,40 @@ uuid = { version = "1", features = ["v4"] }
 
 [target.x86_64-unknown-linux-gnu.dependencies]
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.x86_64-unknown-linux-gnu.build-dependencies]
 libc = { version = "0.2", features = ["extra_traits"] }
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.x86_64-apple-darwin.dependencies]
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.x86_64-apple-darwin.build-dependencies]
 libc = { version = "0.2", features = ["extra_traits"] }
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.aarch64-apple-darwin.dependencies]
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.aarch64-apple-darwin.build-dependencies]
 libc = { version = "0.2", features = ["extra_traits"] }
 nix = { version = "0.27", features = ["fs", "signal", "user"] }
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
 
 [target.x86_64-pc-windows-msvc.dependencies]
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
-winapi = { version = "0.3", default-features = false, features = ["cfg", "combaseapi", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "heapapi", "ifdef", "in6addr", "inaddr", "ioapiset", "iphlpapi", "lmaccess", "lmapibuf", "lmcons", "memoryapi", "minwinbase", "minwindef", "netioapi", "ntlsa", "ntsecapi", "ntstatus", "objidl", "oleauto", "pdh", "powerbase", "processenv", "psapi", "rpcdce", "sddl", "securitybaseapi", "shellapi", "std", "synchapi", "sysinfoapi", "wbemcli", "winbase", "wincon", "windef", "winerror", "winioctl", "winnt", "winsock2", "wtypesbase"] }
-windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
+winapi = { version = "0.3", default-features = false, features = ["cfg", "combaseapi", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "heapapi", "ifdef", "in6addr", "inaddr", "ioapiset", "iphlpapi", "lmaccess", "lmapibuf", "lmcons", "memoryapi", "minwinbase", "minwindef", "netioapi", "ntlsa", "ntsecapi", "ntstatus", "objidl", "oleauto", "pdh", "powerbase", "processenv", "psapi", "rpcdce", "sddl", "securitybaseapi", "shellapi", "std", "synchapi", "sysinfoapi", "wbemcli", "winbase", "wincon", "windef", "winerror", "winioctl", "winnt", "winsock2"] }
+windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
 windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
 
 [target.x86_64-pc-windows-msvc.build-dependencies]
-rustls = { version = "0.21" }
 spin = { version = "0.9" }
-windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
+windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] }
 windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] }
 
 ### END HAKARI SECTION