From 60247cca08823bc86c3aac176c9228c65d869c9b Mon Sep 17 00:00:00 2001 From: Michael Gattozzi Date: Thu, 25 Jan 2024 14:31:57 -0500 Subject: [PATCH] chore(deps): Update arrow and datafusion to 49.0.0 This commit copies in our dependency code from influxdb_iox in order for us to be able to upgrade from a forked version of 46.0.0 to 49.0.0 of both arrow and datafusion. Most of the important changes were around how we consumed the crates in influxdb3(_server/_write). Those diffs are particularly worth looking at as the rest was a straight copy and we don't touch those crates in our development currently for influxdb3 edge. --- Cargo.lock | 1828 ++++++-- Cargo.toml | 61 +- arrow_util/Cargo.toml | 16 +- arrow_util/src/bitset.rs | 338 +- arrow_util/src/string.rs | 65 + arrow_util/src/test_util.rs | 5 +- authz/Cargo.toml | 11 +- authz/src/permission.rs | 8 +- backoff/Cargo.toml | 7 +- backoff/src/lib.rs | 4 + cache_system/Cargo.toml | 10 +- cache_system/src/addressable_heap.rs | 5 +- cache_system/src/backend/mod.rs | 11 +- cache_system/src/backend/policy/mod.rs | 136 +- cache_system/src/cache/driver.rs | 26 +- cache_system/src/cache/metrics.rs | 7 +- cache_system/src/lib.rs | 1 + cache_system/src/loader/batch.rs | 7 +- catalog_cache/Cargo.toml | 23 + catalog_cache/src/api/client.rs | 176 + catalog_cache/src/api/list.rs | 467 ++ catalog_cache/src/api/mod.rs | 159 + catalog_cache/src/api/quorum.rs | 459 ++ catalog_cache/src/api/server.rs | 300 ++ catalog_cache/src/lib.rs | 143 + catalog_cache/src/local/limit.rs | 82 + catalog_cache/src/local/mod.rs | 355 ++ clap_blocks/Cargo.toml | 17 +- clap_blocks/src/bulk_ingest.rs | 274 ++ clap_blocks/src/catalog_cache.rs | 154 + clap_blocks/src/catalog_dsn.rs | 18 +- clap_blocks/src/compactor.rs | 125 +- clap_blocks/src/compactor_scheduler.rs | 192 + clap_blocks/src/garbage_collector.rs | 87 +- clap_blocks/src/gossip.rs | 15 +- clap_blocks/src/ingester.rs | 13 + clap_blocks/src/ingester_address.rs | 16 +- clap_blocks/src/lib.rs | 3 + clap_blocks/src/memory_size.rs | 21 +- clap_blocks/src/object_store.rs | 240 +- clap_blocks/src/parquet_cache.rs | 57 + clap_blocks/src/querier.rs | 10 +- clap_blocks/src/router.rs | 16 + client_util/Cargo.toml | 11 +- data_types/Cargo.toml | 23 +- data_types/src/columns.rs | 545 ++- data_types/src/lib.rs | 361 +- data_types/src/namespace_name.rs | 25 + data_types/src/partition.rs | 122 +- data_types/src/partition_template.rs | 629 ++- data_types/src/service_limits.rs | 312 +- data_types/src/snapshot/hash.rs | 219 + data_types/src/snapshot/list.rs | 192 + data_types/src/snapshot/mask.rs | 71 + data_types/src/snapshot/mod.rs | 11 + data_types/src/snapshot/partition.rs | 246 + data_types/src/snapshot/table.rs | 197 + datafusion_util/Cargo.toml | 11 +- datafusion_util/src/config.rs | 8 + datafusion_util/src/lib.rs | 83 +- datafusion_util/src/watch.rs | 2 +- dml/Cargo.toml | 3 + executor/Cargo.toml | 12 +- executor/src/lib.rs | 8 + flightsql/Cargo.toml | 11 +- flightsql/src/error.rs | 2 +- flightsql/src/planner.rs | 12 +- flightsql/src/sql_info/meta.rs | 87 + flightsql/src/sql_info/mod.rs | 11 +- garbage_collector/Cargo.toml | 13 +- garbage_collector/src/lib.rs | 15 +- garbage_collector/src/objectstore/checker.rs | 136 +- garbage_collector/src/objectstore/deleter.rs | 74 +- garbage_collector/src/objectstore/lister.rs | 8 +- garbage_collector/src/parquetfile/deleter.rs | 11 +- garbage_collector/src/retention/flagger.rs | 1 - generated_types/Cargo.toml | 14 +- generated_types/build.rs | 25 +- .../iox/bulk_ingest/v1/service.proto | 73 + .../iox/catalog/v1/parquet_file.proto | 11 +- .../influxdata/iox/catalog/v1/service.proto | 12 +- .../influxdata/iox/catalog/v2/service.proto | 489 ++ .../iox/catalog_cache/v1/value.proto | 158 + .../influxdata/iox/column_type/v1/type.proto | 14 + .../influxdata/iox/compactor/v1/service.proto | 37 +- .../influxdata/iox/gossip/v1/schema.proto | 8 + .../influxdata/iox/gossip/v1/sort_keys.proto | 20 + .../iox/partition_template/v1/template.proto | 12 + .../influxdata/iox/querier/v1/flight.proto | 21 + .../influxdata/iox/querier/v1/query_log.proto | 73 + .../influxdata/iox/schema/v1/service.proto | 40 +- .../v1/skipped_compaction.proto | 29 + .../influxdata/iox/table/v1/service.proto | 16 + .../protos/influxdata/iox/wal/v1/wal.proto | 49 - generated_types/src/lib.rs | 85 +- grpc-binary-logger-proto/Cargo.toml | 9 +- grpc-binary-logger-test-proto/Cargo.toml | 7 +- grpc-binary-logger-test-proto/src/lib.rs | 7 +- grpc-binary-logger/Cargo.toml | 7 +- grpc-binary-logger/src/lib.rs | 2 +- grpc-binary-logger/src/predicate.rs | 2 +- .../tests/end_to_end_cases/server.rs | 2 +- .../tests/end_to_end_cases/test_utils.rs | 14 +- import_export/Cargo.toml | 11 +- import_export/src/file/export.rs | 29 +- import_export/src/file/import.rs | 119 +- influxdb2_client/Cargo.toml | 15 +- .../tests/common/server_fixture.rs | 2 + influxdb3_server/Cargo.toml | 2 + influxdb3_server/src/http.rs | 9 +- influxdb3_server/src/query_executor.rs | 77 +- influxdb3_write/src/catalog.rs | 22 +- influxdb3_write/src/persister.rs | 20 +- influxdb3_write/src/write_buffer.rs | 20 +- influxdb_influxql_parser/Cargo.toml | 7 +- influxdb_influxql_parser/src/explain.rs | 175 +- influxdb_influxql_parser/src/internal.rs | 10 +- influxdb_influxql_parser/src/keywords.rs | 2 +- ...ser__visit__test__explain_statement-2.snap | 13 + ...ser__visit__test__explain_statement-3.snap | 13 + ...ser__visit__test__explain_statement-4.snap | 15 + ...ser__visit__test__explain_statement-5.snap | 13 + ...ser__visit__test__explain_statement-6.snap | 13 + ...ser__visit__test__explain_statement-7.snap | 13 + ...ser__visit__test__explain_statement-8.snap | 31 + ...arser__visit__test__explain_statement.snap | 2 + ..._visit_mut__test__explain_statement-2.snap | 13 + ..._visit_mut__test__explain_statement-3.snap | 13 + ..._visit_mut__test__explain_statement-4.snap | 15 + ..._visit_mut__test__explain_statement-5.snap | 13 + ..._visit_mut__test__explain_statement-6.snap | 13 + ..._visit_mut__test__explain_statement-7.snap | 13 + ..._visit_mut__test__explain_statement-8.snap | 31 + ...r__visit_mut__test__explain_statement.snap | 2 + influxdb_influxql_parser/src/time_range.rs | 12 +- influxdb_influxql_parser/src/visit.rs | 12 +- influxdb_influxql_parser/src/visit_mut.rs | 13 +- influxdb_iox_client/Cargo.toml | 18 +- influxdb_iox_client/src/client.rs | 3 + influxdb_iox_client/src/client/catalog.rs | 8 +- influxdb_iox_client/src/client/compactor.rs | 7 +- influxdb_iox_client/src/client/error.rs | 20 +- influxdb_iox_client/src/client/flight/mod.rs | 61 + .../src/client/flight/query.rs | 154 + influxdb_iox_client/src/client/query_log.rs | 30 + influxdb_iox_client/src/client/table.rs | 18 + influxdb_iox_client/src/client/write.rs | 126 +- influxdb_iox_client/src/format.rs | 2 +- influxdb_iox_client/src/format/influxql.rs | 18 +- influxdb_line_protocol/Cargo.toml | 8 +- influxdb_line_protocol/README.md | 6 +- influxdb_line_protocol/RELEASE.md | 2 +- influxdb_line_protocol/src/lib.rs | 226 +- influxdb_storage_client/Cargo.toml | 5 +- influxrpc_parser/Cargo.toml | 8 +- influxrpc_parser/src/predicate.rs | 6 +- ingester_query_grpc/Cargo.toml | 15 +- .../influxdata/iox/ingester/v2/query.proto | 15 +- ingester_query_grpc/src/lib.rs | 29 +- iox_catalog/Cargo.toml | 23 +- ...1180000_set_partition_sort_key_to_null.sql | 1 + .../20231003120000_drop_sort_key.sql | 1 + .../20231004120000_add_empty_sort_key.sql | 4 + ...0231121120000_add_partition_generation.sql | 1 + .../20231121150000_partition_id_trigger.sql | 20 + ...1123120000_partition_id_from_partition.sql | 47 + .../20240111150000_add_table_generation.sql | 1 + ...6_complete_kafkaless_transition_sqlite.sql | 39 + ...1180000_set_partition_sort_key_to_null.sql | 2 + .../20231003120000_drop_sort_key.sql | 1 + .../20231004120000_add_empty_sort_key.sql | 1 + ...0231121120000_add_partition_generation.sql | 1 + ...1123120000_partition_id_from_partition.sql | 8 + .../20240111150000_add_table_generation.sql | 1 + iox_catalog/src/cache.rs | 831 ++++ iox_catalog/src/constants.rs | 19 + iox_catalog/src/grpc/client.rs | 997 ++++ iox_catalog/src/grpc/mod.rs | 143 + iox_catalog/src/grpc/serialization.rs | 712 +++ iox_catalog/src/grpc/server.rs | 1032 ++++ iox_catalog/src/interface.rs | 3168 +------------ iox_catalog/src/interface_tests.rs | 3203 +++++++++++++ iox_catalog/src/lib.rs | 695 +-- iox_catalog/src/mem.rs | 793 ++-- iox_catalog/src/metrics.rs | 53 +- iox_catalog/src/migrate.rs | 72 +- iox_catalog/src/postgres.rs | 1107 ++--- iox_catalog/src/sqlite.rs | 1131 ++--- iox_catalog/src/test_helpers.rs | 92 + iox_catalog/src/util.rs | 897 ++++ iox_data_generator/Cargo.toml | 17 +- iox_data_generator/src/substitution.rs | 26 +- iox_data_generator/src/tag_pair.rs | 2 +- iox_query/Cargo.toml | 16 +- iox_query/src/chunk_statistics.rs | 185 +- iox_query/src/exec.rs | 32 +- iox_query/src/exec/context.rs | 48 +- iox_query/src/exec/field.rs | 2 +- iox_query/src/exec/fieldlist.rs | 42 +- iox_query/src/exec/gapfill/algo.rs | 123 +- .../src/exec/gapfill/algo/interpolate.rs | 43 +- iox_query/src/exec/gapfill/buffered_input.rs | 6 +- iox_query/src/exec/gapfill/exec_tests.rs | 13 +- iox_query/src/exec/gapfill/mod.rs | 37 +- iox_query/src/exec/gapfill/params.rs | 2 +- iox_query/src/exec/gapfill/stream.rs | 13 +- iox_query/src/exec/non_null_checker.rs | 26 +- iox_query/src/exec/query_tracing.rs | 6 +- iox_query/src/exec/schema_pivot.rs | 5 +- iox_query/src/exec/seriesset/converter.rs | 9 +- iox_query/src/exec/sleep.rs | 265 ++ iox_query/src/exec/split.rs | 32 +- iox_query/src/frontend/reorg.rs | 97 +- iox_query/src/frontend/sql.rs | 8 +- iox_query/src/lib.rs | 181 +- .../src/logical_optimizer/extract_sleep.rs | 100 + .../src/logical_optimizer/handle_gapfill.rs | 243 +- .../handle_gapfill/range_predicate.rs | 82 +- .../influx_regex_to_datafusion_regex.rs | 15 +- iox_query/src/logical_optimizer/mod.rs | 5 +- .../physical_optimizer/chunk_extraction.rs | 26 +- .../src/physical_optimizer/combine_chunks.rs | 296 +- .../dedup/dedup_null_columns.rs | 4 +- .../dedup/dedup_sort_order.rs | 4 +- .../dedup/partition_split.rs | 24 +- .../physical_optimizer/dedup/remove_dedup.rs | 16 +- .../physical_optimizer/dedup/time_split.rs | 18 +- iox_query/src/physical_optimizer/mod.rs | 10 +- .../physical_optimizer/predicate_pushdown.rs | 64 +- .../physical_optimizer/projection_pushdown.rs | 80 +- iox_query/src/physical_optimizer/sort/mod.rs | 2 + .../sort/order_union_sorted_inputs.rs | 1487 ++++++ .../sort/parquet_sortness.rs | 92 +- .../sort/push_sort_through_union.rs | 121 +- iox_query/src/physical_optimizer/sort/util.rs | 102 + iox_query/src/physical_optimizer/tests.rs | 210 + .../physical_optimizer/union/nested_union.rs | 34 +- .../src/physical_optimizer/union/one_union.rs | 18 +- iox_query/src/provider.rs | 43 +- iox_query/src/provider/adapter.rs | 8 +- iox_query/src/provider/deduplicate.rs | 23 +- iox_query/src/provider/deduplicate/algo.rs | 24 +- iox_query/src/provider/overlap.rs | 21 +- iox_query/src/provider/physical.rs | 266 +- iox_query/src/provider/progressive_eval.rs | 1206 +++++ iox_query/src/provider/record_batch_exec.rs | 54 +- iox_query/src/pruning.rs | 49 +- iox_query/src/query_log.rs | 704 +++ iox_query/src/statistics.rs | 1427 ++++-- iox_query/src/test.rs | 285 +- iox_query/src/util.rs | 286 +- iox_query_influxql/Cargo.toml | 9 +- .../src/aggregate/percentile.rs | 3 +- iox_query_influxql/src/frontend/planner.rs | 27 +- iox_query_influxql/src/plan/ir.rs | 4 +- iox_query_influxql/src/plan/planner.rs | 939 ++-- iox_query_influxql/src/plan/planner/select.rs | 16 +- .../src/plan/planner_rewrite_expression.rs | 14 +- iox_query_influxql/src/plan/rewriter.rs | 29 +- iox_query_influxql/src/plan/udf.rs | 284 +- iox_query_influxql/src/plan/util.rs | 4 +- iox_query_influxql/src/window.rs | 112 +- .../src/window/cumulative_sum.rs | 61 +- iox_query_influxql/src/window/derivative.rs | 85 +- iox_query_influxql/src/window/difference.rs | 60 +- .../src/window/moving_average.rs | 60 +- iox_query_influxql/src/window/non_negative.rs | 57 +- .../src/window/percent_row_number.rs | 61 +- iox_query_influxrpc/Cargo.toml | 9 +- iox_query_influxrpc/src/lib.rs | 7 +- iox_query_influxrpc/src/missing_columns.rs | 4 +- iox_query_influxrpc/src/scan_plan.rs | 23 +- iox_query_params/Cargo.toml | 22 + iox_query_params/src/lib.rs | 21 + iox_query_params/src/params.rs | 675 +++ iox_tests/Cargo.toml | 3 + iox_tests/src/builders.rs | 50 +- iox_tests/src/catalog.rs | 209 +- iox_tests/src/lib.rs | 10 - iox_time/Cargo.toml | 5 +- iox_time/src/lib.rs | 2 +- ioxd_common/Cargo.toml | 21 +- ioxd_common/src/http/error.rs | 32 +- ioxd_common/src/http/mod.rs | 23 +- ioxd_common/src/http/pprof.rs | 2 +- ioxd_common/src/lib.rs | 38 +- ioxd_common/src/rpc.rs | 14 +- ioxd_common/src/server_type.rs | 14 + ioxd_common/src/service.rs | 14 +- ioxd_test/Cargo.toml | 7 +- ioxd_test/src/lib.rs | 2 +- kube_test/Cargo.toml | 22 + kube_test/src/call.rs | 70 + kube_test/src/error.rs | 57 + kube_test/src/handler.rs | 25 + kube_test/src/lib.rs | 31 + kube_test/src/object_map.rs | 178 + kube_test/src/request.rs | 115 + kube_test/src/resource_handler.rs | 267 ++ kube_test/src/service.rs | 54 + kube_test/src/status.rs | 61 + logfmt/Cargo.toml | 5 +- metric/Cargo.toml | 3 + metric/src/counter.rs | 5 +- metric/src/duration.rs | 8 + metric/src/histogram.rs | 91 + metric/src/lib.rs | 3 +- metric/src/metric.rs | 7 +- metric_exporters/Cargo.toml | 3 + mutable_batch/Cargo.toml | 20 +- mutable_batch/src/column.rs | 878 +++- mutable_batch/src/lib.rs | 231 +- mutable_batch/src/payload.rs | 150 +- mutable_batch/src/writer.rs | 12 +- mutable_batch/tests/writer.rs | 109 +- mutable_batch/tests/writer_fuzz.rs | 5 +- mutable_batch_lp/Cargo.toml | 8 +- mutable_batch_lp/fuzz/.gitignore | 4 + mutable_batch_lp/fuzz/Cargo.lock | 4129 +++++++++++++++++ mutable_batch_lp/fuzz/Cargo.toml | 27 + mutable_batch_lp/fuzz/README.md | 46 + .../fuzz/fuzz_targets/lines_converter.rs | 66 + mutable_batch_lp/src/lib.rs | 291 +- mutable_batch_pb/Cargo.toml | 8 +- mutable_batch_pb/src/decode.rs | 14 +- mutable_batch_pb/src/lib.rs | 2 + mutable_batch_pb/tests/encode.rs | 3 +- mutable_batch_tests/Cargo.toml | 12 +- mutable_batch_tests/benches/statistics.rs | 184 + object_store_metrics/Cargo.toml | 11 +- object_store_metrics/src/dummy.rs | 30 +- object_store_metrics/src/lib.rs | 1051 +++-- observability_deps/Cargo.toml | 3 + panic_logging/Cargo.toml | 3 + panic_logging/src/lib.rs | 53 +- parquet_cache/Cargo.toml | 60 + parquet_cache/src/client.rs | 16 + parquet_cache/src/client/cache_connector.rs | 37 + parquet_cache/src/client/http.rs | 62 + parquet_cache/src/client/keyspace.rs | 314 ++ parquet_cache/src/client/mock.rs | 153 + parquet_cache/src/client/object_store.rs | 776 ++++ parquet_cache/src/client/request.rs | 46 + parquet_cache/src/client/write_hints.rs | 223 + parquet_cache/src/controller.rs | 53 + parquet_cache/src/controller/error.rs | 29 + parquet_cache/src/controller/kube_util.rs | 93 + parquet_cache/src/controller/parquet_cache.rs | 139 + .../controller/parquet_cache_controller.rs | 1446 ++++++ .../src/controller/parquet_cache_set.rs | 75 + .../parquet_cache_set_controller.rs | 676 +++ parquet_cache/src/controller/state_service.rs | 109 + parquet_cache/src/data_types.rs | 12 + parquet_cache/src/data_types/keyspace.rs | 164 + parquet_cache/src/data_types/objects.rs | 79 + parquet_cache/src/data_types/policy.rs | 17 + parquet_cache/src/data_types/state.rs | 52 + parquet_cache/src/data_types/write_hints.rs | 81 + parquet_cache/src/lib.rs | 51 + parquet_cache/src/server.rs | 482 ++ parquet_cache/src/server/cache.rs | 113 + parquet_cache/src/server/data.rs | 810 ++++ parquet_cache/src/server/data/manager.rs | 836 ++++ parquet_cache/src/server/data/reads.rs | 23 + parquet_cache/src/server/data/store.rs | 510 ++ parquet_cache/src/server/data/writes.rs | 69 + parquet_cache/src/server/error.rs | 55 + parquet_cache/src/server/keyspace.rs | 957 ++++ parquet_cache/src/server/mock.rs | 217 + parquet_cache/src/server/precondition.rs | 57 + parquet_cache/src/server/response.rs | 83 + parquet_file/Cargo.toml | 23 +- parquet_file/src/chunk.rs | 5 +- parquet_file/src/lib.rs | 183 +- parquet_file/src/metadata.rs | 61 +- parquet_file/src/serialize.rs | 4 +- parquet_file/src/storage.rs | 24 +- parquet_file/tests/metadata.rs | 33 +- parquet_to_line_protocol/Cargo.toml | 8 +- parquet_to_line_protocol/src/batch.rs | 2 +- parquet_to_line_protocol/src/lib.rs | 11 +- partition/Cargo.toml | 37 + partition/benches/partitioner.rs | 246 + partition/src/bucket.rs | 49 + partition/src/filter.rs | 145 + partition/src/lib.rs | 1704 +++++++ partition/src/strftime.rs | 415 ++ partition/src/traits.rs | 61 + partition/src/traits/mutable_batch.rs | 60 + partition/src/traits/record_batch.rs | 82 + predicate/Cargo.toml | 11 +- predicate/src/delete_expr.rs | 21 +- predicate/src/lib.rs | 19 +- predicate/src/rpc_predicate/column_rewrite.rs | 5 +- predicate/src/rpc_predicate/field_rewrite.rs | 2 +- query_functions/Cargo.toml | 13 +- query_functions/src/coalesce_struct.rs | 110 +- query_functions/src/gapfill.rs | 175 +- query_functions/src/lib.rs | 22 +- query_functions/src/regex.rs | 5 +- query_functions/src/registry.rs | 6 +- query_functions/src/selectors/internal.rs | 2 +- query_functions/src/sleep.rs | 94 + query_functions/src/to_timestamp.rs | 85 + query_functions/src/window.rs | 28 +- schema/Cargo.toml | 10 +- schema/src/lib.rs | 20 +- schema/src/sort.rs | 43 +- service_common/Cargo.toml | 15 +- service_common/src/error.rs | 8 +- service_common/src/lib.rs | 35 +- service_grpc_flight/Cargo.toml | 18 +- service_grpc_flight/src/keep_alive.rs | 42 +- service_grpc_flight/src/lib.rs | 380 +- service_grpc_flight/src/planner.rs | 113 + service_grpc_flight/src/request.rs | 224 +- service_grpc_testing/Cargo.toml | 3 + sharder/Cargo.toml | 3 + sharder/benches/sharder.rs | 2 +- sqlx-hotswap-pool/Cargo.toml | 7 +- test_fixtures/README.md | 26 + test_fixtures/parquet/influxql_log_1.parquet | Bin 0 -> 309561 bytes test_fixtures/parquet/influxql_log_2.parquet | Bin 0 -> 320350 bytes test_fixtures/parquet/influxql_log_3.parquet | Bin 0 -> 449494 bytes test_fixtures/parquet/sql_query_log_1.parquet | Bin 0 -> 227212 bytes test_fixtures/parquet/sql_query_log_2.parquet | Bin 0 -> 72243 bytes test_fixtures/parquet/sql_query_log_3.parquet | Bin 0 -> 60961 bytes test_fixtures/wal/9.dat | Bin 0 -> 467 bytes test_helpers/Cargo.toml | 11 +- test_helpers_end_to_end/Cargo.toml | 29 +- test_helpers_end_to_end/src/addrs.rs | 114 +- test_helpers_end_to_end/src/client.rs | 120 +- test_helpers_end_to_end/src/config.rs | 111 + test_helpers_end_to_end/src/data_generator.rs | 1 + test_helpers_end_to_end/src/database.rs | 2 +- test_helpers_end_to_end/src/grpc.rs | 8 +- .../src/http_reverse_proxy.rs | 160 + test_helpers_end_to_end/src/lib.rs | 3 + test_helpers_end_to_end/src/mini_cluster.rs | 183 +- test_helpers_end_to_end/src/server_fixture.rs | 369 +- test_helpers_end_to_end/src/server_type.rs | 76 +- test_helpers_end_to_end/src/service_link.rs | 99 + .../src/snapshot_comparison.rs | 153 +- .../src/snapshot_comparison/queries.rs | 80 +- test_helpers_end_to_end/src/steps.rs | 185 +- test_helpers_end_to_end/src/udp_listener.rs | 3 +- tokio_metrics_bridge/Cargo.toml | 5 +- tokio_watchdog/Cargo.toml | 18 + tokio_watchdog/src/lib.rs | 231 + tower_trailer/Cargo.toml | 21 + tower_trailer/src/lib.rs | 194 + trace/Cargo.toml | 3 + trace/src/lib.rs | 2 +- trace/src/span.rs | 57 +- trace_exporters/Cargo.toml | 9 +- trace_exporters/src/jaeger.rs | 97 +- trace_exporters/src/jaeger/span.rs | 24 +- trace_exporters/src/lib.rs | 20 +- trace_http/Cargo.toml | 8 +- trace_http/src/classify.rs | 62 +- trace_http/src/lib.rs | 2 +- trace_http/src/metrics.rs | 160 +- trace_http/src/tower.rs | 94 +- tracker/Cargo.toml | 15 +- tracker/src/async_semaphore.rs | 40 +- tracker/src/disk_metric.rs | 21 +- tracker/src/lock.rs | 123 +- tracker/src/task.rs | 2 +- tracker/src/task/history.rs | 8 +- trogging/Cargo.toml | 7 +- wal/Cargo.toml | 13 +- wal/src/blocking/reader.rs | 60 +- wal/src/lib.rs | 39 +- wal/tests/end_to_end.rs | 14 +- wal_inspect/Cargo.toml | 7 +- workspace-hack/Cargo.toml | 59 +- 476 files changed, 52639 insertions(+), 11570 deletions(-) create mode 100644 catalog_cache/Cargo.toml create mode 100644 catalog_cache/src/api/client.rs create mode 100644 catalog_cache/src/api/list.rs create mode 100644 catalog_cache/src/api/mod.rs create mode 100644 catalog_cache/src/api/quorum.rs create mode 100644 catalog_cache/src/api/server.rs create mode 100644 catalog_cache/src/lib.rs create mode 100644 catalog_cache/src/local/limit.rs create mode 100644 catalog_cache/src/local/mod.rs create mode 100644 clap_blocks/src/bulk_ingest.rs create mode 100644 clap_blocks/src/catalog_cache.rs create mode 100644 clap_blocks/src/parquet_cache.rs create mode 100644 data_types/src/snapshot/hash.rs create mode 100644 data_types/src/snapshot/list.rs create mode 100644 data_types/src/snapshot/mask.rs create mode 100644 data_types/src/snapshot/mod.rs create mode 100644 data_types/src/snapshot/partition.rs create mode 100644 data_types/src/snapshot/table.rs create mode 100644 generated_types/protos/influxdata/iox/bulk_ingest/v1/service.proto create mode 100644 generated_types/protos/influxdata/iox/catalog/v2/service.proto create mode 100644 generated_types/protos/influxdata/iox/catalog_cache/v1/value.proto create mode 100644 generated_types/protos/influxdata/iox/column_type/v1/type.proto create mode 100644 generated_types/protos/influxdata/iox/gossip/v1/sort_keys.proto create mode 100644 generated_types/protos/influxdata/iox/querier/v1/query_log.proto create mode 100644 generated_types/protos/influxdata/iox/skipped_compaction/v1/skipped_compaction.proto create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-2.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-3.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-4.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-5.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-6.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-7.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit__test__explain_statement-8.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-2.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-3.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-4.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-5.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-6.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-7.snap create mode 100644 influxdb_influxql_parser/src/snapshots/influxdb_influxql_parser__visit_mut__test__explain_statement-8.snap create mode 100644 influxdb_iox_client/src/client/flight/query.rs create mode 100644 influxdb_iox_client/src/client/query_log.rs create mode 100644 iox_catalog/migrations/20230921180000_set_partition_sort_key_to_null.sql create mode 100644 iox_catalog/migrations/20231003120000_drop_sort_key.sql create mode 100644 iox_catalog/migrations/20231004120000_add_empty_sort_key.sql create mode 100644 iox_catalog/migrations/20231121120000_add_partition_generation.sql create mode 100644 iox_catalog/migrations/20231121150000_partition_id_trigger.sql create mode 100644 iox_catalog/migrations/20231123120000_partition_id_from_partition.sql create mode 100644 iox_catalog/migrations/20240111150000_add_table_generation.sql create mode 100644 iox_catalog/sqlite/migrations/20230824100746_complete_kafkaless_transition_sqlite.sql create mode 100644 iox_catalog/sqlite/migrations/20230921180000_set_partition_sort_key_to_null.sql create mode 100644 iox_catalog/sqlite/migrations/20231003120000_drop_sort_key.sql create mode 100644 iox_catalog/sqlite/migrations/20231004120000_add_empty_sort_key.sql create mode 100644 iox_catalog/sqlite/migrations/20231121120000_add_partition_generation.sql create mode 100644 iox_catalog/sqlite/migrations/20231123120000_partition_id_from_partition.sql create mode 100644 iox_catalog/sqlite/migrations/20240111150000_add_table_generation.sql create mode 100644 iox_catalog/src/cache.rs create mode 100644 iox_catalog/src/constants.rs create mode 100644 iox_catalog/src/grpc/client.rs create mode 100644 iox_catalog/src/grpc/mod.rs create mode 100644 iox_catalog/src/grpc/serialization.rs create mode 100644 iox_catalog/src/grpc/server.rs create mode 100644 iox_catalog/src/interface_tests.rs create mode 100644 iox_catalog/src/test_helpers.rs create mode 100644 iox_catalog/src/util.rs create mode 100644 iox_query/src/exec/sleep.rs create mode 100644 iox_query/src/logical_optimizer/extract_sleep.rs create mode 100644 iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs create mode 100644 iox_query/src/physical_optimizer/sort/util.rs create mode 100644 iox_query/src/physical_optimizer/tests.rs create mode 100644 iox_query/src/provider/progressive_eval.rs create mode 100644 iox_query/src/query_log.rs create mode 100644 iox_query_params/Cargo.toml create mode 100644 iox_query_params/src/lib.rs create mode 100644 iox_query_params/src/params.rs create mode 100644 kube_test/Cargo.toml create mode 100644 kube_test/src/call.rs create mode 100644 kube_test/src/error.rs create mode 100644 kube_test/src/handler.rs create mode 100644 kube_test/src/lib.rs create mode 100644 kube_test/src/object_map.rs create mode 100644 kube_test/src/request.rs create mode 100644 kube_test/src/resource_handler.rs create mode 100644 kube_test/src/service.rs create mode 100644 kube_test/src/status.rs create mode 100644 mutable_batch_lp/fuzz/.gitignore create mode 100644 mutable_batch_lp/fuzz/Cargo.lock create mode 100644 mutable_batch_lp/fuzz/Cargo.toml create mode 100644 mutable_batch_lp/fuzz/README.md create mode 100644 mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs create mode 100644 mutable_batch_tests/benches/statistics.rs create mode 100644 parquet_cache/Cargo.toml create mode 100644 parquet_cache/src/client.rs create mode 100644 parquet_cache/src/client/cache_connector.rs create mode 100644 parquet_cache/src/client/http.rs create mode 100644 parquet_cache/src/client/keyspace.rs create mode 100644 parquet_cache/src/client/mock.rs create mode 100644 parquet_cache/src/client/object_store.rs create mode 100644 parquet_cache/src/client/request.rs create mode 100644 parquet_cache/src/client/write_hints.rs create mode 100644 parquet_cache/src/controller.rs create mode 100644 parquet_cache/src/controller/error.rs create mode 100644 parquet_cache/src/controller/kube_util.rs create mode 100644 parquet_cache/src/controller/parquet_cache.rs create mode 100644 parquet_cache/src/controller/parquet_cache_controller.rs create mode 100644 parquet_cache/src/controller/parquet_cache_set.rs create mode 100644 parquet_cache/src/controller/parquet_cache_set_controller.rs create mode 100644 parquet_cache/src/controller/state_service.rs create mode 100644 parquet_cache/src/data_types.rs create mode 100644 parquet_cache/src/data_types/keyspace.rs create mode 100644 parquet_cache/src/data_types/objects.rs create mode 100644 parquet_cache/src/data_types/policy.rs create mode 100644 parquet_cache/src/data_types/state.rs create mode 100644 parquet_cache/src/data_types/write_hints.rs create mode 100644 parquet_cache/src/lib.rs create mode 100644 parquet_cache/src/server.rs create mode 100644 parquet_cache/src/server/cache.rs create mode 100644 parquet_cache/src/server/data.rs create mode 100644 parquet_cache/src/server/data/manager.rs create mode 100644 parquet_cache/src/server/data/reads.rs create mode 100644 parquet_cache/src/server/data/store.rs create mode 100644 parquet_cache/src/server/data/writes.rs create mode 100644 parquet_cache/src/server/error.rs create mode 100644 parquet_cache/src/server/keyspace.rs create mode 100644 parquet_cache/src/server/mock.rs create mode 100644 parquet_cache/src/server/precondition.rs create mode 100644 parquet_cache/src/server/response.rs create mode 100644 partition/Cargo.toml create mode 100644 partition/benches/partitioner.rs create mode 100644 partition/src/bucket.rs create mode 100644 partition/src/filter.rs create mode 100644 partition/src/lib.rs create mode 100644 partition/src/strftime.rs create mode 100644 partition/src/traits.rs create mode 100644 partition/src/traits/mutable_batch.rs create mode 100644 partition/src/traits/record_batch.rs create mode 100644 query_functions/src/sleep.rs create mode 100644 query_functions/src/to_timestamp.rs create mode 100644 service_grpc_flight/src/planner.rs create mode 100644 test_fixtures/README.md create mode 100644 test_fixtures/parquet/influxql_log_1.parquet create mode 100644 test_fixtures/parquet/influxql_log_2.parquet create mode 100644 test_fixtures/parquet/influxql_log_3.parquet create mode 100644 test_fixtures/parquet/sql_query_log_1.parquet create mode 100644 test_fixtures/parquet/sql_query_log_2.parquet create mode 100644 test_fixtures/parquet/sql_query_log_3.parquet create mode 100644 test_fixtures/wal/9.dat create mode 100644 test_helpers_end_to_end/src/http_reverse_proxy.rs create mode 100644 test_helpers_end_to_end/src/service_link.rs create mode 100644 tokio_watchdog/Cargo.toml create mode 100644 tokio_watchdog/src/lib.rs create mode 100644 tower_trailer/Cargo.toml create mode 100644 tower_trailer/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index ebc077b1352..71d353372bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -142,6 +142,12 @@ version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" +[[package]] +name = "arc-swap" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6" + [[package]] name = "arrayref" version = "0.3.7" @@ -156,8 +162,9 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614" dependencies = [ "ahash", "arrow-arith", @@ -177,22 +184,24 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "half 2.3.1", + "half", "num", ] [[package]] name = "arrow-array" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d" dependencies = [ "ahash", "arrow-buffer", @@ -200,42 +209,46 @@ dependencies = [ "arrow-schema", "chrono", "chrono-tz", - "half 2.3.1", + "half", "hashbrown 0.14.3", "num", ] [[package]] name = "arrow-buffer" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c" dependencies = [ "bytes", - "half 2.3.1", + "half", "num", ] [[package]] name = "arrow-cast" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "base64", "chrono", "comfy-table", - "half 2.3.1", + "half", "lexical-core", "num", ] [[package]] name = "arrow-csv" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca" dependencies = [ "arrow-array", "arrow-buffer", @@ -252,19 +265,21 @@ dependencies = [ [[package]] name = "arrow-data" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634" dependencies = [ "arrow-buffer", "arrow-schema", - "half 2.3.1", + "half", "num", ] [[package]] name = "arrow-flight" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "624e0dcb6b5a7a06222bfd2be3f7e905ce849a6b714ec989f18cdba330c77d38" dependencies = [ "arrow-arith", "arrow-array", @@ -277,20 +292,21 @@ dependencies = [ "arrow-schema", "arrow-select", "arrow-string", - "base64 0.21.7", + "base64", "bytes", "futures", "once_cell", "paste", - "prost", + "prost 0.12.3", "tokio", - "tonic", + "tonic 0.10.2", ] [[package]] name = "arrow-ipc" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd" dependencies = [ "arrow-array", "arrow-buffer", @@ -298,12 +314,14 @@ dependencies = [ "arrow-data", "arrow-schema", "flatbuffers", + "lz4_flex", ] [[package]] name = "arrow-json" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,7 +329,7 @@ dependencies = [ "arrow-data", "arrow-schema", "chrono", - "half 2.3.1", + "half", "indexmap 2.1.0", "lexical-core", "num", @@ -321,42 +339,47 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", - "half 2.3.1", + "half", "num", ] [[package]] name = "arrow-row" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a" dependencies = [ "ahash", "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", - "half 2.3.1", + "half", "hashbrown 0.14.3", ] [[package]] name = "arrow-schema" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167" [[package]] name = "arrow-select" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036" dependencies = [ + "ahash", "arrow-array", "arrow-buffer", "arrow-data", @@ -366,8 +389,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7" dependencies = [ "arrow-array", "arrow-buffer", @@ -376,7 +400,7 @@ dependencies = [ "arrow-select", "num", "regex", - "regex-syntax 0.7.5", + "regex-syntax 0.8.2", ] [[package]] @@ -391,9 +415,10 @@ dependencies = [ "hashbrown 0.14.3", "num-traits", "once_cell", + "proptest", "rand", "regex", - "snafu", + "snafu 0.8.0", "uuid", "workspace-hack", ] @@ -429,6 +454,19 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" +[[package]] +name = "async-channel" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ca33f4bc4ed1babef42cad36cc1f51fa88be00420404e5b1e80ab1b18f7678c" +dependencies = [ + "concurrent-queue", + "event-listener 4.0.3", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-compression" version = "0.4.6" @@ -443,8 +481,17 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", - "zstd 0.13.0", - "zstd-safe 7.0.0", + "zstd", + "zstd-safe", +] + +[[package]] +name = "async-lock" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "287272293e9d8c41773cec55e365490fe034813a2f172f502d6ddcf75b2f582b" +dependencies = [ + "event-listener 2.5.3", ] [[package]] @@ -505,8 +552,8 @@ version = "0.1.0" dependencies = [ "assert_matches", "async-trait", - "backoff", - "base64 0.21.7", + "backoff 0.1.0", + "base64", "generated_types", "http", "iox_time", @@ -514,10 +561,10 @@ dependencies = [ "observability_deps", "parking_lot 0.12.1", "paste", - "snafu", + "snafu 0.8.0", "test_helpers_end_to_end", "tokio", - "tonic", + "tonic 0.10.2", "workspace-hack", ] @@ -578,11 +625,22 @@ version = "0.1.0" dependencies = [ "observability_deps", "rand", - "snafu", + "snafu 0.8.0", "tokio", "workspace-hack", ] +[[package]] +name = "backoff" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b62ddb9cb1ec0a098ad4bbf9344d0713fa193ae1a80af55febcff2627b6a00c1" +dependencies = [ + "getrandom", + "instant", + "rand", +] + [[package]] name = "backtrace" version = "0.3.69" @@ -598,12 +656,6 @@ dependencies = [ "rustc-demangle", ] -[[package]] -name = "base64" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - [[package]] name = "base64" version = "0.21.7" @@ -690,7 +742,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc" dependencies = [ "memchr", - "regex-automata 0.4.3", + "regex-automata 0.4.5", "serde", ] @@ -700,11 +752,17 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "bytecount" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1e5f035d16fc623ae5f74981db80a439803888314e3a555fd6f04acd51a3205" + [[package]] name = "bytemuck" -version = "1.14.0" +version = "1.14.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374d28ec25809ee0e23827c2ab573d729e293f281dfe393500e7ad618baa61c6" +checksum = "ed2490600f404f2b94c167e31d3ed1d5f3c225a0f3b80230053b3e0b7b962bd9" [[package]] name = "byteorder" @@ -744,7 +802,7 @@ name = "cache_system" version = "0.1.0" dependencies = [ "async-trait", - "backoff", + "backoff 0.1.0", "criterion", "futures", "iox_time", @@ -759,15 +817,63 @@ dependencies = [ "tokio", "tokio-util", "trace", + "tracker", "workspace-hack", ] +[[package]] +name = "camino" +version = "1.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo-platform" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceed8ef69d8518a5dda55c07425450b58a4e1946f4951eab6d7191ee86c2443d" +dependencies = [ + "serde", +] + +[[package]] +name = "cargo_metadata" +version = "0.14.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4acbb09d9ee8e23699b9634375c72795d095bf268439da88562cf9b501f181fa" +dependencies = [ + "camino", + "cargo-platform", + "semver", + "serde", + "serde_json", +] + [[package]] name = "cast" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" +[[package]] +name = "catalog_cache" +version = "0.1.0" +dependencies = [ + "bytes", + "dashmap", + "futures", + "hyper", + "reqwest", + "snafu 0.8.0", + "tokio", + "tokio-util", + "url", + "workspace-hack", +] + [[package]] name = "cc" version = "1.0.83" @@ -786,9 +892,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.31" +version = "0.4.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb" dependencies = [ "android-tzdata", "iana-time-zone", @@ -796,7 +902,7 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.48.5", + "windows-targets 0.52.0", ] [[package]] @@ -823,9 +929,9 @@ dependencies = [ [[package]] name = "ciborium" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "effd91f6c78e5a4ace8a5d3c0b6bfaec9e2baaef55f3efc00e45fb2e477ee926" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" dependencies = [ "ciborium-io", "ciborium-ll", @@ -834,18 +940,18 @@ dependencies = [ [[package]] name = "ciborium-io" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cdf919175532b369853f5d5e20b26b43112613fd6fe7aee757e35f7a44642656" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" [[package]] name = "ciborium-ll" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "defaa24ecc093c77630e6c15e17c51f5e187bf35ee514f4e2d67baaa96dae22b" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" dependencies = [ "ciborium-io", - "half 1.8.2", + "half", ] [[package]] @@ -863,19 +969,25 @@ name = "clap_blocks" version = "0.1.0" dependencies = [ "clap", + "ed25519-dalek", "futures", "http", "humantime", "iox_catalog", + "iox_time", + "itertools 0.12.0", "metric", + "non-empty-string", "object_store", "observability_deps", - "snafu", + "parquet_cache", + "snafu 0.8.0", "sysinfo", "tempfile", "test_helpers", "trace_exporters", "trogging", + "url", "uuid", "workspace-hack", ] @@ -919,7 +1031,7 @@ dependencies = [ "reqwest", "thiserror", "tokio", - "tonic", + "tonic 0.10.2", "tower", "workspace-hack", ] @@ -941,6 +1053,15 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "concurrent-queue" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d16048cd947b08fa32c24458a22f5dc5e835264f689f4f5653210c69fd107363" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "console" version = "0.15.8" @@ -959,9 +1080,9 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2895653b4d9f1538a83970077cb01dfc77a4810524e51a110944688e916b18e" dependencies = [ - "prost", - "prost-types", - "tonic", + "prost 0.11.9", + "prost-types 0.11.9", + "tonic 0.9.2", "tracing-core", ] @@ -978,13 +1099,13 @@ dependencies = [ "hdrhistogram", "humantime", "parking_lot 0.12.1", - "prost-types", + "prost-types 0.11.9", "serde", "serde_json", "thread_local", "tokio", "tokio-stream", - "tonic", + "tonic 0.9.2", "tracing", "tracing-core", "tracing-subscriber", @@ -1216,6 +1337,69 @@ dependencies = [ "memchr", ] +[[package]] +name = "curve25519-dalek" +version = "4.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89b8c6a2e4b1f45971ad09761aafb85514a84744b67a95e32c3cc1352d1f65c" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "platforms", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + +[[package]] +name = "darling" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0209d94da627ab5605dcccf08bb18afa5009cfbef48d8a8b7d7bdbc79be25c5e" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "177e3443818124b357d8e76f53be906d60937f0d3a90773a664fa63fa253e621" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.48", +] + +[[package]] +name = "darling_macro" +version = "0.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "836a9bbc7ad63342d6d6e7b815ccab164bc77a2d95d84bc3117a8c0d5c98e2d5" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.48", +] + [[package]] name = "dashmap" version = "5.5.3" @@ -1233,22 +1417,28 @@ dependencies = [ name = "data_types" version = "0.1.0" dependencies = [ + "arrow-buffer", "assert_matches", + "bytes", "chrono", "croaring", "generated_types", "hex", "influxdb-line-protocol", "iox_time", + "murmur3", "observability_deps", "once_cell", - "ordered-float 3.9.2", + "ordered-float 4.2.0", "paste", "percent-encoding", "proptest", + "prost 0.12.3", "schema", - "serde", + "serde_json", "sha2", + "siphasher 1.0.0", + "snafu 0.8.0", "sqlx", "test_helpers", "thiserror", @@ -1258,12 +1448,13 @@ dependencies = [ [[package]] name = "datafusion" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "ahash", "arrow", "arrow-array", + "arrow-ipc", "arrow-schema", "async-compression", "async-trait", @@ -1281,16 +1472,15 @@ dependencies = [ "flate2", "futures", "glob", - "half 2.3.1", + "half", "hashbrown 0.14.3", "indexmap 2.1.0", - "itertools 0.11.0", + "itertools 0.12.0", "log", "num_cpus", "object_store", "parking_lot 0.12.1", "parquet", - "percent-encoding", "pin-project-lite", "rand", "sqlparser", @@ -1300,36 +1490,32 @@ dependencies = [ "url", "uuid", "xz2", - "zstd 0.12.4", + "zstd", ] [[package]] name = "datafusion-common" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ + "ahash", "arrow", "arrow-array", - "async-compression", - "bytes", - "bzip2", + "arrow-buffer", + "arrow-schema", "chrono", - "flate2", - "futures", + "half", + "libc", "num_cpus", "object_store", "parquet", "sqlparser", - "tokio", - "tokio-util", - "xz2", - "zstd 0.12.4", ] [[package]] name = "datafusion-execution" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "arrow", "chrono", @@ -1348,12 +1534,14 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "ahash", "arrow", + "arrow-array", "datafusion-common", + "paste", "sqlparser", "strum", "strum_macros", @@ -1361,8 +1549,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "arrow", "async-trait", @@ -1371,33 +1559,33 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "hashbrown 0.14.3", - "itertools 0.11.0", + "itertools 0.12.0", "log", - "regex-syntax 0.7.5", + "regex-syntax 0.8.2", ] [[package]] name = "datafusion-physical-expr" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", + "arrow-ord", "arrow-schema", - "base64 0.21.7", + "base64", "blake2", "blake3", "chrono", "datafusion-common", "datafusion-expr", - "half 2.3.1", + "half", "hashbrown 0.14.3", "hex", "indexmap 2.1.0", - "itertools 0.11.0", - "libc", + "itertools 0.12.0", "log", "md-5", "paste", @@ -1411,8 +1599,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "ahash", "arrow", @@ -1426,26 +1614,23 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "futures", - "half 2.3.1", + "half", "hashbrown 0.14.3", "indexmap 2.1.0", - "itertools 0.11.0", + "itertools 0.12.0", "log", "once_cell", "parking_lot 0.12.1", "pin-project-lite", "rand", - "rstest", - "tempfile", - "termtree", "tokio", "uuid", ] [[package]] name = "datafusion-proto" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "arrow", "chrono", @@ -1453,13 +1638,13 @@ dependencies = [ "datafusion-common", "datafusion-expr", "object_store", - "prost", + "prost 0.12.3", ] [[package]] name = "datafusion-sql" -version = "31.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=81f33b0e27f5694348cd953a937203d835b57178#81f33b0e27f5694348cd953a937203d835b57178" +version = "34.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7#0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" dependencies = [ "arrow", "arrow-schema", @@ -1495,6 +1680,17 @@ dependencies = [ "uuid", ] +[[package]] +name = "delegate" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "082a24a9967533dc5d743c602157637116fc1b52806d694a5a45e6f32567fcdd" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "der" version = "0.7.8" @@ -1506,6 +1702,17 @@ dependencies = [ "zeroize", ] +[[package]] +name = "derivative" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc3dd5e9e9c0b295d6e1e4d811fb6f157d5ffd784b8d202fc62eac8035a770b" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "diff" version = "0.1.13" @@ -1556,6 +1763,36 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dyn-clone" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8", + "signature", +] + +[[package]] +name = "ed25519-dalek" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f628eaec48bfd21b865dc2950cfa014450c01d2fa2b69a86c2fd5844ec523c0" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] + [[package]] name = "either" version = "1.9.0" @@ -1596,6 +1833,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "error-chain" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d2f06b9cac1506ece98fe3231e3cc9c4410ec3d5b1f24ae1c8946f0742cdefc" +dependencies = [ + "version_check", +] + [[package]] name = "etcetera" version = "0.8.0" @@ -1613,6 +1859,27 @@ version = "2.5.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" +[[package]] +name = "event-listener" +version = "4.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b215c49b2b248c855fb73579eb1f4f26c38ffdc12973e20e07b91d78d5646e" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958e4d70b6d5e81971bebec42271ec641e7ff4e170a6fa605f2b8a8b65cb97d3" +dependencies = [ + "event-listener 4.0.3", + "pin-project-lite", +] + [[package]] name = "executor" version = "0.1.0" @@ -1624,10 +1891,11 @@ dependencies = [ "once_cell", "parking_lot 0.12.1", "pin-project", - "snafu", + "snafu 0.8.0", "tokio", "tokio-util", "tokio_metrics_bridge", + "tokio_watchdog", "workspace-hack", ] @@ -1637,6 +1905,24 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" +[[package]] +name = "fiat-crypto" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27573eac26f4dd11e2b1916c3fe1baa56407c83c71a773a8ba17ec0bca03b6b7" + +[[package]] +name = "filetime" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.4.1", + "windows-sys 0.52.0", +] + [[package]] name = "findshlibs" version = "0.10.2" @@ -1693,8 +1979,8 @@ dependencies = [ "iox_query", "observability_deps", "once_cell", - "prost", - "snafu", + "prost 0.12.3", + "snafu 0.8.0", "workspace-hack", ] @@ -1725,8 +2011,17 @@ dependencies = [ ] [[package]] -name = "futures" -version = "0.3.30" +name = "fsevent-sys" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76ee7a02da4d231650c7cea31349b889be2f45ddb3ef3032d2ec8185f6313fd2" +dependencies = [ + "libc", +] + +[[package]] +name = "futures" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" dependencies = [ @@ -1806,12 +2101,6 @@ version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" -[[package]] -name = "futures-timer" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c" - [[package]] name = "futures-util" version = "0.3.30" @@ -1836,14 +2125,15 @@ version = "0.1.0" dependencies = [ "bytes", "observability_deps", - "pbjson 0.6.0", + "pbjson", "pbjson-build", "pbjson-types", - "prost", + "prost 0.12.3", "prost-build", "serde", - "tonic", + "tonic 0.10.2", "tonic-build", + "uuid", "workspace-hack", ] @@ -1885,7 +2175,7 @@ name = "grpc-binary-logger" version = "0.1.0" dependencies = [ "assert_matches", - "base64 0.21.7", + "base64", "byteorder", "bytes", "futures", @@ -1895,11 +2185,11 @@ dependencies = [ "http-body", "hyper", "pin-project", - "prost", + "prost 0.12.3", "prost-build", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", "tonic-build", "tower", "workspace-hack", @@ -1909,10 +2199,10 @@ dependencies = [ name = "grpc-binary-logger-proto" version = "0.1.0" dependencies = [ - "prost", + "prost 0.12.3", "prost-build", - "prost-types", - "tonic", + "prost-types 0.12.3", + "tonic 0.10.2", "tonic-build", "workspace-hack", ] @@ -1921,9 +2211,9 @@ dependencies = [ name = "grpc-binary-logger-test-proto" version = "0.1.0" dependencies = [ - "prost", + "prost 0.12.3", "prost-build", - "tonic", + "tonic 0.10.2", "tonic-build", "workspace-hack", ] @@ -1947,12 +2237,6 @@ dependencies = [ "tracing", ] -[[package]] -name = "half" -version = "1.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" - [[package]] name = "half" version = "2.3.1" @@ -1966,9 +2250,9 @@ dependencies = [ [[package]] name = "handlebars" -version = "4.5.0" +version = "5.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faa67bab9ff362228eb3d00bd024a4965d8231bbb7921167f0cfa66c6626b225" +checksum = "c73166c591e67fb4bf9bc04011b4e35f12e89fe8d676193aa263df065955a379" dependencies = [ "log", "pest", @@ -2009,7 +2293,7 @@ version = "7.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d" dependencies = [ - "base64 0.21.7", + "base64", "byteorder", "flate2", "nom", @@ -2019,7 +2303,7 @@ dependencies = [ [[package]] name = "heappy" version = "0.1.0" -source = "git+https://github.com/mkmik/heappy?rev=1de977a241cdd768acc5b6c82c0728b30c7db7b4#1de977a241cdd768acc5b6c82c0728b30c7db7b4" +source = "git+https://github.com/mkmik/heappy?rev=01a1f88e1b404c5894f89eb1a57f813f713d7ad1#01a1f88e1b404c5894f89eb1a57f813f713d7ad1" dependencies = [ "backtrace", "bytes", @@ -2158,7 +2442,9 @@ dependencies = [ "futures-util", "http", "hyper", + "log", "rustls", + "rustls-native-certs", "tokio", "tokio-rustls", ] @@ -2198,6 +2484,12 @@ dependencies = [ "cc", ] +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "0.5.0" @@ -2269,13 +2561,13 @@ dependencies = [ [[package]] name = "influxdb-line-protocol" -version = "2.0.0" +version = "1.0.0" dependencies = [ "bytes", "log", "nom", "smallvec", - "snafu", + "snafu 0.8.0", "test_helpers", ] @@ -2291,7 +2583,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "snafu", + "snafu 0.8.0", "test_helpers", "tokio", "url", @@ -2339,6 +2631,8 @@ name = "influxdb3_server" version = "0.1.0" dependencies = [ "arrow", + "arrow-json", + "arrow-schema", "async-trait", "authz", "bytes", @@ -2372,7 +2666,7 @@ dependencies = [ "thiserror", "tokio", "tokio-util", - "tonic", + "tonic 0.10.2", "tower", "trace", "trace_exporters", @@ -2443,7 +2737,8 @@ dependencies = [ "generated_types", "influxdb-line-protocol", "insta", - "prost", + "iox_query_params", + "prost 0.12.3", "rand", "reqwest", "schema", @@ -2451,7 +2746,7 @@ dependencies = [ "thiserror", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", ] [[package]] @@ -2462,8 +2757,8 @@ dependencies = [ "futures-util", "generated_types", "observability_deps", - "prost", - "tonic", + "prost 0.12.3", + "tonic 0.10.2", "workspace-hack", ] @@ -2476,7 +2771,7 @@ dependencies = [ "integer-encoding 4.0.0", "observability_deps", "rand", - "snafu", + "snafu 0.7.5", "snap", "test_helpers", "workspace-hack", @@ -2487,7 +2782,7 @@ name = "influxrpc_parser" version = "0.1.0" dependencies = [ "generated_types", - "snafu", + "snafu 0.8.0", "sqlparser", "workspace-hack", ] @@ -2497,25 +2792,45 @@ name = "ingester_query_grpc" version = "0.1.0" dependencies = [ "arrow", - "base64 0.21.7", + "base64", "bytes", "data_types", "datafusion", "datafusion-proto", "flatbuffers", - "pbjson 0.6.0", + "pbjson", "pbjson-build", "predicate", - "prost", + "prost 0.12.3", "prost-build", "query_functions", "serde", - "snafu", - "tonic", + "snafu 0.8.0", + "tonic 0.10.2", "tonic-build", "workspace-hack", ] +[[package]] +name = "inotify" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8069d3ec154eb856955c1c0fbffefbf5f3c40a104ec912d4797314c1801abff" +dependencies = [ + "bitflags 1.3.2", + "inotify-sys", + "libc", +] + +[[package]] +name = "inotify-sys" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e05c02b5e89bff3b946cedeca278abc628fe811e604f027c45a8aa3cf793d0eb" +dependencies = [ + "libc", +] + [[package]] name = "insta" version = "1.34.0" @@ -2557,6 +2872,8 @@ version = "0.1.0" dependencies = [ "assert_matches", "async-trait", + "backoff 0.1.0", + "catalog_cache", "data_types", "dotenvy", "futures", @@ -2575,13 +2892,15 @@ dependencies = [ "rand", "serde", "siphasher 1.0.0", - "snafu", + "snafu 0.8.0", "sqlx", "sqlx-hotswap-pool", "tempfile", "test_helpers", "thiserror", "tokio", + "tonic 0.10.2", + "trace_http", "uuid", "workspace-hack", ] @@ -2599,7 +2918,7 @@ dependencies = [ "handlebars", "humantime", "influxdb2_client", - "itertools 0.11.0", + "itertools 0.12.0", "mutable_batch", "mutable_batch_lp", "parquet_file", @@ -2608,7 +2927,7 @@ dependencies = [ "schema", "serde", "serde_json", - "snafu", + "snafu 0.8.0", "test_helpers", "tokio", "toml", @@ -2634,7 +2953,8 @@ dependencies = [ "hashbrown 0.14.3", "indexmap 2.1.0", "insta", - "itertools 0.11.0", + "iox_time", + "itertools 0.12.0", "metric", "object_store", "observability_deps", @@ -2645,11 +2965,13 @@ dependencies = [ "query_functions", "schema", "serde", - "snafu", + "snafu 0.8.0", "test_helpers", "tokio", "tokio-stream", "trace", + "tracker", + "uuid", "workspace-hack", ] @@ -2667,7 +2989,7 @@ dependencies = [ "influxdb_influxql_parser", "insta", "iox_query", - "itertools 0.11.0", + "itertools 0.12.0", "observability_deps", "once_cell", "predicate", @@ -2697,12 +3019,26 @@ dependencies = [ "predicate", "query_functions", "schema", - "snafu", + "snafu 0.8.0", "test_helpers", "tokio", "workspace-hack", ] +[[package]] +name = "iox_query_params" +version = "0.1.0" +dependencies = [ + "assert_matches", + "datafusion", + "generated_types", + "observability_deps", + "serde", + "serde_json", + "thiserror", + "workspace-hack", +] + [[package]] name = "iox_tests" version = "0.1.0" @@ -2762,15 +3098,16 @@ dependencies = [ "serde_json", "serde_urlencoded", "service_grpc_testing", - "snafu", + "snafu 0.8.0", "tokio", "tokio-stream", "tokio-util", - "tonic", + "tonic 0.10.2", "tonic-health", "tonic-reflection", "tower", "tower-http", + "tower_trailer", "trace", "trace_exporters", "trace_http", @@ -2786,7 +3123,7 @@ dependencies = [ "hyper", "ioxd_common", "metric", - "snafu", + "snafu 0.8.0", "tokio-util", "trace", "workspace-hack", @@ -2860,6 +3197,188 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "json-patch" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6" +dependencies = [ + "serde", + "serde_json", + "thiserror", + "treediff", +] + +[[package]] +name = "jsonpath-rust" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829" +dependencies = [ + "pest", + "pest_derive", + "regex", + "serde_json", + "thiserror", +] + +[[package]] +name = "k8s-openapi" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6" +dependencies = [ + "base64", + "bytes", + "chrono", + "schemars", + "serde", + "serde-value", + "serde_json", +] + +[[package]] +name = "kqueue" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7447f1ca1b7b563588a205fe93dea8df60fd981423a768bc1c0ded35ed147d0c" +dependencies = [ + "kqueue-sys", + "libc", +] + +[[package]] +name = "kqueue-sys" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed9625ffda8729b85e45cf04090035ac368927b8cebc34898e7c120f52e4838b" +dependencies = [ + "bitflags 1.3.2", + "libc", +] + +[[package]] +name = "kube" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3499c8d60c763246c7a213f51caac1e9033f46026904cb89bc8951ae8601f26e" +dependencies = [ + "k8s-openapi", + "kube-client", + "kube-core", + "kube-derive", + "kube-runtime", +] + +[[package]] +name = "kube-client" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033450dfa0762130565890dadf2f8835faedf749376ca13345bcd8ecd6b5f29f" +dependencies = [ + "base64", + "bytes", + "chrono", + "either", + "futures", + "home", + "http", + "http-body", + "hyper", + "hyper-rustls", + "hyper-timeout", + "jsonpath-rust", + "k8s-openapi", + "kube-core", + "pem", + "pin-project", + "rustls", + "rustls-pemfile", + "secrecy", + "serde", + "serde_json", + "serde_yaml", + "thiserror", + "tokio", + "tokio-util", + "tower", + "tower-http", + "tracing", +] + +[[package]] +name = "kube-core" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae" +dependencies = [ + "chrono", + "form_urlencoded", + "http", + "json-patch", + "k8s-openapi", + "once_cell", + "schemars", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "kube-derive" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e98dd5e5767c7b894c1f0e41fd628b145f808e981feb8b08ed66455d47f1a4" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "serde_json", + "syn 2.0.48", +] + +[[package]] +name = "kube-runtime" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d8893eb18fbf6bb6c80ef6ee7dd11ec32b1dc3c034c988ac1b3a84d46a230ae" +dependencies = [ + "ahash", + "async-trait", + "backoff 0.4.0", + "derivative", + "futures", + "hashbrown 0.14.3", + "json-patch", + "k8s-openapi", + "kube-client", + "parking_lot 0.12.1", + "pin-project", + "serde", + "serde_json", + "smallvec", + "thiserror", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "kube_test" +version = "0.1.0" +dependencies = [ + "http", + "hyper", + "k8s-openapi", + "kube-core", + "rand", + "serde", + "serde_json", + "serde_yaml", + "tower", + "workspace-hack", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -2997,23 +3516,12 @@ dependencies = [ ] [[package]] -name = "lz4" -version = "1.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e9e2dd86df36ce760a60f6ff6ad526f7ba1f14ba0356f8254fb6905e6494df1" -dependencies = [ - "libc", - "lz4-sys", -] - -[[package]] -name = "lz4-sys" -version = "1.9.4" +name = "lz4_flex" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57d27b317e207b10f69f5e75494119e391a96f48861ae870d1da6edac98ca900" +checksum = "912b45c753ff5f7f5208307e8ace7d2a2e30d024e26d3509f3dce546c044ce15" dependencies = [ - "cc", - "libc", + "twox-hash", ] [[package]] @@ -3060,9 +3568,9 @@ checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" [[package]] name = "memmap2" -version = "0.9.3" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "45fd3a57831bf88bc63f8cebc0cf956116276e97fef3966103e96416209f7c92" +checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" dependencies = [ "libc", ] @@ -3114,6 +3622,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" dependencies = [ "libc", + "log", "wasi", "windows-sys 0.48.0", ] @@ -3136,12 +3645,53 @@ dependencies = [ "tokio", ] +[[package]] +name = "moka" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad9dc9808102655926a6086abd0b9965ebefd4a39ef0d184f074c34ba5049ec6" +dependencies = [ + "async-lock", + "async-trait", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "futures-util", + "once_cell", + "parking_lot 0.12.1", + "quanta", + "rustc_version", + "skeptic", + "smallvec", + "tagptr", + "thiserror", + "triomphe", + "uuid", +] + +[[package]] +name = "mpchash" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdd8199faa645318222f8aeb383fca4216a3f75b144f1e264ac74c0835d871a9" +dependencies = [ + "num-traits", + "rand", + "xxhash-rust", +] + [[package]] name = "multimap" version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "mutable_batch" version = "0.1.0" @@ -3149,20 +3699,17 @@ dependencies = [ "arrow", "arrow_util", "assert_matches", - "chrono", "data_types", "hashbrown 0.14.3", "iox_time", - "itertools 0.11.0", + "itertools 0.12.0", "mutable_batch_lp", - "paste", - "percent-encoding", + "partition", + "pretty_assertions", "proptest", "rand", "schema", - "snafu", - "thiserror", - "unicode-segmentation", + "snafu 0.8.0", "workspace-hack", ] @@ -3175,9 +3722,11 @@ dependencies = [ "criterion", "hashbrown 0.14.3", "influxdb-line-protocol", + "itertools 0.12.0", "mutable_batch", "schema", - "snafu", + "snafu 0.8.0", + "test_helpers", "workspace-hack", ] @@ -3192,8 +3741,9 @@ dependencies = [ "hashbrown 0.14.3", "mutable_batch", "mutable_batch_lp", + "partition", "schema", - "snafu", + "snafu 0.8.0", "workspace-hack", ] @@ -3210,7 +3760,7 @@ dependencies = [ "mutable_batch", "mutable_batch_lp", "mutable_batch_pb", - "prost", + "prost 0.12.3", ] [[package]] @@ -3245,6 +3795,34 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "non-empty-string" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cf0f4060e345ae505219853da9ca1150564158a648a6aa6a528f0d5794bb33" +dependencies = [ + "delegate", +] + +[[package]] +name = "notify" +version = "6.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6205bd8bb1e454ad2e27422015fb5e4f2bcc7e08fa8f27058670d208324a4d2d" +dependencies = [ + "bitflags 2.4.2", + "crossbeam-channel", + "filetime", + "fsevent-sys", + "inotify", + "kqueue", + "libc", + "log", + "mio", + "walkdir", + "windows-sys 0.48.0", +] + [[package]] name = "ntapi" version = "0.4.1" @@ -3389,12 +3967,12 @@ dependencies = [ [[package]] name = "object_store" -version = "0.7.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f930c88a43b1c3f6e776dfe495b4afab89882dbc81530c632db2ed65451ebcb4" +checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050" dependencies = [ "async-trait", - "base64 0.21.7", + "base64", "bytes", "chrono", "futures", @@ -3403,14 +3981,14 @@ dependencies = [ "itertools 0.11.0", "parking_lot 0.12.1", "percent-encoding", - "quick-xml 0.30.0", + "quick-xml 0.31.0", "rand", "reqwest", - "ring 0.16.20", + "ring", "rustls-pemfile", "serde", "serde_json", - "snafu", + "snafu 0.7.5", "tokio", "tracing", "url", @@ -3428,7 +4006,7 @@ dependencies = [ "metric", "object_store", "pin-project", - "snafu", + "snafu 0.8.0", "tokio", "workspace-hack", ] @@ -3456,6 +4034,12 @@ version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + [[package]] name = "ordered-float" version = "2.10.1" @@ -3467,18 +4051,18 @@ dependencies = [ [[package]] name = "ordered-float" -version = "3.9.2" +version = "4.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" +checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e" dependencies = [ "num-traits", ] [[package]] name = "ouroboros" -version = "0.18.2" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a50b637ffd883b2733a8483599fb6136b9dcedaa1850f7ac08b9b6f9f2061208" +checksum = "97b7be5a8a3462b752f4be3ff2b2bf2f7f1d00834902e46be2a4d68b87b0573c" dependencies = [ "aliasable", "ouroboros_macro", @@ -3487,9 +4071,9 @@ dependencies = [ [[package]] name = "ouroboros_macro" -version = "0.18.2" +version = "0.18.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3633d65683f13b9bcfaa3150880b018899fb0e5d0542f4adaea4f503fdb5eabf" +checksum = "b645dcde5f119c2c454a92d0dfa271a2a3b205da92e4292a68ead4bdbfde1f33" dependencies = [ "heck", "itertools 0.12.0", @@ -3515,6 +4099,12 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "parking" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb813b8af86854136c6922af0598d719255ecb2179515e6e7730d468f05c9cae" + [[package]] name = "parking_lot" version = "0.11.2" @@ -3565,8 +4155,9 @@ dependencies = [ [[package]] name = "parquet" -version = "46.0.0" -source = "git+https://github.com/alamb/arrow-rs.git?rev=7c236c06bfb78c0c877055c1617d9373971511a5#7c236c06bfb78c0c877055c1617d9373971511a5" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af88740a842787da39b3d69ce5fbf6fce97d20211d3b299fee0a0da6430c74d4" dependencies = [ "ahash", "arrow-array", @@ -3576,14 +4167,14 @@ dependencies = [ "arrow-ipc", "arrow-schema", "arrow-select", - "base64 0.21.7", + "base64", "brotli", "bytes", "chrono", "flate2", "futures", "hashbrown 0.14.3", - "lz4", + "lz4_flex", "num", "num-bigint", "object_store", @@ -3593,7 +4184,55 @@ dependencies = [ "thrift", "tokio", "twox-hash", - "zstd 0.12.4", + "zstd", +] + +[[package]] +name = "parquet_cache" +version = "0.1.0" +dependencies = [ + "ahash", + "arc-swap", + "assert_matches", + "async-channel", + "async-trait", + "backoff 0.1.0", + "bytes", + "chrono", + "data_types", + "fnv", + "futures", + "http", + "hyper", + "iox_catalog", + "iox_tests", + "iox_time", + "k8s-openapi", + "kube", + "kube_test", + "lazy_static", + "moka", + "mpchash", + "notify", + "object_store", + "observability_deps", + "parking_lot 0.12.1", + "parquet_file", + "pin-project", + "rand", + "reqwest", + "schemars", + "serde", + "serde_json", + "tempfile", + "thiserror", + "tokio", + "tokio-stream", + "tokio-util", + "tower", + "url", + "uuid", + "workspace-hack", ] [[package]] @@ -3601,7 +4240,8 @@ name = "parquet_file" version = "0.1.0" dependencies = [ "arrow", - "base64 0.21.7", + "assert_matches", + "base64", "bytes", "data_types", "datafusion", @@ -3613,17 +4253,17 @@ dependencies = [ "observability_deps", "parquet", "pbjson-types", - "prost", + "prost 0.12.3", "rand", "schema", - "snafu", + "snafu 0.8.0", "test_helpers", "thiserror", "thrift", "tokio", "uuid", "workspace-hack", - "zstd 0.12.4", + "zstd", ] [[package]] @@ -3639,7 +4279,7 @@ dependencies = [ "object_store", "parquet_file", "schema", - "snafu", + "snafu 0.8.0", "tokio", "workspace-hack", ] @@ -3653,55 +4293,69 @@ dependencies = [ "regex", ] +[[package]] +name = "partition" +version = "0.1.0" +dependencies = [ + "arrow", + "assert_matches", + "chrono", + "criterion", + "data_types", + "generated_types", + "hashbrown 0.14.3", + "mutable_batch", + "mutable_batch_lp", + "paste", + "percent-encoding", + "proptest", + "rand", + "schema", + "test_helpers", + "thiserror", + "unicode-segmentation", + "workspace-hack", +] + [[package]] name = "paste" version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" -[[package]] -name = "pbjson" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "048f9ac93c1eab514f9470c4bc8d97ca2a0a236b84f45cc19d69a59fc11467f6" -dependencies = [ - "base64 0.13.1", - "serde", -] - [[package]] name = "pbjson" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90" dependencies = [ - "base64 0.21.7", + "base64", "serde", ] [[package]] name = "pbjson-build" -version = "0.5.1" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bdbb7b706f2afc610f3853550cdbbf6372fd324824a087806bd4480ea4996e24" +checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735" dependencies = [ "heck", - "itertools 0.10.5", - "prost", - "prost-types", + "itertools 0.11.0", + "prost 0.12.3", + "prost-types 0.12.3", ] [[package]] name = "pbjson-types" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a88c8d87f99a4ac14325e7a4c24af190fca261956e3b82dd7ed67e77e6c7043" +checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12" dependencies = [ "bytes", "chrono", - "pbjson 0.5.1", + "pbjson", "pbjson-build", - "prost", + "prost 0.12.3", "prost-build", "serde", ] @@ -3715,6 +4369,16 @@ dependencies = [ "fixedbitset", ] +[[package]] +name = "pem" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310" +dependencies = [ + "base64", + "serde", +] + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -3825,18 +4489,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" +checksum = "0302c4a0442c456bd56f841aee5c3bfd17967563f6fadc9ceb9f9c23cf3807e0" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" +checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", @@ -3882,11 +4546,17 @@ version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" +[[package]] +name = "platforms" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "626dec3cac7cc0e1577a2ec3fc496277ec2baa084bebad95bb6fdbfae235f84c" + [[package]] name = "pprof" -version = "0.12.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978385d59daf9269189d052ca8a84c1acfd0715c0599a5d5188d4acc078ca46a" +checksum = "ef5c97c51bd34c7e742402e216abdeb44d415fbe6ae41d56b114723e953711cb" dependencies = [ "backtrace", "cfg-if", @@ -3897,9 +4567,9 @@ dependencies = [ "nix 0.26.4", "once_cell", "parking_lot 0.12.1", - "prost", + "prost 0.12.3", "prost-build", - "prost-derive", + "prost-derive 0.12.3", "protobuf", "sha2", "smallvec", @@ -3923,11 +4593,11 @@ dependencies = [ "data_types", "datafusion", "datafusion_util", - "itertools 0.11.0", + "itertools 0.12.0", "observability_deps", "query_functions", "schema", - "snafu", + "snafu 0.8.0", "sqlparser", "test_helpers", "workspace-hack", @@ -3972,19 +4642,19 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.1.25" +version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8646e95016a7a6c4adea95bafa8a16baab64b583356217f2c85db4a39d9a86" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" dependencies = [ "proc-macro2", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] name = "proc-macro2" -version = "1.0.76" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95fc56cda0b5c3325f5fbbd7ff9fda9e02bb00bb3dac51252d2f1bfa1cb8cc8c" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -4039,27 +4709,37 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" dependencies = [ "bytes", - "prost-derive", + "prost-derive 0.11.9", +] + +[[package]] +name = "prost" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a" +dependencies = [ + "bytes", + "prost-derive 0.12.3", ] [[package]] name = "prost-build" -version = "0.11.9" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" +checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" dependencies = [ "bytes", "heck", - "itertools 0.10.5", - "lazy_static", + "itertools 0.11.0", "log", "multimap", + "once_cell", "petgraph", "prettyplease", - "prost", - "prost-types", + "prost 0.12.3", + "prost-types 0.12.3", "regex", - "syn 1.0.109", + "syn 2.0.48", "tempfile", "which", ] @@ -4077,13 +4757,35 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "prost-derive" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" +dependencies = [ + "anyhow", + "itertools 0.11.0", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "prost-types" version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" dependencies = [ - "prost", + "prost 0.11.9", +] + +[[package]] +name = "prost-types" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e" +dependencies = [ + "prost 0.12.3", ] [[package]] @@ -4092,6 +4794,32 @@ version = "2.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "106dd99e98437432fed6519dedecfade6a06a73bb7b2a1e019fdd2bee5778d94" +[[package]] +name = "pulldown-cmark" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a1a2f1f0a7ecff9c31abbe177637be0e97a0aef46cf8738ece09327985d998" +dependencies = [ + "bitflags 1.3.2", + "memchr", + "unicase", +] + +[[package]] +name = "quanta" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca0b7bac0b97248c40bb77288fc52029cf1459c0461ea1b05ee32ccf011de2c" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "query_functions" version = "0.1.0" @@ -4100,12 +4828,12 @@ dependencies = [ "chrono", "datafusion", "datafusion_util", - "itertools 0.11.0", + "itertools 0.12.0", "once_cell", "regex", - "regex-syntax 0.7.5", + "regex-syntax 0.8.2", "schema", - "snafu", + "snafu 0.8.0", "tokio", "workspace-hack", ] @@ -4121,9 +4849,9 @@ dependencies = [ [[package]] name = "quick-xml" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff6510e86862b57b210fd8cbe8ed3f0d7d600b9c2863cd4549a2e033c66e956" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" dependencies = [ "memchr", "serde", @@ -4177,6 +4905,15 @@ dependencies = [ "rand_core", ] +[[package]] +name = "raw-cpuid" +version = "11.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d86a7c4638d42c44551f4791a20e687dbb4c3de1f33c43dd71e355cd429def1" +dependencies = [ + "bitflags 2.4.2", +] + [[package]] name = "rayon" version = "1.8.1" @@ -4217,13 +4954,13 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.4.3", + "regex-automata 0.4.5", "regex-syntax 0.8.2", ] @@ -4238,9 +4975,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" dependencies = [ "aho-corasick", "memchr", @@ -4253,31 +4990,19 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" -[[package]] -name = "regex-syntax" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" - [[package]] name = "regex-syntax" version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" -[[package]] -name = "relative-path" -version = "1.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e898588f33fdd5b9420719948f9f2a32c922a246964576f71ba7f24f80610fbc" - [[package]] name = "reqwest" version = "0.11.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" dependencies = [ - "base64 0.21.7", + "base64", "bytes", "encoding_rs", "futures-core", @@ -4295,6 +5020,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "rustls", + "rustls-native-certs", "rustls-pemfile", "serde", "serde_json", @@ -4309,7 +5035,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 0.25.3", + "webpki-roots", "winreg", ] @@ -4322,21 +5048,6 @@ dependencies = [ "bytemuck", ] -[[package]] -name = "ring" -version = "0.16.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3053cf52e236a3ed746dfc745aa9cacf1b791d846bdaf412f60a8d7d6e17c8fc" -dependencies = [ - "cc", - "libc", - "once_cell", - "spin 0.5.2", - "untrusted 0.7.1", - "web-sys", - "winapi", -] - [[package]] name = "ring" version = "0.17.7" @@ -4347,7 +5058,7 @@ dependencies = [ "getrandom", "libc", "spin 0.9.8", - "untrusted 0.9.0", + "untrusted", "windows-sys 0.48.0", ] @@ -4371,35 +5082,6 @@ dependencies = [ "zeroize", ] -[[package]] -name = "rstest" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97eeab2f3c0a199bc4be135c36c924b6590b88c377d416494288c14f2db30199" -dependencies = [ - "futures", - "futures-timer", - "rstest_macros", - "rustc_version", -] - -[[package]] -name = "rstest_macros" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d428f8247852f894ee1be110b375111b586d4fa431f6c46e64ba5a0dcccbe605" -dependencies = [ - "cfg-if", - "glob", - "proc-macro2", - "quote", - "regex", - "relative-path", - "rustc_version", - "syn 2.0.48", - "unicode-ident", -] - [[package]] name = "rustc-demangle" version = "0.1.23" @@ -4435,28 +5117,30 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" dependencies = [ "log", - "ring 0.17.7", - "rustls-webpki 0.101.7", + "ring", + "rustls-webpki", "sct", ] [[package]] -name = "rustls-pemfile" -version = "1.0.4" +name = "rustls-native-certs" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" dependencies = [ - "base64 0.21.7", + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", ] [[package]] -name = "rustls-webpki" -version = "0.100.3" +name = "rustls-pemfile" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6a5fc258f1c1276dfe3016516945546e2d5383911efc0fc4f1cdc5df3a4ae3" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "ring 0.16.20", - "untrusted 0.7.1", + "base64", ] [[package]] @@ -4465,8 +5149,8 @@ version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ - "ring 0.17.7", - "untrusted 0.9.0", + "ring", + "untrusted", ] [[package]] @@ -4485,9 +5169,18 @@ checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" dependencies = [ - "winapi-util", + "windows-sys 0.52.0", ] [[package]] @@ -4498,10 +5191,35 @@ dependencies = [ "hashbrown 0.14.3", "indexmap 2.1.0", "observability_deps", - "snafu", + "once_cell", + "snafu 0.8.0", "workspace-hack", ] +[[package]] +name = "schemars" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 1.0.109", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -4514,8 +5232,41 @@ version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" dependencies = [ - "ring 0.17.7", - "untrusted 0.9.0", + "ring", + "untrusted", +] + +[[package]] +name = "secrecy" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e" +dependencies = [ + "serde", + "zeroize", +] + +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", ] [[package]] @@ -4523,6 +5274,9 @@ name = "semver" version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" +dependencies = [ + "serde", +] [[package]] name = "seq-macro" @@ -4539,6 +5293,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-value" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c" +dependencies = [ + "ordered-float 2.10.1", + "serde", +] + [[package]] name = "serde_derive" version = "1.0.195" @@ -4550,6 +5314,17 @@ dependencies = [ "syn 2.0.48", ] +[[package]] +name = "serde_derive_internals" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + [[package]] name = "serde_json" version = "1.0.111" @@ -4582,24 +5357,27 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1bf28c79a99f70ee1f1d83d10c875d2e70618417fda01ad1785e027579d9d38" +dependencies = [ + "indexmap 2.1.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "service_common" version = "0.1.0" dependencies = [ - "async-trait", - "bytes", + "arrow", "datafusion", "executor", - "flightsql", - "iox_query", - "iox_query_influxql", - "iox_query_influxrpc", - "metric", - "parking_lot 0.12.1", - "predicate", - "tonic", - "trace", - "tracker", + "tonic 0.10.2", "workspace-hack", ] @@ -4619,16 +5397,19 @@ dependencies = [ "futures", "generated_types", "iox_query", + "iox_query_influxql", + "iox_query_params", "metric", "observability_deps", - "prost", + "prost 0.12.3", "serde", "serde_json", "service_common", - "snafu", + "snafu 0.8.0", "test_helpers", "tokio", - "tonic", + "tonic 0.10.2", + "tower_trailer", "trace", "trace_http", "tracker", @@ -4641,7 +5422,7 @@ version = "0.1.0" dependencies = [ "generated_types", "observability_deps", - "tonic", + "tonic 0.10.2", "workspace-hack", ] @@ -4728,6 +5509,21 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe" +[[package]] +name = "skeptic" +version = "0.13.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16d23b015676c90a0f01c197bfdc786c20342c73a0afdda9025adb0bc42940a8" +dependencies = [ + "bytecount", + "cargo_metadata", + "error-chain", + "glob", + "pulldown-cmark", + "tempfile", + "walkdir", +] + [[package]] name = "slab" version = "0.4.9" @@ -4750,7 +5546,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" dependencies = [ "doc-comment", - "snafu-derive", + "snafu-derive 0.7.5", +] + +[[package]] +name = "snafu" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d342c51730e54029130d7dc9fd735d28c4cd360f1368c01981d4f03ff207f096" +dependencies = [ + "snafu-derive 0.8.0", ] [[package]] @@ -4765,6 +5570,18 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "snafu-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080c44971436b1af15d6f61ddd8b543995cf63ab8e677d46b00cc06f4ef267a0" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "snap" version = "1.1.1" @@ -4819,9 +5636,9 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.37.0" +version = "0.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37ae05a8250b968a3f7db93155a84d68b2e6cea1583949af5ca5b5170c76c075" +checksum = "5cc2c25a6c66789625ef164b4c7d2e548d627902280c13710d33da8222169964" dependencies = [ "log", "sqlparser_derive", @@ -4829,13 +5646,13 @@ dependencies = [ [[package]] name = "sqlparser_derive" -version = "0.1.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55fe75cb4a364c7f7ae06c7dbbc8d84bddd85d6cdf9975963c3935bc1991761e" +checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] @@ -4865,7 +5682,7 @@ dependencies = [ "crossbeam-queue", "dotenvy", "either", - "event-listener", + "event-listener 2.5.3", "futures-channel", "futures-core", "futures-intrusive", @@ -4892,7 +5709,7 @@ dependencies = [ "tracing", "url", "uuid", - "webpki-roots 0.25.3", + "webpki-roots", ] [[package]] @@ -4955,7 +5772,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4" dependencies = [ "atoi", - "base64 0.21.7", + "base64", "bitflags 2.4.2", "byteorder", "bytes", @@ -4998,7 +5815,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24" dependencies = [ "atoi", - "base64 0.21.7", + "base64", "bitflags 2.4.2", "byteorder", "crc", @@ -5177,9 +5994,9 @@ checksum = "d3543ca0810e71767052bdcdd5653f23998b192642a22c5164bfa6581e40a4a2" [[package]] name = "sysinfo" -version = "0.29.11" +version = "0.30.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd727fc423c2060f6c92d9534cef765c65a6ed3f428a03d7def74a8c4348e666" +checksum = "1fb4f3438c8f6389c864e61221cbc97e9bca98b4daf39a5beb7bea660f528bb2" dependencies = [ "cfg-if", "core-foundation-sys", @@ -5187,7 +6004,7 @@ dependencies = [ "ntapi", "once_cell", "rayon", - "winapi", + "windows", ] [[package]] @@ -5211,6 +6028,12 @@ dependencies = [ "libc", ] +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + [[package]] name = "tempfile" version = "3.9.0" @@ -5240,7 +6063,7 @@ dependencies = [ "parking_lot 0.12.1", "tempfile", "tokio", - "tracing-log 0.1.4", + "tracing-log", "tracing-subscriber", "workspace-hack", ] @@ -5253,6 +6076,7 @@ dependencies = [ "arrow-flight", "arrow_util", "assert_cmd", + "assert_matches", "bytes", "data_types", "dml", @@ -5262,24 +6086,27 @@ dependencies = [ "hyper", "influxdb_iox_client", "ingester_query_grpc", + "insta", "iox_catalog", + "iox_query_params", "mutable_batch_lp", "mutable_batch_pb", "nix 0.27.1", "observability_deps", "once_cell", "parking_lot 0.12.1", - "prost", + "prost 0.12.3", "rand", "regex", "reqwest", - "snafu", + "serde_json", + "snafu 0.8.0", "sqlx", "tempfile", "test_helpers", "tokio", "tokio-util", - "tonic", + "tonic 0.10.2", "workspace-hack", ] @@ -5463,6 +6290,7 @@ dependencies = [ "futures-io", "futures-sink", "pin-project-lite", + "slab", "tokio", "tracing", ] @@ -5477,6 +6305,17 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "tokio_watchdog" +version = "0.1.0" +dependencies = [ + "metric", + "observability_deps", + "test_helpers", + "tokio", + "workspace-hack", +] + [[package]] name = "toml" version = "0.8.8" @@ -5517,10 +6356,9 @@ version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ - "async-stream", "async-trait", "axum", - "base64 0.21.7", + "base64", "bytes", "futures-core", "futures-util", @@ -5531,7 +6369,36 @@ dependencies = [ "hyper-timeout", "percent-encoding", "pin-project", - "prost", + "prost 0.11.9", + "tokio", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost 0.12.3", + "rustls", + "rustls-native-certs", "rustls-pemfile", "tokio", "tokio-rustls", @@ -5540,46 +6407,45 @@ dependencies = [ "tower-layer", "tower-service", "tracing", - "webpki-roots 0.23.1", ] [[package]] name = "tonic-build" -version = "0.9.2" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6fdaae4c2c638bb70fe42803a26fbd6fc6ac8c72f5c59f67ecc2a2dcabf4b07" +checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889" dependencies = [ "prettyplease", "proc-macro2", "prost-build", "quote", - "syn 1.0.109", + "syn 2.0.48", ] [[package]] name = "tonic-health" -version = "0.9.2" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "080964d45894b90273d2b1dd755fdd114560db8636bb41cea615213c45043c4d" +checksum = "f80db390246dfb46553481f6024f0082ba00178ea495dbb99e70ba9a4fafb5e1" dependencies = [ "async-stream", - "prost", + "prost 0.12.3", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", ] [[package]] name = "tonic-reflection" -version = "0.9.2" +version = "0.10.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0543d7092032041fbeac1f2c84304537553421a11a623c2301b12ef0264862c7" +checksum = "3fa37c513df1339d197f4ba21d28c918b9ef1ac1768265f11ecb6b7f1cba1b76" dependencies = [ - "prost", - "prost-types", + "prost 0.12.3", + "prost-types 0.12.3", "tokio", "tokio-stream", - "tonic", + "tonic 0.10.2", ] [[package]] @@ -5608,6 +6474,7 @@ version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" dependencies = [ + "base64", "bitflags 2.4.2", "bytes", "futures-core", @@ -5615,6 +6482,7 @@ dependencies = [ "http", "http-body", "http-range-header", + "mime", "pin-project-lite", "tower-layer", "tower-service", @@ -5633,6 +6501,19 @@ version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +[[package]] +name = "tower_trailer" +version = "0.1.0" +dependencies = [ + "futures", + "http", + "http-body", + "parking_lot 0.12.1", + "pin-project", + "tower", + "workspace-hack", +] + [[package]] name = "trace" version = "0.1.0" @@ -5654,7 +6535,7 @@ dependencies = [ "futures", "iox_time", "observability_deps", - "snafu", + "snafu 0.8.0", "thrift", "tokio", "trace", @@ -5665,16 +6546,17 @@ dependencies = [ name = "trace_http" version = "0.1.0" dependencies = [ + "bytes", "futures", "hashbrown 0.14.3", "http", "http-body", - "itertools 0.11.0", + "itertools 0.12.0", "metric", "observability_deps", "parking_lot 0.12.1", "pin-project", - "snafu", + "snafu 0.8.0", "tower", "trace", "workspace-hack", @@ -5713,17 +6595,6 @@ dependencies = [ "valuable", ] -[[package]] -name = "tracing-log" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2" -dependencies = [ - "log", - "once_cell", - "tracing-core", -] - [[package]] name = "tracing-log" version = "0.2.0" @@ -5763,7 +6634,7 @@ dependencies = [ "thread_local", "tracing", "tracing-core", - "tracing-log 0.2.0", + "tracing-log", "tracing-serde", ] @@ -5788,6 +6659,21 @@ dependencies = [ "workspace-hack", ] +[[package]] +name = "treediff" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52984d277bdf2a751072b5df30ec0377febdb02f7696d64c2d7d54630bac4303" +dependencies = [ + "serde_json", +] + +[[package]] +name = "triomphe" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "859eb650cfee7434994602c3a68b25d77ad9e68c8a6cd491616ef86661382eb3" + [[package]] name = "trogging" version = "0.1.0" @@ -5798,7 +6684,7 @@ dependencies = [ "regex", "synchronized-writer", "thiserror", - "tracing-log 0.1.4", + "tracing-log", "tracing-subscriber", ] @@ -5836,6 +6722,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" +[[package]] +name = "unicase" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d2d4dafb69621809a81864c9c1b864479e1235c0dd4e199924b9742439ed89" +dependencies = [ + "version_check", +] + [[package]] name = "unicode-bidi" version = "0.3.15" @@ -5876,10 +6771,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" [[package]] -name = "untrusted" -version = "0.7.1" +name = "unsafe-libyaml" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" +checksum = "ab4c90930b95a82d00dc9e9ac071b4991924390d46cbd0dfe566148667605e4b" [[package]] name = "untrusted" @@ -5918,9 +6813,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.6.1" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +checksum = "f00cc9702ca12d3c81455259621e676d0f7251cec66a21e98fe2e9a37db93b2a" dependencies = [ "getrandom", ] @@ -5968,8 +6863,8 @@ dependencies = [ "mutable_batch_pb", "observability_deps", "parking_lot 0.12.1", - "prost", - "snafu", + "prost 0.12.3", + "snafu 0.8.0", "snap", "test_helpers", "tokio", @@ -6110,15 +7005,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "webpki-roots" -version = "0.23.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338" -dependencies = [ - "rustls-webpki 0.100.3", -] - [[package]] name = "webpki-roots" version = "0.25.3" @@ -6174,6 +7060,16 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e48a53791691ab099e5e2ad123536d0fff50652600abaf43bbf952894110d0be" +dependencies = [ + "windows-core", + "windows-targets 0.52.0", +] + [[package]] name = "windows-core" version = "0.52.0" @@ -6339,10 +7235,9 @@ name = "workspace-hack" version = "0.1.0" dependencies = [ "ahash", - "arrow", "arrow-array", - "arrow-flight", - "arrow-string", + "arrow-cast", + "arrow-ipc", "bitflags 2.4.2", "byteorder", "bytes", @@ -6352,8 +7247,6 @@ dependencies = [ "clap_builder", "crossbeam-utils", "crypto-common", - "datafusion", - "datafusion-optimizer", "digest", "either", "fixedbitset", @@ -6368,8 +7261,10 @@ dependencies = [ "getrandom", "hashbrown 0.14.3", "heck", + "hyper", "indexmap 2.1.0", "itertools 0.10.5", + "itertools 0.11.0", "libc", "lock_api", "log", @@ -6381,27 +7276,26 @@ dependencies = [ "object_store", "once_cell", "parking_lot 0.12.1", - "parquet", "petgraph", "phf_shared", "proptest", - "prost", - "prost-types", + "prost 0.11.9", + "prost 0.12.3", + "prost-types 0.11.9", + "prost-types 0.12.3", "rand", "rand_core", "regex", - "regex-automata 0.4.3", - "regex-syntax 0.7.5", + "regex-automata 0.4.5", "regex-syntax 0.8.2", "reqwest", - "ring 0.16.20", + "ring", "rustls", "serde", "serde_json", "sha2", "similar", "spin 0.9.8", - "sqlparser", "sqlx", "sqlx-core", "sqlx-macros", @@ -6415,7 +7309,6 @@ dependencies = [ "tokio", "tokio-stream", "tokio-util", - "tonic", "tower", "tracing", "tracing-core", @@ -6429,6 +7322,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "xxhash-rust" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53be06678ed9e83edb1745eb72efc0bbcd7b5c3c35711a860906aed827a13d61" + [[package]] name = "xz2" version = "0.1.7" @@ -6485,32 +7384,13 @@ version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" -[[package]] -name = "zstd" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" -dependencies = [ - "zstd-safe 6.0.6", -] - [[package]] name = "zstd" version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bffb3309596d527cfcba7dfc6ed6052f1d39dfbd7c867aa2e865e4a449c10110" dependencies = [ - "zstd-safe 7.0.0", -] - -[[package]] -name = "zstd-safe" -version = "6.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index d9cedab98ae..3094e83cc7b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -98,18 +98,41 @@ edition = "2021" license = "MIT OR Apache-2.0" [workspace.dependencies] -arrow = { version = "46.0.0" } -arrow-flight = { version = "46.0.0" } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178", default-features = false } -datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178" } +arrow = { version = "49.0.0", features = ["prettyprint", "chrono-tz"] } +arrow-buffer = { version = "49.0.0" } +arrow-flight = { version = "49.0.0", features = ["flight-sql-experimental"] } +datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" } +datafusion-proto = { git = "https://github.com/apache/arrow-datafusion.git", rev = "0e53c6d816f3a9d3d27c6ebb6d25b1699e5553e7" } +hashbrown = { version = "0.14.3" } +object_store = { version = "0.8.0" } +parquet = { version = "49.0.0", features = ["object_store"] } +pbjson = { version = "0.6.0" } +pbjson-build = { version = "0.6.2" } +pbjson-types = { version = "0.6.0" } +prost = { version = "0.12.3" } +prost-build = { version = "0.12.2" } +prost-types = { version = "0.12.3" } +sqlparser = { version = "0.41.0" } +tonic = { version = "0.10.2", features = ["tls", "tls-roots"] } +tonic-build = { version = "0.10.2" } +tonic-health = { version = "0.10.2" } +tonic-reflection = { version = "0.10.2" } -hashbrown = { version = "0.14.0" } -object_store = { version = "0.7.0" } -parquet = { version = "46.0.0" } -tonic = { version = "0.9.2", features = ["tls", "tls-webpki-roots"] } -tonic-build = { version = "0.9.2" } -tonic-health = { version = "0.9.2" } -tonic-reflection = { version = "0.9.2" } +[workspace.lints.rust] +rust_2018_idioms = "deny" +unreachable_pub = "deny" +missing_debug_implementations = "deny" +missing_copy_implementations = "deny" + +[workspace.lints.clippy] +dbg_macro = "deny" +todo = "deny" +clone_on_ref_ptr = "deny" +future_not_send = "deny" + +[workspace.lints.rustdoc] +broken_intra_doc_links = "deny" +bare_urls = "deny" # This profile optimizes for runtime performance and small binary size at the expense of longer # build times. It's most suitable for final release builds. @@ -135,19 +158,3 @@ opt-level = 3 [profile.dev.package.similar] opt-level = 3 - -[patch.crates-io] -# Can remove after arrow 47 is released -# Pin to https://github.com/apache/arrow-rs/pull/4790 -# To get fixes for -# - https://github.com/apache/arrow-rs/issues/4788, -# - https://github.com/apache/arrow-rs/pull/4799 -arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-buffer = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-schema = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-select = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-ord = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } -parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5" } diff --git a/arrow_util/Cargo.toml b/arrow_util/Cargo.toml index 83a826f7e0a..18ac4bf7c2c 100644 --- a/arrow_util/Cargo.toml +++ b/arrow_util/Cargo.toml @@ -6,22 +6,24 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -# need dyn_cmp_dict feature for comparing dictionary arrays -arrow = { workspace = true, features = ["prettyprint", "dyn_cmp_dict"] } +arrow = { workspace = true } # used by arrow anyway (needed for printing workaround) chrono = { version = "0.4", default-features = false } -comfy-table = { version = "7.0", default-features = false } +comfy-table = { version = "7.1", default-features = false } hashbrown = { workspace = true } num-traits = "0.2" -once_cell = { version = "1.18", features = ["parking_lot"] } -regex = "1.9.5" -snafu = "0.7" +once_cell = { version = "1.19", features = ["parking_lot"] } +regex = "1.10.2" +snafu = "0.8" uuid = "1" workspace-hack = { version = "0.1", path = "../workspace-hack" } - [dev-dependencies] datafusion = { workspace = true } +proptest = { version = "1.4.0", default-features = false, features = ["std"] } rand = "0.8.3" diff --git a/arrow_util/src/bitset.rs b/arrow_util/src/bitset.rs index 34f177915ef..7fecee6fe51 100644 --- a/arrow_util/src/bitset.rs +++ b/arrow_util/src/bitset.rs @@ -22,6 +22,15 @@ impl BitSet { Self::default() } + /// Construct an empty [`BitSet`] with a pre-allocated capacity for `n` + /// bits. + pub fn with_capacity(n: usize) -> Self { + Self { + buffer: Vec::with_capacity((n + 7) / 8), + len: 0, + } + } + /// Creates a new BitSet with `count` unset bits. pub fn with_size(count: usize) -> Self { let mut bitset = Self::default(); @@ -31,30 +40,30 @@ impl BitSet { /// Reserve space for `count` further bits pub fn reserve(&mut self, count: usize) { - let new_buf_len = (self.len + count + 7) >> 3; + let new_buf_len = (self.len + count + 7) / 8; self.buffer.reserve(new_buf_len); } /// Appends `count` unset bits pub fn append_unset(&mut self, count: usize) { self.len += count; - let new_buf_len = (self.len + 7) >> 3; + let new_buf_len = (self.len + 7) / 8; self.buffer.resize(new_buf_len, 0); } /// Appends `count` set bits pub fn append_set(&mut self, count: usize) { let new_len = self.len + count; - let new_buf_len = (new_len + 7) >> 3; + let new_buf_len = (new_len + 7) / 8; - let skew = self.len & 7; + let skew = self.len % 8; if skew != 0 { *self.buffer.last_mut().unwrap() |= 0xFF << skew; } self.buffer.resize(new_buf_len, 0xFF); - let rem = new_len & 7; + let rem = new_len % 8; if rem != 0 { *self.buffer.last_mut().unwrap() &= (1 << rem) - 1; } @@ -64,15 +73,27 @@ impl BitSet { /// Truncates the bitset to the provided length pub fn truncate(&mut self, len: usize) { - let new_buf_len = (len + 7) >> 3; + let new_buf_len = (len + 7) / 8; self.buffer.truncate(new_buf_len); - let overrun = len & 7; + let overrun = len % 8; if overrun > 0 { *self.buffer.last_mut().unwrap() &= (1 << overrun) - 1; } self.len = len; } + /// Split this bitmap at the specified bit boundary, such that after this + /// call, `self` contains the range `[0, n)` and the returned value contains + /// `[n, len)`. + pub fn split_off(&mut self, n: usize) -> Self { + let mut right = Self::with_capacity(self.len - n); + right.extend_from_range(self, n..self.len); + + self.truncate(n); + + right + } + /// Extends this [`BitSet`] by the context of `other` pub fn extend_from(&mut self, other: &BitSet) { self.append_bits(other.len, &other.buffer) @@ -85,9 +106,9 @@ impl BitSet { return; } - let start_byte = range.start >> 3; - let end_byte = (range.end + 7) >> 3; - let skew = range.start & 7; + let start_byte = range.start / 8; + let end_byte = (range.end + 7) / 8; + let skew = range.start % 8; // `append_bits` requires the provided `to_set` to be byte aligned, therefore // if the range being copied is not byte aligned we must first append @@ -109,16 +130,16 @@ impl BitSet { /// Appends `count` boolean values from the slice of packed bits pub fn append_bits(&mut self, count: usize, to_set: &[u8]) { - assert_eq!((count + 7) >> 3, to_set.len()); + assert_eq!((count + 7) / 8, to_set.len()); let new_len = self.len + count; - let new_buf_len = (new_len + 7) >> 3; + let new_buf_len = (new_len + 7) / 8; self.buffer.reserve(new_buf_len - self.buffer.len()); - let whole_bytes = count >> 3; - let overrun = count & 7; + let whole_bytes = count / 8; + let overrun = count % 8; - let skew = self.len & 7; + let skew = self.len % 8; if skew == 0 { self.buffer.extend_from_slice(&to_set[..whole_bytes]); if overrun > 0 { @@ -158,8 +179,8 @@ impl BitSet { pub fn set(&mut self, idx: usize) { assert!(idx <= self.len); - let byte_idx = idx >> 3; - let bit_idx = idx & 7; + let byte_idx = idx / 8; + let bit_idx = idx % 8; self.buffer[byte_idx] |= 1 << bit_idx; } @@ -167,8 +188,8 @@ impl BitSet { pub fn get(&self, idx: usize) -> bool { assert!(idx <= self.len); - let byte_idx = idx >> 3; - let bit_idx = idx & 7; + let byte_idx = idx / 8; + let bit_idx = idx % 8; (self.buffer[byte_idx] >> bit_idx) & 1 != 0 } @@ -227,8 +248,97 @@ impl BitSet { pub fn is_all_unset(&self) -> bool { self.buffer.iter().all(|&v| v == 0) } + + /// Returns the number of set bits in this bitmap. + pub fn count_ones(&self) -> usize { + // Invariant: the bits outside of [0, self.len) are always 0 + self.buffer.iter().map(|v| v.count_ones() as usize).sum() + } + + /// Returns the number of unset bits in this bitmap. + pub fn count_zeros(&self) -> usize { + self.len() - self.count_ones() + } + + /// Returns true if any bit is set (short circuiting). + pub fn is_any_set(&self) -> bool { + self.buffer.iter().any(|&v| v != 0) + } + + /// Returns a value [`Iterator`] that yields boolean values encoded in the + /// bitmap. + pub fn iter(&self) -> Iter<'_> { + Iter::new(self) + } + + /// Returns the bitwise AND between the two [`BitSet`] instances. + /// + /// # Panics + /// + /// Panics if the two sets have differing lengths. + pub fn and(&self, other: &Self) -> Self { + assert_eq!(self.len, other.len); + + Self { + buffer: self + .buffer + .iter() + .zip(other.buffer.iter()) + .map(|(a, b)| a & b) + .collect(), + len: self.len, + } + } +} + +/// A value iterator yielding the boolean values encoded in the bitmap. +#[derive(Debug)] +pub struct Iter<'a> { + /// A reference to the bitmap buffer. + buffer: &'a [u8], + /// The index of the next yielded bit in `buffer`. + idx: usize, + /// The number of bits stored in buffer. + len: usize, +} + +impl<'a> Iter<'a> { + fn new(b: &'a BitSet) -> Self { + Self { + buffer: &b.buffer, + idx: 0, + len: b.len(), + } + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = bool; + + fn next(&mut self) -> Option { + if self.idx >= self.len { + return None; + } + + let byte_idx = self.idx / 8; + let shift = self.idx % 8; + + self.idx += 1; + + let byte = self.buffer[byte_idx]; + let byte = byte >> shift; + + Some(byte & 1 == 1) + } + + fn size_hint(&self) -> (usize, Option) { + let v = self.len - self.idx; + (v, Some(v)) + } } +impl<'a> ExactSizeIterator for Iter<'a> {} + /// Returns an iterator over set bit positions in increasing order pub fn iter_set_positions(bytes: &[u8]) -> impl Iterator + '_ { iter_set_positions_with_offset(bytes, 0) @@ -240,17 +350,17 @@ pub fn iter_set_positions_with_offset( bytes: &[u8], offset: usize, ) -> impl Iterator + '_ { - let mut byte_idx = offset >> 3; + let mut byte_idx = offset / 8; let mut in_progress = bytes.get(byte_idx).cloned().unwrap_or(0); - let skew = offset & 7; + let skew = offset % 8; in_progress &= 0xFF << skew; std::iter::from_fn(move || loop { if in_progress != 0 { let bit_pos = in_progress.trailing_zeros(); in_progress ^= 1 << bit_pos; - return Some((byte_idx << 3) + (bit_pos as usize)); + return Some((byte_idx * 8) + (bit_pos as usize)); } byte_idx += 1; in_progress = *bytes.get(byte_idx)?; @@ -259,11 +369,13 @@ pub fn iter_set_positions_with_offset( #[cfg(test)] mod tests { - use super::*; use arrow::array::BooleanBufferBuilder; + use proptest::prelude::*; use rand::prelude::*; use rand::rngs::OsRng; + use super::*; + /// Computes a compacted representation of a given bool array fn compact_bools(bools: &[bool]) -> Vec { bools @@ -284,9 +396,8 @@ mod tests { bools .iter() .enumerate() - // Filter out all y that are not true and then return only x - .filter(|&(_, y)| *y) - .map(|(x, _)| x) + .filter(|&(_x, y)| *y) + .map(|(x, _y)| x) } #[test] @@ -304,8 +415,11 @@ mod tests { fn test_bit_mask() { let mut mask = BitSet::new(); + assert!(!mask.is_any_set()); + mask.append_bits(8, &[0b11111111]); let d1 = mask.buffer.clone(); + assert!(mask.is_any_set()); mask.append_bits(3, &[0b01010010]); let d2 = mask.buffer.clone(); @@ -522,9 +636,17 @@ mod tests { fn test_all_set_unset() { for i in 1..100 { let mut v = BitSet::new(); + assert!(!v.is_any_set()); v.append_set(i); assert!(v.is_all_set()); assert!(!v.is_all_unset()); + assert!(v.is_any_set()); + + let mut v = BitSet::new(); + v.append_unset(i); + assert!(!v.is_any_set()); + v.append_set(1); + assert!(v.is_any_set()); } } @@ -590,4 +712,168 @@ mod tests { assert!(!v.is_all_set()); assert!(v.is_all_unset()); } + + #[test] + fn test_split_byte_boundary() { + let mut a = BitSet::new(); + + a.append_set(16); + a.append_unset(8); + a.append_set(8); + + let b = a.split_off(16); + + assert_eq!(a.len(), 16); + assert_eq!(b.len(), 16); + + // All the bits in A are set. + assert!(a.is_all_set()); + for i in 0..16 { + assert!(a.get(i)); + } + + // The first 8 bits in b are unset, and the next 8 bits are set. + for i in 0..8 { + assert!(!b.get(i)); + } + for i in 8..16 { + assert!(b.get(i)); + } + } + + #[test] + fn test_split_sub_byte_boundary() { + let mut a = BitSet::new(); + + a.append_set(3); + a.append_unset(3); + a.append_set(1); + + assert_eq!(a.bytes(), &[0b01000111]); + + let b = a.split_off(5); + + assert_eq!(a.len(), 5); + assert_eq!(b.len(), 2); + + // A contains 3 set bits & 2 unset bits, with the rest masked out. + assert_eq!(a.bytes(), &[0b00000111]); + + // B contains 1 unset bit, and then 1 set bit + assert_eq!(b.bytes(), &[0b0000010]); + } + + #[test] + fn test_split_multi_byte_unclean_boundary() { + let mut a = BitSet::new(); + + a.append_set(8); + a.append_unset(1); + a.append_set(1); + a.append_unset(1); + a.append_set(1); + + assert_eq!(a.bytes(), &[0b11111111, 0b00001010]); + + let b = a.split_off(10); + + assert_eq!(a.len(), 10); + assert_eq!(b.len(), 2); + + assert_eq!(a.bytes(), &[0b11111111, 0b00000010]); + assert_eq!(b.bytes(), &[0b0000010]); + } + + #[test] + fn test_count_ones_with_truncate() { + // For varying sizes of bitmaps. + for i in 1..150 { + let mut b = BitSet::new(); + + // Set "i" number of bits in 2*i values. + for _ in 0..i { + b.append_unset(1); + b.append_set(1); + } + + assert_eq!(b.len(), 2 * i); + assert_eq!(b.count_ones(), i); + assert_eq!(b.count_zeros(), i); + + // Split it such that the last bit is removed. + let other = b.split_off((2 * i) - 1); + assert_eq!(other.len(), 1); + assert_eq!(other.count_ones(), 1); + assert_eq!(other.count_zeros(), 0); + + // Which means the original bitmap must now have 1 less 1 bit. + assert_eq!(b.len(), (2 * i) - 1); + assert_eq!(b.count_ones(), i - 1); + assert_eq!(b.count_zeros(), i); + } + } + + prop_compose! { + /// Returns a [`BitSet`] of random length and content. + fn arbitrary_bitset()( + values in prop::collection::vec(any::(), 0..20) + ) -> BitSet { + let mut b = BitSet::new(); + + for v in &values { + match v { + true => b.append_set(1), + false => b.append_unset(1), + } + } + + b + } + } + + proptest! { + #[test] + fn prop_iter( + values in prop::collection::vec(any::(), 0..20), + ) { + let mut b = BitSet::new(); + + for v in &values { + match v { + true => b.append_set(1), + false => b.append_unset(1), + } + } + + assert_eq!(values.len(), b.len()); + + let got = b.iter().collect::>(); + assert_eq!(values, got); + + // Exact size iter + assert_eq!(b.iter().len(), values.len()); + } + + #[test] + fn prop_and( + mut a in arbitrary_bitset(), + mut b in arbitrary_bitset(), + ) { + let min_len = a.len().min(b.len()); + // Truncate a and b to the same length. + a.truncate(min_len); + b.truncate(min_len); + + let want = a + .iter() + .zip(b.iter()) + .map(|(a, b)| a & b) + .collect::>(); + + let c = a.and(&b); + let got = c.iter().collect::>(); + + assert_eq!(got, want); + } + } } diff --git a/arrow_util/src/string.rs b/arrow_util/src/string.rs index fe3dcc225a7..5460a38b7ff 100644 --- a/arrow_util/src/string.rs +++ b/arrow_util/src/string.rs @@ -154,6 +154,37 @@ impl + FromPrimitive + Zero> PackedStringArray { pub fn into_inner(self) -> (Vec, String) { (self.offsets, self.storage) } + + /// Split this [`PackedStringArray`] at `n`, such that `self`` contains the + /// elements `[0, n)` and the returned [`PackedStringArray`] contains + /// elements `[n, len)`. + pub fn split_off(&mut self, n: usize) -> Self { + if n > self.len() { + return Default::default(); + } + + let offsets = self.offsets.split_off(n + 1); + + // Figure out where to split the string storage. + let split_point = self.offsets.last().map(|v| v.as_()).unwrap(); + + // Split the storage at the split point, such that the first N values + // appear in self. + let storage = self.storage.split_off(split_point); + + // The new "offsets" now needs remapping such that the first offset + // starts at 0, so that indexing into the new storage string will hit + // the right start point. + let offsets = std::iter::once(K::zero()) + .chain( + offsets + .into_iter() + .map(|v| K::from_usize(v.as_() - split_point).unwrap()), + ) + .collect::>(); + + Self { offsets, storage } + } } impl PackedStringArray { @@ -201,6 +232,8 @@ impl<'a, K: AsPrimitive + FromPrimitive + Zero> Iterator for PackedString mod tests { use crate::string::PackedStringArray; + use proptest::prelude::*; + #[test] fn test_storage() { let mut array = PackedStringArray::::new(); @@ -316,4 +349,36 @@ mod tests { vec!["hello", "world", "cupcake", "", "bar", "", "foo", "bar", "", "fiz"] ); } + + proptest! { + #[test] + fn prop_split_off( + a in prop::collection::vec(any::(), 0..20), + b in prop::collection::vec(any::(), 0..20), + ) { + let mut p = PackedStringArray::::new(); + + // Add all the elements in "a" and "b" to the string array. + for v in a.iter().chain(b.iter()) { + p.append(v); + } + + // Split the packed string array at the boundary of "a". + let p2 = p.split_off(a.len()); + + assert_eq!(p.iter().collect::>(), a, "parent"); + assert_eq!(p2.iter().collect::>(), b, "child"); + } + } + + #[test] + fn test_split_off_oob() { + let mut p = PackedStringArray::::new(); + + p.append("bananas"); + + let got = p.split_off(42); + assert_eq!(p.len(), 1); + assert_eq!(got.len(), 0); + } } diff --git a/arrow_util/src/test_util.rs b/arrow_util/src/test_util.rs index 17e80f88c46..8126e251787 100644 --- a/arrow_util/src/test_util.rs +++ b/arrow_util/src/test_util.rs @@ -240,7 +240,7 @@ static REGEX_FILTER: Lazy = Lazy::new(|| { /// Matches things like `time@3 < -9223372036854775808` and `time_min@2 > 1641031200399937022` static REGEX_TIME_OP: Lazy = Lazy::new(|| { - Regex::new("(?Ptime((_min)|(_max))?@[0-9]+ [<>=]=? )(?P-?[0-9]+)") + Regex::new("(?Ptime((_min)|(_max))?@[0-9]+ [<>=]=? (CAST\\()?)(?P-?[0-9]+)(?P AS Timestamp\\(Nanosecond, \"[^\"]\"\\)\\))?") .expect("time opt regex") }); @@ -258,7 +258,8 @@ fn normalize_time_ops(s: &str) -> String { REGEX_TIME_OP .replace_all(s, |c: &Captures<'_>| { let prefix = c.name("prefix").expect("always captures").as_str(); - format!("{prefix}") + let suffix = c.name("suffix").map_or("", |m| m.as_str()); + format!("{prefix}{suffix}") }) .to_string() } diff --git a/authz/Cargo.toml b/authz/Cargo.toml index 06b0a68a5b5..9fc5ed9a961 100644 --- a/authz/Cargo.toml +++ b/authz/Cargo.toml @@ -6,11 +6,12 @@ authors.workspace = true edition.workspace = true license.workspace = true -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[lints] +workspace = true [dependencies] backoff = { path = "../backoff" } -http = {version = "0.2.9", optional = true } +http = {version = "0.2.11", optional = true } iox_time = { version = "0.1.0", path = "../iox_time" } generated_types = { path = "../generated_types" } metric = { version = "0.1.0", path = "../metric" } @@ -19,8 +20,8 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" } # crates.io dependencies in alphabetical order. async-trait = "0.1" -base64 = "0.21.4" -snafu = "0.7" +base64 = "0.21.7" +snafu = "0.8" tonic = { workspace = true } [dev-dependencies] @@ -28,7 +29,7 @@ assert_matches = "1.5.0" parking_lot = "0.12.1" paste = "1.0.14" test_helpers_end_to_end = { path = "../test_helpers_end_to_end" } -tokio = "1.32.0" +tokio = "1.35.1" [features] http = ["dep:http"] diff --git a/authz/src/permission.rs b/authz/src/permission.rs index 1836e655cd7..9ffced0e4b1 100644 --- a/authz/src/permission.rs +++ b/authz/src/permission.rs @@ -75,13 +75,13 @@ impl TryFrom for Permission { match value.permission_one_of { Some(proto::permission::PermissionOneOf::ResourceAction(ra)) => { let r = Resource::try_from_proto( - proto::resource_action_permission::ResourceType::from_i32(ra.resource_type) - .ok_or(IncompatiblePermissionError {})?, + proto::resource_action_permission::ResourceType::try_from(ra.resource_type) + .map_err(|_| IncompatiblePermissionError {})?, ra.resource_id, )?; let a = Action::try_from( - proto::resource_action_permission::Action::from_i32(ra.action) - .ok_or(IncompatiblePermissionError {})?, + proto::resource_action_permission::Action::try_from(ra.action) + .map_err(|_| IncompatiblePermissionError {})?, )?; Ok(Self::ResourceAction(r, a)) } diff --git a/backoff/Cargo.toml b/backoff/Cargo.toml index 1bd3cb34351..484412fb187 100644 --- a/backoff/Cargo.toml +++ b/backoff/Cargo.toml @@ -5,9 +5,12 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -tokio = { version = "1.32", features = ["macros", "time"] } +tokio = { version = "1.35", features = ["macros", "time"] } observability_deps = { path = "../observability_deps" } rand = "0.8" -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/backoff/src/lib.rs b/backoff/src/lib.rs index e1264840265..907847b380f 100644 --- a/backoff/src/lib.rs +++ b/backoff/src/lib.rs @@ -163,6 +163,7 @@ impl Backoff { F1: std::future::Future> + Send, E: std::error::Error + Send + 'static, { + let mut fail_count = 0_usize; loop { // first execute `F` and then use it, so we can avoid `F: Sync`. let do_stuff = do_stuff(); @@ -182,10 +183,13 @@ impl Backoff { } }; + fail_count += 1; + warn!( error=%e, task_name, backoff_secs = backoff.as_secs(), + fail_count, "request encountered non-fatal error - backing off", ); tokio::time::sleep(backoff).await; diff --git a/cache_system/Cargo.toml b/cache_system/Cargo.toml index d17d0ed6272..bb07eba7480 100644 --- a/cache_system/Cargo.toml +++ b/cache_system/Cargo.toml @@ -5,8 +5,11 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -async-trait = "0.1.73" +async-trait = "0.1.77" backoff = { path = "../backoff" } futures = "0.3" iox_time = { path = "../iox_time" } @@ -16,9 +19,10 @@ ouroboros = "0.18" parking_lot = { version = "0.12", features = ["arc_lock"] } pdatastructs = { version = "0.7", default-features = false, features = ["fixedbitset"] } rand = "0.8.3" -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } -tokio-util = { version = "0.7.9" } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio-util = { version = "0.7.10" } trace = { path = "../trace"} +tracker = { path = "../tracker"} workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/cache_system/src/addressable_heap.rs b/cache_system/src/addressable_heap.rs index 5ce84b6e41c..4f3466fe3d3 100644 --- a/cache_system/src/addressable_heap.rs +++ b/cache_system/src/addressable_heap.rs @@ -171,6 +171,7 @@ fn project_tuple(t: &(A, B)) -> (&A, &B) { } /// Iterator of [`AddressableHeap::iter`]. +#[derive(Debug)] pub struct AddressableHeapIter<'a, K, V, O> where K: Clone + Eq + Hash + Ord, @@ -477,13 +478,11 @@ mod tests { res } + #[allow(clippy::map_identity)] // https://github.com/rust-lang/rust-clippy/issues/11764 fn peek(&self) -> Option<(&u8, &String, &i8)> { - #[allow(clippy::map_identity)] self.inner .iter() .min_by_key(|(k, _v, o)| (o, k)) - // This is a false positive as this actually changes - // Option<&(u8, String, i8)> -> Option<(&u8, &String, &i8)> .map(|(k, v, o)| (k, v, o)) } diff --git a/cache_system/src/backend/mod.rs b/cache_system/src/backend/mod.rs index 09855233e34..8395c830d63 100644 --- a/cache_system/src/backend/mod.rs +++ b/cache_system/src/backend/mod.rs @@ -10,7 +10,7 @@ mod test_util; /// Backend to keep and manage stored entries. /// /// A backend might remove entries at any point, e.g. due to memory pressure or expiration. -pub trait CacheBackend: Debug + Send + 'static { +pub trait CacheBackend: Debug { /// Cache key. type K: Clone + Eq + Hash + Ord + Debug + Send + 'static; @@ -37,13 +37,12 @@ pub trait CacheBackend: Debug + Send + 'static { fn as_any(&self) -> &dyn Any; } -impl CacheBackend for Box> +impl CacheBackend for Box where - K: Clone + Eq + Hash + Ord + Debug + Send + 'static, - V: Clone + Debug + Send + 'static, + T: CacheBackend + ?Sized + 'static, { - type K = K; - type V = V; + type K = T::K; + type V = T::V; fn get(&mut self, k: &Self::K) -> Option { self.as_mut().get(k) diff --git a/cache_system/src/backend/policy/mod.rs b/cache_system/src/backend/policy/mod.rs index d525d284394..c503c2a849a 100644 --- a/cache_system/src/backend/policy/mod.rs +++ b/cache_system/src/backend/policy/mod.rs @@ -11,7 +11,7 @@ use std::{ }; use iox_time::{Time, TimeProvider}; -use parking_lot::{lock_api::ArcMutexGuard, Mutex, RawMutex, ReentrantMutex}; +use parking_lot::{lock_api::ArcReentrantMutexGuard, RawMutex, RawThreadId, ReentrantMutex}; use super::CacheBackend; @@ -274,14 +274,14 @@ where /// /// Panics if `inner` is not empty. pub fn new( - inner: Box>, + inner: Box + Send>, time_provider: Arc, ) -> Self { assert!(inner.is_empty(), "inner backend is not empty"); Self { inner: Arc::new(ReentrantMutex::new(RefCell::new(PolicyBackendInner { - inner: Arc::new(Mutex::new(inner)), + inner, subscribers: Vec::new(), time_provider, }))), @@ -324,11 +324,8 @@ where pub fn inner_ref(&mut self) -> InnerBackendRef<'_, K, V> { // NOTE: We deliberately use a mutable reference here to prevent users from using `` while we hold a lock to the underlying backend. - lock_inner!(guard = self.inner); - InnerBackendRef { - inner: guard.inner.lock_arc(), - _phantom: PhantomData, - } + + inner_ref::build(Arc::clone(&self.inner)) } } @@ -345,8 +342,7 @@ where perform_changes(&mut guard, vec![ChangeRequest::get(k.clone())]); // poll inner backend AFTER everything has settled - let mut inner = guard.inner.lock(); - inner.get(k) + guard.inner.get(k) } fn set(&mut self, k: Self::K, v: Self::V) { @@ -361,8 +357,7 @@ where fn is_empty(&self) -> bool { lock_inner!(guard = self.inner); - let inner = guard.inner.lock(); - inner.is_empty() + guard.inner.is_empty() } fn as_any(&self) -> &dyn std::any::Any { @@ -410,11 +405,7 @@ where V: Clone + Debug + Send + 'static, { /// Underlying cache backend. - /// - /// This is wrapped into another `Arc>` construct even though [`PolicyBackendInner`] - /// is already guarded by a lock because we need to reference the underlying backend from - /// [`Recorder`], and [`Recorder`] implements [`CacheBackend`] which is `'static`. - inner: Arc>>>, + inner: Box + Send>, /// List of subscribers. subscribers: Vec>>, @@ -439,7 +430,7 @@ fn perform_changes( while let Some(change_request) = tasks.pop_front() { let mut recorder = Recorder { - inner: Arc::clone(&inner.inner), + inner: inner.inner.as_mut(), records: vec![], }; @@ -542,7 +533,7 @@ where /// patterns work out of the box without the need to fear interleaving modifications. pub fn from_fn(f: F) -> Self where - F: for<'b> FnOnce(&'b mut Recorder) + 'a, + F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, K, V>) + 'a, { Self { fun: Box::new(f) } } @@ -578,13 +569,13 @@ where } /// Execute this change request. - pub fn eval(self, backend: &mut Recorder) { - (self.fun)(backend) + pub fn eval(self, backend: &mut Recorder<'_, K, V>) { + (self.fun)(backend); } } /// Function captured within [`ChangeRequest`]. -type ChangeRequestFn<'a, K, V> = Box FnOnce(&'b mut Recorder) + 'a>; +type ChangeRequestFn<'a, K, V> = Box FnOnce(&'c mut Recorder<'b, K, V>) + 'a>; /// Records of interactions with the callback [`CacheBackend`]. #[derive(Debug, PartialEq)] @@ -614,16 +605,16 @@ enum Record { /// Specialized [`CacheBackend`] that forwards changes and requests to the underlying backend of /// [`PolicyBackend`] but also records all changes into [`Record`]s. #[derive(Debug)] -pub struct Recorder +pub struct Recorder<'a, K, V> where K: Clone + Eq + Hash + Ord + Debug + Send + 'static, V: Clone + Debug + Send + 'static, { - inner: Arc>>>, + inner: &'a mut (dyn CacheBackend + Send), records: Vec>, } -impl Recorder +impl<'a, K, V> Recorder<'a, K, V> where K: Clone + Eq + Hash + Ord + Debug + Send + 'static, V: Clone + Debug + Send + 'static, @@ -637,11 +628,11 @@ where /// modifying requests like [`SET`](CacheBackend::set) or [`REMOVE`](CacheBackend::remove) /// since they always require policies to be in-sync. pub fn get_untracked(&mut self, k: &K) -> Option { - self.inner.lock().get(k) + self.inner.get(k) } } -impl CacheBackend for Recorder +impl<'a, K, V> CacheBackend for Recorder<'a, K, V> where K: Clone + Eq + Hash + Ord + Debug + Send + 'static, V: Clone + Debug + Send + 'static, @@ -651,7 +642,7 @@ where fn get(&mut self, k: &Self::K) -> Option { self.records.push(Record::Get { k: k.clone() }); - self.inner.lock().get(k) + self.inner.get(k) } fn set(&mut self, k: Self::K, v: Self::V) { @@ -659,64 +650,75 @@ where k: k.clone(), v: v.clone(), }); - self.inner.lock().set(k, v); + self.inner.set(k, v); } fn remove(&mut self, k: &Self::K) { self.records.push(Record::Remove { k: k.clone() }); - self.inner.lock().remove(k); + self.inner.remove(k); } fn is_empty(&self) -> bool { - self.inner.lock().is_empty() + self.inner.is_empty() } fn as_any(&self) -> &dyn std::any::Any { - self + panic!("don't any-cast the recorder please") } } -/// Read-only ref to the inner backend of [`PolicyBackend`] for debugging. -pub struct InnerBackendRef<'a, K, V> -where - K: Clone + Eq + Hash + Ord + Debug + Send + 'static, - V: Clone + Debug + Send + 'static, -{ - inner: ArcMutexGuard>>, - _phantom: PhantomData<&'a mut ()>, -} +/// Helper module that wraps the implementation of [`InnerBackendRef`]. +/// +/// This is required because [`ouroboros`] generates a bunch of code that we do not want to leak all over the place. +mod inner_ref { + #![allow(non_snake_case, clippy::future_not_send)] -// Workaround for . -impl<'a, K, V> Drop for InnerBackendRef<'a, K, V> -where - K: Clone + Eq + Hash + Ord + Debug + Send + 'static, - V: Clone + Debug + Send + 'static, -{ - fn drop(&mut self) {} -} + use super::*; + use ouroboros::self_referencing; -impl<'a, K, V> Debug for InnerBackendRef<'a, K, V> -where - K: Clone + Eq + Hash + Ord + Debug + Send + 'static, - V: Clone + Debug + Send + 'static, -{ - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("InnerBackendRef").finish_non_exhaustive() + /// Read-only ref to the inner backend of [`PolicyBackend`] for debugging. + #[self_referencing] + pub struct InnerBackendRef<'a, K, V> + where + K: Clone + Eq + Hash + Ord + Debug + Send + 'static, + V: Clone + Debug + Send + 'static, + { + l1: ArcReentrantMutexGuard>>, + #[borrows(l1)] + #[covariant] + l2: std::cell::RefMut<'this, PolicyBackendInner>, + _phantom: PhantomData<&'a mut ()>, } -} -impl<'a, K, V> Deref for InnerBackendRef<'a, K, V> -where - K: Clone + Eq + Hash + Ord + Debug + Send + 'static, - V: Clone + Debug + Send + 'static, -{ - type Target = dyn CacheBackend; + impl<'a, K, V> Deref for InnerBackendRef<'a, K, V> + where + K: Clone + Eq + Hash + Ord + Debug + Send + 'static, + V: Clone + Debug + Send + 'static, + { + type Target = dyn CacheBackend; + + fn deref(&self) -> &Self::Target { + self.borrow_l2().inner.as_ref() + } + } - fn deref(&self) -> &Self::Target { - self.inner.as_ref() + pub(super) fn build<'a, K, V>(inner: StrongSharedInner) -> InnerBackendRef<'a, K, V> + where + K: Clone + Eq + Hash + Ord + Debug + Send + 'static, + V: Clone + Debug + Send + 'static, + { + let inner = inner.lock_arc(); + InnerBackendRefBuilder { + l1: inner, + l2_builder: |l1| l1.borrow_mut(), + _phantom: PhantomData, + } + .build() } } +pub use inner_ref::InnerBackendRef; + #[cfg(test)] mod tests { use std::{collections::HashMap, sync::Barrier, thread::JoinHandle}; @@ -1923,7 +1925,7 @@ mod tests { /// Same as [`ChangeRequestFn`] but implements `Send`. type SendableChangeRequestFn = - Box FnOnce(&'a mut Recorder) + Send + 'static>; + Box FnOnce(&'b mut Recorder<'a, String, usize>) + Send + 'static>; /// Same as [`ChangeRequest`] but implements `Send`. struct SendableChangeRequest { @@ -1940,7 +1942,7 @@ mod tests { impl SendableChangeRequest { fn from_fn(f: F) -> Self where - F: for<'b> FnOnce(&'b mut Recorder) + Send + 'static, + F: for<'b, 'c> FnOnce(&'c mut Recorder<'b, String, usize>) + Send + 'static, { Self { fun: Box::new(f) } } diff --git a/cache_system/src/cache/driver.rs b/cache_system/src/cache/driver.rs index 2b5e17ba06f..c0c9773b677 100644 --- a/cache_system/src/cache/driver.rs +++ b/cache_system/src/cache/driver.rs @@ -12,8 +12,8 @@ use futures::{ FutureExt, TryFutureExt, }; use observability_deps::tracing::debug; -use parking_lot::Mutex; use std::{collections::HashMap, fmt::Debug, future::Future, sync::Arc}; +use tracker::{LockMetrics, Mutex}; use super::{Cache, CacheGetStatus, CachePeekStatus}; @@ -21,7 +21,7 @@ use super::{Cache, CacheGetStatus, CachePeekStatus}; #[derive(Debug)] pub struct CacheDriver where - B: CacheBackend, + B: CacheBackend + Send + 'static, L: Loader, { state: Arc>>, @@ -30,13 +30,18 @@ where impl CacheDriver where - B: CacheBackend, + B: CacheBackend + Send + 'static, L: Loader, { /// Create new, empty cache with given loader function. - pub fn new(loader: Arc, backend: B) -> Self { + pub fn new(loader: Arc, backend: B, metrics: &metric::Registry, name: &'static str) -> Self { + let metrics = Arc::new(LockMetrics::new( + metrics, + &[("what", "cache_driver_state"), ("cache", name)], + )); + Self { - state: Arc::new(Mutex::new(CacheState { + state: Arc::new(metrics.new_mutex(CacheState { cached_entries: backend, running_queries: HashMap::new(), tag_counter: 0, @@ -140,7 +145,7 @@ where #[async_trait] impl Cache for CacheDriver where - B: CacheBackend, + B: CacheBackend + Send, L: Loader, { type K = B::K; @@ -257,7 +262,7 @@ where impl Drop for CacheDriver where - B: CacheBackend, + B: CacheBackend + Send, L: Loader, { fn drop(&mut self) { @@ -430,7 +435,12 @@ mod tests { type Cache = CacheDriver, TestLoader>; fn construct(&self, loader: Arc) -> Arc { - Arc::new(CacheDriver::new(Arc::clone(&loader) as _, HashMap::new())) + Arc::new(CacheDriver::new( + Arc::clone(&loader) as _, + HashMap::new(), + &metric::Registry::default(), + "test", + )) } fn get_extra(&self, inner: bool) -> Self::GetExtra { diff --git a/cache_system/src/cache/metrics.rs b/cache_system/src/cache/metrics.rs index 7ebad842a16..c72364aef43 100644 --- a/cache_system/src/cache/metrics.rs +++ b/cache_system/src/cache/metrics.rs @@ -645,7 +645,12 @@ mod tests { } fn new_with_loader(loader: Arc) -> Self { - let inner = CacheDriver::new(Arc::clone(&loader) as _, HashMap::new()); + let inner = CacheDriver::new( + Arc::clone(&loader) as _, + HashMap::new(), + &metric::Registry::default(), + "test", + ); let time_provider = Arc::new(MockProvider::new(Time::from_timestamp_millis(0).unwrap())); let metric_registry = metric::Registry::new(); diff --git a/cache_system/src/lib.rs b/cache_system/src/lib.rs index f20f1a9c8b3..68e60ae3e2c 100644 --- a/cache_system/src/lib.rs +++ b/cache_system/src/lib.rs @@ -12,6 +12,7 @@ clippy::dbg_macro, unused_crate_dependencies )] +#![allow(unreachable_pub)] // Workaround for "unused crate" lint false positives. #[cfg(test)] diff --git a/cache_system/src/loader/batch.rs b/cache_system/src/loader/batch.rs index 4d30196a0dc..36ab123929b 100644 --- a/cache_system/src/loader/batch.rs +++ b/cache_system/src/loader/batch.rs @@ -464,7 +464,12 @@ mod tests { #[tokio::test] async fn test_auto_flush_integration_with_cache_driver() { let (inner, batch) = setup(); - let cache = CacheDriver::new(Arc::clone(&batch), HashMap::new()); + let cache = CacheDriver::new( + Arc::clone(&batch), + HashMap::new(), + &metric::Registry::default(), + "test", + ); inner.mock_next(vec![1, 2], vec![String::from("foo"), String::from("bar")]); inner.mock_next(vec![3], vec![String::from("baz")]); diff --git a/catalog_cache/Cargo.toml b/catalog_cache/Cargo.toml new file mode 100644 index 00000000000..cdb79c5347b --- /dev/null +++ b/catalog_cache/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "catalog_cache" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +bytes = "1.5" +dashmap = "5.5" +futures = "0.3" +hyper = "0.14" +url = "2.5" +reqwest = { version = "0.11", default-features = false } +snafu = "0.8" +tokio = { version = "1.35", default-features = false, features = ["macros", "rt"] } +tokio-util = "0.7" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] diff --git a/catalog_cache/src/api/client.rs b/catalog_cache/src/api/client.rs new file mode 100644 index 00000000000..94e9bf9b47c --- /dev/null +++ b/catalog_cache/src/api/client.rs @@ -0,0 +1,176 @@ +//! Client for the cache HTTP API + +use crate::api::list::{ListDecoder, ListEntry, MAX_VALUE_SIZE}; +use crate::api::{RequestPath, GENERATION}; +use crate::{CacheKey, CacheValue}; +use bytes::{Buf, Bytes}; +use futures::prelude::*; +use futures::stream::BoxStream; +use reqwest::{Client, Response, StatusCode, Url}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::time::Duration; + +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Creating client: {source}"))] + Client { source: reqwest::Error }, + + #[snafu(display("Put Reqwest error: {source}"))] + Put { source: reqwest::Error }, + + #[snafu(display("Get Reqwest error: {source}"))] + Get { source: reqwest::Error }, + + #[snafu(display("List Reqwest error: {source}"))] + List { source: reqwest::Error }, + + #[snafu(display("Health Reqwest error: {source}"))] + Health { source: reqwest::Error }, + + #[snafu(display("Missing generation header"))] + MissingGeneration, + + #[snafu(display("Invalid generation value"))] + InvalidGeneration, + + #[snafu(display("Error decoding list stream: {source}"), context(false))] + ListStream { source: crate::api::list::Error }, +} + +/// Result type for [`CatalogCacheClient`] +pub type Result = std::result::Result; + +/// The type returned by [`CatalogCacheClient::list`] +pub type ListStream = BoxStream<'static, Result>; + +const RESOURCE_REQUEST_TIMEOUT: Duration = Duration::from_secs(1); + +/// We use a longer timeout for list request as they may transfer a non-trivial amount of data +const LIST_REQUEST_TIMEOUT: Duration = Duration::from_secs(20); + +/// A client for accessing a remote catalog cache +#[derive(Debug)] +pub struct CatalogCacheClient { + client: Client, + endpoint: Url, +} + +impl CatalogCacheClient { + /// Create a new [`CatalogCacheClient`] with the given remote endpoint + pub fn try_new(endpoint: Url) -> Result { + let client = Client::builder() + .connect_timeout(Duration::from_secs(2)) + .build() + .context(ClientSnafu)?; + + Ok(Self { endpoint, client }) + } + + /// Retrieve the given value from the remote cache, if present + pub async fn get(&self, key: CacheKey) -> Result> { + let url = format!("{}{}", self.endpoint, RequestPath::Resource(key)); + let timeout = RESOURCE_REQUEST_TIMEOUT; + let req = self.client.get(url).timeout(timeout); + let resp = req.send().await.context(GetSnafu)?; + + if resp.status() == StatusCode::NOT_FOUND { + return Ok(None); + } + let resp = resp.error_for_status().context(GetSnafu)?; + + let generation = resp + .headers() + .get(&GENERATION) + .context(MissingGenerationSnafu)?; + + let generation = generation + .to_str() + .ok() + .and_then(|v| v.parse().ok()) + .context(InvalidGenerationSnafu)?; + + let data = resp.bytes().await.context(GetSnafu)?; + + Ok(Some(CacheValue::new(data, generation))) + } + + /// Upsert the given key-value pair to the remote cache + /// + /// Returns false if the value had a generation less than or equal to + /// an existing value + pub async fn put(&self, key: CacheKey, value: &CacheValue) -> Result { + let url = format!("{}{}", self.endpoint, RequestPath::Resource(key)); + + let response = self + .client + .put(url) + .timeout(RESOURCE_REQUEST_TIMEOUT) + .header(&GENERATION, value.generation) + .body(value.data.clone()) + .send() + .await + .context(PutSnafu)? + .error_for_status() + .context(PutSnafu)?; + + Ok(matches!(response.status(), StatusCode::OK)) + } + + /// List the contents of the remote cache + /// + /// Values larger than `max_value_size` will not be returned inline, with only the key + /// and generation returned instead. Defaults to [`MAX_VALUE_SIZE`] + pub fn list(&self, max_value_size: Option) -> ListStream { + let size = max_value_size.unwrap_or(MAX_VALUE_SIZE); + let url = format!("{}{}?size={size}", self.endpoint, RequestPath::List); + let fut = self.client.get(url).timeout(LIST_REQUEST_TIMEOUT).send(); + + futures::stream::once(fut.map_err(|source| Error::List { source })) + .and_then(move |response| futures::future::ready(list_stream(response, size))) + .try_flatten() + .boxed() + } +} + +struct ListStreamState { + response: Response, + current: Bytes, + decoder: ListDecoder, +} + +impl ListStreamState { + fn new(response: Response, max_value_size: usize) -> Self { + Self { + response, + current: Default::default(), + decoder: ListDecoder::new().with_max_value_size(max_value_size), + } + } +} + +fn list_stream( + response: Response, + max_value_size: usize, +) -> Result>> { + let resp = response.error_for_status().context(ListSnafu)?; + let state = ListStreamState::new(resp, max_value_size); + Ok(stream::try_unfold(state, |mut state| async move { + loop { + if state.current.is_empty() { + match state.response.chunk().await.context(ListSnafu)? { + Some(new) => state.current = new, + None => break, + } + } + + let to_read = state.current.len(); + let read = state.decoder.decode(&state.current)?; + state.current.advance(read); + if read != to_read { + break; + } + } + Ok(state.decoder.flush()?.map(|entry| (entry, state))) + })) +} diff --git a/catalog_cache/src/api/list.rs b/catalog_cache/src/api/list.rs new file mode 100644 index 00000000000..155f7949196 --- /dev/null +++ b/catalog_cache/src/api/list.rs @@ -0,0 +1,467 @@ +//! The encoding mechanism for list streams +//! +//! This is capable of streaming both keys and values, this saves round-trips when hydrating +//! a cache from a remote, and avoids creating a flood of HTTP GET requests + +use bytes::Bytes; +use snafu::{ensure, Snafu}; + +use crate::{CacheKey, CacheValue}; + +/// Error type for list streams +#[derive(Debug, Snafu)] +#[allow(missing_copy_implementations, missing_docs)] +pub enum Error { + #[snafu(display("Unexpected EOF whilst decoding list stream"))] + UnexpectedEOF, + + #[snafu(display("List value of {size} bytes too large"))] + ValueTooLarge { size: usize }, +} + +/// Result type for list streams +pub type Result = std::result::Result; + +/// The size at which to flush [`Bytes`] from [`ListEncoder`] +pub const FLUSH_SIZE: usize = 1024 * 1024; // Flush in 1MB blocks + +/// The maximum value size to send in a [`ListEntry`] +/// +/// This primarily exists as a self-protection limit to prevent large or corrupted streams +/// from swamping the client, but also mitigates Head-Of-Line blocking resulting from +/// large cache values +pub const MAX_VALUE_SIZE: usize = 1024 * 10; + +/// Encodes [`ListEntry`] as an iterator of [`Bytes`] +/// +/// Each [`ListEntry`] is encoded as a `ListHeader`, followed by the value data +#[derive(Debug)] +pub struct ListEncoder { + /// The current offset into entries + offset: usize, + /// The ListEntry to encode + entries: Vec, + /// The flush size, made configurable for testing + flush_size: usize, + /// The maximum value size to write + max_value_size: usize, +} + +impl ListEncoder { + /// Create a new [`ListEncoder`] from the provided [`ListEntry`] + pub fn new(entries: Vec) -> Self { + Self { + entries, + offset: 0, + flush_size: FLUSH_SIZE, + max_value_size: MAX_VALUE_SIZE, + } + } + + /// Override the maximum value size to write + pub fn with_max_value_size(mut self, size: usize) -> Self { + self.max_value_size = size; + self + } +} + +impl Iterator for ListEncoder { + type Item = Bytes; + + fn next(&mut self) -> Option { + if self.offset == self.entries.len() { + return None; + } + + let mut cap = 0; + let mut end_offset = self.offset; + while end_offset < self.entries.len() && cap < self.flush_size { + match &self.entries[end_offset].data { + Some(d) if d.len() <= self.max_value_size => cap += ListHeader::SIZE + d.len(), + _ => cap += ListHeader::SIZE, + }; + end_offset += 1; + } + + let mut buf = Vec::with_capacity(cap); + for entry in self.entries.iter().take(end_offset).skip(self.offset) { + match &entry.data { + Some(d) if d.len() <= self.max_value_size => { + buf.extend_from_slice(&entry.header(false).encode()); + buf.extend_from_slice(d) + } + _ => buf.extend_from_slice(&entry.header(true).encode()), + } + } + self.offset = end_offset; + Some(buf.into()) + } +} + +#[allow(non_snake_case)] +mod Flags { + /// The value is not included in this response + /// + /// [`ListEncoder`](super::ListEncoder) only sends inline values for values smaller than a + /// configured threshold + pub(crate) const HEAD: u8 = 1; +} + +/// The header encoded in a list stream +#[derive(Debug)] +struct ListHeader { + /// The size of the value + size: u32, + /// Reserved for future usage + reserved: u16, + /// A bitmask of [`Flags`] + flags: u8, + /// The variant of [`CacheKey`] + variant: u8, + /// The generation of this value + generation: u64, + /// The key contents of [`CacheKey`] + key: u128, +} + +impl ListHeader { + /// The encoded size of [`ListHeader`] + const SIZE: usize = 32; + + /// Encodes [`ListHeader`] to an array + fn encode(&self) -> [u8; Self::SIZE] { + let mut out = [0; Self::SIZE]; + out[..4].copy_from_slice(&self.size.to_le_bytes()); + out[4..6].copy_from_slice(&self.reserved.to_le_bytes()); + out[6] = self.flags; + out[7] = self.variant; + out[8..16].copy_from_slice(&self.generation.to_le_bytes()); + out[16..32].copy_from_slice(&self.key.to_le_bytes()); + out + } + + /// Decodes [`ListHeader`] from an array + fn decode(buf: [u8; Self::SIZE]) -> Self { + Self { + size: u32::from_le_bytes(buf[..4].try_into().unwrap()), + reserved: u16::from_le_bytes(buf[4..6].try_into().unwrap()), + flags: buf[6], + variant: buf[7], + generation: u64::from_le_bytes(buf[8..16].try_into().unwrap()), + key: u128::from_le_bytes(buf[16..32].try_into().unwrap()), + } + } +} + +/// The state for [`ListDecoder`] +#[derive(Debug)] +enum DecoderState { + /// Decoding a header, contains the decoded data and the current offset + Header([u8; ListHeader::SIZE], usize), + /// Decoding value data for the given [`ListHeader`] + Body(ListHeader, Vec), +} + +impl Default for DecoderState { + fn default() -> Self { + Self::Header([0; ListHeader::SIZE], 0) + } +} + +/// Decodes [`ListEntry`] from a stream of bytes +#[derive(Debug)] +pub struct ListDecoder { + state: DecoderState, + max_size: usize, +} + +impl Default for ListDecoder { + fn default() -> Self { + Self { + state: DecoderState::default(), + max_size: MAX_VALUE_SIZE, + } + } +} + +impl ListDecoder { + /// Create a new [`ListDecoder`] + pub fn new() -> Self { + Self::default() + } + + /// Overrides the maximum value to deserialize + /// + /// Values larger than this will result in an error + /// Defaults to [`MAX_VALUE_SIZE`] + pub fn with_max_value_size(mut self, size: usize) -> Self { + self.max_size = size; + self + } + + /// Decode an entry from `buf`, returning the number of bytes consumed + /// + /// This is meant to be used in combination with [`Self::flush`] + pub fn decode(&mut self, mut buf: &[u8]) -> Result { + let initial = buf.len(); + while !buf.is_empty() { + match &mut self.state { + DecoderState::Header(header, offset) => { + let to_read = buf.len().min(ListHeader::SIZE - *offset); + header[*offset..*offset + to_read].copy_from_slice(&buf[..to_read]); + *offset += to_read; + buf = &buf[to_read..]; + + if *offset == ListHeader::SIZE { + let header = ListHeader::decode(*header); + let size = header.size as _; + ensure!(size <= self.max_size, ValueTooLargeSnafu { size }); + self.state = DecoderState::Body(header, Vec::with_capacity(size)) + } + } + DecoderState::Body(header, value) => { + let to_read = buf.len().min(header.size as usize - value.len()); + if to_read == 0 { + break; + } + value.extend_from_slice(&buf[..to_read]); + buf = &buf[to_read..]; + } + } + } + Ok(initial - buf.len()) + } + + /// Flush the contents of this [`ListDecoder`] + /// + /// Returns `Ok(Some(entry))` if a record is fully decoded + /// Returns `Ok(None)` if no in-progress record + /// Otherwise returns an error + pub fn flush(&mut self) -> Result> { + match std::mem::take(&mut self.state) { + DecoderState::Body(header, value) if value.len() == header.size as usize => { + Ok(Some(ListEntry { + variant: header.variant, + key: header.key, + generation: header.generation, + data: ((header.flags & Flags::HEAD) == 0).then(|| value.into()), + })) + } + DecoderState::Header(_, 0) => Ok(None), + _ => Err(Error::UnexpectedEOF), + } + } +} + +/// A key value pair encoded as part of a list +/// +/// Unlike [`CacheKey`] and [`CacheValue`] this allows: +/// +/// * Non-fatal handling of unknown key variants +/// * The option to not include the value data, e.g. if too large +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct ListEntry { + variant: u8, + generation: u64, + key: u128, + data: Option, +} + +impl ListEntry { + /// Create a new [`ListEntry`] from the provided key and value + pub fn new(key: CacheKey, value: CacheValue) -> Self { + let (variant, key) = match key { + CacheKey::Namespace(v) => (b'n', v as _), + CacheKey::Table(v) => (b't', v as _), + CacheKey::Partition(v) => (b'p', v as _), + }; + + Self { + key, + variant, + generation: value.generation, + data: Some(value.data), + } + } + + /// The key if it matches a known variant of [`CacheKey`] + /// + /// Returns `None` otherwise + pub fn key(&self) -> Option { + match self.variant { + b't' => Some(CacheKey::Table(self.key as _)), + b'n' => Some(CacheKey::Namespace(self.key as _)), + b'p' => Some(CacheKey::Partition(self.key as _)), + _ => None, + } + } + + /// The generation of this entry + pub fn generation(&self) -> u64 { + self.generation + } + + /// Returns the value data if present + pub fn value(&self) -> Option<&Bytes> { + self.data.as_ref() + } + + /// Returns the [`ListHeader`] for a given [`ListEntry`] + fn header(&self, head: bool) -> ListHeader { + let generation = self.generation; + let (flags, size) = match (head, &self.data) { + (false, Some(data)) => (0, data.len() as u32), + _ => (Flags::HEAD, 0), + }; + + ListHeader { + size, + flags, + variant: self.variant, + key: self.key, + generation, + reserved: 0, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::Buf; + use std::io::BufRead; + + fn decode_entries(mut r: R) -> Result> { + let mut decoder = ListDecoder::default(); + let iter = std::iter::from_fn(move || { + loop { + let buf = r.fill_buf().unwrap(); + if buf.is_empty() { + break; + } + let to_read = buf.len(); + let read = decoder.decode(buf).unwrap(); + r.consume(read); + if read != to_read { + break; + } + } + decoder.flush().transpose() + }); + iter.collect() + } + + #[test] + fn test_roundtrip() { + let expected = vec![ + ListEntry::new(CacheKey::Namespace(2), CacheValue::new("test".into(), 32)), + ListEntry::new(CacheKey::Namespace(6), CacheValue::new("3".into(), 4)), + ListEntry { + variant: 0, + key: u128::MAX, + generation: u64::MAX, + data: Some("unknown".into()), + }, + ListEntry::new(CacheKey::Table(6), CacheValue::new("3".into(), 23)), + ListEntry { + variant: b'p', + key: 45, + generation: 23, + data: None, + }, + ListEntry::new( + CacheKey::Partition(3), + CacheValue::new("bananas".into(), 23), + ), + ]; + + let encoded: Vec<_> = ListEncoder::new(expected.clone()).collect(); + assert_eq!(encoded.len(), 1); // Expect entries to be encoded in single flush + + for buf_size in [3, 5, 12] { + let reader = std::io::BufReader::with_capacity(buf_size, encoded[0].clone().reader()); + let entries = decode_entries(reader).unwrap(); + assert_eq!(entries, expected); + + // Invalid key should not be fatal + assert_eq!(entries[2].key(), None); + // Head response should not be fatal + assert_eq!(entries[4].value(), None); + } + } + + #[test] + fn test_empty() { + let data: Vec<_> = ListEncoder::new(vec![]).collect(); + assert_eq!(data.len(), 0); + + let entries = decode_entries(std::io::Cursor::new([0_u8; 0])).unwrap(); + assert_eq!(entries.len(), 0); + } + + #[test] + fn test_flush_size() { + let data = Bytes::from(vec![0; 128]); + let entries = (0..1024) + .map(|x| ListEntry::new(CacheKey::Namespace(x), CacheValue::new(data.clone(), 0))) + .collect(); + + let mut encoder = ListEncoder::new(entries); + encoder.flush_size = 1024; // Lower limit for test + + let mut remaining = 1024; + for block in encoder { + let expected = remaining.min(7); + assert_eq!(block.len(), (data.len() + ListHeader::SIZE) * expected); + let decoded = decode_entries(block.reader()).unwrap(); + assert_eq!(decoded.len(), expected); + remaining -= expected; + } + } + + #[test] + fn test_size_limit() { + let entries = vec![ + ListEntry::new( + CacheKey::Namespace(0), + CacheValue::new(vec![0; 128].into(), 0), + ), + ListEntry::new( + CacheKey::Namespace(1), + CacheValue::new(vec![0; 129].into(), 0), + ), + ListEntry::new( + CacheKey::Namespace(2), + CacheValue::new(vec![0; 128].into(), 0), + ), + ]; + + let mut encoder = ListEncoder::new(entries); + encoder.max_value_size = 128; // Artificially lower limit for test + + let encoded: Vec<_> = encoder.collect(); + assert_eq!(encoded.len(), 1); + + let decoded = decode_entries(encoded[0].clone().reader()).unwrap(); + assert_eq!(decoded[0].value().unwrap().len(), 128); + assert_eq!(decoded[1].value(), None); // Should omit value that is too large + assert_eq!(decoded[2].value().unwrap().len(), 128); + + let mut decoder = ListDecoder::new(); + decoder.max_size = 12; + let err = decoder.decode(&encoded[0]).unwrap_err().to_string(); + assert_eq!(err, "List value of 128 bytes too large"); + + let mut decoder = ListDecoder::new(); + decoder.max_size = 128; + + let consumed = decoder.decode(&encoded[0]).unwrap(); + let r = decoder.flush().unwrap().unwrap(); + assert_eq!(r.value().unwrap().len(), 128); + + // Next record skipped by encoder as too large + decoder.decode(&encoded[0][consumed..]).unwrap(); + let r = decoder.flush().unwrap().unwrap(); + assert_eq!(r.value(), None); + } +} diff --git a/catalog_cache/src/api/mod.rs b/catalog_cache/src/api/mod.rs new file mode 100644 index 00000000000..66d404229ae --- /dev/null +++ b/catalog_cache/src/api/mod.rs @@ -0,0 +1,159 @@ +//! The remote API for the catalog cache + +use crate::CacheKey; +use hyper::http::HeaderName; + +pub mod client; + +pub mod quorum; + +pub mod server; + +pub mod list; + +/// The header used to encode the generation in a get response +static GENERATION: HeaderName = HeaderName::from_static("x-influx-generation"); + +/// Defines the mapping to HTTP paths for given request types +#[derive(Debug, Eq, PartialEq)] +enum RequestPath { + /// A request addressing a resource identified by [`CacheKey`] + Resource(CacheKey), + /// A list request + List, +} + +impl RequestPath { + fn parse(s: &str) -> Option { + let s = s.strip_prefix('/').unwrap_or(s); + if s == "v1/" { + return Some(Self::List); + } + + let (prefix, value) = s.rsplit_once('/')?; + let value = u64::from_str_radix(value, 16).ok()?; + match prefix { + "v1/n" => Some(Self::Resource(CacheKey::Namespace(value as i64))), + "v1/t" => Some(Self::Resource(CacheKey::Table(value as i64))), + "v1/p" => Some(Self::Resource(CacheKey::Partition(value as i64))), + _ => None, + } + } +} + +impl std::fmt::Display for RequestPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::List => write!(f, "v1/"), + Self::Resource(CacheKey::Namespace(v)) => write!(f, "v1/n/{v:016x}"), + Self::Resource(CacheKey::Table(v)) => write!(f, "v1/t/{v:016x}"), + Self::Resource(CacheKey::Partition(v)) => write!(f, "v1/p/{v:016x}"), + } + } +} + +#[cfg(test)] +mod tests { + use crate::api::list::ListEntry; + use crate::api::server::test_util::TestCacheServer; + use crate::api::RequestPath; + use crate::{CacheKey, CacheValue}; + use futures::TryStreamExt; + use std::collections::HashSet; + + #[test] + fn test_request_path() { + let paths = [ + RequestPath::List, + RequestPath::Resource(CacheKey::Partition(12)), + RequestPath::Resource(CacheKey::Partition(i64::MAX)), + RequestPath::Resource(CacheKey::Partition(i64::MIN)), + RequestPath::Resource(CacheKey::Namespace(12)), + RequestPath::Resource(CacheKey::Namespace(i64::MAX)), + RequestPath::Resource(CacheKey::Namespace(i64::MIN)), + RequestPath::Resource(CacheKey::Table(12)), + RequestPath::Resource(CacheKey::Table(i64::MAX)), + RequestPath::Resource(CacheKey::Table(i64::MIN)), + ]; + + let mut set = HashSet::with_capacity(paths.len()); + for path in paths { + let s = path.to_string(); + let back = RequestPath::parse(&s).unwrap(); + assert_eq!(back, path); + assert!(set.insert(s), "should be unique"); + } + } + + #[tokio::test] + async fn test_basic() { + let serve = TestCacheServer::bind_ephemeral(); + let client = serve.client(); + + let key = CacheKey::Partition(1); + + let v1 = CacheValue::new("1".into(), 2); + assert!(client.put(key, &v1).await.unwrap()); + + let returned = client.get(key).await.unwrap().unwrap(); + assert_eq!(v1, returned); + + // Duplicate upsert ignored + assert!(!client.put(key, &v1).await.unwrap()); + + // Stale upsert ignored + let v2 = CacheValue::new("2".into(), 1); + assert!(!client.put(key, &v2).await.unwrap()); + + let returned = client.get(key).await.unwrap().unwrap(); + assert_eq!(v1, returned); + + let v3 = CacheValue::new("3".into(), 3); + assert!(client.put(key, &v3).await.unwrap()); + + let returned = client.get(key).await.unwrap().unwrap(); + assert_eq!(v3, returned); + + let key2 = CacheKey::Partition(5); + assert!(client.put(key2, &v1).await.unwrap()); + + let mut result = client.list(None).try_collect::>().await.unwrap(); + result.sort_unstable_by_key(|entry| entry.key()); + + let expected = vec![ListEntry::new(key, v3), ListEntry::new(key2, v1)]; + assert_eq!(result, expected); + + serve.shutdown().await; + } + + #[tokio::test] + async fn test_list_size() { + let serve = TestCacheServer::bind_ephemeral(); + let client = serve.client(); + + let v1 = CacheValue::new("123".into(), 2); + client.put(CacheKey::Table(1), &v1).await.unwrap(); + + let v2 = CacheValue::new("13".into(), 2); + client.put(CacheKey::Table(2), &v2).await.unwrap(); + + let v3 = CacheValue::new("1".into(), 2); + client.put(CacheKey::Table(3), &v3).await.unwrap(); + + let mut res = client.list(Some(2)).try_collect::>().await.unwrap(); + res.sort_unstable_by_key(|x| x.key()); + + assert_eq!(res.len(), 3); + + assert_eq!(res[0].value(), None); + assert_eq!(res[1].value(), Some(&v2.data)); + assert_eq!(res[2].value(), Some(&v3.data)); + + let mut res = client.list(Some(3)).try_collect::>().await.unwrap(); + res.sort_unstable_by_key(|x| x.key()); + + assert_eq!(res[0].value(), Some(&v1.data)); + assert_eq!(res[1].value(), Some(&v2.data)); + assert_eq!(res[2].value(), Some(&v3.data)); + } +} diff --git a/catalog_cache/src/api/quorum.rs b/catalog_cache/src/api/quorum.rs new file mode 100644 index 00000000000..17c4edf8bdd --- /dev/null +++ b/catalog_cache/src/api/quorum.rs @@ -0,0 +1,459 @@ +//! Client for performing quorum catalog reads/writes + +use crate::api::client::{CatalogCacheClient, Error as ClientError}; +use crate::local::CatalogCache; +use crate::{CacheKey, CacheValue}; +use futures::channel::oneshot; +use futures::future::{select, Either}; +use futures::{pin_mut, StreamExt}; +use snafu::{ResultExt, Snafu}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::task::JoinError; +use tokio_util::sync::CancellationToken; + +/// Error for [`QuorumCatalogCache`] +#[allow(missing_docs)] +#[derive(Debug, Snafu)] +pub enum Error { + #[snafu(display("Failed to communicate with any remote replica: {source}"))] + NoRemote { source: ClientError }, + + #[snafu(display("Write task was aborted"))] + Cancelled, + + #[snafu(display("Join Error: {source}"))] + Join { source: JoinError }, + + #[snafu(display("Failed to establish a read quorum: {generations:?}"))] + Quorum { + generations: [Result, ClientError>; 3], + }, + + #[snafu(display("Failed to list replica: {source}"))] + List { source: ClientError }, + + #[snafu(display("Local cache error: {source}"), context(false))] + Local { source: crate::local::Error }, +} + +/// Result for [`QuorumCatalogCache`] +pub type Result = std::result::Result; + +/// Performs quorum reads and writes across a local [`CatalogCache`] and two [`CatalogCacheClient`] +#[derive(Debug)] +pub struct QuorumCatalogCache { + local: Arc, + replicas: Arc<[CatalogCacheClient; 2]>, + shutdown: CancellationToken, +} + +impl Drop for QuorumCatalogCache { + fn drop(&mut self) { + self.shutdown.cancel() + } +} + +impl QuorumCatalogCache { + /// Create a new [`QuorumCatalogCache`] + pub fn new(local: Arc, replicas: Arc<[CatalogCacheClient; 2]>) -> Self { + Self { + local, + replicas, + shutdown: CancellationToken::new(), + } + } + + /// Retrieve the given value from the remote cache + /// + /// Returns `None` if value is not present in a quorum of replicas + /// Returns [`Error::Quorum`] if cannot establish a read quorum + pub async fn get(&self, key: CacheKey) -> Result> { + let local = self.local.get(key); + + let fut1 = self.replicas[0].get(key); + let fut2 = self.replicas[1].get(key); + pin_mut!(fut1); + pin_mut!(fut2); + + match select(fut1, fut2).await { + Either::Left((result, fut)) | Either::Right((result, fut)) => match (local, result) { + (None, Ok(None)) => Ok(None), + (Some(l), Ok(Some(r))) if l.generation <= r.generation => { + // preempt write from remote to local that arrives late + if l.generation < r.generation { + self.local.insert(key, r.clone())?; + } + Ok(Some(r)) + } + (local, r1) => { + // r1 either failed or did not return anything + let r2 = fut.await; + match (local, r1, r2) { + (None, _, Ok(None)) | (_, Ok(None), Ok(None)) => Ok(None), + (Some(l), _, Ok(Some(r))) if l.generation <= r.generation => { + // preempt write from remote to local that arrives late + if l.generation < r.generation { + self.local.insert(key, r.clone())?; + } + Ok(Some(r)) + } + (local, Ok(Some(l)), Ok(Some(r))) if l.generation == r.generation => { + if local.map(|x| x.generation < l.generation).unwrap_or(true) { + self.local.insert(key, l.clone())?; + } + Ok(Some(l)) + } + (l, r1, r2) => Err(Error::Quorum { + generations: [ + Ok(l.map(|x| x.generation)), + r1.map(|x| x.map(|x| x.generation)), + r2.map(|x| x.map(|x| x.generation)), + ], + }), + } + } + }, + } + } + + /// Upsert the given key-value pair + /// + /// Returns Ok if able to replicate the write to a quorum + pub async fn put(&self, key: CacheKey, value: CacheValue) -> Result<()> { + self.local.insert(key, value.clone())?; + + let replicas = Arc::clone(&self.replicas); + let (sender, receiver) = oneshot::channel(); + + let fut = async move { + let fut1 = replicas[0].put(key, &value); + let fut2 = replicas[1].put(key, &value); + pin_mut!(fut1); + pin_mut!(fut2); + + match select(fut1, fut2).await { + Either::Left((r, fut)) | Either::Right((r, fut)) => { + let _ = sender.send(r); + fut.await + } + } + }; + + // We spawn a tokio task so that we can potentially continue to replicate + // to the second replica asynchronously once we receive an ok response + let cancel = self.shutdown.child_token(); + let handle = tokio::spawn(async move { + let cancelled = cancel.cancelled(); + pin_mut!(fut); + pin_mut!(cancelled); + match select(cancelled, fut).await { + Either::Left(_) => Err(Error::Cancelled), + Either::Right((Ok(_), _)) => Ok(()), + Either::Right((Err(source), _)) => Err(Error::NoRemote { source }), + } + }); + + match receiver.await { + Ok(Ok(_)) => Ok(()), + _ => match handle.await { + Ok(r) => r, + Err(source) => Err(Error::Join { source }), + }, + } + } + + /// Warm the local cache by performing quorum reads from the other two replicas + /// + /// This method should be called after this server has been participating in the write quorum + /// for a period of time, e.g. 1 minute. This avoids an issue where a quorum cannot be + /// established for in-progress writes. + pub async fn warm(&self) -> Result<()> { + // List doesn't return keys in any particular order + // + // We therefore build a hashmap with the keys from one replica and compare + // this against those returned by the other + // + // We don't need to consult the local `CatalogCache`, as we only need to insert + // if a read quorum can be established between the replicas and isn't present locally + let mut generations = HashMap::with_capacity(128); + let mut list = self.replicas[0].list(Some(0)); + while let Some(entry) = list.next().await.transpose().context(ListSnafu)? { + if let Some(k) = entry.key() { + generations.insert(k, entry.generation()); + } + } + + let mut list = self.replicas[1].list(None); + while let Some(entry) = list.next().await.transpose().context(ListSnafu)? { + if let Some(k) = entry.key() { + match (generations.get(&k), entry.value()) { + (Some(generation), Some(v)) if *generation == entry.generation() => { + let value = CacheValue::new(v.clone(), *generation); + // In the case that local already has the given version + // this will be a no-op + self.local.insert(k, value)?; + } + _ => {} + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::api::server::test_util::TestCacheServer; + use std::future::Future; + use std::task::Context; + use std::time::Duration; + + #[tokio::test] + async fn test_basic() { + let local = Arc::new(CatalogCache::default()); + let r1 = TestCacheServer::bind_ephemeral(); + let r2 = TestCacheServer::bind_ephemeral(); + + let replicas = Arc::new([r1.client(), r2.client()]); + let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas)); + + assert_eq!(quorum.get(CacheKey::Table(1)).await.unwrap(), None); + + let k1 = CacheKey::Table(1); + let k2 = CacheKey::Table(2); + let k3 = CacheKey::Table(3); + + let v1 = CacheValue::new("foo".into(), 2); + quorum.put(k1, v1.clone()).await.unwrap(); + quorum.put(k2, v1.clone()).await.unwrap(); + + let r = quorum.get(k2).await.unwrap().unwrap(); + assert_eq!(r, v1); + + // New value + let v2 = CacheValue::new("foo".into(), 4); + quorum.put(k2, v2.clone()).await.unwrap(); + + let r = quorum.get(k1).await.unwrap().unwrap(); + assert_eq!(r, v1); + + let r = quorum.get(k2).await.unwrap().unwrap(); + assert_eq!(r, v2); + + // Can remove value from one replica and still get quorum + r2.cache().delete(k2).unwrap(); + let r = quorum.get(k2).await.unwrap().unwrap(); + assert_eq!(r, v2); + + // Loss of two copies results in not found + r1.cache().delete(k2).unwrap(); + let r = quorum.get(k2).await.unwrap(); + assert_eq!(r, None); + + // Simulate stale value in r1 + r1.cache().insert(k2, v1.clone()).unwrap(); + let err = quorum.get(k2).await.unwrap_err(); + assert!(matches!(err, Error::Quorum { .. }), "{err}"); + + // If quorum has stale value follows quorum + r2.cache().delete(k2); + r2.cache().insert(k2, v1.clone()).unwrap(); + let r = quorum.get(k2).await.unwrap().unwrap(); + assert_eq!(r, v1); + + // Simulate loss of replica 2 + r2.shutdown().await; + + // Can still establish a write quorum + quorum.put(k3, v1.clone()).await.unwrap(); + + // Can read newly inserted value + let r = quorum.get(k3).await.unwrap().unwrap(); + assert_eq!(r, v1); + + // Can still read from quorum of k1 + let r = quorum.get(k1).await.unwrap().unwrap(); + assert_eq!(r, v1); + + // Cannot get quorum as lost single node and local disagrees with replica 1 + let err = quorum.get(k2).await.unwrap_err(); + assert!(matches!(err, Error::Quorum { .. }), "{err}"); + + // Can establish quorum following write + quorum.put(k2, v2.clone()).await.unwrap(); + let r = quorum.get(k2).await.unwrap().unwrap(); + assert_eq!(r, v2); + + // Still cannot establish quorum + r1.cache().delete(k2); + let err = quorum.get(k2).await.unwrap_err(); + assert!(matches!(err, Error::Quorum { .. }), "{err}"); + + // k2 is now no longer present anywhere, can establish quorum + local.delete(k2); + let r = quorum.get(k2).await.unwrap(); + assert_eq!(r, None); + + // Simulate loss of replica 1 (in addition to replica 2) + r1.shutdown().await; + + // Can no longer get quorum for anything + let err = quorum.get(k1).await.unwrap_err(); + assert!(matches!(err, Error::Quorum { .. }), "{err}"); + } + + #[tokio::test] + async fn test_read_through() { + let local = Arc::new(CatalogCache::default()); + let r1 = TestCacheServer::bind_ephemeral(); + let r2 = TestCacheServer::bind_ephemeral(); + + let replicas = Arc::new([r1.client(), r2.client()]); + let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas)); + + let key = CacheKey::Table(1); + let v0 = CacheValue::new("v0".into(), 0); + + r1.cache().insert(key, v0.clone()).unwrap(); + r2.cache().insert(key, v0.clone()).unwrap(); + + let result = quorum.get(key).await.unwrap().unwrap(); + assert_eq!(result, v0); + + // Should have read-through to local + assert_eq!(local.get(key).unwrap(), v0); + + let v1 = CacheValue::new("v1".into(), 1); + let v2 = CacheValue::new("v2".into(), 2); + + r1.cache().insert(key, v1.clone()).unwrap(); + r2.cache().insert(key, v2.clone()).unwrap(); + + // A quorum request will get either v1 or v2 depending on which it contacts first + let result = quorum.get(key).await.unwrap().unwrap(); + assert!(result == v1 || result == v2, "{result:?}"); + + // Should read-through + assert_eq!(local.get(key).unwrap(), result); + + // Update r1 with version 2 + r1.cache().insert(key, v2.clone()).unwrap(); + + let result = quorum.get(key).await.unwrap().unwrap(); + assert_eq!(result, v2); + + // Should read-through + assert_eq!(local.get(key).unwrap(), v2); + + let v3 = CacheValue::new("v3".into(), 3); + local.insert(key, v3.clone()).unwrap(); + + // Should establish quorum for v2 even though local is v3 + let result = quorum.get(key).await.unwrap().unwrap(); + assert_eq!(result, v2); + + // Should not read-through + assert_eq!(local.get(key).unwrap(), v3); + + let v4 = CacheValue::new("v4".into(), 4); + let v5 = CacheValue::new("v5".into(), 5); + + local.insert(key, v5.clone()).unwrap(); + r1.cache().insert(key, v4.clone()).unwrap(); + + // Should fail as cannot establish quorum of three different versions of `[5, 4, 2]` + // and has latest version locally + let err = quorum.get(key).await.unwrap_err(); + assert!(matches!(err, Error::Quorum { .. }), "{err}"); + assert_eq!(local.get(key).unwrap(), v5); + + let v6 = CacheValue::new("v6".into(), 6); + r1.cache().insert(key, v6.clone()).unwrap(); + + // Should succeed as r1 has newer version than local + let result = quorum.get(key).await.unwrap().unwrap(); + assert_eq!(result, v6); + + // Should read-through + assert_eq!(local.get(key).unwrap(), v6); + } + + #[tokio::test] + async fn test_warm() { + let local = Arc::new(CatalogCache::default()); + let r1 = TestCacheServer::bind_ephemeral(); + let r2 = TestCacheServer::bind_ephemeral(); + + let replicas = Arc::new([r1.client(), r2.client()]); + let quorum = QuorumCatalogCache::new(local, Arc::clone(&replicas)); + + let k1 = CacheKey::Table(1); + let v1 = CacheValue::new("v1".into(), 1); + quorum.put(k1, v1.clone()).await.unwrap(); + + let k2 = CacheKey::Table(2); + let v2 = CacheValue::new("v2".into(), 1); + quorum.put(k2, v2.clone()).await.unwrap(); + + // Simulate local restart + let local = Arc::new(CatalogCache::default()); + let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas)); + + assert_eq!(local.list().count(), 0); + + quorum.warm().await.unwrap(); + + // Should populate both entries + let mut entries: Vec<_> = local.list().collect(); + entries.sort_unstable_by_key(|(k, _)| *k); + assert_eq!(entries, vec![(k1, v1.clone()), (k2, v2.clone())]); + + // Simulate local restart + let local = Arc::new(CatalogCache::default()); + let quorum = QuorumCatalogCache::new(Arc::clone(&local), Arc::clone(&replicas)); + + // Simulate in-progress write + let v3 = CacheValue::new("v3".into(), 2); + assert!(r1.cache().insert(k2, v3.clone()).unwrap()); + + // Cannot establish quorum for k1 so should skip over + quorum.warm().await.unwrap(); + let entries: Vec<_> = local.list().collect(); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0], (k1, v1.clone())); + + // If r2 updated warming should pick up new quorum + assert!(r2.cache().insert(k2, v3.clone()).unwrap()); + quorum.warm().await.unwrap(); + let mut entries: Vec<_> = local.list().collect(); + entries.sort_unstable_by_key(|(k, _)| *k); + assert_eq!(entries, vec![(k1, v1), (k2, v3)]); + + // Test cancellation safety + let k3 = CacheKey::Table(3); + let fut = quorum.put(k3, v2.clone()); + { + // `fut` is dropped (cancelled) on exit from this code block + pin_mut!(fut); + + let noop_waker = futures::task::noop_waker(); + let mut cx = Context::from_waker(&noop_waker); + assert!(fut.poll(&mut cx).is_pending()); + } + + // Write should still propagate asynchronously + let mut attempts = 0; + loop { + tokio::time::sleep(Duration::from_millis(1)).await; + match quorum.get(k3).await { + Ok(Some(_)) => break, + _ => { + assert!(attempts < 100); + attempts += 1; + } + } + } + } +} diff --git a/catalog_cache/src/api/server.rs b/catalog_cache/src/api/server.rs new file mode 100644 index 00000000000..b29d841f880 --- /dev/null +++ b/catalog_cache/src/api/server.rs @@ -0,0 +1,300 @@ +//! Server for the cache HTTP API + +use crate::api::list::{ListEncoder, ListEntry}; +use crate::api::{RequestPath, GENERATION}; +use crate::local::CatalogCache; +use crate::CacheValue; +use futures::ready; +use hyper::body::HttpBody; +use hyper::header::ToStrError; +use hyper::http::request::Parts; +use hyper::service::Service; +use hyper::{Body, Method, Request, Response, StatusCode}; +use snafu::{OptionExt, ResultExt, Snafu}; +use std::convert::Infallible; +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +enum Error { + #[snafu(display("Http error: {source}"), context(false))] + Http { source: hyper::http::Error }, + + #[snafu(display("Hyper error: {source}"), context(false))] + Hyper { source: hyper::Error }, + + #[snafu(display("Local cache error: {source}"), context(false))] + Local { source: crate::local::Error }, + + #[snafu(display("Non UTF-8 Header: {source}"))] + BadHeader { source: ToStrError }, + + #[snafu(display("Request missing generation header"))] + MissingGeneration, + + #[snafu(display("Invalid generation header: {source}"))] + InvalidGeneration { source: std::num::ParseIntError }, + + #[snafu(display("List query missing size"))] + MissingSize, + + #[snafu(display("List query invalid size: {source}"))] + InvalidSize { source: std::num::ParseIntError }, +} + +impl Error { + /// Convert an error into a [`Response`] + fn response(self) -> Response { + let mut response = Response::new(Body::from(self.to_string())); + *response.status_mut() = match &self { + Self::Http { .. } | Self::Hyper { .. } | Self::Local { .. } => { + StatusCode::INTERNAL_SERVER_ERROR + } + Self::InvalidGeneration { .. } + | Self::MissingGeneration + | Self::InvalidSize { .. } + | Self::MissingSize + | Self::BadHeader { .. } => StatusCode::BAD_REQUEST, + }; + response + } +} + +/// A [`Service`] that wraps a [`CatalogCache`] +#[derive(Debug, Clone)] +pub struct CatalogCacheService(Arc); + +/// Shared state for [`CatalogCacheService`] +#[derive(Debug)] +struct ServiceState { + cache: Arc, +} + +impl Service> for CatalogCacheService { + type Response = Response; + + type Error = Infallible; + type Future = CatalogRequestFuture; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let (parts, body) = req.into_parts(); + CatalogRequestFuture { + parts, + body, + buffer: vec![], + state: Arc::clone(&self.0), + } + } +} + +/// The future for [`CatalogCacheService`] +#[derive(Debug)] +pub struct CatalogRequestFuture { + /// The request body + body: Body, + /// The request parts + parts: Parts, + /// The in-progress body + /// + /// We use Vec not Bytes to ensure the cache isn't storing slices of large allocations + buffer: Vec, + /// The cache to service requests + state: Arc, +} + +impl Future for CatalogRequestFuture { + type Output = Result, Infallible>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let r = loop { + match ready!(Pin::new(&mut self.body).poll_data(cx)) { + Some(Ok(b)) => self.buffer.extend_from_slice(&b), + Some(Err(e)) => break Err(e.into()), + None => break Ok(()), + } + }; + Poll::Ready(Ok(match r.and_then(|_| self.call()) { + Ok(resp) => resp, + Err(e) => e.response(), + })) + } +} + +impl CatalogRequestFuture { + fn call(&mut self) -> Result, Error> { + let body = std::mem::take(&mut self.buffer); + + let status = match RequestPath::parse(self.parts.uri.path()) { + Some(RequestPath::List) => match self.parts.method { + Method::GET => { + let query = self.parts.uri.query().context(MissingSizeSnafu)?; + let mut parts = url::form_urlencoded::parse(query.as_bytes()); + let (_, size) = parts.find(|(k, _)| k == "size").context(MissingSizeSnafu)?; + let size = size.parse().context(InvalidSizeSnafu)?; + + let iter = self.state.cache.list(); + let entries = iter.map(|(k, v)| ListEntry::new(k, v)).collect(); + let encoder = ListEncoder::new(entries).with_max_value_size(size); + + let stream = futures::stream::iter(encoder.map(Ok::<_, Error>)); + let response = Response::builder().body(Body::wrap_stream(stream))?; + return Ok(response); + } + _ => StatusCode::METHOD_NOT_ALLOWED, + }, + Some(RequestPath::Resource(key)) => match self.parts.method { + Method::GET => match self.state.cache.get(key) { + Some(value) => { + let response = Response::builder() + .header(&GENERATION, value.generation) + .body(value.data.into())?; + return Ok(response); + } + None => StatusCode::NOT_FOUND, + }, + Method::PUT => { + let headers = &self.parts.headers; + let generation = headers.get(&GENERATION).context(MissingGenerationSnafu)?; + let generation = generation.to_str().context(BadHeaderSnafu)?; + let generation = generation.parse().context(InvalidGenerationSnafu)?; + let value = CacheValue::new(body.into(), generation); + + match self.state.cache.insert(key, value)? { + true => StatusCode::OK, + false => StatusCode::NOT_MODIFIED, + } + } + Method::DELETE => { + self.state.cache.delete(key); + StatusCode::OK + } + _ => StatusCode::METHOD_NOT_ALLOWED, + }, + None => StatusCode::NOT_FOUND, + }; + + let mut response = Response::new(Body::empty()); + *response.status_mut() = status; + Ok(response) + } +} + +/// Runs a [`CatalogCacheService`] in a background task +/// +/// Will abort the background task on drop +#[derive(Debug)] +pub struct CatalogCacheServer { + state: Arc, +} + +impl CatalogCacheServer { + /// Create a new [`CatalogCacheServer`]. + /// + /// Note that the HTTP interface needs to be wired up in some higher-level structure. Use [`service`](Self::service) + /// for that. + pub fn new(cache: Arc) -> Self { + let state = Arc::new(ServiceState { cache }); + + Self { state } + } + + /// Returns HTTP service. + pub fn service(&self) -> CatalogCacheService { + CatalogCacheService(Arc::clone(&self.state)) + } + + /// Returns a reference to the [`CatalogCache`] of this server + pub fn cache(&self) -> &Arc { + &self.state.cache + } +} + +/// Test utilities. +pub mod test_util { + use std::{net::SocketAddr, ops::Deref}; + + use hyper::{service::make_service_fn, Server}; + use tokio::task::JoinHandle; + use tokio_util::sync::CancellationToken; + + use crate::api::client::CatalogCacheClient; + + use super::*; + + /// Test runner for a [`CatalogCacheServer`]. + #[derive(Debug)] + pub struct TestCacheServer { + addr: SocketAddr, + server: CatalogCacheServer, + shutdown: CancellationToken, + handle: Option>, + } + + impl TestCacheServer { + /// Create a new [`TestCacheServer`] bound to an ephemeral port + pub fn bind_ephemeral() -> Self { + Self::bind(&SocketAddr::from(([127, 0, 0, 1], 0))) + } + + /// Create a new [`CatalogCacheServer`] bound to the provided [`SocketAddr`] + pub fn bind(addr: &SocketAddr) -> Self { + let server = CatalogCacheServer::new(Arc::new(CatalogCache::default())); + let service = server.service(); + let make_service = make_service_fn(move |_conn| { + futures::future::ready(Ok::<_, Infallible>(service.clone())) + }); + + let hyper_server = Server::bind(addr).serve(make_service); + let addr = hyper_server.local_addr(); + + let shutdown = CancellationToken::new(); + let signal = shutdown.clone().cancelled_owned(); + let graceful = hyper_server.with_graceful_shutdown(signal); + let handle = Some(tokio::spawn(async move { graceful.await.unwrap() })); + + Self { + addr, + server, + shutdown, + handle, + } + } + + /// Returns a [`CatalogCacheClient`] for communicating with this server + pub fn client(&self) -> CatalogCacheClient { + let addr = format!("http://{}", self.addr); + CatalogCacheClient::try_new(addr.parse().unwrap()).unwrap() + } + + /// Triggers and waits for graceful shutdown + pub async fn shutdown(mut self) { + self.shutdown.cancel(); + if let Some(x) = self.handle.take() { + x.await.unwrap() + } + } + } + + impl Deref for TestCacheServer { + type Target = CatalogCacheServer; + + fn deref(&self) -> &Self::Target { + &self.server + } + } + + impl Drop for TestCacheServer { + fn drop(&mut self) { + if let Some(x) = &self.handle { + x.abort() + } + } + } +} diff --git a/catalog_cache/src/lib.rs b/catalog_cache/src/lib.rs new file mode 100644 index 00000000000..037044899ee --- /dev/null +++ b/catalog_cache/src/lib.rs @@ -0,0 +1,143 @@ +//! Consistent cache system used by the catalog service +//! +//! # Design +//! +//! The catalog service needs to be able to service queries without needing to communicate +//! with its underlying backing store. This serves the dual purpose of reducing load on this +//! backing store, and also returning results in a more timely manner. +//! +//! This caching must be transparent to the users of the catalog service, and therefore must not +//! introduce eventually consistent behaviour, or other consistency effects. +//! +//! As such this crate provides a strongly-consistent, distributed key-value cache. +//! +//! In order to keep things simple, this only provides a mapping from [`CacheKey`] to opaque +//! binary payloads, with no support for structured payloads. +//! +//! This avoids: +//! +//! * Complex replicated state machines +//! * Forward compatibility challenges where newer data can't roundtrip through older servers +//! * Simple to introspect, debug and reason about +//! * Predictable and easily quantifiable memory usage +//! +//! However, it does have the following implications: +//! +//! * Care must be taken to ensure that parsing of the cached payloads does not become a bottleneck +//! * Large values (> 1MB) should be avoided, as updates will resend the entire value +//! +//! ## Components +//! +//! This crate is broken into multiple parts +//! +//! * [`CatalogCache`] provides a local key value store +//! * [`CatalogCacheService`] exposes this [`CatalogCache`] over an HTTP API +//! * [`CatalogCacheClient`] communicates with a remote [`CatalogCacheService`] +//! * [`QuorumCatalogCache`] combines the above into a strongly-consistent distributed cache +//! +//! [`CatalogCache`]: local::CatalogCache +//! [`CatalogCacheClient`]: api::client::CatalogCacheClient +//! [`CatalogCacheService`]: api::server::CatalogCacheService +//! [`QuorumCatalogCache`]: api::quorum::QuorumCatalogCache +//! +#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_docs, + clippy::explicit_iter_loop, + // See https://github.com/influxdata/influxdb_iox/pull/1671 + clippy::future_not_send, + clippy::use_self, + clippy::clone_on_ref_ptr, + clippy::todo, + clippy::dbg_macro, + unused_crate_dependencies +)] + +// Workaround for "unused crate" lint false positives. +use workspace_hack as _; + +use bytes::Bytes; +use std::sync::atomic::AtomicBool; + +pub mod api; +pub mod local; + +/// The types of catalog cache key +#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)] +pub enum CacheKey { + /// A catalog namespace + Namespace(i64), + /// A catalog table + Table(i64), + /// A catalog partition + Partition(i64), +} + +impl CacheKey { + /// Variant as string. + /// + /// This can be used for logging and metrics. + pub fn variant(&self) -> &'static str { + match self { + Self::Namespace(_) => "namespace", + Self::Table(_) => "table", + Self::Partition(_) => "partition", + } + } + + /// Untyped ID. + pub fn id(&self) -> i64 { + match self { + Self::Namespace(id) => *id, + Self::Table(id) => *id, + Self::Partition(id) => *id, + } + } +} + +/// A value stored in [`CatalogCache`](local::CatalogCache) +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct CacheValue { + /// The data stored for this cache + data: Bytes, + /// The generation of this cache data + generation: u64, +} + +impl CacheValue { + /// Create a new [`CacheValue`] with the provided `data` and `generation` + pub fn new(data: Bytes, generation: u64) -> Self { + Self { data, generation } + } + + /// The data stored for this cache + pub fn data(&self) -> &Bytes { + &self.data + } + + /// The generation of this cache data + pub fn generation(&self) -> u64 { + self.generation + } +} + +/// Combines a [`CacheValue`] with an [`AtomicBool`] for the purposes of NRU-eviction +#[derive(Debug)] +struct CacheEntry { + /// The value of this cache entry + value: CacheValue, + /// An atomic flag that is set to `true` by `CatalogCache::get` and + /// cleared by `CatalogCache::evict_unused` + used: AtomicBool, +} + +impl From for CacheEntry { + fn from(value: CacheValue) -> Self { + Self { + value, + // Values start used to prevent racing with `evict_unused` + used: AtomicBool::new(true), + } + } +} diff --git a/catalog_cache/src/local/limit.rs b/catalog_cache/src/local/limit.rs new file mode 100644 index 00000000000..6c38fee5a82 --- /dev/null +++ b/catalog_cache/src/local/limit.rs @@ -0,0 +1,82 @@ +//! A memory limiter + +use super::{Error, Result}; +use std::sync::atomic::{AtomicUsize, Ordering}; + +#[derive(Debug)] +pub(crate) struct MemoryLimiter { + current: AtomicUsize, + limit: usize, +} + +impl MemoryLimiter { + /// Create a new [`MemoryLimiter`] limited to `limit` bytes + pub(crate) fn new(limit: usize) -> Self { + Self { + current: AtomicUsize::new(0), + limit, + } + } + + /// Reserve `size` bytes, returning an error if this would exceed the limit + pub(crate) fn reserve(&self, size: usize) -> Result<()> { + let limit = self.limit; + let max = limit + .checked_sub(size) + .ok_or(Error::TooLarge { size, limit })?; + + // We can use relaxed ordering as not relying on this to + // synchronise memory accesses beyond itself + self.current + .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |current| { + // This cannot overflow as current + size <= limit + (current <= max).then_some(current + size) + }) + .map_err(|current| Error::OutOfMemory { + size, + current, + limit, + })?; + Ok(()) + } + + /// Free `size` bytes + pub(crate) fn free(&self, size: usize) { + self.current.fetch_sub(size, Ordering::Relaxed); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_limiter() { + let limiter = MemoryLimiter::new(100); + + limiter.reserve(20).unwrap(); + limiter.reserve(70).unwrap(); + + let err = limiter.reserve(20).unwrap_err().to_string(); + assert_eq!(err, "Cannot reserve additional 20 bytes for cache containing 90 bytes as would exceed limit of 100 bytes"); + + limiter.reserve(10).unwrap(); + limiter.reserve(0).unwrap(); + + let err = limiter.reserve(1).unwrap_err().to_string(); + assert_eq!(err, "Cannot reserve additional 1 bytes for cache containing 100 bytes as would exceed limit of 100 bytes"); + + limiter.free(10); + limiter.reserve(10).unwrap(); + + limiter.free(100); + + // Can add single value taking entire range + limiter.reserve(100).unwrap(); + limiter.free(100); + + // Protected against overflow + let err = limiter.reserve(usize::MAX).unwrap_err(); + assert!(matches!(err, Error::TooLarge { .. }), "{err}"); + } +} diff --git a/catalog_cache/src/local/mod.rs b/catalog_cache/src/local/mod.rs new file mode 100644 index 00000000000..373dd628112 --- /dev/null +++ b/catalog_cache/src/local/mod.rs @@ -0,0 +1,355 @@ +//! A local in-memory cache + +mod limit; + +use crate::local::limit::MemoryLimiter; +use crate::{CacheEntry, CacheKey, CacheValue}; +use dashmap::mapref::entry::Entry; +use dashmap::DashMap; +use snafu::Snafu; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +/// Error for [`CatalogCache`] +#[derive(Debug, Snafu)] +#[allow(missing_docs, missing_copy_implementations)] +pub enum Error { + #[snafu(display("Cannot reserve additional {size} bytes for cache containing {current} bytes as would exceed limit of {limit} bytes"))] + OutOfMemory { + size: usize, + current: usize, + limit: usize, + }, + + #[snafu(display("Cannot reserve additional {size} bytes for cache as request exceeds total memory limit of {limit} bytes"))] + TooLarge { size: usize, limit: usize }, +} + +/// Result for [`CatalogCache`] +pub type Result = std::result::Result; + +/// A trait for observing updated to [`CatalogCache`] +/// +/// This can be used for injecting metrics, maintaining secondary indices or otherwise +/// +/// Note: members are invoked under locks in [`CatalogCache`] and should therefore +/// be short-running and not call back into [`CatalogCache`] +pub trait CatalogCacheObserver: std::fmt::Debug + Send + Sync { + /// Called before a value is potentially inserted into [`CatalogCache`] + /// + /// This is called regardless of it [`CatalogCache`] already contains the value + fn insert(&self, key: CacheKey, new: &CacheValue, old: Option<&CacheValue>); + + /// A key removed from the [`CatalogCache`] + fn evict(&self, key: CacheKey, value: &CacheValue); +} + +/// A concurrent Not-Recently-Used cache mapping [`CacheKey`] to [`CacheValue`] +#[derive(Debug, Default)] +pub struct CatalogCache { + map: DashMap, + observer: Option>, + limit: Option, +} + +impl CatalogCache { + /// Create a new `CatalogCache` with an optional memory limit + pub fn new(limit: Option) -> Self { + Self { + limit: limit.map(MemoryLimiter::new), + ..Default::default() + } + } + + /// Sets a [`CatalogCacheObserver`] for this [`CatalogCache`] + pub fn with_observer(self, observer: Arc) -> Self { + Self { + observer: Some(observer), + ..self + } + } + + /// Returns the value for `key` if it exists + pub fn get(&self, key: CacheKey) -> Option { + let entry = self.map.get(&key)?; + entry.used.store(true, Ordering::Relaxed); + Some(entry.value.clone()) + } + + /// Insert the given `value` into the cache + /// + /// Skips insertion and returns false iff an entry already exists with the + /// same or greater generation + pub fn insert(&self, key: CacheKey, value: CacheValue) -> Result { + match self.map.entry(key) { + Entry::Occupied(mut o) => { + let old = &o.get().value; + if value.generation <= old.generation { + return Ok(false); + } + if let Some(l) = &self.limit { + let new_len = value.data.len(); + let cur_len = old.data.len(); + match new_len > cur_len { + true => l.reserve(new_len - cur_len)?, + false => l.free(cur_len - new_len), + } + } + if let Some(v) = &self.observer { + v.insert(key, &value, Some(old)); + } + o.insert(value.into()); + } + Entry::Vacant(v) => { + if let Some(l) = &self.limit { + l.reserve(value.data.len())?; + } + if let Some(v) = &self.observer { + v.insert(key, &value, None); + } + v.insert(value.into()); + } + } + Ok(true) + } + + /// Removes the [`CacheValue`] for the given `key` if any + pub fn delete(&self, key: CacheKey) -> Option { + match self.map.entry(key) { + Entry::Occupied(o) => { + let old = &o.get().value; + if let Some(v) = &self.observer { + v.evict(key, old) + } + if let Some(l) = &self.limit { + l.free(old.data.len()) + } + Some(o.remove().value) + } + _ => None, + } + } + + /// Returns an iterator over the items in this cache + pub fn list(&self) -> CacheIterator<'_> { + CacheIterator(self.map.iter()) + } + + /// Evict all entries not accessed with [`CatalogCache::get`] or updated since + /// the last call to this function + /// + /// Periodically calling this provides a Not-Recently-Used eviction policy + pub fn evict_unused(&self) { + self.map.retain(|key, entry| { + let retain = entry.used.swap(false, Ordering::Relaxed); + if !retain { + if let Some(v) = &self.observer { + v.evict(*key, &entry.value); + } + if let Some(l) = &self.limit { + l.free(entry.value.data.len()); + } + } + retain + }); + } +} + +/// Iterator for [`CatalogCache`] +#[allow(missing_debug_implementations)] +pub struct CacheIterator<'a>(dashmap::iter::Iter<'a, CacheKey, CacheEntry>); + +impl<'a> Iterator for CacheIterator<'a> { + type Item = (CacheKey, CacheValue); + + fn next(&mut self) -> Option { + let value = self.0.next()?; + Some((*value.key(), value.value().value.clone())) + } + + fn size_hint(&self) -> (usize, Option) { + self.0.size_hint() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use bytes::Bytes; + use dashmap::DashSet; + + #[derive(Debug, Default)] + struct KeyObserver { + keys: DashSet, + } + + impl KeyObserver { + fn keys(&self) -> Vec { + let mut keys: Vec<_> = self.keys.iter().map(|k| *k).collect(); + keys.sort_unstable(); + keys + } + } + + impl CatalogCacheObserver for KeyObserver { + fn insert(&self, key: CacheKey, _new: &CacheValue, _old: Option<&CacheValue>) { + self.keys.insert(key); + } + + fn evict(&self, key: CacheKey, _value: &CacheValue) { + self.keys.remove(&key); + } + } + + #[test] + fn test_basic() { + let observer = Arc::new(KeyObserver::default()); + let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _); + + let v1 = CacheValue::new("1".into(), 5); + assert!(cache.insert(CacheKey::Table(0), v1.clone()).unwrap()); + assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1); + + // Older generation rejected + assert!(!cache + .insert(CacheKey::Table(0), CacheValue::new("2".into(), 3)) + .unwrap()); + + // Value unchanged + assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v1); + + // Different key accepted + let v2 = CacheValue::new("2".into(), 5); + assert!(cache.insert(CacheKey::Table(1), v2.clone()).unwrap()); + assert_eq!(cache.get(CacheKey::Table(1)).unwrap(), v2); + + let v3 = CacheValue::new("3".into(), 0); + assert!(cache.insert(CacheKey::Partition(0), v3.clone()).unwrap()); + + // Newer generation updates + let v4 = CacheValue::new("4".into(), 6); + assert!(cache.insert(CacheKey::Table(0), v4.clone()).unwrap()); + + let mut values: Vec<_> = cache.list().collect(); + values.sort_unstable_by(|(a, _), (b, _)| a.cmp(b)); + + assert_eq!( + values, + vec![ + (CacheKey::Table(0), v4.clone()), + (CacheKey::Table(1), v2), + (CacheKey::Partition(0), v3), + ] + ); + assert_eq!( + observer.keys(), + vec![ + CacheKey::Table(0), + CacheKey::Table(1), + CacheKey::Partition(0) + ] + ); + + assert_eq!(cache.get(CacheKey::Namespace(0)), None); + assert_eq!(cache.delete(CacheKey::Namespace(0)), None); + + assert_eq!(cache.get(CacheKey::Table(0)).unwrap(), v4); + assert_eq!(cache.delete(CacheKey::Table(0)).unwrap(), v4); + assert_eq!(cache.get(CacheKey::Table(0)), None); + + assert_eq!(cache.list().count(), 2); + assert_eq!(observer.keys.len(), 2); + } + + #[test] + fn test_nru() { + let observer = Arc::new(KeyObserver::default()); + let cache = CatalogCache::default().with_observer(Arc::clone(&observer) as _); + + let value = CacheValue::new("1".into(), 0); + cache.insert(CacheKey::Namespace(0), value.clone()).unwrap(); + cache.insert(CacheKey::Partition(0), value.clone()).unwrap(); + cache.insert(CacheKey::Table(0), value.clone()).unwrap(); + + cache.evict_unused(); + // Inserted records should only be evicted on the next pass + assert_eq!(cache.list().count(), 3); + assert_eq!(observer.keys.len(), 3); + + // Updating a record marks it used + cache + .insert(CacheKey::Table(0), CacheValue::new("2".into(), 1)) + .unwrap(); + + // Fetching a record marks it used + cache.get(CacheKey::Partition(0)).unwrap(); + + // Insert a new record is used + cache.insert(CacheKey::Partition(1), value.clone()).unwrap(); + + cache.evict_unused(); + + // Namespace(0) evicted + let mut values: Vec<_> = cache.list().map(|(k, _)| k).collect(); + values.sort_unstable(); + let expected = vec![ + CacheKey::Table(0), + CacheKey::Partition(0), + CacheKey::Partition(1), + ]; + assert_eq!(values, expected); + assert_eq!(observer.keys(), expected); + + // Stale updates don't count as usage + assert!(!cache.insert(CacheKey::Partition(0), value).unwrap()); + + // Listing does not preserve recently used + assert_eq!(cache.list().count(), 3); + + cache.evict_unused(); + assert_eq!(cache.list().count(), 0); + assert_eq!(observer.keys.len(), 0) + } + + #[test] + fn test_limit() { + let cache = CatalogCache::new(Some(200)); + + let k1 = CacheKey::Table(1); + let k2 = CacheKey::Table(2); + let k3 = CacheKey::Table(3); + + let v_100 = Bytes::from(vec![0; 100]); + let v_20 = Bytes::from(vec![0; 20]); + + cache.insert(k1, CacheValue::new(v_100.clone(), 0)).unwrap(); + cache.insert(k2, CacheValue::new(v_100.clone(), 0)).unwrap(); + + let r = cache.insert(k3, CacheValue::new(v_20.clone(), 0)); + assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 20 bytes for cache containing 200 bytes as would exceed limit of 200 bytes"); + + // Upsert k1 to 20 bytes + cache.insert(k1, CacheValue::new(v_20.clone(), 1)).unwrap(); + + // Can now insert k3 + cache.insert(k3, CacheValue::new(v_20.clone(), 0)).unwrap(); + + // Should evict nothing + cache.evict_unused(); + + // Cannot increase size of k3 to 100 + let r = cache.insert(k3, CacheValue::new(v_100.clone(), 1)); + assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 80 bytes for cache containing 140 bytes as would exceed limit of 200 bytes"); + + cache.delete(k2).unwrap(); + cache.insert(k3, CacheValue::new(v_100.clone(), 1)).unwrap(); + + let r = cache.insert(k2, CacheValue::new(v_100.clone(), 1)); + assert_eq!(r.unwrap_err().to_string(), "Cannot reserve additional 100 bytes for cache containing 120 bytes as would exceed limit of 200 bytes"); + + // Should evict everything apart from k3 + cache.evict_unused(); + + cache.insert(k2, CacheValue::new(v_100.clone(), 1)).unwrap(); + } +} diff --git a/clap_blocks/Cargo.toml b/clap_blocks/Cargo.toml index f7707ac0848..de5d836c42a 100644 --- a/clap_blocks/Cargo.toml +++ b/clap_blocks/Cargo.toml @@ -5,24 +5,33 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] clap = { version = "4", features = ["derive", "env"] } +ed25519-dalek = { version = "2", features = ["pem"] } futures = "0.3" -http = "0.2.9" +http = "0.2.11" humantime = "2.1.0" iox_catalog = { path = "../iox_catalog" } +iox_time = { path = "../iox_time" } +itertools = "0.12.0" metric = { path = "../metric" } +non-empty-string = "0.2.4" object_store = { workspace = true } observability_deps = { path = "../observability_deps" } -snafu = "0.7" -sysinfo = "0.29.10" +parquet_cache = { path = "../parquet_cache" } +snafu = "0.8" +sysinfo = "0.30.5" trace_exporters = { path = "../trace_exporters" } trogging = { path = "../trogging", default-features = false, features = ["clap"] } +url = "2.4" uuid = { version = "1", features = ["v4"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -tempfile = "3.8.0" +tempfile = "3.9.0" test_helpers = { path = "../test_helpers" } [features] diff --git a/clap_blocks/src/bulk_ingest.rs b/clap_blocks/src/bulk_ingest.rs new file mode 100644 index 00000000000..df383b5f8e0 --- /dev/null +++ b/clap_blocks/src/bulk_ingest.rs @@ -0,0 +1,274 @@ +//! CLI config for the router to enable bulk ingest APIs + +use ed25519_dalek::{ + pkcs8::{DecodePrivateKey, DecodePublicKey}, + SigningKey, VerifyingKey, +}; +use snafu::{ResultExt, Snafu}; +use std::{fs, io, path::PathBuf}; + +/// CLI config for bulk ingest. +#[derive(Debug, Clone, Default, clap::Parser)] +pub struct BulkIngestConfig { + /// Private signing key used for Parquet metadata returned from the `NewParquetMetadata` gRPC + /// API to prevent tampering/corruption of Parquet metadata provided by IOx to the process + /// preparing Parquet files for bulk ingest. + /// + /// This is a path to an Ed25519 private key file generated by OpenSSL with the command: + /// `openssl genpkey -algorithm ed25519 -out private-key-filename.pem` + /// + /// The public key used to verify signatures will be derived from this private key. Additional + /// public verification keys can be specified with + /// `-bulk-ingest-additional-verification-key-files` to support key rotation. + /// + /// If not specified, the `NewParquetMetadata` gRPC API will return unimplemented. + #[clap( + long = "bulk-ingest-metadata-signing-key-file", + env = "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE" + )] + metadata_signing_key_file: Option, + + /// When in the process of rotating keys, specify paths to files containing public verification + /// keys of previously used private signing keys used for signing metadata in the past. + /// + /// These files can be derived from private key files with this OpenSSL command: + /// `openssl pkey -in private-key-filename.pem -pubout -out public-key-filename.pem` + /// + /// Example: "public-key-1.pem,public-key-2.pem" + /// + /// If verification of the metadata signature fails with the current public key derived from + /// the current signing key, these verification keys will be tested in order to allow older + /// signatures generated with the old key to still be validated. For best performance of + /// signature verification, specify the additional verification keys in order of most likely + /// candidates first (probably most recently used first). + /// + /// If no additional verification keys are specified, only the verification key associated with + /// the current metadata signing key will be used to validate signatures. + #[clap( + long = "bulk-ingest-additional-verification-key-files", + env = "INFLUXDB_IOX_BULK_INGEST_ADDITIONAL_VERIFICATION_KEY_FILES", + required = false, + num_args=1.., + value_delimiter = ',', + )] + additional_verification_key_files: Vec, + + /// Rather than using whatever object store configuration may have been specified as a source + /// of presigned upload URLs for bulk ingest, use a mock implementation that returns an upload + /// URL value that can be inspected but not used. + /// + /// Only useful for testing bulk ingest without setting up S3! Do not use this in production! + #[clap( + hide = true, + long = "bulk-ingest-use-mock-presigned-url-signer", + env = "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER", + default_value = "false" + )] + pub use_mock_presigned_url_signer: bool, +} + +impl BulkIngestConfig { + /// Constructor for bulk ingest configuration. + pub fn new( + metadata_signing_key_file: Option, + additional_verification_key_files: Vec, + use_mock_presigned_url_signer: bool, + ) -> Self { + Self { + metadata_signing_key_file, + additional_verification_key_files, + use_mock_presigned_url_signer, + } + } +} + +impl TryFrom<&BulkIngestConfig> for Option { + type Error = BulkIngestConfigError; + + fn try_from(config: &BulkIngestConfig) -> Result { + config + .metadata_signing_key_file + .as_ref() + .map(|signing_key_file| { + let signing_key: SigningKey = fs::read_to_string(signing_key_file) + .context(ReadingSigningKeyFileSnafu { + filename: &signing_key_file, + }) + .and_then(|file_contents| { + DecodePrivateKey::from_pkcs8_pem(&file_contents).context( + DecodingSigningKeySnafu { + filename: signing_key_file, + }, + ) + })?; + + let additional_verifying_keys: Vec<_> = config + .additional_verification_key_files + .iter() + .map(|verification_key_file| { + fs::read_to_string(verification_key_file) + .context(ReadingVerifyingKeyFileSnafu { + filename: &verification_key_file, + }) + .and_then(|file_contents| { + DecodePublicKey::from_public_key_pem(&file_contents).context( + DecodingVerifyingKeySnafu { + filename: verification_key_file, + }, + ) + }) + }) + .collect::, _>>()?; + + Ok(BulkIngestKeys { + signing_key, + additional_verifying_keys, + }) + }) + .transpose() + } +} + +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum BulkIngestConfigError { + #[snafu(display("Could not read signing key from {}: {source}", filename.display()))] + ReadingSigningKeyFile { + filename: PathBuf, + source: io::Error, + }, + + #[snafu(display("Could not decode signing key from {}: {source}", filename.display()))] + DecodingSigningKey { + filename: PathBuf, + source: ed25519_dalek::pkcs8::Error, + }, + + #[snafu(display("Could not read verifying key from {}: {source}", filename.display()))] + ReadingVerifyingKeyFile { + filename: PathBuf, + source: io::Error, + }, + + #[snafu(display("Could not decode verifying key from {}: {source}", filename.display()))] + DecodingVerifyingKey { + filename: PathBuf, + source: ed25519_dalek::pkcs8::spki::Error, + }, +} + +/// Key values extracted from the files specified to the CLI. To get an instance, first create a +/// `BulkIngestConfig`, then call `try_from` to get a `Result` containing an +/// `Option` where the `Option` will be `Some` if the `BulkIngestConfig`'s +/// `metadata_signing_key_file` value is `Some`. +/// +/// If any filenames specified anywhere in the `BulkIngestConfig` can't be read or don't contain +/// valid key values, the `try_from` implementation will return an error. +#[derive(Debug)] +pub struct BulkIngestKeys { + /// The parsed private signing key value contained in the file specified to + /// `--bulk-ingest-metadata-signing-key-file`. + pub signing_key: SigningKey, + + /// If any files were specified in `--bulk-ingest-additional-verification-key-files`, this list + /// will contain their parsed public verification key values. + pub additional_verifying_keys: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + use clap::Parser; + use std::process::Command; + use test_helpers::{assert_contains, make_temp_file, tmp_dir}; + + #[test] + fn missing_signing_key_param() { + // No signing key file -> no keys + let config = BulkIngestConfig::try_parse_from(["something"]).unwrap(); + let keys: Option = (&config).try_into().unwrap(); + assert!(keys.is_none(), "expected None, got: {:?}", keys); + + // Even if there are additional verification key files; no signing key file means no keys + let config = BulkIngestConfig::try_parse_from([ + "something", + "--bulk-ingest-additional-verification-key-files", + "some-public-key-filename.pem", + ]) + .unwrap(); + let keys: Option = (&config).try_into().unwrap(); + assert!(keys.is_none(), "expected None, got: {:?}", keys); + } + + #[test] + fn signing_key_file_not_found() { + let nonexistent_filename = "do-not-create-a-file-with-this-name-or-this-test-will-fail"; + let config = BulkIngestConfig::try_parse_from([ + "something", + "--bulk-ingest-metadata-signing-key-file", + nonexistent_filename, + ]) + .unwrap(); + + let keys: Result, _> = (&config).try_into(); + let err = keys.unwrap_err(); + assert_contains!( + err.to_string(), + format!("Could not read signing key from {nonexistent_filename}") + ); + } + + #[test] + fn signing_key_file_contents_invalid() { + let signing_key_file = make_temp_file("not a valid signing key"); + let signing_key_filename = signing_key_file.path().display().to_string(); + + let config = BulkIngestConfig::try_parse_from([ + "something", + "--bulk-ingest-metadata-signing-key-file", + &signing_key_filename, + ]) + .unwrap(); + + let keys: Result, _> = (&config).try_into(); + let err = keys.unwrap_err(); + assert_contains!( + err.to_string(), + format!("Could not decode signing key from {signing_key_filename}") + ); + } + + #[test] + fn valid_signing_key_file_no_additional_key_files() { + let tmp_dir = tmp_dir().unwrap(); + let signing_key_filename = tmp_dir + .path() + .join("test-private-key.pem") + .display() + .to_string(); + Command::new("openssl") + .arg("genpkey") + .arg("-algorithm") + .arg("ed25519") + .arg("-out") + .arg(&signing_key_filename) + .output() + .unwrap(); + + let config = BulkIngestConfig::try_parse_from([ + "something", + "--bulk-ingest-metadata-signing-key-file", + &signing_key_filename, + ]) + .unwrap(); + + let keys: Result, _> = (&config).try_into(); + let keys = keys.unwrap().unwrap(); + let additional_keys = keys.additional_verifying_keys; + assert!( + additional_keys.is_empty(), + "expected additional keys to be empty, got {:?}", + additional_keys + ); + } +} diff --git a/clap_blocks/src/catalog_cache.rs b/clap_blocks/src/catalog_cache.rs new file mode 100644 index 00000000000..a9b85435a8c --- /dev/null +++ b/clap_blocks/src/catalog_cache.rs @@ -0,0 +1,154 @@ +//! Config for the catalog cache server mode. + +use std::time::Duration; + +use itertools::Itertools; +use snafu::{OptionExt, Snafu}; +use url::{Host, Url}; + +use crate::memory_size::MemorySize; + +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("host '{host}' is not a prefix of '{prefix}'"))] + NotAPrefix { host: String, prefix: String }, + + #[snafu(display("host '{host}' is not a valid host"))] + NotAValidHost { host: String }, + + #[snafu(display("invalid url: {source}"))] + InvalidUrl { source: url::ParseError }, + + #[snafu(display("Expected exactly two peers"))] + InvalidPeers, +} + +/// CLI config for catalog configuration +#[derive(Debug, Clone, PartialEq, Eq, clap::Parser)] +pub struct CatalogConfig { + /// Host Name + /// + /// If provided, any matching entries in peers will be ignored + #[clap(long = "hostname", env = "INFLUXDB_IOX_HOSTNAME", value_parser = Host::parse)] + pub hostname: Option>, + + /// Peers + /// + /// Can be provided as a comma-separated list, or on the command line multiple times + #[clap( + long = "catalog-cache-peers", + env = "INFLUXDB_IOX_CATALOG_CACHE_PEERS", + required = false, + value_delimiter = ',' + )] + pub peers: Vec, + + /// Warmup delay. + /// + /// The warm-up (via dumping the cache of our peers) is delayed by the given time to make sure that we already + /// receive quorum writes. This ensure a gaplass transition / roll-out w/o any cache MISSes (esp. w/o any backend requests). + #[clap( + long = "catalog-cache-warmup-delay", + env = "INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY", + default_value = default_warmup_delay(), + value_parser = humantime::parse_duration, + )] + pub warmup_delay: Duration, + + /// Garbage collection interval. + /// + /// Every time this interval past, cache elements that have not been used (i.e. read or updated) since the last time + /// are evicted from the cache. + #[clap( + long = "catalog-cache-gc-interval", + env = "INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL", + default_value = default_gc_interval(), + value_parser = humantime::parse_duration, + )] + pub gc_interval: Duration, + + /// Maximum number of bytes that should be cached within the catalog cache. + /// + /// If that limit is exceeded, no new values are accepted. This is meant as a safety measurement. You should adjust + /// your pod size and the GC interval (`--catalog-cache-gc-interval` / `INFLUXDB_IOX_CATALOG_CACHE_GC_INTERVAL`) to + /// your workload. + /// + /// Can be given as absolute value or in percentage of the total available memory (e.g. `10%`). + #[clap( + long = "catalog-cache-size-limit", + env = "INFLUXDB_IOX_CATALOG_CACHE_SIZE_LIMIT", + default_value = "1073741824", // 1GB + action + )] + pub cache_size_limit: MemorySize, + + /// Number of concurrent quorum operations that a single request can trigger. + #[clap( + long = "catalog-cache-quorum-fanout", + env = "INFLUXDB_IOX_CATALOG_CACHE_QUORUM_FANOUT", + default_value_t = 10 + )] + pub quorum_fanout: usize, +} + +impl CatalogConfig { + /// Return URL of other catalog cache nodes. + pub fn peers(&self) -> Result<[Url; 2], Error> { + let (peer1, peer2) = self + .peers + .iter() + .filter(|x| match (x.host(), &self.hostname) { + (Some(a), Some(r)) => &a != r, + _ => true, + }) + .collect_tuple() + .context(InvalidPeersSnafu)?; + + Ok([peer1.clone(), peer2.clone()]) + } +} + +fn default_warmup_delay() -> &'static str { + let s = humantime::format_duration(Duration::from_secs(60 * 5)).to_string(); + Box::leak(Box::new(s)) +} + +fn default_gc_interval() -> &'static str { + let s = humantime::format_duration(Duration::from_secs(60 * 15)).to_string(); + Box::leak(Box::new(s)) +} + +#[cfg(test)] +mod tests { + use super::*; + use clap::Parser; + + #[test] + fn test_peers() { + let config = CatalogConfig::parse_from([ + "binary", + "--catalog-cache-peers", + "http://peer1:8080", + "--catalog-cache-peers", + "http://peer2:9090", + ]); + let peer1 = Url::parse("http://peer1:8080").unwrap(); + let peer2 = Url::parse("http://peer2:9090").unwrap(); + + let peers = config.peers().unwrap(); + assert_eq!(peers, [peer1.clone(), peer2.clone()]); + + let mut config = CatalogConfig::parse_from([ + "binary", + "--catalog-cache-peers", + "http://peer1:8080,http://peer2:9090,http://peer3:9091", + ]); + let err = config.peers().unwrap_err(); + assert!(matches!(err, Error::InvalidPeers), "{err}"); + + config.hostname = Some(Host::parse("peer3").unwrap()); + let peers = config.peers().unwrap(); + assert_eq!(peers, [peer1.clone(), peer2.clone()]); + } +} diff --git a/clap_blocks/src/catalog_dsn.rs b/clap_blocks/src/catalog_dsn.rs index a7c9e13cd91..74e84bc73bf 100644 --- a/clap_blocks/src/catalog_dsn.rs +++ b/clap_blocks/src/catalog_dsn.rs @@ -1,10 +1,13 @@ //! Catalog-DSN-related configs. +use http::uri::InvalidUri; +use iox_catalog::grpc::client::GrpcCatalogClient; use iox_catalog::sqlite::{SqliteCatalog, SqliteConnectionOptions}; use iox_catalog::{ interface::Catalog, mem::MemCatalog, postgres::{PostgresCatalog, PostgresConnectionOptions}, }; +use iox_time::TimeProvider; use observability_deps::tracing::*; use snafu::{ResultExt, Snafu}; use std::{sync::Arc, time::Duration}; @@ -18,6 +21,9 @@ pub enum Error { #[snafu(display("Catalog DSN not specified. Expected a string like 'postgresql://postgres@localhost:5432/postgres' or 'sqlite:///tmp/catalog.sqlite'"))] DsnNotSpecified {}, + #[snafu(display("Invalid URI: {source}"))] + InvalidUri { source: InvalidUri }, + #[snafu(display("A catalog error occurred: {}", source))] Catalog { source: iox_catalog::interface::Error, @@ -55,7 +61,9 @@ pub struct CatalogDsnConfig { /// /// PostgreSQL: `postgresql://postgres@localhost:5432/postgres` /// - /// Sqlite (a local filename /tmp/foo.sqlite): `sqlite:///tmp/foo.sqlite` + /// Sqlite (a local filename /tmp/foo.sqlite): `sqlite:///tmp/foo.sqlite` - + /// note sqlite is for development/testing only and should not be used for + /// production workloads. /// /// Memory (ephemeral, only useful for testing): `memory` /// @@ -117,6 +125,7 @@ impl CatalogDsnConfig { &self, app_name: &'static str, metrics: Arc, + time_provider: Arc, ) -> Result, Error> { let Some(dsn) = self.dsn.as_ref() else { return Err(Error::DsnNotSpecified {}); @@ -141,7 +150,7 @@ impl CatalogDsnConfig { )) } else if dsn == "memory" { info!("Catalog: In-memory"); - let mem = MemCatalog::new(metrics); + let mem = MemCatalog::new(metrics, time_provider); Ok(Arc::new(mem)) } else if let Some(file_path) = dsn.strip_prefix("sqlite://") { info!(file_path, "Catalog: Sqlite"); @@ -153,6 +162,11 @@ impl CatalogDsnConfig { .await .context(CatalogSnafu)?, )) + } else if dsn.starts_with("http://") || dsn.starts_with("https://") { + info!("Catalog: gRPC"); + let uri = dsn.parse().context(InvalidUriSnafu)?; + let grpc = GrpcCatalogClient::new(uri, metrics, time_provider); + Ok(Arc::new(grpc)) } else { Err(Error::UnknownCatalogDsn { dsn: dsn.to_string(), diff --git a/clap_blocks/src/compactor.rs b/clap_blocks/src/compactor.rs index 55ab6c15d8b..9b63bc82f1a 100644 --- a/clap_blocks/src/compactor.rs +++ b/clap_blocks/src/compactor.rs @@ -73,80 +73,29 @@ pub struct CompactorConfig { #[clap( long = "exec-mem-pool-bytes", env = "INFLUXDB_IOX_EXEC_MEM_POOL_BYTES", - default_value = "8589934592", // 8GB + default_value = "17179869184", // 16GB action )] pub exec_mem_pool_bytes: MemorySize, - /// Desired max size of compacted parquet files. + /// Overrides INFLUXDB_IOX_EXEC_MEM_POOL_BYTES to set the size of memory pool + /// used during compaction DF plan execution. This value is expressed as a percent + /// of the memory limit for the cgroup (e.g. 70 = 70% of the cgroup memory limit). + /// This is converted to a byte limit as the compactor starts. /// - /// Note this is a target desired value, rather than a guarantee. - /// 1024 * 1024 * 100 = 104,857,600 - #[clap( - long = "compaction-max-desired-size-bytes", - env = "INFLUXDB_IOX_COMPACTION_MAX_DESIRED_FILE_SIZE_BYTES", - default_value = "104857600", - action - )] - pub max_desired_file_size_bytes: u64, - - /// Percentage of desired max file size for "leading edge split" - /// optimization. - /// - /// This setting controls the estimated output file size at which - /// the compactor will apply the "leading edge" optimization. - /// - /// When compacting files together, if the output size is - /// estimated to be greater than the following quantity, the - /// "leading edge split" optimization will be applied: - /// - /// percentage_max_file_size * max_desired_file_size_bytes - /// - /// This value must be between (0, 100) - /// - /// Default is 20 - #[clap( - long = "compaction-percentage-max-file_size", - env = "INFLUXDB_IOX_COMPACTION_PERCENTAGE_MAX_FILE_SIZE", - default_value = "20", - action - )] - pub percentage_max_file_size: u16, - - /// Split file percentage for "leading edge split" - /// - /// To reduce the likelihood of recompacting the same data too many - /// times, the compactor uses the "leading edge split" - /// optimization for the common case where the new data written - /// into a partition also has the most recent timestamps. - /// - /// When compacting multiple files together, if the compactor - /// estimates the resulting file will be large enough (see - /// `percentage_max_file_size`) it creates two output files - /// rather than one, split by time, like this: - /// - /// `|-------------- older_data -----------------||---- newer_data ----|` - /// - /// In the common case, the file containing `older_data` is less - /// likely to overlap with new data written in. - /// - /// This setting controls what percentage of data is placed into - /// the `older_data` portion. + /// Extreme values (<20% or >90%) are ignored and INFLUXDB_IOX_EXEC_MEM_POOL_BYTES + /// is used. It will also use INFLUXDB_IOX_EXEC_MEM_POOL_BYTES if we fail to read + /// the cgroup limit, or it doesn't parse to a sane value. /// - /// Increasing this value increases the average size of compacted - /// files after the first round of compaction. However, doing so - /// also increase the likelihood that late arriving data will - /// overlap with larger existing files, necessitating additional - /// compaction rounds. - /// - /// This value must be between (0, 100) + /// If compaction plans attempt to allocate more than the computed byte limit + /// during execution, they will error with "ResourcesExhausted". #[clap( - long = "compaction-split-percentage", - env = "INFLUXDB_IOX_COMPACTION_SPLIT_PERCENTAGE", - default_value = "80", + long = "exec-mem-pool-percent", + env = "INFLUXDB_IOX_EXEC_MEM_POOL_PERCENT", + default_value = "70", action )] - pub split_percentage: u16, + pub exec_mem_pool_percent: u64, /// Maximum duration of the per-partition compaction task in seconds. #[clap( @@ -182,39 +131,6 @@ pub struct CompactorConfig { )] pub enable_scratchpad: bool, - /// Maximum number of files that the compactor will try and - /// compact in a single plan. - /// - /// The higher this setting is the fewer compactor plans are run - /// and thus fewer resources over time are consumed by the - /// compactor. Increasing this setting also increases the peak - /// memory used for each compaction plan, and thus if it is set - /// too high, the compactor plans may exceed available memory. - #[clap( - long = "compaction-max-num-files-per-plan", - env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_FILES_PER_PLAN", - default_value = "20", - action - )] - pub max_num_files_per_plan: usize, - - /// Minimum number of L1 files to compact to L2. - /// - /// If there are more than this many L1 (by definition non - /// overlapping) files in a partition, the compactor will compact - /// them together into one or more larger L2 files. - /// - /// Setting this value higher in general results in fewer overall - /// resources spent on compaction but more files per partition (and - /// thus less optimal compression and query performance). - #[clap( - long = "compaction-min-num-l1-files-to-compact", - env = "INFLUXDB_IOX_COMPACTION_MIN_NUM_L1_FILES_TO_COMPACT", - default_value = "10", - action - )] - pub min_num_l1_files_to_compact: usize, - /// Only process all discovered partitions once. /// /// By default the compactor will continuously loop over all @@ -227,19 +143,6 @@ pub struct CompactorConfig { )] pub process_once: bool, - /// Maximum number of columns in a table of a partition that - /// will be able to considered to get compacted - /// - /// If a table has more than this many columns, the compactor will - /// not compact it, to avoid large memory use. - #[clap( - long = "compaction-max-num-columns-per-table", - env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_COLUMNS_PER_TABLE", - default_value = "10000", - action - )] - pub max_num_columns_per_table: usize, - /// Limit the number of partition fetch queries to at most the specified /// number of queries per second. /// diff --git a/clap_blocks/src/compactor_scheduler.rs b/clap_blocks/src/compactor_scheduler.rs index 25aeecc6aed..e2b3c8f3cac 100644 --- a/clap_blocks/src/compactor_scheduler.rs +++ b/clap_blocks/src/compactor_scheduler.rs @@ -1,5 +1,8 @@ //! Compactor-Scheduler-related configs. +use crate::socket_addr::SocketAddr; +use std::str::FromStr; + /// Compaction Scheduler type. #[derive(Debug, Default, Clone, Copy, PartialEq, clap::ValueEnum)] pub enum CompactorSchedulerType { @@ -90,6 +93,62 @@ pub struct PartitionSourceConfigForLocalScheduler { pub ignore_partition_skip_marker: bool, } +/// CLI config for scheduler's gossip. +#[derive(Debug, Clone, clap::Parser)] +pub struct CompactorSchedulerGossipConfig { + /// A comma-delimited set of seed gossip peer addresses. + /// + /// Example: "10.0.0.1:4242,10.0.0.2:4242" + /// + /// These seeds will be used to discover all other peers that talk to the + /// same seeds. Typically all nodes in the cluster should use the same set + /// of seeds. + #[clap( + long = "compactor-scheduler-gossip-seed-list", + env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_SEED_LIST", + required = false, + num_args=1.., + value_delimiter = ',', + requires = "scheduler_gossip_bind_address", // Field name, not flag + )] + pub scheduler_seed_list: Vec, + + /// The UDP socket address IOx will use for gossip communication between + /// peers. + /// + /// Example: "0.0.0.0:4242" + /// + /// If not provided, the gossip sub-system is disabled. + #[clap( + long = "compactor-scheduler-gossip-bind-address", + env = "INFLUXDB_IOX_COMPACTOR_SCHEDULER_GOSSIP_BIND_ADDR", + default_value = "0.0.0.0:0", + required = false, + action + )] + pub scheduler_gossip_bind_address: SocketAddr, +} + +impl Default for CompactorSchedulerGossipConfig { + fn default() -> Self { + Self { + scheduler_seed_list: vec![], + scheduler_gossip_bind_address: SocketAddr::from_str("0.0.0.0:4324").unwrap(), + } + } +} + +impl CompactorSchedulerGossipConfig { + /// constructor for GossipConfig + /// + pub fn new(bind_address: &str, seed_list: Vec) -> Self { + Self { + scheduler_seed_list: seed_list, + scheduler_gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(), + } + } +} + /// CLI config for compactor scheduler. #[derive(Debug, Clone, Default, clap::Parser)] pub struct CompactorSchedulerConfig { @@ -103,6 +162,135 @@ pub struct CompactorSchedulerConfig { )] pub compactor_scheduler_type: CompactorSchedulerType, + /// Maximum number of files that the compactor will try and + /// compact in a single plan. + /// + /// The higher this setting is the fewer compactor plans are run + /// and thus fewer resources over time are consumed by the + /// compactor. Increasing this setting also increases the peak + /// memory used for each compaction plan, and thus if it is set + /// too high, the compactor plans may exceed available memory. + #[clap( + long = "compaction-max-num-files-per-plan", + env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_FILES_PER_PLAN", + default_value = "20", + action + )] + pub max_num_files_per_plan: usize, + + /// Desired max size of compacted parquet files. + /// + /// Note this is a target desired value, rather than a guarantee. + /// 1024 * 1024 * 100 = 104,857,600 + #[clap( + long = "compaction-max-desired-size-bytes", + env = "INFLUXDB_IOX_COMPACTION_MAX_DESIRED_FILE_SIZE_BYTES", + default_value = "104857600", + action + )] + pub max_desired_file_size_bytes: u64, + + /// Minimum number of L1 files to compact to L2. + /// + /// If there are more than this many L1 (by definition non + /// overlapping) files in a partition, the compactor will compact + /// them together into one or more larger L2 files. + /// + /// Setting this value higher in general results in fewer overall + /// resources spent on compaction but more files per partition (and + /// thus less optimal compression and query performance). + #[clap( + long = "compaction-min-num-l1-files-to-compact", + env = "INFLUXDB_IOX_COMPACTION_MIN_NUM_L1_FILES_TO_COMPACT", + default_value = "10", + action + )] + pub min_num_l1_files_to_compact: usize, + + /// Maximum number of columns in a table of a partition that + /// will be able to considered to get compacted + /// + /// If a table has more than this many columns, the compactor will + /// not compact it, to avoid large memory use. + #[clap( + long = "compaction-max-num-columns-per-table", + env = "INFLUXDB_IOX_COMPACTION_MAX_NUM_COLUMNS_PER_TABLE", + default_value = "10000", + action + )] + pub max_num_columns_per_table: usize, + + /// Percentage of desired max file size for "leading edge split" + /// optimization. + /// + /// This setting controls the estimated output file size at which + /// the compactor will apply the "leading edge" optimization. + /// + /// When compacting files together, if the output size is + /// estimated to be greater than the following quantity, the + /// "leading edge split" optimization will be applied: + /// + /// percentage_max_file_size * target_file_size + /// + /// This value must be between (0, 100) + /// + /// Default is 20 + #[clap( + long = "compaction-percentage-max-file_size", + env = "INFLUXDB_IOX_COMPACTION_PERCENTAGE_MAX_FILE_SIZE", + default_value = "20", + action + )] + pub percentage_max_file_size: u16, + + /// Enable new priority-based compaction selection. + /// + /// Eventually, this will be the only way to select partitions. + /// + /// Default is false + #[clap( + long = "compaction-priority-based-selection", + env = "INFLUXDB_IOX_COMPACTION_PRIORITY_BASED_SELECTION", + default_value = "false", + action + )] + pub priority_based_selection: bool, + + /// Split file percentage for "leading edge split" + /// + /// To reduce the likelihood of recompacting the same data too many + /// times, the compactor uses the "leading edge split" + /// optimization for the common case where the new data written + /// into a partition also has the most recent timestamps. + /// + /// When compacting multiple files together, if the compactor + /// estimates the resulting file will be large enough (see + /// `percentage_max_file_size`) it creates two output files + /// rather than one, split by time, like this: + /// + /// `|-------------- older_data -----------------||---- newer_data ----|` + /// + /// In the common case, the file containing `older_data` is less + /// likely to overlap with new data written in. + /// + /// This setting controls what percentage of data is placed into + /// the `older_data` portion. + /// + /// Increasing this value increases the average size of compacted + /// files after the first round of compaction. However, doing so + /// also increase the likelihood that late arriving data will + /// overlap with larger existing files, necessitating additional + /// compaction rounds. + /// + /// This value must be between (0, 100) + #[clap( + long = "compaction-split-percentage", + env = "INFLUXDB_IOX_COMPACTION_SPLIT_PERCENTAGE", + default_value = "80", + action + )] + pub split_percentage: u16, + /// Partition source config used by the local scheduler. #[clap(flatten)] pub partition_source_config: PartitionSourceConfigForLocalScheduler, @@ -110,6 +298,10 @@ pub struct CompactorSchedulerConfig { /// Shard config used by the local scheduler. #[clap(flatten)] pub shard_config: ShardConfigForLocalScheduler, + + /// Gossip config. + #[clap(flatten)] + pub gossip_config: CompactorSchedulerGossipConfig, } #[cfg(test)] diff --git a/clap_blocks/src/garbage_collector.rs b/clap_blocks/src/garbage_collector.rs index 95e6aa3a7d8..0b10d785456 100644 --- a/clap_blocks/src/garbage_collector.rs +++ b/clap_blocks/src/garbage_collector.rs @@ -24,14 +24,6 @@ pub struct GarbageCollectorConfig { )] pub objectstore_cutoff: Duration, - /// Number of concurrent object store deletion tasks - #[clap( - long, - default_value_t = 5, - env = "INFLUXDB_IOX_GC_OBJECTSTORE_CONCURRENT_DELETES" - )] - pub objectstore_concurrent_deletes: usize, - /// Number of minutes to sleep between iterations of the objectstore list loop. /// This is the sleep between entirely fresh list operations. /// Defaults to 30 minutes. @@ -65,13 +57,26 @@ pub struct GarbageCollectorConfig { pub parquetfile_cutoff: Duration, /// Number of minutes to sleep between iterations of the parquet file deletion loop. + /// /// Defaults to 30 minutes. + /// + /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and + /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen + #[clap(long, env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES")] + pub parquetfile_sleep_interval_minutes: Option, + + /// Duration to sleep between iterations of the parquet file deletion loop. + /// + /// Defaults to 30 minutes. + /// + /// If both INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES and + /// INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL are specified, the smaller is chosen #[clap( long, - default_value_t = 30, - env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL_MINUTES" + value_parser = parse_duration, + env = "INFLUXDB_IOX_GC_PARQUETFILE_SLEEP_INTERVAL" )] - pub parquetfile_sleep_interval_minutes: u64, + pub parquetfile_sleep_interval: Option, /// Number of minutes to sleep between iterations of the retention code. /// Defaults to 35 minutes to reduce incidence of it running at the same time as the parquet @@ -83,3 +88,63 @@ pub struct GarbageCollectorConfig { )] pub retention_sleep_interval_minutes: u64, } + +impl GarbageCollectorConfig { + /// Returns the parquet_file sleep interval + pub fn parquetfile_sleep_interval(&self) -> Duration { + match ( + self.parquetfile_sleep_interval, + self.parquetfile_sleep_interval_minutes, + ) { + (None, None) => Duration::from_secs(30 * 60), + (Some(d), None) => d, + (None, Some(m)) => Duration::from_secs(m * 60), + (Some(d), Some(m)) => d.min(Duration::from_secs(m * 60)), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_gc_config() { + let a: &[&str] = &[]; + let config = GarbageCollectorConfig::parse_from(a); + assert_eq!( + config.parquetfile_sleep_interval(), + Duration::from_secs(30 * 60) + ); + + let config = + GarbageCollectorConfig::parse_from(["something", "--parquetfile-sleep-interval", "3d"]); + + assert_eq!( + config.parquetfile_sleep_interval(), + Duration::from_secs(24 * 60 * 60 * 3) + ); + + let config = GarbageCollectorConfig::parse_from([ + "something", + "--parquetfile-sleep-interval-minutes", + "34", + ]); + assert_eq!( + config.parquetfile_sleep_interval(), + Duration::from_secs(34 * 60) + ); + + let config = GarbageCollectorConfig::parse_from([ + "something", + "--parquetfile-sleep-interval-minutes", + "34", + "--parquetfile-sleep-interval", + "35m", + ]); + assert_eq!( + config.parquetfile_sleep_interval(), + Duration::from_secs(34 * 60) + ); + } +} diff --git a/clap_blocks/src/gossip.rs b/clap_blocks/src/gossip.rs index 90623631c46..47365baec4d 100644 --- a/clap_blocks/src/gossip.rs +++ b/clap_blocks/src/gossip.rs @@ -1,6 +1,7 @@ //! CLI config for cluster gossip communication. use crate::socket_addr::SocketAddr; +use std::str::FromStr; /// Configuration parameters for the cluster gossip communication mechanism. #[derive(Debug, Clone, clap::Parser)] @@ -32,18 +33,20 @@ pub struct GossipConfig { #[clap( long = "gossip-bind-address", env = "INFLUXDB_IOX_GOSSIP_BIND_ADDR", - requires = "seed_list", // Field name, not flag + default_value = "0.0.0.0:4242", + required = false, action )] - pub gossip_bind_address: Option, + pub gossip_bind_address: SocketAddr, } impl GossipConfig { - /// Initialise the gossip config to be disabled. - pub fn disabled() -> Self { + /// constructor for GossipConfig + /// + pub fn new(bind_address: &str, seed_list: Vec) -> Self { Self { - seed_list: vec![], - gossip_bind_address: None, + seed_list, + gossip_bind_address: SocketAddr::from_str(bind_address).unwrap(), } } } diff --git a/clap_blocks/src/ingester.rs b/clap_blocks/src/ingester.rs index d736b16d921..be2ab26022a 100644 --- a/clap_blocks/src/ingester.rs +++ b/clap_blocks/src/ingester.rs @@ -76,6 +76,19 @@ pub struct IngesterConfig { )] pub persist_hot_partition_cost: usize, + /// An optional lower bound byte size limit that buffered data within a + /// partition must reach in order to be converted into an incremental + /// snapshot at query time. + /// + /// Snapshots improve query performance by amortising response generation at + /// the expense of a small memory overhead. Snapshots are retained until the + /// buffer is persisted. + #[clap( + long = "min-partition-snapshot-size", + env = "INFLUXDB_IOX_MIN_PARTITION_SNAPSHOT_SIZE" + )] + pub min_partition_snapshot_size: Option, + /// Limit the number of partitions that may be buffered in a single /// namespace (across all tables) at any one time. /// diff --git a/clap_blocks/src/ingester_address.rs b/clap_blocks/src/ingester_address.rs index 96300e92352..90a8e8d489f 100644 --- a/clap_blocks/src/ingester_address.rs +++ b/clap_blocks/src/ingester_address.rs @@ -1,7 +1,7 @@ //! Shared configuration and tests for accepting ingester addresses as arguments. use http::uri::{InvalidUri, InvalidUriParts, Uri}; -use snafu::Snafu; +use snafu::{ResultExt, Snafu}; use std::{fmt::Display, str::FromStr}; /// An address to an ingester's gRPC API. Create by using `IngesterAddress::from_str`. @@ -14,7 +14,7 @@ pub struct IngesterAddress { #[allow(missing_docs)] #[derive(Snafu, Debug)] pub enum Error { - #[snafu(context(false))] + #[snafu(display("{source}"))] Invalid { source: InvalidUri }, #[snafu(display("Port is required; no port found in `{value}`"))] @@ -28,14 +28,14 @@ impl FromStr for IngesterAddress { type Err = Error; fn from_str(s: &str) -> Result { - let uri = Uri::from_str(s)?; + let uri = Uri::from_str(s).context(InvalidSnafu)?; if uri.port().is_none() { return MissingPortSnafu { value: s }.fail(); } let uri = if uri.scheme().is_none() { - Uri::from_str(&format!("http://{s}"))? + Uri::from_str(&format!("http://{s}")).context(InvalidSnafu)? } else { uri }; @@ -67,7 +67,7 @@ mod tests { num_args=1.., value_delimiter = ',' )] - pub ingester_addresses: Vec, + pub(crate) ingester_addresses: Vec, } #[test] @@ -89,7 +89,7 @@ mod tests { num_args=0.., value_delimiter = ',' )] - pub ingester_addresses: Vec, + pub(crate) ingester_addresses: Vec, } #[test] @@ -243,7 +243,7 @@ mod tests { num_args=1.., value_delimiter = ',' )] - pub ingester_addresses: Vec, + pub(crate) ingester_addresses: Vec, } #[test] @@ -281,7 +281,7 @@ mod tests { num_args=0.., value_delimiter = ',' )] - pub ingester_addresses: Vec, + pub(crate) ingester_addresses: Vec, } #[test] diff --git a/clap_blocks/src/lib.rs b/clap_blocks/src/lib.rs index 255870036fc..d9f689133ce 100644 --- a/clap_blocks/src/lib.rs +++ b/clap_blocks/src/lib.rs @@ -18,6 +18,8 @@ // Workaround for "unused crate" lint false positives. use workspace_hack as _; +pub mod bulk_ingest; +pub mod catalog_cache; pub mod catalog_dsn; pub mod compactor; pub mod compactor_scheduler; @@ -27,6 +29,7 @@ pub mod ingester; pub mod ingester_address; pub mod memory_size; pub mod object_store; +pub mod parquet_cache; pub mod querier; pub mod router; pub mod run_config; diff --git a/clap_blocks/src/memory_size.rs b/clap_blocks/src/memory_size.rs index 6204472d5aa..6e7515df599 100644 --- a/clap_blocks/src/memory_size.rs +++ b/clap_blocks/src/memory_size.rs @@ -2,7 +2,7 @@ use std::{str::FromStr, sync::OnceLock}; -use sysinfo::{RefreshKind, System, SystemExt}; +use sysinfo::{MemoryRefreshKind, RefreshKind, System}; /// Memory size. /// @@ -46,10 +46,7 @@ impl FromStr for MemorySize { "relative memory size must be in [0, 100] but is {percentage}" )); } - let total = *TOTAL_MEM_BYTES.get_or_init(|| { - let sys = System::new_with_specifics(RefreshKind::new().with_memory()); - sys.total_memory() as usize - }); + let total = total_mem_bytes(); let bytes = (percentage as f64 / 100f64 * total as f64).round() as usize; Ok(Self(bytes)) } @@ -62,9 +59,17 @@ impl FromStr for MemorySize { } /// Totally available memory size in bytes. -/// -/// Keep this in a global state so that we only need to inspect the system once during IOx startup. -static TOTAL_MEM_BYTES: OnceLock = OnceLock::new(); +pub fn total_mem_bytes() -> usize { + // Keep this in a global state so that we only need to inspect the system once during IOx startup. + static TOTAL_MEM_BYTES: OnceLock = OnceLock::new(); + + *TOTAL_MEM_BYTES.get_or_init(|| { + let sys = System::new_with_specifics( + RefreshKind::new().with_memory(MemoryRefreshKind::everything()), + ); + sys.total_memory() as usize + }) +} #[cfg(test)] mod tests { diff --git a/clap_blocks/src/object_store.rs b/clap_blocks/src/object_store.rs index 38a96bc9aa4..e961357a30c 100644 --- a/clap_blocks/src/object_store.rs +++ b/clap_blocks/src/object_store.rs @@ -1,16 +1,20 @@ //! CLI handling for object store config (via CLI arguments and environment variables). use futures::TryStreamExt; -use object_store::memory::InMemory; -use object_store::path::Path; -use object_store::throttle::ThrottledStore; -use object_store::{throttle::ThrottleConfig, DynObjectStore}; +use non_empty_string::NonEmptyString; +use object_store::{ + memory::InMemory, + path::Path, + throttle::{ThrottleConfig, ThrottledStore}, + DynObjectStore, +}; use observability_deps::tracing::{info, warn}; use snafu::{ResultExt, Snafu}; -use std::sync::Arc; -use std::{fs, num::NonZeroUsize, path::PathBuf, time::Duration}; +use std::{convert::Infallible, fs, num::NonZeroUsize, path::PathBuf, sync::Arc, time::Duration}; use uuid::Uuid; +use crate::parquet_cache::ParquetCacheClientConfig; + #[derive(Debug, Snafu)] #[allow(missing_docs)] pub enum ParseError { @@ -53,6 +57,12 @@ pub enum ParseError { /// specified. pub const FALLBACK_AWS_REGION: &str = "us-east-1"; +/// A `clap` `value_parser` which returns `None` when given an empty string and +/// `Some(NonEmptyString)` otherwise. +fn parse_optional_string(s: &str) -> Result, Infallible> { + Ok(NonEmptyString::new(s.to_string()).ok()) +} + /// CLI config for object stores. #[derive(Debug, Clone, clap::Parser)] pub struct ObjectStoreConfig { @@ -74,7 +84,8 @@ pub struct ObjectStoreConfig { long = "object-store", env = "INFLUXDB_IOX_OBJECT_STORE", ignore_case = true, - action + action, + verbatim_doc_comment )] pub object_store: Option, @@ -108,8 +119,11 @@ pub struct ObjectStoreConfig { /// /// Prefer the environment variable over the command line flag in shared /// environments. - #[clap(long = "aws-access-key-id", env = "AWS_ACCESS_KEY_ID", action)] - pub aws_access_key_id: Option, + /// + /// An empty string value is equivalent to omitting the flag. + /// Note: must refer to std::option::Option explicitly, see + #[clap(long = "aws-access-key-id", env = "AWS_ACCESS_KEY_ID", value_parser = parse_optional_string, default_value="", action)] + pub aws_access_key_id: std::option::Option, /// When using Amazon S3 as the object store, set this to the secret access /// key that goes with the specified access key ID. @@ -119,8 +133,11 @@ pub struct ObjectStoreConfig { /// /// Prefer the environment variable over the command line flag in shared /// environments. - #[clap(long = "aws-secret-access-key", env = "AWS_SECRET_ACCESS_KEY", action)] - pub aws_secret_access_key: Option, + /// + /// An empty string value is equivalent to omitting the flag. + /// Note: must refer to std::option::Option explicitly, see + #[clap(long = "aws-secret-access-key", env = "AWS_SECRET_ACCESS_KEY", value_parser = parse_optional_string, default_value = "", action)] + pub aws_secret_access_key: std::option::Option, /// When using Amazon S3 as the object store, set this to the region /// that goes with the specified bucket if different from the fallback @@ -203,6 +220,10 @@ pub struct ObjectStoreConfig { action )] pub object_store_connection_limit: NonZeroUsize, + + /// Optional config for the cache client. + #[clap(flatten)] + pub cache_config: Option, } impl ObjectStoreConfig { @@ -229,6 +250,7 @@ impl ObjectStoreConfig { google_service_account: Default::default(), object_store, object_store_connection_limit: NonZeroUsize::new(16).unwrap(), + cache_config: Default::default(), } } } @@ -284,10 +306,24 @@ fn new_gcs(_: &ObjectStoreConfig) -> Result, ParseError> { #[cfg(feature = "aws")] fn new_s3(config: &ObjectStoreConfig) -> Result, ParseError> { - use object_store::aws::AmazonS3Builder; use object_store::limit::LimitStore; - info!(bucket=?config.bucket, endpoint=?config.aws_endpoint, object_store_type="S3", "Object Store"); + info!( + bucket=?config.bucket, + endpoint=?config.aws_endpoint, + object_store_type="S3", + "Object Store" + ); + + Ok(Arc::new(LimitStore::new( + build_s3(config)?, + config.object_store_connection_limit.get(), + ))) +} + +#[cfg(feature = "aws")] +fn build_s3(config: &ObjectStoreConfig) -> Result { + use object_store::aws::AmazonS3Builder; let mut builder = AmazonS3Builder::new() .with_allow_http(config.aws_allow_http) @@ -298,22 +334,19 @@ fn new_s3(config: &ObjectStoreConfig) -> Result, ParseError> builder = builder.with_bucket_name(bucket); } if let Some(key_id) = &config.aws_access_key_id { - builder = builder.with_access_key_id(key_id); + builder = builder.with_access_key_id(key_id.get()); } if let Some(token) = &config.aws_session_token { builder = builder.with_token(token); } if let Some(secret) = &config.aws_secret_access_key { - builder = builder.with_secret_access_key(secret); + builder = builder.with_secret_access_key(secret.get()); } if let Some(endpoint) = &config.aws_endpoint { builder = builder.with_endpoint(endpoint); } - Ok(Arc::new(LimitStore::new( - builder.build().context(InvalidS3ConfigSnafu)?, - config.object_store_connection_limit.get(), - ))) + builder.build().context(InvalidS3ConfigSnafu) } #[cfg(not(feature = "aws"))] @@ -361,10 +394,10 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result = match &config.object_store { Some(ObjectStoreType::Memory) | None => { info!(object_store_type = "Memory", "Object Store"); - Ok(Arc::new(InMemory::new())) + Arc::new(InMemory::new()) } Some(ObjectStoreType::MemoryThrottled) => { let config = ThrottleConfig { @@ -384,12 +417,12 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result new_gcs(config), - Some(ObjectStoreType::S3) => new_s3(config), - Some(ObjectStoreType::Azure) => new_azure(config), + Some(ObjectStoreType::Google) => new_gcs(config)?, + Some(ObjectStoreType::S3) => new_s3(config)?, + Some(ObjectStoreType::Azure) => new_azure(config)?, Some(ObjectStoreType::File) => match config.database_directory.as_ref() { Some(db_dir) => { info!(?db_dir, object_store_type = "Directory", "Object Store"); @@ -398,17 +431,49 @@ pub fn make_object_store(config: &ObjectStoreConfig) -> Result MissingObjectStoreConfigSnafu { object_store: ObjectStoreType::File, missing: "data-dir", } - .fail(), + .fail()?, }, + }; + + if let Some(cache_config) = &config.cache_config { + let cache = parquet_cache::make_client( + cache_config.namespace_addr.clone(), + Arc::clone(&remote_store), + ); + info!(?cache_config, "Parquet cache enabled"); + Ok(cache) + } else { + Ok(remote_store) + } +} + +/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so when the AWS +/// feature is enabled and the configured object store is S3, return a signer. +#[cfg(feature = "aws")] +pub fn make_presigned_url_signer( + config: &ObjectStoreConfig, +) -> Result>, ParseError> { + match &config.object_store { + Some(ObjectStoreType::S3) => Ok(Some(Arc::new(build_s3(config)?))), + _ => Ok(None), } } +/// The `object_store::signer::Signer` trait is only implemented for AWS currently, so if the AWS +/// feature isn't enabled, don't return a signer. +#[cfg(not(feature = "aws"))] +pub fn make_presigned_url_signer( + _config: &ObjectStoreConfig, +) -> Result>, ParseError> { + Ok(None) +} + #[derive(Debug, Snafu)] #[allow(missing_docs)] pub enum CheckError { @@ -425,10 +490,7 @@ pub async fn check_object_store(object_store: &DynObjectStore) -> Result<(), Che let prefix = Path::from_iter([uuid]); // create stream (this might fail if the store is not readable) - let mut stream = object_store - .list(Some(&prefix)) - .await - .context(CannotReadObjectStoreSnafu)?; + let mut stream = object_store.list(Some(&prefix)); // ... but sometimes it fails only if we use the resulting stream, so try that once stream @@ -464,6 +526,14 @@ mod tests { assert_eq!(&object_store.to_string(), "InMemory") } + #[test] + fn default_url_signer_is_none() { + let config = ObjectStoreConfig::try_parse_from(["server"]).unwrap(); + + let signer = make_presigned_url_signer(&config).unwrap(); + assert!(signer.is_none(), "Expected None, got {signer:?}"); + } + #[test] #[cfg(feature = "aws")] fn valid_s3_config() { @@ -481,7 +551,10 @@ mod tests { .unwrap(); let object_store = make_object_store(&config).unwrap(); - assert_eq!(&object_store.to_string(), "AmazonS3(mybucket)") + assert_eq!( + &object_store.to_string(), + "LimitStore(16, AmazonS3(mybucket))" + ) } #[test] @@ -497,13 +570,73 @@ mod tests { assert_eq!( err, - "Specified S3 for the object store, required configuration missing for bucket" + "Error configuring Amazon S3: Generic S3 error: Missing bucket name" + ); + } + + #[test] + #[cfg(feature = "aws")] + fn valid_s3_url_signer() { + let config = ObjectStoreConfig::try_parse_from([ + "server", + "--object-store", + "s3", + "--bucket", + "mybucket", + "--aws-access-key-id", + "NotARealAWSAccessKey", + "--aws-secret-access-key", + "NotARealAWSSecretAccessKey", + ]) + .unwrap(); + + assert!(make_presigned_url_signer(&config).unwrap().is_some()); + + // Even with the aws feature on, any other object store shouldn't create a signer. + let root = TempDir::new().unwrap(); + let root_path = root.path().to_str().unwrap(); + + let config = ObjectStoreConfig::try_parse_from([ + "server", + "--object-store", + "file", + "--data-dir", + root_path, + ]) + .unwrap(); + + let signer = make_presigned_url_signer(&config).unwrap(); + assert!(signer.is_none(), "Expected None, got {signer:?}"); + } + + #[test] + #[cfg(feature = "aws")] + fn s3_url_signer_config_missing_params() { + let mut config = + ObjectStoreConfig::try_parse_from(["server", "--object-store", "s3"]).unwrap(); + + // clean out eventual leaks via env variables + config.bucket = None; + + let err = make_presigned_url_signer(&config).unwrap_err().to_string(); + + assert_eq!( + err, + "Error configuring Amazon S3: Generic S3 error: Missing bucket name" ); } #[test] #[cfg(feature = "gcp")] fn valid_google_config() { + use std::io::Write; + use tempfile::NamedTempFile; + + let mut file = NamedTempFile::new().expect("tempfile should be created"); + const FAKE_KEY: &str = r#"{"private_key": "private_key", "private_key_id": "private_key_id", "client_email":"client_email", "disable_oauth":true}"#; + writeln!(file, "{FAKE_KEY}").unwrap(); + let path = file.path().to_str().expect("file path should exist"); + let config = ObjectStoreConfig::try_parse_from([ "server", "--object-store", @@ -511,12 +644,15 @@ mod tests { "--bucket", "mybucket", "--google-service-account", - "~/Not/A/Real/path.json", + path, ]) .unwrap(); let object_store = make_object_store(&config).unwrap(); - assert_eq!(&object_store.to_string(), "GoogleCloudStorage(mybucket)") + assert_eq!( + &object_store.to_string(), + "LimitStore(16, GoogleCloudStorage(mybucket))" + ) } #[test] @@ -532,8 +668,7 @@ mod tests { assert_eq!( err, - "Specified Google for the object store, required configuration missing for \ - bucket, google-service-account" + "Error configuring GCS: Generic GCS error: Missing bucket name" ); } @@ -549,12 +684,12 @@ mod tests { "--azure-storage-account", "NotARealStorageAccount", "--azure-storage-access-key", - "NotARealKey", + "Zm9vYmFy", // base64 encoded "foobar" ]) .unwrap(); let object_store = make_object_store(&config).unwrap(); - assert_eq!(&object_store.to_string(), "MicrosoftAzure(mybucket)") + assert_eq!(&object_store.to_string(), "LimitStore(16, MicrosoftAzure { account: NotARealStorageAccount, container: mybucket })") } #[test] @@ -570,8 +705,7 @@ mod tests { assert_eq!( err, - "Specified Azure for the object store, required configuration missing for \ - bucket, azure-storage-account, azure-storage-access-key" + "Error configuring Microsoft Azure: Generic MicrosoftAzure error: Container name must be specified" ); } @@ -614,4 +748,28 @@ mod tests { data-dir" ); } + + #[test] + fn valid_cache_config() { + let root = TempDir::new().unwrap(); + let root_path = root.path().to_str().unwrap(); + + let config = ObjectStoreConfig::try_parse_from([ + "server", + "--object-store", + "file", + "--data-dir", + root_path, + "--parquet-cache-namespace-addr", + "http://k8s-noninstance-general-service-route:8080", + ]) + .unwrap(); + + let object_store = make_object_store(&config).unwrap().to_string(); + assert!( + object_store.starts_with("DataCacheObjectStore"), + "{}", + object_store + ) + } } diff --git a/clap_blocks/src/parquet_cache.rs b/clap_blocks/src/parquet_cache.rs new file mode 100644 index 00000000000..d93aa944a4c --- /dev/null +++ b/clap_blocks/src/parquet_cache.rs @@ -0,0 +1,57 @@ +//! CLI handling for parquet data cache config (via CLI arguments and environment variables). + +/// Config for cache client. +#[derive(Debug, Clone, Default, clap::Parser)] +pub struct ParquetCacheClientConfig { + /// The address for the service namespace (not a given instance). + /// + /// When the client comes online, it discovers the keyspace + /// by issue requests to this address. + #[clap( + long = "parquet-cache-namespace-addr", + env = "INFLUXDB_IOX_PARQUET_CACHE_NAMESPACE_ADDR", + required = false + )] + pub namespace_addr: String, +} + +/// Config for cache instance. +#[derive(Debug, Clone, Default, clap::Parser)] +pub struct ParquetCacheInstanceConfig { + /// The path to the config file for the keyspace. + #[clap( + long = "parquet-cache-keyspace-config-path", + env = "INFLUXDB_IOX_PARQUET_CACHE_KEYSPACE_CONFIG_PATH", + required = true + )] + pub keyspace_config_path: String, + + /// The hostname of the cache instance (k8s pod) running this process. + /// + /// Cache controller should be setting this env var. + #[clap( + long = "parquet-cache-instance-hostname", + env = "HOSTNAME", + required = true + )] + pub instance_hostname: String, + + /// The local directory to store data. + #[clap( + long = "parquet-cache-local-dir", + env = "INFLUXDB_IOX_PARQUET_CACHE_LOCAL_DIR", + required = true + )] + pub local_dir: String, +} + +impl From for parquet_cache::ParquetCacheServerConfig { + fn from(instance_config: ParquetCacheInstanceConfig) -> Self { + Self { + keyspace_config_path: instance_config.keyspace_config_path, + hostname: instance_config.instance_hostname, + local_dir: instance_config.local_dir, + policy_config: Default::default(), + } + } +} diff --git a/clap_blocks/src/querier.rs b/clap_blocks/src/querier.rs index e92b55b3189..4a62455b0ee 100644 --- a/clap_blocks/src/querier.rs +++ b/clap_blocks/src/querier.rs @@ -120,6 +120,14 @@ pub struct QuerierConfig { action )] pub datafusion_config: HashMap, + + /// Use the new V2 API to talk to the ingester. + /// + /// Defaults to "no". + /// + /// See . + #[clap(long = "v2-ingester-api", env = "INFLUXDB_IOX_V2_INGESTER_API", action)] + pub v2_ingester_api: bool, } fn parse_datafusion_config( @@ -213,7 +221,7 @@ mod tests { "error: \ invalid value '\\ingester-0:8082' \ for '--ingester-addresses [...]': \ - Invalid: invalid uri character" + invalid uri character" ); } diff --git a/clap_blocks/src/router.rs b/clap_blocks/src/router.rs index 68381407baf..28442d79e72 100644 --- a/clap_blocks/src/router.rs +++ b/clap_blocks/src/router.rs @@ -1,6 +1,7 @@ //! CLI config for the router using the RPC write path use crate::{ + bulk_ingest::BulkIngestConfig, gossip::GossipConfig, ingester_address::IngesterAddress, single_tenant::{ @@ -20,6 +21,10 @@ pub struct RouterConfig { #[clap(flatten)] pub gossip_config: GossipConfig, + /// Bulk ingest API config. + #[clap(flatten)] + pub bulk_ingest_config: BulkIngestConfig, + /// Addr for connection to authz #[clap( long = CONFIG_AUTHZ_FLAG, @@ -57,6 +62,17 @@ pub struct RouterConfig { )] pub http_request_limit: usize, + /// When writing line protocol data, does an error on a single line + /// reject the write? Or will all individual valid lines be written? + /// Set to true to enable all valid lines to write. + #[clap( + long = "partial-writes-enabled", + env = "INFLUXDB_IOX_PARTIAL_WRITES_ENABLED", + default_value = "false", + action + )] + pub permit_partial_writes: bool, + /// gRPC address for the router to talk with the ingesters. For /// example: /// diff --git a/client_util/Cargo.toml b/client_util/Cargo.toml index 803a11001fc..8b2e12f6663 100644 --- a/client_util/Cargo.toml +++ b/client_util/Cargo.toml @@ -6,14 +6,17 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -http = "0.2.9" -reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls"] } -thiserror = "1.0.48" +http = "0.2.11" +reqwest = { version = "0.11", default-features = false, features = ["stream", "rustls-tls-native-roots"] } +thiserror = "1.0.56" tonic = { workspace = true } tower = "0.4" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread"] } mockito = { version = "1.2", default-features = false } diff --git a/data_types/Cargo.toml b/data_types/Cargo.toml index 5eebe4050da..c38745ce4e5 100644 --- a/data_types/Cargo.toml +++ b/data_types/Cargo.toml @@ -6,27 +6,36 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] +arrow-buffer = { workspace = true } +bytes = "1.5" chrono = { version = "0.4", default-features = false } croaring = "1.0.0" influxdb-line-protocol = { path = "../influxdb_line_protocol" } iox_time = { path = "../iox_time" } generated_types = { path = "../generated_types" } +murmur3 = "0.5.2" observability_deps = { path = "../observability_deps" } once_cell = "1" -ordered-float = "3" +ordered-float = "4" +percent-encoding = "2.3.1" +prost = { workspace = true } schema = { path = "../schema" } -sha2 = "0.10" -sqlx = { version = "0.7.1", features = ["runtime-tokio-rustls", "postgres", "uuid"] } -thiserror = "1.0.48" +serde_json = "1.0" +siphasher = "1.0" +sha2 = { version = "0.10", default-features = false } +snafu = "0.8" +sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "uuid"] } +thiserror = "1.0.56" uuid = { version = "1", features = ["v4"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } -percent-encoding = "2.2.0" -serde = { version = "1.0.188", features = ["derive"] } [dev-dependencies] # In alphabetical order assert_matches = "1" paste = "1.0.14" -proptest = { version = "1.2.0", default-features = false } +proptest = { version = "1.4.0", default-features = false } test_helpers = { path = "../test_helpers" } hex = "0.4.2" diff --git a/data_types/src/columns.rs b/data_types/src/columns.rs index 958287317ca..1c6b0a91adc 100644 --- a/data_types/src/columns.rs +++ b/data_types/src/columns.rs @@ -1,14 +1,17 @@ //! Types having to do with columns. use super::TableId; -use generated_types::influxdata::iox::{gossip, schema::v1 as proto}; +use generated_types::influxdata::iox::{column_type::v1 as proto, gossip}; use influxdb_line_protocol::FieldValue; -use schema::{builder::SchemaBuilder, InfluxColumnType, InfluxFieldType, Schema}; -use serde::{Deserialize, Serialize}; +use schema::{builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, Schema}; +use snafu::Snafu; +use std::cmp::Ordering; +use std::collections::HashSet; use std::{ collections::{BTreeMap, BTreeSet, HashMap}, convert::TryFrom, ops::Deref, + sync::Arc, }; /// Unique ID for a `Column` @@ -27,11 +30,11 @@ impl ColumnId { } /// Column definitions for a table indexed by their name -#[derive(Debug, Clone, Eq, PartialEq, Hash)] -pub struct ColumnsByName(BTreeMap); +#[derive(Debug, Clone, Eq, PartialEq, Hash, Default)] +pub struct ColumnsByName(BTreeMap, ColumnSchema>); -impl From> for ColumnsByName { - fn from(value: BTreeMap) -> Self { +impl From, ColumnSchema>> for ColumnsByName { + fn from(value: BTreeMap, ColumnSchema>) -> Self { Self(value) } } @@ -44,7 +47,7 @@ impl ColumnsByName { .into_iter() .map(|c| { ( - c.name, + Arc::from(c.name), ColumnSchema { id: c.id, column_type: c.column_type, @@ -60,13 +63,13 @@ impl ColumnsByName { /// # Panics /// /// This method panics if a column of the same name already exists in `self`. - pub fn add_column(&mut self, column_name: String, column_schema: ColumnSchema) { - let old = self.0.insert(column_name, column_schema); + pub fn add_column(&mut self, column_name: impl Into>, column_schema: ColumnSchema) { + let old = self.0.insert(column_name.into(), column_schema); assert!(old.is_none()); } /// Iterate over the names and columns. - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> impl Iterator, &ColumnSchema)> { self.0.iter() } @@ -83,7 +86,7 @@ impl ColumnsByName { /// Return the set of column names. Used in combination with a write operation's /// column names to determine whether a write would exceed the max allowed columns. pub fn names(&self) -> BTreeSet<&str> { - self.0.keys().map(|name| name.as_str()).collect() + self.0.keys().map(|name| name.as_ref()).collect() } /// Return an iterator of the set of column IDs. @@ -92,9 +95,16 @@ impl ColumnsByName { } /// Return column ids of the given column names - /// Will panic if any of the names are not found - pub fn ids_for_names(&self, names: &[&str]) -> SortedColumnSet { - SortedColumnSet::from(names.iter().map(|name| { + /// + /// # Panics + /// + /// Panics if any of the names are not found in this set. + pub fn ids_for_names(&self, names: impl IntoIterator + Send) -> SortKeyIds + where + T: AsRef, + { + SortKeyIds::from(names.into_iter().map(|name| { + let name = name.as_ref(); self.get(name) .unwrap_or_else(|| panic!("column name not found: {}", name)) .id @@ -107,26 +117,32 @@ impl ColumnsByName { self.0.get(name) } + /// Get the `ColumnId` for the time column, if present (a table created through + /// `table_load_or_create` will always have a time column). + pub fn time_column_id(&self) -> Option { + self.get(schema::TIME_COLUMN_NAME).map(|column| column.id) + } + /// Create `ID->name` map for columns. - pub fn id_map(&self) -> HashMap { + pub fn id_map(&self) -> HashMap> { self.0 .iter() - .map(|(name, c)| (c.id, name.as_str())) + .map(|(name, c)| (c.id, Arc::clone(name))) .collect() } } impl IntoIterator for ColumnsByName { - type Item = (String, ColumnSchema); - type IntoIter = std::collections::btree_map::IntoIter; + type Item = (Arc, ColumnSchema); + type IntoIter = std::collections::btree_map::IntoIter, ColumnSchema>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() } } -impl FromIterator<(String, ColumnSchema)> for ColumnsByName { - fn from_iter>(iter: T) -> Self { +impl FromIterator<(Arc, ColumnSchema)> for ColumnsByName { + fn from_iter, ColumnSchema)>>(iter: T) -> Self { Self(BTreeMap::from_iter(iter)) } } @@ -140,7 +156,7 @@ impl TryFrom for Schema { for (column_name, column_schema) in value.into_iter() { let t = InfluxColumnType::from(column_schema.column_type); - builder.influx_column(column_name, t); + builder.influx_column(column_name.as_ref(), t); } builder.build() @@ -167,7 +183,7 @@ impl Column { } /// returns true if the column type matches the line protocol field value type - pub fn matches_field_type(&self, field_value: &FieldValue) -> bool { + pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool { match field_value { FieldValue::I64(_) => self.column_type == ColumnType::I64, FieldValue::U64(_) => self.column_type == ColumnType::U64, @@ -194,7 +210,7 @@ impl ColumnSchema { } /// returns true if the column matches the line protocol field value type - pub fn matches_field_type(&self, field_value: &FieldValue) -> bool { + pub fn matches_field_type(&self, field_value: &FieldValue<'_>) -> bool { matches!( (field_value, self.column_type), (FieldValue::I64(_), ColumnType::I64) @@ -224,9 +240,7 @@ impl TryFrom<&gossip::v1::Column> for ColumnSchema { /// The column data type #[allow(missing_docs)] -#[derive( - Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, sqlx::Type, Serialize, Deserialize, -)] +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, sqlx::Type)] #[repr(i16)] pub enum ColumnType { I64 = 1, @@ -261,8 +275,14 @@ impl std::fmt::Display for ColumnType { } } +/// Errors deserialising a protobuf serialised [`ColumnType`]. +#[derive(Debug, Snafu)] +#[snafu(display("invalid column value"))] +#[allow(missing_copy_implementations)] +pub struct ColumnTypeProtoError {} + impl TryFrom for ColumnType { - type Error = Box; + type Error = ColumnTypeProtoError; fn try_from(value: i16) -> Result { match value { @@ -273,7 +293,7 @@ impl TryFrom for ColumnType { x if x == Self::String as i16 => Ok(Self::String), x if x == Self::Time as i16 => Ok(Self::Time), x if x == Self::Tag as i16 => Ok(Self::Tag), - _ => Err("invalid column value".into()), + _ => Err(ColumnTypeProtoError {}), } } } @@ -321,7 +341,7 @@ impl PartialEq for ColumnType { } /// Returns the `ColumnType` for the passed in line protocol `FieldValue` type -pub fn column_type_from_field(field_value: &FieldValue) -> ColumnType { +pub fn column_type_from_field(field_value: &FieldValue<'_>) -> ColumnType { match field_value { FieldValue::I64(_) => ColumnType::I64, FieldValue::U64(_) => ColumnType::U64, @@ -331,27 +351,43 @@ pub fn column_type_from_field(field_value: &FieldValue) -> ColumnType { } } -impl TryFrom for ColumnType { - type Error = Box; +impl TryFrom for ColumnType { + type Error = &'static str; - fn try_from(value: proto::column_schema::ColumnType) -> Result { + fn try_from(value: proto::ColumnType) -> Result { Ok(match value { - proto::column_schema::ColumnType::I64 => ColumnType::I64, - proto::column_schema::ColumnType::U64 => ColumnType::U64, - proto::column_schema::ColumnType::F64 => ColumnType::F64, - proto::column_schema::ColumnType::Bool => ColumnType::Bool, - proto::column_schema::ColumnType::String => ColumnType::String, - proto::column_schema::ColumnType::Time => ColumnType::Time, - proto::column_schema::ColumnType::Tag => ColumnType::Tag, - proto::column_schema::ColumnType::Unspecified => { - return Err("unknown column type".into()) - } + proto::ColumnType::I64 => Self::I64, + proto::ColumnType::U64 => Self::U64, + proto::ColumnType::F64 => Self::F64, + proto::ColumnType::Bool => Self::Bool, + proto::ColumnType::String => Self::String, + proto::ColumnType::Time => Self::Time, + proto::ColumnType::Tag => Self::Tag, + proto::ColumnType::Unspecified => return Err("unknown column type"), }) } } +impl From for proto::ColumnType { + fn from(value: ColumnType) -> Self { + match value { + ColumnType::I64 => Self::I64, + ColumnType::U64 => Self::U64, + ColumnType::F64 => Self::F64, + ColumnType::Bool => Self::Bool, + ColumnType::String => Self::String, + ColumnType::Time => Self::Time, + ColumnType::Tag => Self::Tag, + } + } +} + /// Set of columns and used as Set data type. -/// Its inner is implemneted as a vector because postgres does not have set type +/// +/// # Data Structure +/// This is internally implemented as a sorted vector. The sorting allows for fast [`PartialEq`]/[`Eq`]/[`Hash`] and +/// ensures that the PostgreSQL data is deterministic. Note that PostgreSQL does NOT have a set type at the moment, so +/// this is stored as an array. #[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type)] #[sqlx(transparent, no_pg_array)] pub struct ColumnSet(Vec); @@ -370,16 +406,21 @@ impl ColumnSet { let mut columns: Vec = columns.into_iter().collect(); columns.sort(); - let len_pre_dedup = columns.len(); - columns.dedup(); - let len_post_dedup = columns.len(); - assert_eq!(len_pre_dedup, len_post_dedup, "set contains duplicates"); + assert!( + columns.windows(2).all(|w| w[0] != w[1]), + "set contains duplicates" + ); columns.shrink_to_fit(); Self(columns) } + /// Create a new empty [`ColumnSet`] + pub fn empty() -> Self { + Self(Vec::new()) + } + /// Estimate the memory consumption of this object and its contents pub fn size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.0.capacity()) @@ -389,6 +430,66 @@ impl ColumnSet { pub fn is_empty(&self) -> bool { self.0.is_empty() } + + /// Computes the union of `self` and `other` + pub fn union(&mut self, other: &Self) { + let mut insert_idx = 0; + let mut src_idx = 0; + + while insert_idx < self.0.len() && src_idx < other.0.len() { + let s = self.0[insert_idx]; + let o = other.0[src_idx]; + + match s.cmp(&o) { + Ordering::Less => insert_idx += 1, + Ordering::Equal => { + insert_idx += 1; + src_idx += 1; + } + Ordering::Greater => { + self.0.insert(insert_idx, o); + insert_idx += 1; + src_idx += 1; + } + } + } + self.0.extend_from_slice(&other.0[src_idx..]); + } + + /// Returns the indices and ids in `self` that are present in both `self` and `other` + /// + /// ``` + /// # use data_types::{ColumnId, ColumnSet}; + /// let a = ColumnSet::new([1, 2, 4, 6, 7].into_iter().map(ColumnId::new)); + /// let b = ColumnSet::new([2, 4, 6].into_iter().map(ColumnId::new)); + /// + /// assert_eq!( + /// a.intersect(&b).collect::>(), + /// vec![(1, b[0]), (2, b[1]), (3, b[2])] + /// ) + /// ``` + pub fn intersect<'a>( + &'a self, + other: &'a Self, + ) -> impl Iterator + 'a { + let mut left_idx = 0; + let mut right_idx = 0; + std::iter::from_fn(move || loop { + let s = self.0.get(left_idx)?; + let o = other.get(right_idx)?; + + match s.cmp(o) { + Ordering::Less => left_idx += 1, + Ordering::Greater => right_idx += 1, + Ordering::Equal => { + let t = left_idx; + left_idx += 1; + right_idx += 1; + return Some((t, *s)); + } + } + }) + } } impl From for Vec { @@ -405,12 +506,13 @@ impl Deref for ColumnSet { } } -/// Set of sorted columns in a specific given order at created time +/// Set of sorted column IDs in a specific given order at creation time, to be used as a +/// [`SortKey`] by looking up the column names in the table's schema. #[derive(Debug, Clone, PartialEq, Eq, Hash, sqlx::Type, Default)] #[sqlx(transparent, no_pg_array)] -pub struct SortedColumnSet(Vec); +pub struct SortKeyIds(Vec); -impl SortedColumnSet { +impl SortKeyIds { /// Create new sorted column set. /// /// The order of the passed columns will be preserved. @@ -423,32 +525,143 @@ impl SortedColumnSet { { let mut columns: Vec = columns.into_iter().collect(); - // verify if there are duplicates - let mut columns_sorted = columns.clone(); - columns_sorted.sort(); - let len_pre_dedup = columns_sorted.len(); - columns_sorted.dedup(); - let len_post_dedup = columns_sorted.len(); - assert_eq!(len_pre_dedup, len_post_dedup, "set contains duplicates"); + // Validate the ID set contains no duplicates. + // + // This validates an invariant in debug builds, skipping the cost + // for release builds. + if cfg!(debug_assertions) { + SortKeyIds::check_for_deplicates(&columns); + } // Must continue with columns in original order columns.shrink_to_fit(); + Self(columns) } + /// Given another set of sort key IDs, merge them together and, if needed, return a value to + /// use to update the catalog. + /// + /// If `other` contains any column IDs that are not present in `self`, create a new + /// `SortKeyIds` instance that includes the new columns in `other` (in the same order they + /// appear in `other`) appended to the existing columns, but keeping the time column ID last. + /// + /// If existing columns appear in `self` in a different order than they appear in `other`, the + /// order in `self` takes precedence and remains unchanged. + /// + /// If `self` contains all the sort keys in `other` already (regardless of order), this will + /// return `None` as no update to the catalog is needed. + pub fn maybe_append(&self, other: &Self, time_column_id: ColumnId) -> Option { + let existing_columns_without_time = self + .iter() + .cloned() + .filter(|&column_id| column_id != time_column_id); + + let mut new_columns = other + .iter() + .cloned() + .filter(|column_id| !self.contains(column_id)) + .peekable(); + + if new_columns.peek().is_none() { + None + } else { + Some(SortKeyIds::new( + existing_columns_without_time + .chain(new_columns) + .chain(std::iter::once(time_column_id)), + )) + } + } + /// Estimate the memory consumption of this object and its contents pub fn size(&self) -> usize { std::mem::size_of_val(self) + (std::mem::size_of::() * self.0.capacity()) } + + /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided + /// [`ColumnsByName`] map by converting it to a `HashMap. If you already have + /// an id-to-name column map, use [`SortKeyIds::to_sort_key_using_map`] instead. + /// + /// If you have a [`Partition`][super::Partition], it may be more convenient to call the + /// [`Partition::sort_key`][super::Partition::sort_key] method instead! + /// + /// # Panics + /// + /// Will panic if an ID isn't found in the column map. + pub fn to_sort_key(&self, columns: &ColumnsByName) -> SortKey { + let column_id_map = columns.id_map(); + self.to_sort_key_using_map(&column_id_map) + } + + /// Build a [`SortKey`] from [`SortKeyIds`]; looking up column names in the provided + /// [`HashMap`] map. + /// + /// If you have a [`Partition`][super::Partition], it may be more convenient to call the + /// [`Partition::sort_key`][super::Partition::sort_key] method instead! + /// + /// # Panics + /// + /// Will panic if an ID isn't found in the column map. + pub fn to_sort_key_using_map(&self, column_id_map: &HashMap>) -> SortKey { + SortKey::from_columns(self.0.iter().map(|id| { + Arc::clone( + column_id_map.get(id).unwrap_or_else(|| { + panic!("cannot find column names for sort key id {}", id.get()) + }), + ) + })) + } + + /// Returns `true` if `other` is a monotonic update of `self`. + /// + /// # Panics + /// + /// Assumes "time" is the last column in both sets, and panics if the last + /// columns are not identical. + pub fn is_monotonic_update(&self, other: &Self) -> bool { + // The SortKeyIds always reference the time column last (if set). + if self.0.last().is_some() { + assert_eq!( + self.0.last(), + other.last(), + "last column in sort IDs must be time, and cannot change" + ); + } + + // Ensure the values in other are a prefix match, with the exception of + // the last "time" column. + self.0.len() <= other.len() + && self + .0 + .iter() + .take(self.0.len().saturating_sub(1)) + .zip(other.iter()) + .all(|(a, b)| a == b) + } + + fn check_for_deplicates(columns: &[ColumnId]) { + let mut column_ids: HashSet = HashSet::with_capacity(columns.len()); + for c in columns { + match column_ids.get(&c.0) { + Some(_) => { + panic!("set contains duplicates"); + } + _ => { + column_ids.insert(c.0); + } + } + } + } } -impl From for Vec { - fn from(set: SortedColumnSet) -> Self { +impl From for Vec { + fn from(set: SortKeyIds) -> Self { set.0 } } -impl Deref for SortedColumnSet { +impl Deref for SortKeyIds { type Target = [ColumnId]; fn deref(&self) -> &Self::Target { @@ -456,7 +669,7 @@ impl Deref for SortedColumnSet { } } -impl From for SortedColumnSet +impl From for SortKeyIds where I: IntoIterator, { @@ -465,9 +678,17 @@ where } } -impl From for Vec { - fn from(val: SortedColumnSet) -> Self { - val.0.into_iter().map(|id| id.get()).collect() +impl From<&SortKeyIds> for Vec { + fn from(val: &SortKeyIds) -> Self { + val.0.iter().map(|id| id.get()).collect() + } +} + +impl From<&SortKeyIds> for generated_types::influxdata::iox::catalog::v1::SortKeyIds { + fn from(val: &SortKeyIds) -> Self { + generated_types::influxdata::iox::catalog::v1::SortKeyIds { + array_sort_key_ids: val.into(), + } } } @@ -483,10 +704,46 @@ mod tests { ColumnSet::new([ColumnId::new(1), ColumnId::new(2), ColumnId::new(1)]); } + #[test] + fn test_column_set_eq() { + let set_1 = ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]); + let set_2 = ColumnSet::new([ColumnId::new(2), ColumnId::new(1)]); + assert_eq!(set_1, set_2); + } + + #[test] + fn test_column_set_union_intersect() { + let a = ColumnSet::new([1, 2, 5, 7].into_iter().map(ColumnId::new)); + let b = ColumnSet::new([1, 5, 6, 7, 8].into_iter().map(ColumnId::new)); + + let mut t = ColumnSet::empty(); + t.union(&a); + assert_eq!(t, a); + + assert_eq!( + t.intersect(&a).collect::>(), + vec![(0, a[0]), (1, a[1]), (2, a[2]), (3, a[3])] + ); + + t.union(&b); + let expected = ColumnSet::new([1, 2, 5, 6, 7, 8].into_iter().map(ColumnId::new)); + assert_eq!(t, expected); + + assert_eq!( + t.intersect(&a).collect::>(), + vec![(0, a[0]), (1, a[1]), (2, a[2]), (4, a[3])] + ); + + assert_eq!( + t.intersect(&b).collect::>(), + vec![(0, b[0]), (2, b[1]), (3, b[2]), (4, b[3]), (5, b[4])] + ); + } + #[test] #[should_panic = "set contains duplicates"] fn test_sorted_column_set_duplicates() { - SortedColumnSet::new([ + SortKeyIds::new([ ColumnId::new(2), ColumnId::new(1), ColumnId::new(3), @@ -496,7 +753,7 @@ mod tests { #[test] fn test_sorted_column_set() { - let set = SortedColumnSet::new([ColumnId::new(2), ColumnId::new(1), ColumnId::new(3)]); + let set = SortKeyIds::new([ColumnId::new(2), ColumnId::new(1), ColumnId::new(3)]); // verify the order is preserved assert_eq!(set[0], ColumnId::new(2)); assert_eq!(set[1], ColumnId::new(1)); @@ -506,35 +763,35 @@ mod tests { #[test] fn test_column_schema() { assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::I64).unwrap(), + ColumnType::try_from(proto::ColumnType::I64).unwrap(), ColumnType::I64, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::U64).unwrap(), + ColumnType::try_from(proto::ColumnType::U64).unwrap(), ColumnType::U64, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::F64).unwrap(), + ColumnType::try_from(proto::ColumnType::F64).unwrap(), ColumnType::F64, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::Bool).unwrap(), + ColumnType::try_from(proto::ColumnType::Bool).unwrap(), ColumnType::Bool, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::String).unwrap(), + ColumnType::try_from(proto::ColumnType::String).unwrap(), ColumnType::String, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::Time).unwrap(), + ColumnType::try_from(proto::ColumnType::Time).unwrap(), ColumnType::Time, ); assert_eq!( - ColumnType::try_from(proto::column_schema::ColumnType::Tag).unwrap(), + ColumnType::try_from(proto::ColumnType::Tag).unwrap(), ColumnType::Tag, ); - assert!(ColumnType::try_from(proto::column_schema::ColumnType::Unspecified).is_err()); + assert!(ColumnType::try_from(proto::ColumnType::Unspecified).is_err()); } #[test] @@ -567,50 +824,50 @@ mod tests { fn test_columns_by_names_exist() { let columns = build_columns_by_names(); - let ids = columns.ids_for_names(&["foo", "bar"]); - assert_eq!(ids, SortedColumnSet::from([1, 2])); + let ids = columns.ids_for_names(["foo", "bar"]); + assert_eq!(ids, SortKeyIds::from([1, 2])); } #[test] fn test_columns_by_names_exist_different_order() { let columns = build_columns_by_names(); - let ids = columns.ids_for_names(&["bar", "foo"]); - assert_eq!(ids, SortedColumnSet::from([2, 1])); + let ids = columns.ids_for_names(["bar", "foo"]); + assert_eq!(ids, SortKeyIds::from([2, 1])); } #[test] #[should_panic = "column name not found: baz"] fn test_columns_by_names_not_exist() { let columns = build_columns_by_names(); - columns.ids_for_names(&["foo", "baz"]); + columns.ids_for_names(["foo", "baz"]); } fn build_columns_by_names() -> ColumnsByName { - let mut columns: BTreeMap = BTreeMap::new(); + let mut columns: BTreeMap, ColumnSchema> = BTreeMap::new(); columns.insert( - "foo".to_string(), + "foo".into(), ColumnSchema { id: ColumnId::new(1), column_type: ColumnType::I64, }, ); columns.insert( - "bar".to_string(), + "bar".into(), ColumnSchema { id: ColumnId::new(2), column_type: ColumnType::I64, }, ); columns.insert( - "time".to_string(), + "time".into(), ColumnSchema { id: ColumnId::new(3), column_type: ColumnType::Time, }, ); columns.insert( - "tag1".to_string(), + "tag1".into(), ColumnSchema { id: ColumnId::new(4), column_type: ColumnType::Tag, @@ -619,4 +876,122 @@ mod tests { ColumnsByName(columns) } + + // panic if the sort_key_ids are not found in the columns + #[test] + #[should_panic(expected = "cannot find column names for sort key id 3")] + fn test_panic_build_sort_key_from_ids_and_map() { + // table columns + let uno = ColumnSchema { + id: ColumnId::new(1), + column_type: ColumnType::Tag, + }; + let dos = ColumnSchema { + id: ColumnId::new(2), + column_type: ColumnType::Tag, + }; + let mut column_map = ColumnsByName::default(); + column_map.add_column("uno", uno); + column_map.add_column("dos", dos); + + // sort_key_ids include some columns that are not in the columns + let sort_key_ids = SortKeyIds::from([2, 3]); + sort_key_ids.to_sort_key(&column_map); + } + + #[test] + fn test_build_sort_key_from_ids_and_map() { + // table columns + let uno = ColumnSchema { + id: ColumnId::new(1), + column_type: ColumnType::Tag, + }; + let dos = ColumnSchema { + id: ColumnId::new(2), + column_type: ColumnType::Tag, + }; + let tres = ColumnSchema { + id: ColumnId::new(3), + column_type: ColumnType::Tag, + }; + let mut column_map = ColumnsByName::default(); + column_map.add_column("uno", uno); + column_map.add_column("dos", dos); + column_map.add_column("tres", tres); + + // sort_key_ids is empty + let sort_key_ids = SortKeyIds::default(); + let sort_key = sort_key_ids.to_sort_key(&column_map); + assert_eq!(sort_key, SortKey::empty()); + + // sort_key_ids include all columns and in the same order + let sort_key_ids = SortKeyIds::from([1, 2, 3]); + let sort_key = sort_key_ids.to_sort_key(&column_map); + assert_eq!(sort_key, SortKey::from_columns(vec!["uno", "dos", "tres"])); + + // sort_key_ids include all columns but in different order + let sort_key_ids = SortKeyIds::from([2, 3, 1]); + let sort_key = sort_key_ids.to_sort_key(&column_map); + assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres", "uno"])); + + // sort_key_ids include some columns + let sort_key_ids = SortKeyIds::from([2, 3]); + let sort_key = sort_key_ids.to_sort_key(&column_map); + assert_eq!(sort_key, SortKey::from_columns(vec!["dos", "tres"])); + + // sort_key_ids include some columns in different order + let sort_key_ids = SortKeyIds::from([3, 1]); + let sort_key = sort_key_ids.to_sort_key(&column_map); + assert_eq!(sort_key, SortKey::from_columns(vec!["tres", "uno"])); + } + + #[test] + fn test_sort_key_ids_round_trip_encoding() { + let original = SortKeyIds::from([1, 2, 3]); + + let encoded: generated_types::influxdata::iox::catalog::v1::SortKeyIds = (&original).into(); + + let decoded: SortKeyIds = encoded.array_sort_key_ids.into(); + assert_eq!(decoded, original); + } + + macro_rules! test_is_monotonic_update { + ( + $name:ident, + a = $a:expr, + b = $b:expr, + want = $want:expr + ) => { + paste::paste! { + #[test] + fn []() { + let a = SortKeyIds::new($a.into_iter().map(ColumnId::new)); + let b = SortKeyIds::new($b.into_iter().map(ColumnId::new)); + assert_eq!(a.is_monotonic_update(&b), $want) + } + } + }; + } + + test_is_monotonic_update!(equal, a = [42, 24, 1], b = [42, 24, 1], want = true); + + test_is_monotonic_update!(empty, a = [], b = [42, 24, 1], want = true); + + test_is_monotonic_update!( + longer_with_time, + a = [42, 24, 1], + b = [42, 24, 13, 1], + want = true + ); + + test_is_monotonic_update!(shorter_with_time, a = [42, 24, 1], b = [1], want = false); + + test_is_monotonic_update!( + mismatch_with_time, + a = [42, 24, 1], + b = [24, 42, 1], + want = false + ); + + test_is_monotonic_update!(mismatch, a = [42, 24, 1], b = [24, 42, 1], want = false); } diff --git a/data_types/src/lib.rs b/data_types/src/lib.rs index a158e704497..951af51d965 100644 --- a/data_types/src/lib.rs +++ b/data_types/src/lib.rs @@ -29,10 +29,13 @@ pub mod partition; pub use partition::*; pub mod sequence_number_set; pub mod service_limits; +pub mod snapshot; + pub use service_limits::*; use observability_deps::tracing::warn; use schema::TIME_COLUMN_NAME; +use snafu::Snafu; use std::{ borrow::Borrow, collections::{BTreeMap, BTreeSet, HashMap}, @@ -41,9 +44,16 @@ use std::{ mem::{self, size_of_val}, num::{FpCategory, NonZeroU64}, ops::{Add, Deref, Sub}, + sync::Arc, }; use uuid::Uuid; +/// Errors deserialising a protobuf serialised [`ParquetFile`]. +#[derive(Debug, Snafu)] +#[snafu(display("invalid compaction level value"))] +#[allow(missing_copy_implementations)] +pub struct CompactionLevelProtoError {} + /// Compaction levels #[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash, sqlx::Type)] #[repr(i16)] @@ -68,14 +78,14 @@ impl Display for CompactionLevel { } impl TryFrom for CompactionLevel { - type Error = Box; + type Error = CompactionLevelProtoError; fn try_from(value: i32) -> Result { match value { x if x == Self::Initial as i32 => Ok(Self::Initial), x if x == Self::FileNonOverlapped as i32 => Ok(Self::FileNonOverlapped), x if x == Self::Final as i32 => Ok(Self::Final), - _ => Err("invalid compaction level value".into()), + _ => Err(CompactionLevelProtoError {}), } } } @@ -131,7 +141,7 @@ impl NamespaceId { } impl std::fmt::Display for NamespaceId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } @@ -157,7 +167,7 @@ impl TableId { } impl std::fmt::Display for TableId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } @@ -268,12 +278,48 @@ impl ParquetFileId { } impl std::fmt::Display for ParquetFileId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { // Use `self.number` to refer to each positional data point. write!(f, "{}", self.0) } } +/// Unique store UUID for a [`ParquetFile`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)] +#[sqlx(transparent)] +pub struct ObjectStoreId(Uuid); + +#[allow(missing_docs)] +impl ObjectStoreId { + #[allow(clippy::new_without_default)] + pub fn new() -> Self { + Self::from_uuid(Uuid::new_v4()) + } + + pub fn from_uuid(uuid: Uuid) -> Self { + Self(uuid) + } + + pub fn get_uuid(&self) -> Uuid { + self.0 + } +} + +impl std::fmt::Display for ObjectStoreId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } +} + +impl std::str::FromStr for ObjectStoreId { + type Err = uuid::Error; + + fn from_str(s: &str) -> Result { + let uuid = Uuid::parse_str(s)?; + Ok(Self::from_uuid(uuid)) + } +} + /// Data object for a namespace #[derive(Debug, Clone, PartialEq, sqlx::FromRow)] pub struct Namespace { @@ -352,35 +398,23 @@ impl NamespaceSchema { impl From<&NamespaceSchema> for generated_types::influxdata::iox::schema::v1::NamespaceSchema { fn from(schema: &NamespaceSchema) -> Self { - use generated_types::influxdata::iox::schema::v1 as proto; - Self { - id: schema.id.get(), - tables: schema - .tables - .iter() - .map(|(name, t)| { - ( - name.clone(), - proto::TableSchema { - id: t.id.get(), - columns: t - .columns - .iter() - .map(|(name, c)| { - ( - name.clone(), - proto::ColumnSchema { - id: c.id.get(), - column_type: c.column_type as i32, - }, - ) - }) - .collect(), - }, - ) - }) - .collect(), - } + namespace_schema_proto(schema.id, schema.tables.iter()) + } +} + +/// Generate [`NamespaceSchema`] protobuf from a `NamespaceId` and a list of tables. Useful to +/// filter the tables returned from an API request to a particular table without needing to clone +/// the whole `NamespaceSchema` to use the `From` impl. +pub fn namespace_schema_proto<'a>( + id: NamespaceId, + tables: impl Iterator, +) -> generated_types::influxdata::iox::schema::v1::NamespaceSchema { + use generated_types::influxdata::iox::schema::v1 as proto; + proto::NamespaceSchema { + id: id.get(), + tables: tables + .map(|(name, t)| (name.clone(), proto::TableSchema::from(t))) + .collect(), } } @@ -428,7 +462,7 @@ impl TableSchema { Self { id: table.id, partition_template: table.partition_template.clone(), - columns: ColumnsByName::new([]), + columns: ColumnsByName::default(), } } @@ -458,7 +492,11 @@ impl TableSchema { /// /// This method panics if a column of the same name already exists in /// `self`. - pub fn add_column_schema(&mut self, column_name: String, column_schema: ColumnSchema) { + pub fn add_column_schema( + &mut self, + column_name: impl Into>, + column_schema: ColumnSchema, + ) { self.columns.add_column(column_name, column_schema); } @@ -468,12 +506,12 @@ impl TableSchema { + self .columns .iter() - .map(|(k, v)| size_of_val(k) + k.capacity() + size_of_val(v)) + .map(|(k, v)| size_of_val(k) + k.as_ref().len() + size_of_val(v)) .sum::() } /// Create `ID->name` map for columns. - pub fn column_id_map(&self) -> HashMap { + pub fn column_id_map(&self) -> HashMap> { self.columns.id_map() } @@ -494,6 +532,29 @@ impl TableSchema { } } +impl From<&TableSchema> for generated_types::influxdata::iox::schema::v1::TableSchema { + fn from(table_schema: &TableSchema) -> Self { + use generated_types::influxdata::iox::schema::v1 as proto; + + Self { + id: table_schema.id.get(), + columns: table_schema + .columns + .iter() + .map(|(name, c)| { + ( + name.to_string(), + proto::ColumnSchema { + id: c.id.get(), + column_type: c.column_type as i32, + }, + ) + }) + .collect(), + } + } +} + /// Data recorded when compaction skips a partition. #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::FromRow)] pub struct SkippedCompaction { @@ -515,8 +576,9 @@ pub struct SkippedCompaction { pub limit_num_files_first_in_partition: i64, } -use generated_types::influxdata::iox::compactor::v1 as compactor_proto; -impl From for compactor_proto::SkippedCompaction { +impl From + for generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction +{ fn from(skipped_compaction: SkippedCompaction) -> Self { let SkippedCompaction { partition_id, @@ -537,7 +599,27 @@ impl From for compactor_proto::SkippedCompaction { limit_bytes, num_files, limit_num_files, - limit_num_files_first_in_partition: Some(limit_num_files_first_in_partition), + limit_num_files_first_in_partition, + } + } +} + +impl From + for SkippedCompaction +{ + fn from( + skipped_compaction: generated_types::influxdata::iox::skipped_compaction::v1::SkippedCompaction, + ) -> Self { + Self { + partition_id: PartitionId::new(skipped_compaction.partition_id), + reason: skipped_compaction.reason, + skipped_at: Timestamp::new(skipped_compaction.skipped_at), + estimated_bytes: skipped_compaction.estimated_bytes, + limit_bytes: skipped_compaction.limit_bytes, + num_files: skipped_compaction.num_files, + limit_num_files: skipped_compaction.limit_num_files, + limit_num_files_first_in_partition: skipped_compaction + .limit_num_files_first_in_partition, } } } @@ -552,10 +634,11 @@ pub struct ParquetFile { /// the table pub table_id: TableId, /// the partition identifier - #[sqlx(flatten)] - pub partition_id: TransitionPartitionId, + pub partition_id: PartitionId, + /// the optional partition hash id + pub partition_hash_id: Option, /// the uuid used in the object store path for this file - pub object_store_id: Uuid, + pub object_store_id: ObjectStoreId, /// the min timestamp of data in this file pub min_time: Timestamp, /// the max timestamp of data in this file @@ -608,9 +691,10 @@ impl ParquetFile { pub fn from_params(params: ParquetFileParams, id: ParquetFileId) -> Self { Self { id, + partition_id: params.partition_id, + partition_hash_id: params.partition_hash_id, namespace_id: params.namespace_id, table_id: params.table_id, - partition_id: params.partition_id, object_store_id: params.object_store_id, min_time: params.min_time, max_time: params.max_time, @@ -626,7 +710,13 @@ impl ParquetFile { /// Estimate the memory consumption of this object and its contents pub fn size(&self) -> usize { - std::mem::size_of_val(self) + self.partition_id.size() + self.column_set.size() + let hash_id = self + .partition_hash_id + .as_ref() + .map(|x| x.size()) + .unwrap_or_default(); + + std::mem::size_of_val(self) + hash_id + self.column_set.size() - std::mem::size_of_val(&self.column_set) } @@ -661,6 +751,11 @@ impl ParquetFile { } false } + + /// Temporary to aid incremental migration + pub fn transition_partition_id(&self) -> TransitionPartitionId { + TransitionPartitionId::from_parts(self.partition_id, self.partition_hash_id.clone()) + } } impl From for generated_types::influxdata::iox::catalog::v1::ParquetFile { @@ -669,7 +764,11 @@ impl From for generated_types::influxdata::iox::catalog::v1::Parque id: v.id.get(), namespace_id: v.namespace_id.get(), table_id: v.table_id.get(), - partition_identifier: Some(v.partition_id.into()), + partition_id: v.partition_id.get(), + partition_hash_id: v + .partition_hash_id + .map(|x| x.as_bytes().to_vec()) + .unwrap_or_default(), object_store_id: v.object_store_id.to_string(), min_time: v.min_time.get(), max_time: v.max_time.get(), @@ -700,40 +799,8 @@ pub enum ParquetFileProtoError { InvalidObjectStoreId(uuid::Error), /// The specified compaction level value is invalid. - #[error("invalid compaction level: {0}")] - InvalidCompactionLevel(Box), -} - -impl TryFrom for ParquetFile { - type Error = ParquetFileProtoError; - - fn try_from( - v: generated_types::influxdata::iox::catalog::v1::ParquetFile, - ) -> Result { - Ok(Self { - id: ParquetFileId::new(v.id), - namespace_id: NamespaceId::new(v.namespace_id), - table_id: TableId::new(v.table_id), - partition_id: TransitionPartitionId::try_from( - v.partition_identifier - .ok_or(ParquetFileProtoError::NoPartitionId)?, - )?, - object_store_id: v - .object_store_id - .parse() - .map_err(ParquetFileProtoError::InvalidObjectStoreId)?, - min_time: Timestamp::new(v.min_time), - max_time: Timestamp::new(v.max_time), - to_delete: v.to_delete.map(Timestamp::new), - file_size_bytes: v.file_size_bytes, - row_count: v.row_count, - compaction_level: CompactionLevel::try_from(v.compaction_level) - .map_err(ParquetFileProtoError::InvalidCompactionLevel)?, - created_at: Timestamp::new(v.created_at), - column_set: ColumnSet::new(v.column_set.into_iter().map(ColumnId::new)), - max_l0_created_at: Timestamp::new(v.max_l0_created_at), - }) - } + #[error(transparent)] + InvalidCompactionLevel(#[from] CompactionLevelProtoError), } /// Data for a parquet file to be inserted into the catalog. @@ -744,9 +811,11 @@ pub struct ParquetFileParams { /// the table pub table_id: TableId, /// the partition identifier - pub partition_id: TransitionPartitionId, + pub partition_id: PartitionId, + /// the partition hash ID + pub partition_hash_id: Option, /// the uuid used in the object store path for this file - pub object_store_id: Uuid, + pub object_store_id: ObjectStoreId, /// the min timestamp of data in this file pub min_time: Timestamp, /// the max timestamp of data in this file @@ -765,25 +834,6 @@ pub struct ParquetFileParams { pub max_l0_created_at: Timestamp, } -impl From for ParquetFileParams { - fn from(value: ParquetFile) -> Self { - Self { - namespace_id: value.namespace_id, - table_id: value.table_id, - partition_id: value.partition_id, - object_store_id: value.object_store_id, - min_time: value.min_time, - max_time: value.max_time, - file_size_bytes: value.file_size_bytes, - row_count: value.row_count, - compaction_level: value.compaction_level, - created_at: value.created_at, - column_set: value.column_set, - max_l0_created_at: value.max_l0_created_at, - } - } -} - /// ID of a chunk. /// /// This ID is unique within a single partition. @@ -835,9 +885,9 @@ impl std::fmt::Display for ChunkId { } } -impl From for ChunkId { - fn from(uuid: Uuid) -> Self { - Self(uuid) +impl From for ChunkId { + fn from(id: ObjectStoreId) -> Self { + Self(id.get_uuid()) } } @@ -1405,9 +1455,12 @@ impl IsNan for f64 { pub enum Statistics { I64(StatValues), U64(StatValues), - F64(StatValues), Bool(StatValues), String(StatValues), + + /// For the purposes of min/max values of floats, NaN values are ignored (no + /// ordering is applied to NaNs). + F64(StatValues), } impl Statistics { @@ -1706,6 +1759,16 @@ impl TimestampMinMax { || range.contains(self.max) || (self.min <= range.start && self.max >= range.end) } + + /// Returns the union of this range with `other` with the minimum of the `min`s + /// and the maximum of the `max`es + + pub fn union(&self, other: &Self) -> Self { + Self { + min: self.min.min(other.min), + max: self.max.max(other.max), + } + } } /// FileRange describes a range of files by the min/max time and the sum of their capacities. @@ -1726,7 +1789,6 @@ mod tests { use std::borrow::Cow; use ordered_float::OrderedFloat; - use proptest::{prelude::*, proptest}; #[test] fn test_chunk_id_new() { @@ -2661,7 +2723,7 @@ mod tests { let schema1 = TableSchema { id: TableId::new(1), partition_template: Default::default(), - columns: ColumnsByName::new([]), + columns: ColumnsByName::default(), }; let schema2 = TableSchema { id: TableId::new(2), @@ -2681,8 +2743,8 @@ mod tests { let schema1 = NamespaceSchema { id: NamespaceId::new(1), tables: BTreeMap::from([]), - max_tables: MaxTables::new(42), - max_columns_per_table: MaxColumnsPerTable::new(4), + max_tables: MaxTables::try_from(42).unwrap(), + max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(), retention_period_ns: None, partition_template: Default::default(), }; @@ -2692,12 +2754,12 @@ mod tests { String::from("foo"), TableSchema { id: TableId::new(1), - columns: ColumnsByName::new([]), + columns: ColumnsByName::default(), partition_template: Default::default(), }, )]), - max_tables: MaxTables::new(42), - max_columns_per_table: MaxColumnsPerTable::new(4), + max_tables: MaxTables::try_from(42).unwrap(), + max_columns_per_table: MaxColumnsPerTable::try_from(4).unwrap(), retention_period_ns: None, partition_template: Default::default(), }; @@ -2734,77 +2796,4 @@ mod tests { assert_eq!(tr.start(), 1); assert_eq!(tr.end(), 1); } - - use crate::partition::tests::arbitrary_partition_id; - - prop_compose! { - /// Return an arbitrary [`Timestamp`]. - pub fn arbitrary_timestamp()(value in any::()) -> Timestamp { - Timestamp::new(value) - } - } - - fn arbitrary_compaction_level() -> impl prop::strategy::Strategy { - prop_oneof![ - Just(CompactionLevel::Initial), - Just(CompactionLevel::FileNonOverlapped), - Just(CompactionLevel::Final), - ] - } - - prop_compose! { - /// Return an arbitrary [`ParquetFile`] with a randomised values. - fn arbitrary_parquet_file()( - partition_id in arbitrary_partition_id(), - parquet_file_id in any::(), - namespace_id in any::(), - table_id in any::(), - min_time in arbitrary_timestamp(), - max_time in arbitrary_timestamp(), - to_delete in prop::option::of(arbitrary_timestamp()), - file_size_bytes in any::(), - row_count in any::(), - compaction_level in arbitrary_compaction_level(), - created_at in arbitrary_timestamp(), - column_set in prop::collection::vec(any::(), 0..10), - max_l0_created_at in arbitrary_timestamp(), - ) -> ParquetFile { - let column_set = ColumnSet::new(column_set.into_iter().map(ColumnId::new)); - - ParquetFile { - id: ParquetFileId::new(parquet_file_id), - namespace_id: NamespaceId::new(namespace_id), - table_id: TableId::new(table_id), - partition_id, - object_store_id: Uuid::new_v4(), - min_time, - max_time, - to_delete, - file_size_bytes, - row_count, - compaction_level, - created_at, - column_set, - max_l0_created_at, - } - } - } - - proptest! { - /// Assert a [`ParquetFile`] is round-trippable through proto - /// serialisation. - #[test] - fn prop_parquet_file_proto_round_trip(file in arbitrary_parquet_file()) { - use generated_types::influxdata::iox::catalog::v1 as proto; - - // Encoding is infallible - let encoded = proto::ParquetFile::from(file.clone()); - - // Decoding a valid proto ParquetFile is infallible. - let decoded = ParquetFile::try_from(encoded).unwrap(); - - // The deserialised value must match the input (round trippable) - assert_eq!(decoded, file); - } - } } diff --git a/data_types/src/namespace_name.rs b/data_types/src/namespace_name.rs index 04f462a7f90..e9e2e580512 100644 --- a/data_types/src/namespace_name.rs +++ b/data_types/src/namespace_name.rs @@ -131,6 +131,14 @@ impl<'a> NamespaceName<'a> { Ok(Self::new(format!("{}_{}", org, bucket))?) } + + /// Efficiently returns the string representation of this [`NamespaceName`]. + /// + /// If this [`NamespaceName`] contains an owned string, it is returned + /// without cloning. + pub fn into_string(self) -> String { + self.0.into_owned() + } } impl<'a> std::convert::From> for String { @@ -191,6 +199,23 @@ mod tests { .expect("failed on valid DB mapping"); assert_eq!(got.as_str(), "org_bucket"); + assert_eq!(got.into_string(), "org_bucket"); + } + + #[test] + fn test_into_string() { + // Ref type str + assert_eq!( + NamespaceName::new("bananas").unwrap().into_string(), + "bananas" + ); + // Owned type string + assert_eq!( + NamespaceName::new("bananas".to_string()) + .unwrap() + .into_string(), + "bananas" + ); } #[test] diff --git a/data_types/src/partition.rs b/data_types/src/partition.rs index 825fda0d22a..eb095241a28 100644 --- a/data_types/src/partition.rs +++ b/data_types/src/partition.rs @@ -1,8 +1,6 @@ //! Types having to do with partitions. -use crate::SortedColumnSet; - -use super::{TableId, Timestamp}; +use super::{ColumnsByName, SortKeyIds, TableId, Timestamp}; use schema::sort::SortKey; use sha2::Digest; @@ -22,6 +20,14 @@ pub enum TransitionPartitionId { } impl TransitionPartitionId { + /// Create a [`TransitionPartitionId`] from a [`PartitionId`] and optional [`PartitionHashId`] + pub fn from_parts(id: PartitionId, hash_id: Option) -> Self { + match hash_id { + Some(x) => Self::Deterministic(x), + None => Self::Deprecated(id), + } + } + /// Size in bytes including `self`. pub fn size(&self) -> usize { match self { @@ -63,15 +69,12 @@ where impl From<(PartitionId, Option<&PartitionHashId>)> for TransitionPartitionId { fn from((partition_id, partition_hash_id): (PartitionId, Option<&PartitionHashId>)) -> Self { - partition_hash_id - .cloned() - .map(TransitionPartitionId::Deterministic) - .unwrap_or_else(|| TransitionPartitionId::Deprecated(partition_id)) + Self::from_parts(partition_id, partition_hash_id.cloned()) } } impl std::fmt::Display for TransitionPartitionId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { Self::Deprecated(old_partition_id) => write!(f, "{}", old_partition_id.0), Self::Deterministic(partition_hash_id) => write!(f, "{}", partition_hash_id), @@ -169,7 +172,7 @@ impl PartitionId { } impl std::fmt::Display for PartitionId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{}", self.0) } } @@ -279,7 +282,7 @@ const PARTITION_HASH_ID_SIZE_BYTES: usize = 32; pub struct PartitionHashId(Arc<[u8; PARTITION_HASH_ID_SIZE_BYTES]>); impl std::fmt::Display for PartitionHashId { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { for byte in &*self.0 { write!(f, "{:02x}", byte)?; } @@ -340,6 +343,11 @@ impl TryFrom<&[u8]> for PartitionHashId { impl PartitionHashId { /// Create a new `PartitionHashId`. pub fn new(table_id: TableId, partition_key: &PartitionKey) -> Self { + Self::from_raw(table_id, partition_key.as_bytes()) + } + + /// Create a new `PartitionHashId` + pub fn from_raw(table_id: TableId, key: &[u8]) -> Self { // The hash ID of a partition is the SHA-256 of the `TableId` then the `PartitionKey`. This // particular hash format was chosen so that there won't be collisions and this value can // be used to uniquely identify a Partition without needing to go to the catalog to get a @@ -357,7 +365,7 @@ impl PartitionHashId { assert_eq!(table_bytes.len(), 8); inner.update(table_bytes); - inner.update(partition_key.as_bytes()); + inner.update(key); Self(Arc::new(inner.finalize().into())) } @@ -440,37 +448,32 @@ pub struct Partition { /// the string key of the partition pub partition_key: PartitionKey, - // TODO: remove this field once the sort_key_ids is fully imlemented - /// vector of column names that describes how *every* parquet file - /// in this [`Partition`] is sorted. - pub sort_key: Vec, - - /// vector of column ids that describes how *every* parquet file - /// in this [`Partition`] is sorted. The sort_key contains all the + /// Vector of column IDs that describes how *every* parquet file + /// in this [`Partition`] is sorted. The sort key contains all the /// primary key (PK) columns that have been persisted, and nothing /// else. The PK columns are all `tag` columns and the `time` /// column. /// /// Even though it is possible for both the unpersisted data /// and/or multiple parquet files to contain different subsets of - /// columns, the partition's sort_key is guaranteed to be + /// columns, the partition's sort key is guaranteed to be /// "compatible" across all files. Compatible means that the /// parquet file is sorted in the same order as the partition - /// sort_key after removing any missing columns. + /// sort key after removing any missing columns. /// /// Partitions are initially created before any data is persisted - /// with an empty sort_key. The partition sort_key is updated as + /// with an empty sort key. The partition sort key is updated as /// needed when data is persisted to parquet files: both on the /// first persist when the sort key is empty, as on subsequent /// persist operations when new tags occur in newly inserted data. /// - /// Updating inserts new column into the existing order. The order + /// Updating inserts new columns into the existing sort key. The order /// of the existing columns relative to each other is NOT changed. /// /// For example, updating `A,B,C` to either `A,D,B,C` or `A,B,C,D` /// is legal. However, updating to `A,C,D,B` is not because the - /// relative order of B and C have been reversed. - pub sort_key_ids: SortedColumnSet, + /// relative order of B and C has been reversed. + sort_key_ids: SortKeyIds, /// The time at which the newest file of the partition is created pub new_file_at: Option, @@ -480,40 +483,13 @@ impl Partition { /// Create a new Partition data object from the given attributes. This constructor will take /// care of computing the [`PartitionHashId`]. /// - /// This is only appropriate to use in the in-memory catalog or in tests. - pub fn new_in_memory_only( - id: PartitionId, - table_id: TableId, - partition_key: PartitionKey, - sort_key: Vec, - sort_key_ids: SortedColumnSet, - new_file_at: Option, - ) -> Self { - let hash_id = PartitionHashId::new(table_id, &partition_key); - Self { - id, - hash_id: Some(hash_id), - table_id, - partition_key, - sort_key, - sort_key_ids, - new_file_at, - } - } - - /// The sqlite catalog has to define a `PartitionPod` type that's slightly different than - /// `Partition` because of what sqlite serialization is supported. This function is for - /// conversion between the `PartitionPod` type and `Partition` and should not be used anywhere - /// else. - /// - /// The in-memory catalog also creates the `Partition` directly from w - pub fn new_with_hash_id_from_sqlite_catalog_only( + /// This is only appropriate to use in the catalog or in tests. + pub fn new_catalog_only( id: PartitionId, hash_id: Option, table_id: TableId, partition_key: PartitionKey, - sort_key: Vec, - sort_key_ids: SortedColumnSet, + sort_key_ids: SortKeyIds, new_file_at: Option, ) -> Self { Self { @@ -521,7 +497,6 @@ impl Partition { hash_id, table_id, partition_key, - sort_key, sort_key_ids, new_file_at, } @@ -538,30 +513,31 @@ impl Partition { self.hash_id.as_ref() } - // TODO: remove this function after all PRs that teach compactor, ingester, - // and querier to use sort_key_ids are merged. - /// The sort key for the partition, if present, structured as a `SortKey` - pub fn sort_key(&self) -> Option { - if self.sort_key.is_empty() { - return None; - } - - Some(SortKey::from_columns(self.sort_key.iter().map(|s| &**s))) - } - - /// The sort_key_ids if present - pub fn sort_key_ids(&self) -> &SortedColumnSet { - &self.sort_key_ids - } - - /// The sort_key_ids if not empty and None if empty - pub fn sort_key_ids_none_if_empty(&self) -> Option<&SortedColumnSet> { + /// The sort key IDs, if the sort key has been set + pub fn sort_key_ids(&self) -> Option<&SortKeyIds> { if self.sort_key_ids.is_empty() { None } else { Some(&self.sort_key_ids) } } + + /// The sort key containing the column names found in the specified column map. + /// + /// # Panics + /// + /// Will panic if an ID isn't found in the column map. + pub fn sort_key(&self, columns_by_name: &ColumnsByName) -> Option { + self.sort_key_ids() + .map(|sort_key_ids| sort_key_ids.to_sort_key(columns_by_name)) + } + + /// Change the sort key IDs to the given sort key IDs. This should only be used in the + /// in-memory catalog or in tests; all other sort key updates should go through the catalog + /// functions. + pub fn set_sort_key_ids(&mut self, sort_key_ids: &SortKeyIds) { + self.sort_key_ids = sort_key_ids.clone(); + } } #[cfg(test)] diff --git a/data_types/src/partition_template.rs b/data_types/src/partition_template.rs index 48d82e48cac..bbd063302bd 100644 --- a/data_types/src/partition_template.rs +++ b/data_types/src/partition_template.rs @@ -129,36 +129,43 @@ //! [ //! TemplatePart::TimeFormat("%Y"), //! TemplatePart::TagValue("a"), -//! TemplatePart::TagValue("b") +//! TemplatePart::TagValue("b"), +//! TemplatePart::Bucket("c", 10) //! ] //! ``` //! //! The following partition keys are derived: //! -//! * `time=2023-01-01, a=bananas, b=plátanos` -> `2023|bananas|plátanos` -//! * `time=2023-01-01, b=plátanos` -> `2023|!|plátanos` -//! * `time=2023-01-01, another=cat, b=plátanos` -> `2023|!|plátanos` -//! * `time=2023-01-01` -> `2023|!|!` -//! * `time=2023-01-01, a=cat|dog, b=!` -> `2023|cat%7Cdog|%21` -//! * `time=2023-01-01, a=%50` -> `2023|%2550|!` -//! * `time=2023-01-01, a=` -> `2023|^|!` -//! * `time=2023-01-01, a=` -> `2023|#|!` +//! * `time=2023-01-01, a=bananas, b=plátanos, c=ananas` -> `2023|bananas|plátanos|5` +//! * `time=2023-01-01, b=plátanos` -> `2023|!|plátanos|!` +//! * `time=2023-01-01, another=cat, b=plátanos` -> `2023|!|plátanos|!` +//! * `time=2023-01-01` -> `2023|!|!|!` +//! * `time=2023-01-01, a=cat|dog, b=!, c=!` -> `2023|cat%7Cdog|%21|8` +//! * `time=2023-01-01, a=%50, c=%50` -> `2023|%2550|!|9` +//! * `time=2023-01-01, a=, c=` -> `2023|^|!|0` +//! * `time=2023-01-01, a=` -> `2023|#|!|!` //! //! When using the default partitioning template (YYYY-MM-DD) there is no //! encoding necessary, as the derived partition key contains a single part, and //! no reserved characters. //! //! [percent encoded]: https://url.spec.whatwg.org/#percent-encoded-bytes +use std::{ + borrow::Cow, + fmt::{Display, Formatter}, + ops::Range, + sync::Arc, +}; use chrono::{ format::{Numeric, StrftimeItems}, DateTime, Days, Months, Utc, }; use generated_types::influxdata::iox::partition_template::v1 as proto; +use murmur3::murmur3_32; use once_cell::sync::Lazy; use percent_encoding::{percent_decode_str, AsciiSet, CONTROLS}; use schema::TIME_COLUMN_NAME; -use std::{borrow::Cow, sync::Arc}; use thiserror::Error; /// Reasons a user-specified partition template isn't valid. @@ -187,12 +194,32 @@ pub enum ValidationError { #[error("invalid strftime format in partition template: {0}")] InvalidStrftime(String), - /// The partition template defines a [`TagValue`] part, but the provided - /// value is invalid. + /// The partition template defines a [`TagValue`] part or [`Bucket`] part, + /// but the provided tag name value is invalid. /// /// [`TagValue`]: [`proto::template_part::Part::TagValue`] - #[error("invalid tag value in partition template: {0}")] + /// [`Bucket`]: [`proto::template_part::Part::Bucket`] + #[error("invalid tag name value in partition template: {0}")] InvalidTagValue(String), + + /// The partition template defines a [`Bucket`] part, but the provided + /// number of buckets is invalid. + /// + /// [`Bucket`]: [`proto::template_part::Part::Bucket`] + #[error( + "number of buckets in partition template must be in range \ + [{ALLOWED_BUCKET_QUANTITIES:?}), number specified: {0}" + )] + InvalidNumberOfBuckets(u32), + + /// The partition template defines a [`TagValue`] or [`Bucket`] part + /// which repeats a tag name used in another [`TagValue`] or [`Bucket`] part. + /// This is not allowed + /// + /// [`TagValue`]: [`proto::template_part::Part::TagValue`] + /// [`Bucket`]: [`proto::template_part::Part::Bucket`] + #[error("tag name value cannot be repeated in partition template: {0}")] + RepeatedTagValue(String), } /// The maximum number of template parts a custom partition template may specify, to limit the @@ -234,6 +261,14 @@ pub const PARTITION_KEY_PART_TRUNCATED: char = '#'; /// data point. pub const TAG_VALUE_KEY_TIME: &str = "time"; +/// The range of bucket quantities allowed for [`Bucket`] template parts. +/// +/// [`Bucket`]: [`proto::template_part::Part::Bucket`] +pub const ALLOWED_BUCKET_QUANTITIES: Range = Range { + start: 1, + end: 100_000, +}; + /// The minimal set of characters that must be encoded during partition key /// generation when they form part of a partition key part, in order to be /// unambiguously reversible. @@ -249,10 +284,23 @@ pub const ENCODED_PARTITION_KEY_CHARS: AsciiSet = CONTROLS /// Allocationless and protobufless access to the parts of a template needed to /// actually do partitioning. #[derive(Debug, Clone)] -#[allow(missing_docs)] pub enum TemplatePart<'a> { + /// A tag-value partition part. + /// + /// Specifies the name of the tag column. TagValue(&'a str), + + /// A strftime formatter. + /// + /// Specifies the formatter spec applied to the [`TIME_COLUMN_NAME`] column. TimeFormat(&'a str), + + /// A bucketing partition part. + /// + /// Specifies the name of the tag column used to derive which of the `n` + /// buckets the data belongs in, through the mechanism implemented by the + /// [`bucket_for_tag_value`] function. + Bucket(&'a str, u32), } /// The default partitioning scheme is by each day according to the "time" column. @@ -266,6 +314,37 @@ pub static PARTITION_BY_DAY_PROTO: Lazy> = Lazy::n }) }); +// This applies murmur3 32 bit hashing to the tag value string, as Iceberg would. +// +// * +fn iceberg_hash(tag_value: &str) -> u32 { + murmur3_32(&mut tag_value.as_bytes(), 0).expect("read of tag value string must never error") +} + +/// Hash bucket the provided tag value to a bucket ID in the range `[0,num_buckets)`. +/// +/// This applies murmur3 32 bit hashing to the tag value string, zero-ing the sign bit +/// then modulo assigning it to a bucket as Iceberg would. +/// +/// * +/// * +/// +/// +/// # Panics +/// +/// If `num_buckets` is zero, this will panic. Validation MUST prevent +/// [`TemplatePart::Bucket`] from being constructed with a zero bucket count. It just +/// makes no sense and shouldn't need to be checked here. +#[inline(always)] +pub fn bucket_for_tag_value(tag_value: &str, num_buckets: u32) -> u32 { + // Hash the tag value as iceberg would. + let hash = iceberg_hash(tag_value); + // Then bucket it as iceberg would, by removing the sign bit from the + // 32 bit murmur hash and modulo by the number of buckets to assign + // across. + (hash & i32::MAX as u32) % num_buckets +} + /// A partition template specified by a namespace record. /// /// Internally this type is [`None`] when no namespace-level override is @@ -344,6 +423,10 @@ impl TablePartitionTemplateOverride { .map(|part| match part { proto::template_part::Part::TagValue(value) => TemplatePart::TagValue(value), proto::template_part::Part::TimeFormat(fmt) => TemplatePart::TimeFormat(fmt), + proto::template_part::Part::Bucket(proto::Bucket { + tag_name, + num_buckets, + }) => TemplatePart::Bucket(tag_name, *num_buckets), }) } @@ -370,6 +453,10 @@ impl TablePartitionTemplateOverride { .map(|part| match part { proto::template_part::Part::TagValue(s) => s.capacity(), proto::template_part::Part::TimeFormat(s) => s.capacity(), + proto::template_part::Part::Bucket(proto::Bucket { + tag_name, + num_buckets: _, + }) => tag_name.capacity() + std::mem::size_of::(), }) .unwrap_or_default() }) @@ -384,15 +471,42 @@ impl TablePartitionTemplateOverride { } } +/// Display the serde_json representation so that the output +/// can be copy/pasted into CLI tools, etc as the partition +/// template is specified as JSON +impl Display for TablePartitionTemplateOverride { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}", + self.as_proto() + .map(|proto| serde_json::to_string(proto) + .expect("serialization should be infallible")) + .unwrap_or_default() + ) + } +} + +impl TryFrom> for TablePartitionTemplateOverride { + type Error = ValidationError; + + fn try_from(p: Option) -> Result { + Ok(Self(p.map(serialization::Wrapper::try_from).transpose()?)) + } +} + /// This manages the serialization/deserialization of the `proto::PartitionTemplate` type to and /// from the database through `sqlx` for the `NamespacePartitionTemplateOverride` and /// `TablePartitionTemplateOverride` types. It's an internal implementation detail to minimize code /// duplication. mod serialization { - use super::{ValidationError, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, TAG_VALUE_KEY_TIME}; + use super::{ + ValidationError, ALLOWED_BUCKET_QUANTITIES, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, + TAG_VALUE_KEY_TIME, + }; use chrono::{format::StrftimeItems, Utc}; use generated_types::influxdata::iox::partition_template::v1 as proto; - use std::{fmt::Write, sync::Arc}; + use std::{collections::HashSet, fmt::Write, sync::Arc}; #[derive(Debug, Clone, PartialEq, Hash)] pub struct Wrapper(Arc); @@ -437,6 +551,8 @@ mod serialization { return Err(ValidationError::TooManyParts { specified }); } + let mut seen_tags: HashSet<&str> = HashSet::with_capacity(specified); + // All time formats must be valid and tag values may not specify any // restricted values. for part in &partition_template.parts { @@ -479,6 +595,32 @@ mod serialization { "{TAG_VALUE_KEY_TIME} cannot be used" ))); } + + if !seen_tags.insert(value.as_str()) { + return Err(ValidationError::RepeatedTagValue(value.into())); + } + } + Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name, + num_buckets, + })) => { + if tag_name.is_empty() { + return Err(ValidationError::InvalidTagValue(tag_name.into())); + } + + if tag_name.contains(TAG_VALUE_KEY_TIME) { + return Err(ValidationError::InvalidTagValue(format!( + "{TAG_VALUE_KEY_TIME} cannot be used" + ))); + } + + if !seen_tags.insert(tag_name.as_str()) { + return Err(ValidationError::RepeatedTagValue(tag_name.into())); + } + + if !ALLOWED_BUCKET_QUANTITIES.contains(num_buckets) { + return Err(ValidationError::InvalidNumberOfBuckets(*num_buckets)); + } } None => {} } @@ -558,6 +700,10 @@ pub enum ColumnValue<'a> { /// Exclusive end of the datatime partition range. end: DateTime, }, + + /// The inner value is the ID of the bucket selected through a modulo hash + /// of the input column value. + Bucket(u32), } impl<'a> ColumnValue<'a> { @@ -572,7 +718,7 @@ impl<'a> ColumnValue<'a> { let this = match self { ColumnValue::Identity(v) => v.as_bytes(), ColumnValue::Prefix(v) => v.as_bytes(), - ColumnValue::Datetime { .. } => { + ColumnValue::Datetime { .. } | ColumnValue::Bucket(..) => { return false; } }; @@ -590,6 +736,7 @@ where ColumnValue::Identity(v) => other.as_ref().eq(v.as_ref()), ColumnValue::Prefix(_) => false, ColumnValue::Datetime { .. } => false, + ColumnValue::Bucket(..) => false, } } } @@ -605,7 +752,9 @@ where /// /// # Panics /// -/// This method panics if a column value is not valid UTF8 after decoding. +/// This method panics if a column value is not valid UTF8 after decoding, or +/// when a bucket ID is not valid (not a u32 or within the expected number of +/// buckets). pub fn build_column_values<'a>( template: &'a TablePartitionTemplateOverride, partition_key: &'a str, @@ -629,10 +778,21 @@ pub fn build_column_values<'a>( // Produce an iterator of (template_part, template_value) template_parts .zip(key_parts) - .filter_map(|(template, value)| match template { - TemplatePart::TagValue(col_name) => Some((col_name, parse_part_tag_value(value)?)), - TemplatePart::TimeFormat(format) => { - Some((TIME_COLUMN_NAME, parse_part_time_format(value, format)?)) + .filter_map(|(template, value)| { + if value == PARTITION_KEY_VALUE_NULL_STR { + None + } else { + match template { + TemplatePart::TagValue(col_name) => { + Some((col_name, parse_part_tag_value(value)?)) + } + TemplatePart::TimeFormat(format) => { + Some((TIME_COLUMN_NAME, parse_part_time_format(value, format)?)) + } + TemplatePart::Bucket(col_name, num_buckets) => { + Some((col_name, parse_part_bucket(value, num_buckets)?)) + } + } } }) } @@ -640,11 +800,6 @@ pub fn build_column_values<'a>( fn parse_part_tag_value(value: &str) -> Option> { // Perform re-mapping of sentinel values. let value = match value { - PARTITION_KEY_VALUE_NULL_STR => { - // Skip null or empty partition key parts, indicated by the - // presence of a single "!" character as the part value. - return None; - } PARTITION_KEY_VALUE_EMPTY_STR => { // Re-map the empty string sentinel "^"" to an empty string // value. @@ -736,6 +891,18 @@ fn parse_part_time_format(value: &str, format: &str) -> Option Option> { + // Parse the bucket ID from the given value string. + let bucket_id = value + .parse::() + .expect("invalid partition key bucket encoding"); + // Invariant: If the bucket ID (0 indexed) is greater than the number of + // buckets to spread data across the partition key is invalid. + assert!(bucket_id < num_buckets); + + Some(ColumnValue::Bucket(bucket_id)) +} + fn parsed_implicit_defaults(mut parsed: chrono::format::Parsed) -> Option { parsed.year?; @@ -800,6 +967,12 @@ pub fn test_table_partition_override( let part = match part { TemplatePart::TagValue(value) => proto::template_part::Part::TagValue(value.into()), TemplatePart::TimeFormat(fmt) => proto::template_part::Part::TimeFormat(fmt.into()), + TemplatePart::Bucket(value, num_buckets) => { + proto::template_part::Part::Bucket(proto::Bucket { + tag_name: value.into(), + num_buckets, + }) + } }; proto::TemplatePart { part: Some(part) } @@ -814,12 +987,32 @@ pub fn test_table_partition_override( #[cfg(test)] mod tests { - use super::*; use assert_matches::assert_matches; use chrono::TimeZone; + use proptest::prelude::*; use sqlx::Encode; use test_helpers::assert_error; + use super::*; + + #[test] + fn test_partition_template_to_string() { + let template_empty: TablePartitionTemplateOverride = + TablePartitionTemplateOverride::default(); + + let template: Vec> = + [TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a")] + .into_iter() + .collect::>(); + let template: TablePartitionTemplateOverride = test_table_partition_override(template); + + assert_eq!(template_empty.to_string(), ""); + assert_eq!( + template.to_string(), + "{\"parts\":[{\"timeFormat\":\"%Y\"},{\"tagValue\":\"a\"}]}" + ); + } + #[test] fn test_max_partition_key_len() { let max_len: usize = @@ -879,6 +1072,60 @@ mod tests { assert_error!(err, ValidationError::TooManyParts { specified } if specified == 9); } + #[test] + fn repeated_tag_name_value_is_invalid() { + // Test [`TagValue`] + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".into())), + }, + ], + }); + + assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas"); + + // Test [`Bucket`] + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "bananas".into(), + num_buckets: 42, + })), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "bananas".into(), + num_buckets: 42, + })), + }, + ], + }); + + assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas"); + + // Test a combination of [`TagValue`] and [`Bucket`] + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "bananas".into(), + num_buckets: 42, + })), + }, + ], + }); + + assert_error!(err, ValidationError::RepeatedTagValue ( ref specified ) if specified == "bananas"); + } + /// Chrono will panic when formatting a timestamp if the "%#z" formatting /// directive is used... #[test] @@ -947,10 +1194,74 @@ mod tests { assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty()); } + /// "time" is a special column already covered by strftime, being a time + /// series database and all. + #[test] + fn bucket_time_tag_name_is_invalid() { + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "time".into(), + num_buckets: 42, + })), + }], + }); + + assert_error!(err, ValidationError::InvalidTagValue(_)); + } + + #[test] + fn bucket_empty_tag_name_is_invalid() { + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "".into(), + num_buckets: 42, + })), + }], + }); + + assert_error!(err, ValidationError::InvalidTagValue(ref value) if value.is_empty()); + } + + #[test] + fn bucket_zero_num_buckets_is_invalid() { + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "arán".into(), + num_buckets: 0, + })), + }], + }); + + assert_error!(err, ValidationError::InvalidNumberOfBuckets(0)); + } + + #[test] + fn bucket_too_high_num_buckets_is_invalid() { + const TOO_HIGH: u32 = 100_000; + + let err = serialization::Wrapper::try_from(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "arán".into(), + num_buckets: TOO_HIGH, + })), + }], + }); + + assert_error!(err, ValidationError::InvalidNumberOfBuckets(TOO_HIGH)); + } + fn identity(s: &str) -> ColumnValue<'_> { ColumnValue::Identity(s.into()) } + fn bucket(bucket_id: u32) -> ColumnValue<'static> { + ColumnValue::Bucket(bucket_id) + } + fn prefix<'a, T>(s: T) -> ColumnValue<'a> where T: Into>, @@ -965,14 +1276,76 @@ mod tests { } } + #[test] + fn test_iceberg_string_hash() { + assert_eq!(iceberg_hash("iceberg"), 1210000089); + } + + // This is a test fixture designed to catch accidental changes to the + // Iceberg-like hash-bucket partitioning behaviour. + // + // You shouldn't be changing this! + #[test] + fn test_hash_bucket_fixture() { + // These are values lifted from the iceberg spark test suite for + // `BucketString`, sadly not provided in the reference/spec: + // + // https://github.com/apache/iceberg/blob/31e31fd819c846f49d2bd459b8bfadfdc3c2bc3a/spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestSparkBucketFunction.java#L151-L169 + // + assert_eq!(bucket_for_tag_value("abcdefg", 5), 4); + assert_eq!(bucket_for_tag_value("abc", 128), 122); + assert_eq!(bucket_for_tag_value("abcde", 64), 54); + assert_eq!(bucket_for_tag_value("测试", 12), 8); + assert_eq!(bucket_for_tag_value("测试raul试测", 16), 1); + assert_eq!(bucket_for_tag_value("", 16), 0); + + // These are pre-existing arbitrary fixture values + assert_eq!(bucket_for_tag_value("bananas", 10), 1); + assert_eq!(bucket_for_tag_value("plátanos", 100), 98); + assert_eq!(bucket_for_tag_value("crobhaing bananaí", 1000), 166); + assert_eq!(bucket_for_tag_value("bread", 42), 9); + assert_eq!(bucket_for_tag_value("arán", 76), 72); + assert_eq!(bucket_for_tag_value("banana arán", 1337), 1284); + assert_eq!( + bucket_for_tag_value("uasmhéid bananaí", u32::MAX), + 1109892861 + ); + } + + /// Test to approximate and show how the tag value maps to the partition key + /// for the example cases in the mod-doc. The behaviour that renders the key + /// itself is a combination of this bucket assignment and the render logic. + #[test] + fn test_bucket_for_mod_doc() { + assert_eq!(bucket_for_tag_value("ananas", 10), 5); + assert_eq!(bucket_for_tag_value("!", 10), 8); + assert_eq!(bucket_for_tag_value("%50", 10), 9); + assert_eq!(bucket_for_tag_value("", 10), 0); + } + + proptest! { + #[test] + fn prop_consistent_bucketing_within_limits(tag_values in proptest::collection::vec(any::(), (1, 10)), num_buckets in any::()) { + for value in tag_values { + // First pass assign + let want_bucket = bucket_for_tag_value(&value, num_buckets); + // The assigned bucket must fit within the domain given to the bucketer. + assert!(want_bucket < num_buckets); + // Feed in the same tag value, expect the same result. + let got_bucket = bucket_for_tag_value(&value, num_buckets); + assert_eq!(want_bucket, got_bucket); + } + } + } + /// Generate a test that asserts "partition_key" is reversible, yielding /// "want" assuming the partition "template" was used. macro_rules! test_build_column_values { ( $name:ident, - template = $template:expr, // Array/vec of TemplatePart - partition_key = $partition_key:expr, // String derived partition key - want = $want:expr // Expected build_column_values() output + template = $template:expr, // Array/vec of TemplatePart + partition_key = $partition_key:expr, // String derived partition key + want = $want:expr // Expected build_column_values() output ) => { paste::paste! { #[test] @@ -1001,23 +1374,26 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|bananas|plátanos", + partition_key = "2023|bananas|plátanos|5", want = [ (TIME_COLUMN_NAME, year(2023)), ("a", identity("bananas")), ("b", identity("plátanos")), + ("c", bucket(5)), ] ); test_build_column_values!( - module_doc_example_2, + module_doc_example_2, // Examples 2 and 3 are the same partition key template = [ TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|!|plátanos", + partition_key = "2023|!|plátanos|!", want = [(TIME_COLUMN_NAME, year(2023)), ("b", identity("plátanos")),] ); @@ -1027,8 +1403,9 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|!|!", + partition_key = "2023|!|!|!", want = [(TIME_COLUMN_NAME, year(2023)),] ); @@ -1038,12 +1415,14 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|cat%7Cdog|%21", + partition_key = "2023|cat%7Cdog|%21|8", want = [ (TIME_COLUMN_NAME, year(2023)), ("a", identity("cat|dog")), ("b", identity("!")), + ("c", bucket(8)), ] ); @@ -1053,9 +1432,14 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|%2550|!", - want = [(TIME_COLUMN_NAME, year(2023)), ("a", identity("%50")),] + partition_key = "2023|%2550|!|9", + want = [ + (TIME_COLUMN_NAME, year(2023)), + ("a", identity("%50")), + ("c", bucket(9)), + ] ); test_build_column_values!( @@ -1064,8 +1448,25 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|BANANAS#|!", + partition_key = "2023|^|!|0", + want = [ + (TIME_COLUMN_NAME, year(2023)), + ("a", identity("")), + ("c", bucket(0)), + ] + ); + + test_build_column_values!( + module_doc_example_8, + template = [ + TemplatePart::TimeFormat("%Y"), + TemplatePart::TagValue("a"), + TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), + ], + partition_key = "2023|BANANAS#|!|!|!", want = [(TIME_COLUMN_NAME, year(2023)), ("a", prefix("BANANAS")),] ); @@ -1075,8 +1476,9 @@ mod tests { TemplatePart::TimeFormat("%Y"), TemplatePart::TagValue("a"), TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10), ], - partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!", + partition_key = "2023|%28%E3%83%8E%E0%B2%A0%E7%9B%8A%E0%B2%A0%29%E3%83%8E%E5%BD%A1%E2%94%BB%E2%94%81%E2%94%BB#|!|!", want = [ (TIME_COLUMN_NAME, year(2023)), ("a", prefix("(ノಠ益ಠ)ノ彡┻━┻")), @@ -1115,6 +1517,13 @@ mod tests { want = [] ); + test_build_column_values!( + datetime_null, + template = [TemplatePart::TimeFormat("%Y"),], + partition_key = "!", + want = [] + ); + test_build_column_values!( datetime_range_y, template = [TemplatePart::TimeFormat("%Y"),], @@ -1206,6 +1615,51 @@ mod tests { )] ); + test_build_column_values!( + bucket_part_fixture, + template = [ + TemplatePart::Bucket("a", 41), + TemplatePart::Bucket("b", 91), + TemplatePart::Bucket("c", 144) + ], + partition_key = "1|2|3", + want = [("a", bucket(1)), ("b", bucket(2)), ("c", bucket(3)),] + ); + + #[test] + #[should_panic] + fn test_build_column_values_bucket_part_out_of_range_panics() { + let template = [ + TemplatePart::Bucket("a", 42), + TemplatePart::Bucket("b", 42), + TemplatePart::Bucket("c", 42), + ] + .into_iter() + .collect::>(); + let template = test_table_partition_override(template); + + // normalise the values into a (str, ColumnValue) for the comparison + let input = String::from("1|1|43"); + let _ = build_column_values(&template, input.as_str()).collect::>(); + } + + #[test] + #[should_panic] + fn test_build_column_values_bucket_part_not_u32_panics() { + let template = [ + TemplatePart::Bucket("a", 42), + TemplatePart::Bucket("b", 42), + TemplatePart::Bucket("c", 42), + ] + .into_iter() + .collect::>(); + let template = test_table_partition_override(template); + + // normalise the values into a (str, ColumnValue) for the comparison + let input = String::from("1|1|bananas"); + let _ = build_column_values(&template, input.as_str()).collect::>(); + } + test_build_column_values!( datetime_not_compact_y_d, template = [TemplatePart::TimeFormat("%Y-%d"),], @@ -1369,11 +1823,18 @@ mod tests { proto::TemplatePart { part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), }, + proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: "bananas".into(), + num_buckets: 42, + })), + }, ], }; let expected_json_str = "{\"parts\":[\ {\"tagValue\":\"region\"},\ - {\"timeFormat\":\"year-%Y\"}\ + {\"timeFormat\":\"year-%Y\"},\ + {\"bucket\":{\"tagName\":\"bananas\",\"numBuckets\":42}}\ ]}"; let namespace = NamespacePartitionTemplateOverride::try_from(custom_template).unwrap(); @@ -1383,7 +1844,7 @@ mod tests { ); fn extract_sqlite_argument_text( - argument_value: &sqlx::sqlite::SqliteArgumentValue, + argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>, ) -> String { match argument_value { sqlx::sqlite::SqliteArgumentValue::Text(cow) => cow.to_string(), @@ -1401,6 +1862,88 @@ mod tests { ); let table_json_str: String = buf.iter().map(extract_sqlite_argument_text).collect(); assert_eq!(table_json_str, expected_json_str); - assert_eq!(table.len(), 2); + assert_eq!(table.len(), 3); + } + + #[test] + fn test_template_size_reporting() { + const BASE_SIZE: usize = std::mem::size_of::() + + std::mem::size_of::(); + + let first_string = "^"; + let template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue(first_string.into())), + }], + }), + &NamespacePartitionTemplateOverride::default(), + ) + .expect("failed to create table partition template "); + + assert_eq!( + template.size(), + BASE_SIZE + std::mem::size_of::() + first_string.len() + ); + + let second_string = "region"; + let template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue(second_string.into())), + }], + }), + &NamespacePartitionTemplateOverride::default(), + ) + .expect("failed to create table partition template "); + + assert_eq!( + template.size(), + BASE_SIZE + std::mem::size_of::() + second_string.len() + ); + + let time_string = "year-%Y"; + let template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue(second_string.into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat(time_string.into())), + }, + ], + }), + &NamespacePartitionTemplateOverride::default(), + ) + .expect("failed to create table partition template "); + assert_eq!( + template.size(), + BASE_SIZE + + std::mem::size_of::() + + second_string.len() + + std::mem::size_of::() + + time_string.len() + ); + + let template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(proto::Bucket { + tag_name: second_string.into(), + num_buckets: 42, + })), + }], + }), + &NamespacePartitionTemplateOverride::default(), + ) + .expect("failed to create table partition template"); + assert_eq!( + template.size(), + BASE_SIZE + + std::mem::size_of::() + + second_string.len() + + std::mem::size_of::() + ); } } diff --git a/data_types/src/service_limits.rs b/data_types/src/service_limits.rs index 4e4acc14914..7c00b6a1e90 100644 --- a/data_types/src/service_limits.rs +++ b/data_types/src/service_limits.rs @@ -3,74 +3,164 @@ use generated_types::influxdata::iox::namespace::{ v1 as namespace_proto, v1::update_namespace_service_protection_limit_request::LimitUpdate, }; +use observability_deps::tracing::*; +use std::num::NonZeroUsize; use thiserror::Error; -/// Max tables allowed in a namespace. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)] -#[sqlx(transparent)] -pub struct MaxTables(i32); +/// Definitions that apply to both MaxColumnsPerTable and MaxTables. Note that the hardcoded +/// default value specified in the macro invocation must be greater than 0 and fit in an `i32`. +macro_rules! define_service_limit { + ($type_name:ident, $default_value:expr, $documentation:expr) => { + /// $documentation + #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] + pub struct $type_name(NonZeroUsize); -#[allow(missing_docs)] -impl MaxTables { - pub const fn new(v: i32) -> Self { - Self(v) - } + impl TryFrom for $type_name { + type Error = ServiceLimitError; - pub fn get(&self) -> i32 { - self.0 - } + fn try_from(value: usize) -> Result { + // Even though the value is stored as a `usize`, service limits are stored as `i32` + // in the database and transferred as i32 over protobuf. So try to convert to an + // `i32` (and throw away the result) so that we know about invalid values before + // trying to use them. + if i32::try_from(value).is_err() { + return Err(ServiceLimitError::MustFitInI32); + } - /// Default per-namespace table count service protection limit. - pub const fn const_default() -> Self { - Self(500) - } -} + let nonzero_value = + NonZeroUsize::new(value).ok_or(ServiceLimitError::MustBeGreaterThanZero)?; -impl Default for MaxTables { - fn default() -> Self { - Self::const_default() - } -} + Ok(Self(nonzero_value)) + } + } -impl std::fmt::Display for MaxTables { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.0) - } -} + impl TryFrom for $type_name { + type Error = ServiceLimitError; -/// Max columns per table allowed in a namespace. -#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, sqlx::Type)] -#[sqlx(transparent)] -pub struct MaxColumnsPerTable(i32); + fn try_from(value: u64) -> Result { + // Even though the value is stored as a `usize`, service limits are stored as `i32` + // in the database and transferred as i32 over protobuf. So try to convert to an + // `i32` (and throw away the result) so that we know about invalid values before + // trying to use them. + if i32::try_from(value).is_err() { + return Err(ServiceLimitError::MustFitInI32); + } -#[allow(missing_docs)] -impl MaxColumnsPerTable { - pub const fn new(v: i32) -> Self { - Self(v) - } + let nonzero_value = usize::try_from(value) + .ok() + .and_then(NonZeroUsize::new) + .ok_or(ServiceLimitError::MustBeGreaterThanZero)?; - pub fn get(&self) -> i32 { - self.0 - } + Ok(Self(nonzero_value)) + } + } - /// Default per-table column count service protection limit. - pub const fn const_default() -> Self { - Self(200) - } -} + impl TryFrom for $type_name { + type Error = ServiceLimitError; -impl Default for MaxColumnsPerTable { - fn default() -> Self { - Self::const_default() - } -} + fn try_from(value: i32) -> Result { + let nonzero_value = usize::try_from(value) + .ok() + .and_then(NonZeroUsize::new) + .ok_or(ServiceLimitError::MustBeGreaterThanZero)?; -impl std::fmt::Display for MaxColumnsPerTable { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.0) - } + Ok(Self(nonzero_value)) + } + } + + #[allow(missing_docs)] + impl $type_name { + pub fn get(&self) -> usize { + self.0.get() + } + + /// For use by the database and some protobuf representations. It should not be + /// possible to construct an instance that contains a `NonZeroUsize` that won't fit in + /// an `i32`. + pub fn get_i32(&self) -> i32 { + self.0.get() as i32 + } + + /// Constant-time default for use in constructing test constants. + pub const fn const_default() -> Self { + // This is safe because the hardcoded value is not 0. + let value = unsafe { NonZeroUsize::new_unchecked($default_value) }; + + Self(value) + } + } + + impl Default for $type_name { + fn default() -> Self { + Self::const_default() + } + } + + impl std::fmt::Display for $type_name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.0) + } + } + + // Tell sqlx this is an i32 in the database. + impl sqlx::Type for $type_name + where + i32: sqlx::Type, + DB: sqlx::Database, + { + fn type_info() -> DB::TypeInfo { + >::type_info() + } + } + + impl<'q, DB> sqlx::Encode<'q, DB> for $type_name + where + DB: sqlx::Database, + i32: sqlx::Encode<'q, DB>, + { + fn encode_by_ref( + &self, + buf: &mut >::ArgumentBuffer, + ) -> sqlx::encode::IsNull { + >::encode_by_ref(&self.get_i32(), buf) + } + } + + // The database stores i32s, so there's a chance of invalid values already being stored in + // there. When deserializing those values, rather than panicking or returning an error, log + // and use the default instead. + impl<'r, DB: ::sqlx::Database> ::sqlx::decode::Decode<'r, DB> for $type_name + where + i32: sqlx::Decode<'r, DB>, + { + fn decode( + value: >::ValueRef, + ) -> ::std::result::Result< + Self, + ::std::boxed::Box< + dyn ::std::error::Error + 'static + ::std::marker::Send + ::std::marker::Sync, + >, + > { + let data = >::decode(value)?; + + let data = Self::try_from(data).unwrap_or_else(|_| { + error!("database contains invalid $type_name value {data}, using default value"); + Self::default() + }); + + Ok(data) + } + } + }; } +define_service_limit!(MaxTables, 500, "Max tables allowed in a namespace."); +define_service_limit!( + MaxColumnsPerTable, + 200, + "Max columns per table allowed in a namespace." +); + /// Overrides for service protection limits. #[derive(Debug, Copy, Clone)] pub struct NamespaceServiceProtectionLimitsOverride { @@ -80,16 +170,23 @@ pub struct NamespaceServiceProtectionLimitsOverride { pub max_columns_per_table: Option, } -impl From for NamespaceServiceProtectionLimitsOverride { - fn from(value: namespace_proto::ServiceProtectionLimits) -> Self { +impl TryFrom + for NamespaceServiceProtectionLimitsOverride +{ + type Error = ServiceLimitError; + + fn try_from(value: namespace_proto::ServiceProtectionLimits) -> Result { let namespace_proto::ServiceProtectionLimits { max_tables, max_columns_per_table, } = value; - Self { - max_tables: max_tables.map(MaxTables::new), - max_columns_per_table: max_columns_per_table.map(MaxColumnsPerTable::new), - } + + Ok(Self { + max_tables: max_tables.map(MaxTables::try_from).transpose()?, + max_columns_per_table: max_columns_per_table + .map(MaxColumnsPerTable::try_from) + .transpose()?, + }) } } @@ -114,6 +211,11 @@ pub enum ServiceLimitError { /// No value was provided so we can't update anything #[error("a supported service limit value is required")] NoValueSpecified, + + /// Limits are stored as `i32` in the database and transferred as i32 over protobuf, so even + /// though they are stored as `usize` in Rust, the `usize` value must be less than `i32::MAX`. + #[error("service limit values must fit in a 32-bit signed integer (`i32`)")] + MustFitInI32, } impl TryFrom> for ServiceLimitUpdate { @@ -122,20 +224,88 @@ impl TryFrom> for ServiceLimitUpdate { fn try_from(limit_update: Option) -> Result { match limit_update { Some(LimitUpdate::MaxTables(n)) => { - if n == 0 { - return Err(ServiceLimitError::MustBeGreaterThanZero); - } - Ok(ServiceLimitUpdate::MaxTables(MaxTables::new(n))) - } - Some(LimitUpdate::MaxColumnsPerTable(n)) => { - if n == 0 { - return Err(ServiceLimitError::MustBeGreaterThanZero); - } - Ok(ServiceLimitUpdate::MaxColumnsPerTable( - MaxColumnsPerTable::new(n), - )) + Ok(ServiceLimitUpdate::MaxTables(MaxTables::try_from(n)?)) } + Some(LimitUpdate::MaxColumnsPerTable(n)) => Ok(ServiceLimitUpdate::MaxColumnsPerTable( + MaxColumnsPerTable::try_from(n)?, + )), None => Err(ServiceLimitError::NoValueSpecified), } } } + +#[cfg(test)] +mod tests { + use super::*; + + fn extract_sqlite_argument_i32(argument_value: &sqlx::sqlite::SqliteArgumentValue<'_>) -> i32 { + match argument_value { + sqlx::sqlite::SqliteArgumentValue::Int(i) => *i, + other => panic!("Expected Int values, got: {other:?}"), + } + } + + macro_rules! service_limit_test { + ($type_name:ident, $module_name: ident) => { + mod $module_name { + use super::*; + + fn success>(value: T, expected: usize) + where + >::Error: std::fmt::Debug, + { + assert_eq!(value.try_into().unwrap().get(), expected); + } + + #[test] + fn successful_conversions() { + success(1usize, 1); + success(1u64, 1); + success(1i32, 1); + success(i32::MAX, i32::MAX as usize); + } + + fn failure>(value: T, expected_error_message: &str) + where + >::Error: std::fmt::Debug + std::fmt::Display, + { + assert_eq!( + value.try_into().unwrap_err().to_string(), + expected_error_message + ); + } + + #[test] + fn failed_conversions() { + failure(0usize, "service limit values must be greater than 0"); + failure(0u64, "service limit values must be greater than 0"); + failure(0i32, "service limit values must be greater than 0"); + failure(-1i32, "service limit values must be greater than 0"); + failure( + i32::MAX as usize + 1, + "service limit values must fit in a 32-bit signed integer (`i32`)", + ); + failure( + i32::MAX as u64 + 1, + "service limit values must fit in a 32-bit signed integer (`i32`)", + ); + } + + #[test] + fn encode() { + let value = $type_name::try_from(10).unwrap(); + let mut buf = Default::default(); + let _ = <$type_name as sqlx::Encode<'_, sqlx::Sqlite>>::encode_by_ref( + &value, &mut buf, + ); + + let encoded: Vec<_> = buf.iter().map(extract_sqlite_argument_i32).collect(); + assert_eq!(encoded, &[value.get_i32()]); + } + } + }; + } + + service_limit_test!(MaxTables, max_tables); + service_limit_test!(MaxColumnsPerTable, max_columns_per_table); +} diff --git a/data_types/src/snapshot/hash.rs b/data_types/src/snapshot/hash.rs new file mode 100644 index 00000000000..adf8c24c96f --- /dev/null +++ b/data_types/src/snapshot/hash.rs @@ -0,0 +1,219 @@ +//! A primitive hash table supporting linear probing + +use bytes::Bytes; +use generated_types::influxdata::iox::catalog_cache::v1 as generated; +use siphasher::sip::SipHasher24; + +use snafu::{ensure, Snafu}; + +/// Error for [`HashBuckets`] +#[derive(Debug, Snafu)] +#[allow(missing_docs, missing_copy_implementations)] +pub enum Error { + #[snafu(display("Bucket length not a power of two"))] + BucketsNotPower, + #[snafu(display("Unrecognized hash function"))] + UnrecognizedHash, +} + +/// Result for [`HashBuckets`] +pub type Result = std::result::Result; + +/// A primitive hash table supporting [linear probing] +/// +/// [linear probing](https://en.wikipedia.org/wiki/Linear_probing) +#[derive(Debug, Clone)] +pub struct HashBuckets { + /// The mask to yield index in `buckets` from a u64 hash + mask: usize, + /// A sequence of u32 encoding the value index + 1, or 0 if empty + buckets: Bytes, + /// The hash function to use + hash: SipHasher24, +} + +impl HashBuckets { + /// Performs a lookup of `value` + pub fn lookup(&self, value: &[u8]) -> HashProbe<'_> { + self.lookup_raw(self.hash.hash(value)) + } + + fn lookup_raw(&self, hash: u64) -> HashProbe<'_> { + let idx = (hash as usize) & self.mask; + HashProbe { + idx, + buckets: self, + mask: self.mask as _, + } + } +} + +impl TryFrom for HashBuckets { + type Error = Error; + + fn try_from(value: generated::HashBuckets) -> std::result::Result { + let buckets_len = value.buckets.len(); + ensure!(buckets_len.count_ones() == 1, BucketsNotPowerSnafu); + let mask = buckets_len.wrapping_sub(1) ^ 3; + match value.hash_function { + Some(generated::hash_buckets::HashFunction::SipHash24(s)) => Ok(Self { + mask, + buckets: value.buckets, + hash: SipHasher24::new_with_keys(s.key0, s.key1), + }), + _ => Err(Error::UnrecognizedHash), + } + } +} + +impl From for generated::HashBuckets { + fn from(value: HashBuckets) -> Self { + let (key0, key1) = value.hash.keys(); + Self { + buckets: value.buckets, + hash_function: Some(generated::hash_buckets::HashFunction::SipHash24( + generated::SipHash24 { key0, key1 }, + )), + } + } +} + +/// Yields the indices to probe for equality +#[derive(Debug)] +pub struct HashProbe<'a> { + buckets: &'a HashBuckets, + idx: usize, + mask: usize, +} + +impl<'a> Iterator for HashProbe<'a> { + type Item = usize; + + fn next(&mut self) -> Option { + let slice = self.buckets.buckets.get(self.idx..self.idx + 4)?; + let entry = u32::from_le_bytes(slice.try_into().unwrap()); + self.idx = (self.idx + 4) & self.mask; + + // Empty entries are encoded as 0 + Some(entry.checked_sub(1)? as usize) + } +} + +/// An encoder for [`HashBuckets`] +#[derive(Debug)] +pub struct HashBucketsEncoder { + mask: usize, + buckets: Vec, + hash: SipHasher24, + len: u32, + capacity: u32, +} + +impl HashBucketsEncoder { + /// Create a new [`HashBucketsEncoder`] + /// + /// # Panics + /// + /// Panics if capacity >= u32::MAX + pub fn new(capacity: usize) -> Self { + assert!(capacity < u32::MAX as usize); + + let buckets_len = (capacity * 2).next_power_of_two() * 4; + let mask = buckets_len.wrapping_sub(1) ^ 3; + Self { + mask, + len: 0, + capacity: capacity as u32, + buckets: vec![0; buckets_len], + // Note: this uses keys (0, 0) + hash: SipHasher24::new(), + } + } + + /// Append a new value + /// + /// # Panics + /// + /// Panics if this would exceed the capacity provided to new + pub fn push(&mut self, v: &[u8]) { + self.push_raw(self.hash.hash(v)); + } + + /// Append a new value by hash, returning the bucket index + fn push_raw(&mut self, hash: u64) -> usize { + assert_ne!(self.len, self.capacity); + self.len += 1; + let entry = self.len; + let mut idx = (hash as usize) & self.mask; + loop { + let s = &mut self.buckets[idx..idx + 4]; + let s: &mut [u8; 4] = s.try_into().unwrap(); + if s.iter().all(|x| *x == 0) { + *s = entry.to_le_bytes(); + return idx / 4; + } + idx = (idx + 4) & self.mask; + } + } + + /// Construct the output [`HashBuckets`] + pub fn finish(self) -> HashBuckets { + HashBuckets { + mask: self.mask, + hash: self.hash, + buckets: self.buckets.into(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_collision() { + let mut builder = HashBucketsEncoder::new(6); + + assert_eq!(builder.push_raw(14), 3); + assert_eq!(builder.push_raw(297), 10); + assert_eq!(builder.push_raw(43), 11); // Hashes to occupied bucket 10 + assert_eq!(builder.push_raw(60), 15); + assert_eq!(builder.push_raw(124), 0); // Hashes to occupied bucket 15 + assert_eq!(builder.push_raw(0), 1); // Hashes to occupied bucket 0 + + let buckets = builder.finish(); + + let l = buckets.lookup_raw(14).collect::>(); + assert_eq!(l, vec![0]); + + let l = buckets.lookup_raw(297).collect::>(); + assert_eq!(l, vec![1, 2]); + + let l = buckets.lookup_raw(43).collect::>(); + assert_eq!(l, vec![1, 2]); + + let l = buckets.lookup_raw(60).collect::>(); + assert_eq!(l, vec![3, 4, 5]); + + let l = buckets.lookup_raw(0).collect::>(); + assert_eq!(l, vec![4, 5]); + } + + #[test] + fn test_basic() { + let data = ["a", "", "bongos", "cupcakes", "bananas"]; + let mut builder = HashBucketsEncoder::new(data.len()); + for s in &data { + builder.push(s.as_bytes()); + } + let buckets = builder.finish(); + + let contains = |s: &str| -> bool { buckets.lookup(s.as_bytes()).any(|idx| data[idx] == s) }; + + assert!(contains("a")); + assert!(contains("")); + assert!(contains("bongos")); + assert!(contains("bananas")); + assert!(!contains("windows")); + } +} diff --git a/data_types/src/snapshot/list.rs b/data_types/src/snapshot/list.rs new file mode 100644 index 00000000000..bd86b98dd30 --- /dev/null +++ b/data_types/src/snapshot/list.rs @@ -0,0 +1,192 @@ +//! A list of [`Message`] supporting efficient skipping + +use bytes::Bytes; +use prost::Message; +use snafu::{ensure, Snafu}; +use std::marker::PhantomData; +use std::ops::Range; + +use generated_types::influxdata::iox::catalog_cache::v1 as generated; + +/// Error type for [`MessageList`] +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(context(false), display("PackedList decode error: {source}"))] + DecodeError { source: prost::DecodeError }, + + #[snafu(context(false), display("PackedList encode error: {source}"))] + EncodeError { source: prost::EncodeError }, + + #[snafu(display("Invalid MessageList offsets: {start}..{end}"))] + InvalidSlice { start: usize, end: usize }, + + #[snafu(display("MessageList slice {start}..{end} out of bounds 0..{bounds}"))] + SliceOutOfBounds { + start: usize, + end: usize, + bounds: usize, + }, +} + +/// Error type for [`MessageList`] +pub type Result = std::result::Result; + +/// A packed list of [`Message`] +/// +/// Normally protobuf encodes repeated fields by simply encoding the tag multiple times, +/// see [here](https://protobuf.dev/programming-guides/encoding/#optional). +/// +/// Unfortunately this means it is not possible to locate a value at a given index without +/// decoding all prior records. [`MessageList`] therefore provides a list encoding, inspired +/// by arrow, that provides this and is designed to be combined with [`prost`]'s support +/// for zero-copy decoding of [`Bytes`] +#[derive(Debug, Clone, Eq, PartialEq)] +pub struct MessageList { + len: usize, + offsets: Bytes, + values: Bytes, + phantom: PhantomData, +} + +impl MessageList { + /// Encode `values` to a [`MessageList`] + pub fn encode(values: &[T]) -> Result { + let cap = (values.len() + 1) * 4; + let mut offsets: Vec = Vec::with_capacity(cap); + offsets.extend_from_slice(&0_u32.to_le_bytes()); + + let mut cap = 0; + for x in values { + cap += x.encoded_len(); + let offset = u32::try_from(cap).unwrap(); + offsets.extend_from_slice(&offset.to_le_bytes()); + } + + let mut data = Vec::with_capacity(cap); + values.iter().try_for_each(|x| x.encode(&mut data))?; + + Ok(Self { + len: values.len(), + offsets: offsets.into(), + values: data.into(), + phantom: Default::default(), + }) + } + + /// Returns true if this list is empty + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + /// Returns the number of elements in this list + pub fn len(&self) -> usize { + self.len + } + + /// Returns the element at index `idx` + pub fn get(&self, idx: usize) -> Result { + let offset_start = idx * 4; + let offset_slice = &self.offsets[offset_start..offset_start + 8]; + let start = u32::from_le_bytes(offset_slice[0..4].try_into().unwrap()) as usize; + let end = u32::from_le_bytes(offset_slice[4..8].try_into().unwrap()) as usize; + + let bounds = self.values.len(); + ensure!(end >= start, InvalidSliceSnafu { start, end }); + ensure!(end <= bounds, SliceOutOfBoundsSnafu { start, end, bounds }); + + // We slice `Bytes` to preserve zero-copy + let data = self.values.slice(start..end); + Ok(T::decode(data)?) + } +} + +impl From for MessageList { + fn from(proto: generated::MessageList) -> Self { + let len = (proto.offsets.len() / 4).saturating_sub(1); + Self { + len, + offsets: proto.offsets, + values: proto.values, + phantom: Default::default(), + } + } +} + +impl From> for generated::MessageList { + fn from(value: MessageList) -> Self { + Self { + offsets: value.offsets, + values: value.values, + } + } +} + +impl IntoIterator for MessageList { + type Item = Result; + type IntoIter = MessageListIter; + + fn into_iter(self) -> Self::IntoIter { + MessageListIter { + iter: (0..self.len), + list: self, + } + } +} + +/// [`Iterator`] for [`MessageList`] +#[derive(Debug)] +pub struct MessageListIter { + iter: Range, + list: MessageList, +} + +impl Iterator for MessageListIter { + type Item = Result; + + fn next(&mut self) -> Option { + Some(self.list.get(self.iter.next()?)) + } + + fn size_hint(&self) -> (usize, Option) { + self.iter.size_hint() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple() { + let strings = ["", "test", "foo", "abc", "", "skd"]; + let strings: Vec<_> = strings.into_iter().map(ToString::to_string).collect(); + + let encoded = MessageList::encode(&strings).unwrap(); + + assert_eq!(encoded.get(5).unwrap().as_str(), "skd"); + assert_eq!(encoded.get(2).unwrap().as_str(), "foo"); + assert_eq!(encoded.get(0).unwrap().as_str(), ""); + + let decoded: Vec<_> = encoded.clone().into_iter().map(Result::unwrap).collect(); + assert_eq!(strings, decoded); + + let proto = generated::MessageList::from(encoded.clone()); + let back = MessageList::::from(proto.clone()); + assert_eq!(encoded, back); + + // Invalid decode should return error not panic + let invalid = MessageList::::from(proto); + invalid.get(2).unwrap_err(); + + let strings: Vec = vec![]; + let encoded = MessageList::encode(&strings).unwrap(); + assert_eq!(encoded.len(), 0); + assert!(encoded.is_empty()); + + let proto = generated::MessageList::default(); + let encoded = MessageList::::from(proto); + assert_eq!(encoded.len(), 0); + assert!(encoded.is_empty()); + } +} diff --git a/data_types/src/snapshot/mask.rs b/data_types/src/snapshot/mask.rs new file mode 100644 index 00000000000..ae9dc3bc0ba --- /dev/null +++ b/data_types/src/snapshot/mask.rs @@ -0,0 +1,71 @@ +//! A packed bitmask + +use arrow_buffer::bit_iterator::BitIndexIterator; +use arrow_buffer::bit_util::{ceil, set_bit}; +use bytes::Bytes; +use generated_types::influxdata::iox::catalog_cache::v1 as generated; + +/// A packed bitmask +#[derive(Debug, Clone)] +pub struct BitMask { + mask: Bytes, + len: usize, +} + +impl BitMask { + /// Returns an iterator of the set indices in this mask + pub fn set_indices(&self) -> BitIndexIterator<'_> { + BitIndexIterator::new(&self.mask, 0, self.len) + } +} + +impl From for BitMask { + fn from(value: generated::BitMask) -> Self { + Self { + mask: value.mask, + len: value.len as _, + } + } +} + +impl From for generated::BitMask { + fn from(value: BitMask) -> Self { + Self { + mask: value.mask, + len: value.len as _, + } + } +} + +/// A builder for [`BitMask`] +#[derive(Debug)] +pub struct BitMaskBuilder { + values: Vec, + len: usize, +} + +impl BitMaskBuilder { + /// Create a new bitmask able to store `len` boolean values + #[inline] + pub fn new(len: usize) -> Self { + Self { + values: vec![0; ceil(len, 8)], + len, + } + } + + /// Set the bit at index `idx` + #[inline] + pub fn set_bit(&mut self, idx: usize) { + set_bit(&mut self.values, idx) + } + + /// Return the built [`BitMask`] + #[inline] + pub fn finish(self) -> BitMask { + BitMask { + mask: self.values.into(), + len: self.len, + } + } +} diff --git a/data_types/src/snapshot/mod.rs b/data_types/src/snapshot/mod.rs new file mode 100644 index 00000000000..7be5a937954 --- /dev/null +++ b/data_types/src/snapshot/mod.rs @@ -0,0 +1,11 @@ +//! Definitions of catalog snapshots +//! +//! Snapshots are read-optimised, that is they are designed to be inexpensive to +//! decode, making extensive use of zero-copy [`Bytes`](bytes::Bytes) in place of +//! allocating structures such as `String` and `Vec` + +pub mod hash; +pub mod list; +pub mod mask; +pub mod partition; +pub mod table; diff --git a/data_types/src/snapshot/partition.rs b/data_types/src/snapshot/partition.rs new file mode 100644 index 00000000000..d1838e57acf --- /dev/null +++ b/data_types/src/snapshot/partition.rs @@ -0,0 +1,246 @@ +//! Snapshot definition for partitions + +use crate::snapshot::list::MessageList; +use crate::snapshot::mask::{BitMask, BitMaskBuilder}; +use crate::{ + ColumnId, ColumnSet, CompactionLevelProtoError, NamespaceId, ObjectStoreId, ParquetFile, + ParquetFileId, Partition, PartitionHashId, PartitionHashIdError, PartitionId, + SkippedCompaction, SortKeyIds, TableId, Timestamp, +}; +use bytes::Bytes; +use generated_types::influxdata::iox::{ + catalog_cache::v1 as proto, skipped_compaction::v1 as skipped_compaction_proto, +}; +use snafu::{OptionExt, ResultExt, Snafu}; + +/// Error for [`PartitionSnapshot`] +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Error decoding PartitionFile: {source}"))] + FileDecode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Error encoding ParquetFile: {source}"))] + FileEncode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Missing required field {field}"))] + RequiredField { field: &'static str }, + + #[snafu(context(false))] + CompactionLevel { source: CompactionLevelProtoError }, + + #[snafu(context(false))] + PartitionHashId { source: PartitionHashIdError }, + + #[snafu(display("Invalid partition key: {source}"))] + PartitionKey { source: std::str::Utf8Error }, +} + +/// Result for [`PartitionSnapshot`] +pub type Result = std::result::Result; + +/// A snapshot of a partition +#[derive(Debug, Clone)] +pub struct PartitionSnapshot { + /// The [`NamespaceId`] + namespace_id: NamespaceId, + /// The [`TableId`] + table_id: TableId, + /// The [`PartitionId`] + partition_id: PartitionId, + /// The [`PartitionHashId`] + partition_hash_id: Option, + /// The generation of this snapshot + generation: u64, + /// The partition key + key: Bytes, + /// The files + files: MessageList, + /// The columns for this partition + columns: ColumnSet, + /// The sort key ids + sort_key: SortKeyIds, + /// The time of a new file + new_file_at: Option, + /// Skipped compaction. + skipped_compaction: Option, +} + +impl PartitionSnapshot { + /// Create a new [`PartitionSnapshot`] from the provided state + pub fn encode( + namespace_id: NamespaceId, + partition: Partition, + files: Vec, + skipped_compaction: Option, + generation: u64, + ) -> Result { + // Iterate in reverse order as schema additions are normally additive and + // so the later files will typically have more columns + let columns = files.iter().rev().fold(ColumnSet::empty(), |mut acc, v| { + acc.union(&v.column_set); + acc + }); + + let files = files + .into_iter() + .map(|file| { + let mut mask = BitMaskBuilder::new(columns.len()); + for (idx, _) in columns.intersect(&file.column_set) { + mask.set_bit(idx); + } + + proto::PartitionFile { + id: file.id.get(), + object_store_uuid: Some(file.object_store_id.get_uuid().into()), + min_time: file.min_time.0, + max_time: file.max_time.0, + file_size_bytes: file.file_size_bytes, + row_count: file.row_count, + compaction_level: file.compaction_level as _, + created_at: file.created_at.0, + max_l0_created_at: file.max_l0_created_at.0, + column_mask: Some(mask.finish().into()), + } + }) + .collect::>(); + + Ok(Self { + generation, + columns, + namespace_id, + partition_id: partition.id, + partition_hash_id: partition.hash_id().cloned(), + key: partition.partition_key.as_bytes().to_vec().into(), + files: MessageList::encode(&files).context(FileEncodeSnafu)?, + sort_key: partition.sort_key_ids().cloned().unwrap_or_default(), + table_id: partition.table_id, + new_file_at: partition.new_file_at, + skipped_compaction: skipped_compaction.map(|sc| sc.into()), + }) + } + + /// Create a new [`PartitionSnapshot`] from a `proto` and generation + pub fn decode(proto: proto::Partition, generation: u64) -> Self { + let table_id = TableId::new(proto.table_id); + let partition_hash_id = proto + .partition_hash_id + .then(|| PartitionHashId::from_raw(table_id, proto.key.as_ref())); + + Self { + generation, + table_id, + partition_hash_id, + key: proto.key, + files: MessageList::from(proto.files.unwrap_or_default()), + namespace_id: NamespaceId::new(proto.namespace_id), + partition_id: PartitionId::new(proto.partition_id), + columns: ColumnSet::new(proto.column_ids.into_iter().map(ColumnId::new)), + sort_key: SortKeyIds::new(proto.sort_key_ids.into_iter().map(ColumnId::new)), + new_file_at: proto.new_file_at.map(Timestamp::new), + skipped_compaction: proto.skipped_compaction, + } + } + + /// Returns the generation of this snapshot + pub fn generation(&self) -> u64 { + self.generation + } + + /// Returns the [`PartitionId`] + pub fn partition_id(&self) -> PartitionId { + self.partition_id + } + + /// Returns the [`PartitionHashId`] if any + pub fn partition_hash_id(&self) -> Option<&PartitionHashId> { + self.partition_hash_id.as_ref() + } + + /// Returns the file at index `idx` + pub fn file(&self, idx: usize) -> Result { + let file = self.files.get(idx).context(FileDecodeSnafu)?; + + let uuid = file.object_store_uuid.context(RequiredFieldSnafu { + field: "object_store_uuid", + })?; + + let column_set = match file.column_mask { + Some(mask) => { + let mask = BitMask::from(mask); + ColumnSet::new(mask.set_indices().map(|idx| self.columns[idx])) + } + None => self.columns.clone(), + }; + + Ok(ParquetFile { + id: ParquetFileId(file.id), + namespace_id: self.namespace_id, + table_id: self.table_id, + partition_id: self.partition_id, + partition_hash_id: self.partition_hash_id.clone(), + object_store_id: ObjectStoreId::from_uuid(uuid.into()), + min_time: Timestamp(file.min_time), + max_time: Timestamp(file.max_time), + to_delete: None, + file_size_bytes: file.file_size_bytes, + row_count: file.row_count, + compaction_level: file.compaction_level.try_into()?, + created_at: Timestamp(file.created_at), + column_set, + max_l0_created_at: Timestamp(file.max_l0_created_at), + }) + } + + /// Returns an iterator over the files in this snapshot + pub fn files(&self) -> impl Iterator> + '_ { + (0..self.files.len()).map(|idx| self.file(idx)) + } + + /// Returns the [`Partition`] for this snapshot + pub fn partition(&self) -> Result { + let key = std::str::from_utf8(&self.key).context(PartitionKeySnafu)?; + Ok(Partition::new_catalog_only( + self.partition_id, + self.partition_hash_id.clone(), + self.table_id, + key.into(), + self.sort_key.clone(), + self.new_file_at, + )) + } + + /// Returns the columns IDs + pub fn column_ids(&self) -> &ColumnSet { + &self.columns + } + + /// Return skipped compaction for this partition, if any. + pub fn skipped_compaction(&self) -> Option { + self.skipped_compaction + .as_ref() + .cloned() + .map(|sc| sc.into()) + } +} + +impl From for proto::Partition { + fn from(value: PartitionSnapshot) -> Self { + Self { + key: value.key, + files: Some(value.files.into()), + namespace_id: value.namespace_id.get(), + table_id: value.table_id.get(), + partition_id: value.partition_id.get(), + partition_hash_id: value.partition_hash_id.is_some(), + column_ids: value.columns.iter().map(|x| x.get()).collect(), + sort_key_ids: value.sort_key.iter().map(|x| x.get()).collect(), + new_file_at: value.new_file_at.map(|x| x.get()), + skipped_compaction: value.skipped_compaction, + } + } +} diff --git a/data_types/src/snapshot/table.rs b/data_types/src/snapshot/table.rs new file mode 100644 index 00000000000..08c235d2dff --- /dev/null +++ b/data_types/src/snapshot/table.rs @@ -0,0 +1,197 @@ +//! Snapshot definition for tables +use crate::snapshot::list::MessageList; +use crate::{ + Column, ColumnId, ColumnTypeProtoError, NamespaceId, Partition, PartitionId, Table, TableId, +}; +use bytes::Bytes; +use generated_types::influxdata::iox::catalog_cache::v1 as proto; +use generated_types::influxdata::iox::column_type::v1::ColumnType; +use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate; +use snafu::{ResultExt, Snafu}; + +/// Error for [`TableSnapshot`] +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum Error { + #[snafu(display("Error decoding TablePartition: {source}"))] + PartitionDecode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Error encoding TablePartition: {source}"))] + PartitionEncode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Error decoding TableColumn: {source}"))] + ColumnDecode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Error encoding TableColumn: {source}"))] + ColumnEncode { + source: crate::snapshot::list::Error, + }, + + #[snafu(display("Invalid column name: {source}"))] + ColumnName { source: std::str::Utf8Error }, + + #[snafu(display("Invalid table name: {source}"))] + TableName { source: std::str::Utf8Error }, + + #[snafu(display("Invalid partition template: {source}"))] + PartitionTemplate { + source: crate::partition_template::ValidationError, + }, + + #[snafu(context(false))] + ColumnType { source: ColumnTypeProtoError }, +} + +/// Result for [`TableSnapshot`] +pub type Result = std::result::Result; + +/// A snapshot of a table +#[derive(Debug, Clone)] +pub struct TableSnapshot { + table_id: TableId, + namespace_id: NamespaceId, + table_name: Bytes, + partitions: MessageList, + columns: MessageList, + partition_template: Option, + generation: u64, +} + +impl TableSnapshot { + /// Create a new [`TableSnapshot`] from the provided state + pub fn encode( + table: Table, + partitions: Vec, + columns: Vec, + generation: u64, + ) -> Result { + let columns: Vec<_> = columns + .into_iter() + .map(|c| proto::TableColumn { + id: c.id.get(), + name: c.name.into(), + column_type: ColumnType::from(c.column_type).into(), + }) + .collect(); + + let partitions: Vec<_> = partitions + .into_iter() + .map(|p| proto::TablePartition { + id: p.id.get(), + key: p.partition_key.as_bytes().to_vec().into(), + }) + .collect(); + + Ok(Self { + table_id: table.id, + namespace_id: table.namespace_id, + table_name: table.name.into(), + partitions: MessageList::encode(&partitions).context(PartitionEncodeSnafu)?, + columns: MessageList::encode(&columns).context(ColumnEncodeSnafu)?, + partition_template: table.partition_template.as_proto().cloned(), + generation, + }) + } + + /// Create a new [`TableSnapshot`] from a `proto` and generation + pub fn decode(proto: proto::Table, generation: u64) -> Self { + Self { + generation, + table_id: TableId::new(proto.table_id), + namespace_id: NamespaceId::new(proto.namespace_id), + table_name: proto.table_name, + partitions: MessageList::from(proto.partitions.unwrap_or_default()), + columns: MessageList::from(proto.columns.unwrap_or_default()), + partition_template: proto.partition_template, + } + } + + /// Returns the [`Table`] for this snapshot + pub fn table(&self) -> Result { + let name = std::str::from_utf8(&self.table_name).context(TableNameSnafu)?; + let template = self + .partition_template + .clone() + .try_into() + .context(PartitionTemplateSnafu)?; + + Ok(Table { + id: self.table_id, + namespace_id: self.namespace_id, + name: name.into(), + partition_template: template, + }) + } + + /// Returns the column by index + pub fn column(&self, idx: usize) -> Result { + let column = self.columns.get(idx).context(ColumnDecodeSnafu)?; + let name = std::str::from_utf8(&column.name).context(ColumnNameSnafu)?; + + Ok(Column { + id: ColumnId::new(column.id), + table_id: self.table_id, + name: name.into(), + column_type: (column.column_type as i16).try_into()?, + }) + } + + /// Returns an iterator of the columns in this table + pub fn columns(&self) -> impl Iterator> + '_ { + (0..self.columns.len()).map(|idx| self.column(idx)) + } + + /// Returns an iterator of the [`PartitionId`] in this table + pub fn partitions(&self) -> impl Iterator> + '_ { + (0..self.partitions.len()).map(|idx| { + let p = self.partitions.get(idx).context(PartitionDecodeSnafu)?; + Ok(TableSnapshotPartition { + id: PartitionId::new(p.id), + key: p.key, + }) + }) + } + + /// Returns the generation of this snapshot + pub fn generation(&self) -> u64 { + self.generation + } +} + +/// Partition information stored within [`TableSnapshot`] +#[derive(Debug)] +pub struct TableSnapshotPartition { + id: PartitionId, + key: Bytes, +} + +impl TableSnapshotPartition { + /// Returns the [`PartitionId`] for this partition + pub fn id(&self) -> PartitionId { + self.id + } + + /// Returns the partition key for this partition + pub fn key(&self) -> &[u8] { + &self.key + } +} + +impl From for proto::Table { + fn from(value: TableSnapshot) -> Self { + Self { + partitions: Some(value.partitions.into()), + columns: Some(value.columns.into()), + partition_template: value.partition_template, + namespace_id: value.namespace_id.get(), + table_id: value.table_id.get(), + table_name: value.table_name, + } + } +} diff --git a/datafusion_util/Cargo.toml b/datafusion_util/Cargo.toml index 9c75525084d..1f5f5541153 100644 --- a/datafusion_util/Cargo.toml +++ b/datafusion_util/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] async-trait = "0.1" datafusion = { workspace = true } @@ -13,10 +16,8 @@ futures = "0.3" object_store = { workspace = true } observability_deps = { path = "../observability_deps" } pin-project = "1.1" -tokio = { version = "1.32", features = ["parking_lot", "sync"] } +schema = { path = "../schema" } +tokio = { version = "1.35", features = ["parking_lot", "sync"] } tokio-stream = "0.1" -url = "2.4" +url = "2.5" workspace-hack = { version = "0.1", path = "../workspace-hack" } - -[dev-dependencies] -schema = { path = "../schema" } diff --git a/datafusion_util/src/config.rs b/datafusion_util/src/config.rs index 1c1a975a4ff..ed41b197688 100644 --- a/datafusion_util/src/config.rs +++ b/datafusion_util/src/config.rs @@ -4,6 +4,7 @@ use datafusion::{ config::ConfigOptions, execution::runtime_env::RuntimeEnv, prelude::SessionConfig, }; use object_store::ObjectStore; +use schema::TIME_DATA_TIMEZONE; use url::Url; // The default catalog name - this impacts what SQL queries use if not specified @@ -20,6 +21,7 @@ pub fn iox_session_config() -> SessionConfig { let mut options = ConfigOptions::new(); options.execution.parquet.pushdown_filters = true; options.execution.parquet.reorder_filters = true; + options.execution.time_zone = TIME_DATA_TIMEZONE().map(|s| s.to_string()); options.optimizer.repartition_sorts = true; SessionConfig::from(options) @@ -27,6 +29,12 @@ pub fn iox_session_config() -> SessionConfig { .with_create_default_catalog_and_schema(true) .with_information_schema(true) .with_default_catalog_and_schema(DEFAULT_CATALOG, DEFAULT_SCHEMA) + // Tell the datafusion optimizer to avoid repartitioning sorted inputs + .with_prefer_existing_sort(true) + // Avoid repartitioning file scans as it destroys existing sort orders + // see https://github.com/influxdata/influxdb_iox/issues/9450 + // see https://github.com/apache/arrow-datafusion/issues/8451 + .with_repartition_file_scans(false) } /// Register the "IOx" object store provider for URLs of the form "iox://{id} diff --git a/datafusion_util/src/lib.rs b/datafusion_util/src/lib.rs index b62cf3b8007..6323f06278a 100644 --- a/datafusion_util/src/lib.rs +++ b/datafusion_util/src/lib.rs @@ -20,6 +20,7 @@ //! for expression manipulation functions. use datafusion::execution::memory_pool::{MemoryPool, UnboundedMemoryPool}; +use std::collections::HashSet; // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -33,10 +34,11 @@ use std::task::{Context, Poll}; use datafusion::arrow::array::BooleanArray; use datafusion::arrow::compute::filter_record_batch; use datafusion::arrow::datatypes::{DataType, Fields}; +use datafusion::common::stats::Precision; use datafusion::common::{DataFusionError, ToDFSchema}; -use datafusion::datasource::MemTable; use datafusion::execution::context::TaskContext; use datafusion::logical_expr::expr::Sort; +use datafusion::logical_expr::utils::inspect_expr_pre; use datafusion::physical_expr::execution_props::ExecutionProps; use datafusion::physical_expr::{create_physical_expr, PhysicalExpr}; use datafusion::physical_optimizer::pruning::PruningPredicate; @@ -51,6 +53,7 @@ use datafusion::{ scalar::ScalarValue, }; use futures::{Stream, StreamExt}; +use schema::TIME_DATA_TIMEZONE; use tokio::sync::mpsc::{Receiver, UnboundedReceiver}; use tokio_stream::wrappers::{ReceiverStream, UnboundedReceiverStream}; use watch::WatchedTask; @@ -113,8 +116,8 @@ pub fn lit_dict(value: &str) -> Expr { pub fn make_range_expr(start: i64, end: i64, time: impl AsRef) -> Expr { // We need to cast the start and end values to timestamps // the equivalent of: - let ts_start = ScalarValue::TimestampNanosecond(Some(start), None); - let ts_end = ScalarValue::TimestampNanosecond(Some(end), None); + let ts_start = timestamptz_nano(start); + let ts_end = timestamptz_nano(end); let time_col = time.as_ref().as_expr(); let ts_low = lit(ts_start).lt_eq(time_col.clone()); @@ -123,6 +126,45 @@ pub fn make_range_expr(start: i64, end: i64, time: impl AsRef) -> Expr { ts_low.and(ts_high) } +/// Ensures all columns referred to in `filters` are in the `projection`, if +/// any, adding them if necessary. +pub fn extend_projection_for_filters( + schema: &Schema, + filters: &[Expr], + projection: Option<&Vec>, +) -> Result>, DataFusionError> { + let Some(mut projection) = projection.cloned() else { + return Ok(None); + }; + + let mut seen_cols: HashSet = projection.iter().cloned().collect(); + for filter in filters { + inspect_expr_pre(filter, |expr| { + if let Expr::Column(c) = expr { + let idx = schema.index_of(&c.name)?; + // if haven't seen this column before, add it to the list + if seen_cols.insert(idx) { + projection.push(idx); + } + } + Ok(()) as Result<(), DataFusionError> + })?; + } + Ok(Some(projection)) +} + +// TODO port this upstream to datafusion (maybe as From
{ + self.backing + .repositories() + .tables() + .create(name, partition_template, namespace_id) + .await + } + + async fn get_by_id(&mut self, table_id: TableId) -> Result> { + self.backing + .repositories() + .tables() + .get_by_id(table_id) + .await + } + + async fn get_by_namespace_and_name( + &mut self, + namespace_id: NamespaceId, + name: &str, + ) -> Result> { + self.backing + .repositories() + .tables() + .get_by_namespace_and_name(namespace_id, name) + .await + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + self.backing + .repositories() + .tables() + .list_by_namespace_id(namespace_id) + .await + } + + async fn list(&mut self) -> Result> { + self.backing.repositories().tables().list().await + } + + async fn snapshot(&mut self, table_id: TableId) -> Result { + self.backing + .repositories() + .tables() + .snapshot(table_id) + .await + } +} + +#[async_trait] +impl ColumnRepo for Repos { + async fn create_or_get( + &mut self, + name: &str, + table_id: TableId, + column_type: ColumnType, + ) -> Result { + self.backing + .repositories() + .columns() + .create_or_get(name, table_id, column_type) + .await + } + + async fn create_or_get_many_unchecked( + &mut self, + table_id: TableId, + columns: HashMap<&str, ColumnType>, + ) -> Result> { + self.backing + .repositories() + .columns() + .create_or_get_many_unchecked(table_id, columns) + .await + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + self.backing + .repositories() + .columns() + .list_by_namespace_id(namespace_id) + .await + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + self.backing + .repositories() + .columns() + .list_by_table_id(table_id) + .await + } + + async fn list(&mut self) -> Result> { + self.backing.repositories().columns().list().await + } +} + +#[async_trait] +impl PartitionRepo for Repos { + async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result { + // read-through: need to wire up table snapshots to look this up efficiently + self.backing + .repositories() + .partitions() + .create_or_get(key, table_id) + .await + } + + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result> { + futures::stream::iter(prepare_set(partition_ids.iter().cloned())) + .map(|p_id| { + let this = &self; + async move { + let snapshot = match this.get_partition(p_id).await { + Ok(s) => s, + Err(Error::NotFound { .. }) => { + return Ok(futures::stream::empty().boxed()); + } + Err(e) => { + return Err(e); + } + }; + + match snapshot.partition() { + Ok(p) => Ok(futures::stream::once(async move { Ok(p) }).boxed()), + Err(e) => Err(Error::from(e)), + } + } + }) + .buffer_unordered(self.quorum_fanout) + .try_flatten() + .try_collect::>() + .await + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + // read-through: need to wire up table snapshots to look this up efficiently + self.backing + .repositories() + .partitions() + .list_by_table_id(table_id) + .await + } + + async fn list_ids(&mut self) -> Result> { + // read-through: only used for testing, we should eventually remove this interface + self.backing.repositories().partitions().list_ids().await + } + + async fn cas_sort_key( + &mut self, + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result> { + let res = self + .backing + .repositories() + .partitions() + .cas_sort_key(partition_id, old_sort_key_ids, new_sort_key_ids) + .await?; + + self.refresh_partition(partition_id) + .await + .map_err(CasFailure::QueryError)?; + + Ok(res) + } + + #[allow(clippy::too_many_arguments)] + async fn record_skipped_compaction( + &mut self, + partition_id: PartitionId, + reason: &str, + num_files: usize, + limit_num_files: usize, + limit_num_files_first_in_partition: usize, + estimated_bytes: u64, + limit_bytes: u64, + ) -> Result<()> { + self.backing + .repositories() + .partitions() + .record_skipped_compaction( + partition_id, + reason, + num_files, + limit_num_files, + limit_num_files_first_in_partition, + estimated_bytes, + limit_bytes, + ) + .await?; + + self.refresh_partition(partition_id).await?; + + Ok(()) + } + + async fn get_in_skipped_compactions( + &mut self, + partition_id: &[PartitionId], + ) -> Result> { + futures::stream::iter(prepare_set(partition_id.iter().cloned())) + .map(|p_id| { + let this = &self; + async move { + let snapshot = match this.get_partition(p_id).await { + Ok(s) => s, + Err(Error::NotFound { .. }) => { + return Ok(futures::stream::empty().boxed()); + } + Err(e) => { + return Err(e); + } + }; + + match snapshot.skipped_compaction() { + Some(sc) => Ok(futures::stream::once(async move { Ok(sc) }).boxed()), + None => Ok(futures::stream::empty().boxed()), + } + } + }) + .buffer_unordered(self.quorum_fanout) + .try_flatten() + .try_collect::>() + .await + } + + async fn list_skipped_compactions(&mut self) -> Result> { + // read-through: used for debugging, this should be replaced w/ proper hierarchy-traversal + self.backing + .repositories() + .partitions() + .list_skipped_compactions() + .await + } + + async fn delete_skipped_compactions( + &mut self, + partition_id: PartitionId, + ) -> Result> { + let res = self + .backing + .repositories() + .partitions() + .delete_skipped_compactions(partition_id) + .await?; + + self.refresh_partition(partition_id).await?; + + Ok(res) + } + + async fn most_recent_n(&mut self, n: usize) -> Result> { + // read-through: used for ingester warm-up at the moment + self.backing + .repositories() + .partitions() + .most_recent_n(n) + .await + } + + async fn partitions_new_file_between( + &mut self, + minimum_time: Timestamp, + maximum_time: Option, + ) -> Result> { + // read-through: used by the compactor for scheduling, we should eventually find a better interface + self.backing + .repositories() + .partitions() + .partitions_new_file_between(minimum_time, maximum_time) + .await + } + + async fn list_old_style(&mut self) -> Result> { + // read-through: used by the ingester due to hash-id stuff + self.backing + .repositories() + .partitions() + .list_old_style() + .await + } + + async fn snapshot(&mut self, partition_id: PartitionId) -> Result { + self.get_partition(partition_id).await + } +} + +#[async_trait] +impl ParquetFileRepo for Repos { + async fn flag_for_delete_by_retention(&mut self) -> Result> { + let res = self + .backing + .repositories() + .parquet_files() + .flag_for_delete_by_retention() + .await?; + + let affected_partitions = res + .iter() + .map(|(p_id, _os_id)| *p_id) + .collect::>(); + + // ensure deterministic order + let mut affected_partitions = affected_partitions.into_iter().collect::>(); + affected_partitions.sort_unstable(); + + // refresh ALL partitons that are affected, NOT just only the ones that were cached. This should avoid the + // following "lost update" race condition: + // + // This scenario assumes that the partition in question is NOT cached yet. + // + // | T | Thread 1 | Thread 2 | + // | - | ------------------------------------- | -------------------------------------------------- | + // | 1 | receive `create_update_delete` | | + // | 2 | execute change within backing catalog | | + // | 3 | takes snapshot from backing catalog | | + // | 4 | | receive `flag_for_delete_by_retention` | + // | 5 | | execute change within backing catalog | + // | 6 | | affected partition not cached => no snapshot taken | + // | 7 | | return | + // | 8 | quorum-write snapshot | | + // | 9 | return | | + // + // The partition is now cached by does NOT contain the `flag_for_delete_by_retention` change and will not + // automatically converge. + futures::stream::iter(affected_partitions) + .map(|p_id| { + let this = &self; + async move { + this.refresh_partition(p_id).await?; + Ok::<(), Error>(()) + } + }) + .buffer_unordered(self.quorum_fanout) + .try_collect::<()>() + .await?; + + Ok(res) + } + + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + // deleted files are NOT part of the snapshot, so this bypasses the cache + self.backing + .repositories() + .parquet_files() + .delete_old_ids_only(older_than) + .await + } + + async fn list_by_partition_not_to_delete_batch( + &mut self, + partition_ids: Vec, + ) -> Result> { + futures::stream::iter(prepare_set(partition_ids)) + .map(|p_id| { + let this = &self; + async move { + let snapshot = match this.get_partition(p_id).await { + Ok(s) => s, + Err(Error::NotFound { .. }) => { + return Ok(futures::stream::empty().boxed()); + } + Err(e) => { + return Err(e); + } + }; + + // Decode files so we can drop the snapshot early. + // + // Need to collect the file results into a vec though because we cannot return borrowed data and + // "owned iterators" aren't a thing. + let files = snapshot + .files() + .map(|res| res.map_err(Error::from)) + .collect::>(); + Ok::<_, Error>(futures::stream::iter(files).boxed()) + } + }) + .buffer_unordered(self.quorum_fanout) + .try_flatten() + .try_collect::>() + .await + } + + async fn get_by_object_store_id( + &mut self, + object_store_id: ObjectStoreId, + ) -> Result> { + // read-through: see https://github.com/influxdata/influxdb_iox/issues/9719 + self.backing + .repositories() + .parquet_files() + .get_by_object_store_id(object_store_id) + .await + } + + async fn exists_by_object_store_id_batch( + &mut self, + object_store_ids: Vec, + ) -> Result> { + // read-through: this is used by the GC, so this is not overall latency-critical + self.backing + .repositories() + .parquet_files() + .exists_by_object_store_id_batch(object_store_ids) + .await + } + + async fn create_upgrade_delete( + &mut self, + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], + create: &[ParquetFileParams], + target_level: CompactionLevel, + ) -> Result> { + let res = self + .backing + .repositories() + .parquet_files() + .create_upgrade_delete(partition_id, delete, upgrade, create, target_level) + .await?; + + self.refresh_partition(partition_id).await?; + + Ok(res) + } +} + +/// Prepare set of elements in deterministic order. +fn prepare_set(set: S) -> Vec +where + S: IntoIterator, + T: Eq + Ord, +{ + // ensure deterministic order (also required for de-dup) + let mut set = set.into_iter().collect::>(); + set.sort_unstable(); + + // de-dup + set.dedup(); + + set +} + +#[cfg(test)] +mod tests { + use catalog_cache::api::server::test_util::TestCacheServer; + use catalog_cache::local::CatalogCache; + use iox_time::SystemProvider; + + use crate::{interface_tests::TestCatalog, mem::MemCatalog}; + + use super::*; + use std::sync::Arc; + + #[tokio::test] + async fn test_catalog() { + crate::interface_tests::test_catalog(|| async { + let metrics = Arc::new(metric::Registry::default()); + let time_provider = Arc::new(SystemProvider::new()) as _; + let backing = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider))); + + let peer0 = TestCacheServer::bind_ephemeral(); + let peer1 = TestCacheServer::bind_ephemeral(); + let cache = Arc::new(QuorumCatalogCache::new( + Arc::new(CatalogCache::default()), + Arc::new([peer0.client(), peer1.client()]), + )); + + // use new metrics registry so the two layers don't double-count + let metrics = Arc::new(metric::Registry::default()); + let caching_catalog = Arc::new(CachingCatalog::new( + cache, + backing, + metrics, + time_provider, + 10, + )); + + let test_catalog = TestCatalog::new(caching_catalog); + test_catalog.hold_onto(peer0); + test_catalog.hold_onto(peer1); + + Arc::new(test_catalog) as _ + }) + .await; + } +} diff --git a/iox_catalog/src/constants.rs b/iox_catalog/src/constants.rs new file mode 100644 index 00000000000..b6b88fbd21d --- /dev/null +++ b/iox_catalog/src/constants.rs @@ -0,0 +1,19 @@ +//! Constants that are hold for all catalog implementations. + +/// Time column. +pub const TIME_COLUMN: &str = "time"; + +/// Default retention period for data in the catalog. +pub const DEFAULT_RETENTION_PERIOD: Option = None; + +/// Maximum number of files touched by [`ParquetFileRepo::flag_for_delete_by_retention`] at a time. +/// +/// +/// [`ParquetFileRepo::flag_for_delete_by_retention`]: crate::interface::ParquetFileRepo::flag_for_delete_by_retention +pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION: i64 = 1_000; + +/// Maximum number of files touched by [`ParquetFileRepo::delete_old_ids_only`] at a time. +/// +/// +/// [`ParquetFileRepo::delete_old_ids_only`]: crate::interface::ParquetFileRepo::delete_old_ids_only +pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE: i64 = 10_000; diff --git a/iox_catalog/src/grpc/client.rs b/iox_catalog/src/grpc/client.rs new file mode 100644 index 00000000000..8edc05dd640 --- /dev/null +++ b/iox_catalog/src/grpc/client.rs @@ -0,0 +1,997 @@ +//! gRPC client implementation. +use std::future::Future; +use std::ops::ControlFlow; +use std::{collections::HashMap, sync::Arc}; + +use async_trait::async_trait; +use futures::TryStreamExt; +use log::{debug, info, warn}; +use tonic::transport::{Channel, Uri}; + +use crate::{ + interface::{ + CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, PartitionRepo, + RepoCollection, Result, SoftDeletedRows, TableRepo, + }, + metrics::MetricDecorator, +}; +use backoff::{Backoff, BackoffError}; +use data_types::snapshot::partition::PartitionSnapshot; +use data_types::{ + partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, + snapshot::table::TableSnapshot, + Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId, + NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile, + ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction, + SortKeyIds, Table, TableId, Timestamp, +}; +use generated_types::influxdata::iox::catalog::v2 as proto; +use iox_time::TimeProvider; +use trace_http::metrics::{MetricFamily, RequestMetrics}; +use trace_http::tower::TraceService; + +use super::serialization::{ + convert_status, deserialize_column, deserialize_namespace, deserialize_object_store_id, + deserialize_parquet_file, deserialize_partition, deserialize_skipped_compaction, + deserialize_sort_key_ids, deserialize_table, serialize_column_type, serialize_object_store_id, + serialize_parquet_file_params, serialize_soft_deleted_rows, serialize_sort_key_ids, ContextExt, + RequiredExt, +}; + +type InstrumentedChannel = TraceService; + +/// Catalog that goes through a gRPC interface. +#[derive(Debug)] +pub struct GrpcCatalogClient { + channel: InstrumentedChannel, + metrics: Arc, + time_provider: Arc, +} + +impl GrpcCatalogClient { + /// Create new client. + pub fn new( + uri: Uri, + metrics: Arc, + time_provider: Arc, + ) -> Self { + let channel = TraceService::new_client( + Channel::builder(uri).connect_lazy(), + Arc::new(RequestMetrics::new( + Arc::clone(&metrics), + MetricFamily::GrpcClient, + )), + None, + "catalog", + ); + Self { + channel, + metrics, + time_provider, + } + } +} + +impl std::fmt::Display for GrpcCatalogClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "grpc") + } +} + +#[async_trait] +impl Catalog for GrpcCatalogClient { + async fn setup(&self) -> Result<(), Error> { + Ok(()) + } + + fn repositories(&self) -> Box { + Box::new(MetricDecorator::new( + GrpcCatalogClientRepos { + channel: self.channel.clone(), + }, + Arc::clone(&self.metrics), + Arc::clone(&self.time_provider), + )) + } + + #[cfg(test)] + fn metrics(&self) -> Arc { + Arc::clone(&self.metrics) + } + + fn time_provider(&self) -> Arc { + Arc::clone(&self.time_provider) + } +} + +#[derive(Debug)] +struct GrpcCatalogClientRepos { + channel: InstrumentedChannel, +} + +type ServiceClient = proto::catalog_service_client::CatalogServiceClient; + +fn is_upstream_error(e: &tonic::Status) -> bool { + matches!( + e.code(), + tonic::Code::Cancelled + | tonic::Code::DeadlineExceeded + | tonic::Code::FailedPrecondition + | tonic::Code::Aborted + | tonic::Code::Unavailable + ) +} + +impl GrpcCatalogClientRepos { + fn client(&self) -> ServiceClient { + proto::catalog_service_client::CatalogServiceClient::new(self.channel.clone()) + } + + async fn retry( + &self, + operation: &str, + upload: U, + fun_io: FunIo, + ) -> Result + where + U: Clone + std::fmt::Debug + Send + Sync, + FunIo: Fn(U, ServiceClient) -> Fut + Send + Sync, + Fut: Future, tonic::Status>> + Send, + D: std::fmt::Debug, + { + Backoff::new(&Default::default()) + .retry_with_backoff(operation, || async { + let res = fun_io(upload.clone(), self.client()).await; + match res { + Ok(r) => { + let r = r.into_inner(); + debug!("{} successfully received: {:?}", operation, &r); + ControlFlow::Break(Ok(r)) + } + Err(e) if is_upstream_error(&e) => { + info!("{} retriable error encountered: {:?}", operation, &e); + ControlFlow::Continue(e) + } + Err(e) => { + warn!( + "{operation} attempted {:?} and received error: {:?}", + upload, e + ); + ControlFlow::Break(Err(convert_status(e))) + } + } + }) + .await + .map_err(|be| { + let status = match be { + BackoffError::DeadlineExceeded { source, .. } => source, + }; + convert_status(status) + })? + } +} + +impl RepoCollection for GrpcCatalogClientRepos { + fn namespaces(&mut self) -> &mut dyn NamespaceRepo { + self + } + + fn tables(&mut self) -> &mut dyn TableRepo { + self + } + + fn columns(&mut self) -> &mut dyn ColumnRepo { + self + } + + fn partitions(&mut self) -> &mut dyn PartitionRepo { + self + } + + fn parquet_files(&mut self) -> &mut dyn ParquetFileRepo { + self + } +} + +#[async_trait] +impl NamespaceRepo for GrpcCatalogClientRepos { + async fn create( + &mut self, + name: &NamespaceName<'_>, + partition_template: Option, + retention_period_ns: Option, + service_protection_limits: Option, + ) -> Result { + let n = proto::NamespaceCreateRequest { + name: name.to_string(), + partition_template: partition_template.and_then(|t| t.as_proto().cloned()), + retention_period_ns, + service_protection_limits: service_protection_limits.map(|l| { + proto::ServiceProtectionLimits { + max_tables: l.max_tables.map(|x| x.get_i32()), + max_columns_per_table: l.max_columns_per_table.map(|x| x.get_i32()), + } + }), + }; + + let resp = self + .retry("namespace_create", n, |data, mut client| async move { + client.namespace_create(data).await + }) + .await?; + + Ok(deserialize_namespace( + resp.namespace.required().ctx("namespace")?, + )?) + } + + async fn update_retention_period( + &mut self, + name: &str, + retention_period_ns: Option, + ) -> Result { + let n = proto::NamespaceUpdateRetentionPeriodRequest { + name: name.to_owned(), + retention_period_ns, + }; + + let resp = self.retry( + "namespace_update_retention_period", + n, + |data, mut client| async move { client.namespace_update_retention_period(data).await }, + ) + .await?; + + Ok(deserialize_namespace( + resp.namespace.required().ctx("namespace")?, + )?) + } + + async fn list(&mut self, deleted: SoftDeletedRows) -> Result> { + let n = proto::NamespaceListRequest { + deleted: serialize_soft_deleted_rows(deleted), + }; + + self.retry("namespace_list", n, |data, mut client| async move { + client.namespace_list(data).await + }) + .await? + .map_err(convert_status) + .and_then(|res| async move { + deserialize_namespace(res.namespace.required().ctx("namespace")?).map_err(Error::from) + }) + .try_collect() + .await + } + + async fn get_by_id( + &mut self, + id: NamespaceId, + deleted: SoftDeletedRows, + ) -> Result> { + let n = proto::NamespaceGetByIdRequest { + id: id.get(), + deleted: serialize_soft_deleted_rows(deleted), + }; + + let resp = self + .retry("namespace_get_by_id", n, |data, mut client| async move { + client.namespace_get_by_id(data).await + }) + .await?; + Ok(resp.namespace.map(deserialize_namespace).transpose()?) + } + + async fn get_by_name( + &mut self, + name: &str, + deleted: SoftDeletedRows, + ) -> Result> { + let n = proto::NamespaceGetByNameRequest { + name: name.to_owned(), + deleted: serialize_soft_deleted_rows(deleted), + }; + + let resp = self + .retry("namespace_get_by_name", n, |data, mut client| async move { + client.namespace_get_by_name(data).await + }) + .await?; + Ok(resp.namespace.map(deserialize_namespace).transpose()?) + } + + async fn soft_delete(&mut self, name: &str) -> Result<()> { + let n = proto::NamespaceSoftDeleteRequest { + name: name.to_owned(), + }; + + self.retry("namespace_soft_delete", n, |data, mut client| async move { + client.namespace_soft_delete(data).await + }) + .await?; + Ok(()) + } + + async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result { + let n = proto::NamespaceUpdateTableLimitRequest { + name: name.to_owned(), + new_max: new_max.get_i32(), + }; + + let resp = self + .retry("namespace_soft_delete", n, |data, mut client| async move { + client.namespace_update_table_limit(data).await + }) + .await?; + + Ok(deserialize_namespace( + resp.namespace.required().ctx("namespace")?, + )?) + } + + async fn update_column_limit( + &mut self, + name: &str, + new_max: MaxColumnsPerTable, + ) -> Result { + let n = proto::NamespaceUpdateColumnLimitRequest { + name: name.to_owned(), + new_max: new_max.get_i32(), + }; + + let resp = self + .retry("namespace_soft_delete", n, |data, mut client| async move { + client.namespace_update_column_limit(data).await + }) + .await?; + + Ok(deserialize_namespace( + resp.namespace.required().ctx("namespace")?, + )?) + } +} + +#[async_trait] +impl TableRepo for GrpcCatalogClientRepos { + async fn create( + &mut self, + name: &str, + partition_template: TablePartitionTemplateOverride, + namespace_id: NamespaceId, + ) -> Result
{ + let t = proto::TableCreateRequest { + name: name.to_owned(), + partition_template: partition_template.as_proto().cloned(), + namespace_id: namespace_id.get(), + }; + + let resp = self + .retry("table_create", t, |data, mut client| async move { + client.table_create(data).await + }) + .await?; + Ok(deserialize_table(resp.table.required().ctx("table")?)?) + } + + async fn get_by_id(&mut self, table_id: TableId) -> Result> { + let t = proto::TableGetByIdRequest { id: table_id.get() }; + + let resp = self + .retry("table_get_by_id", t, |data, mut client| async move { + client.table_get_by_id(data).await + }) + .await?; + Ok(resp.table.map(deserialize_table).transpose()?) + } + + async fn get_by_namespace_and_name( + &mut self, + namespace_id: NamespaceId, + name: &str, + ) -> Result> { + let t = proto::TableGetByNamespaceAndNameRequest { + namespace_id: namespace_id.get(), + name: name.to_owned(), + }; + + let resp = self.retry( + "table_get_by_namespace_and_name", + t, + |data, mut client| async move { client.table_get_by_namespace_and_name(data).await }, + ) + .await?; + Ok(resp.table.map(deserialize_table).transpose()?) + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + let t = proto::TableListByNamespaceIdRequest { + namespace_id: namespace_id.get(), + }; + + self.retry( + "table_list_by_namespace_id", + t, + |data, mut client| async move { client.table_list_by_namespace_id(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) }) + .try_collect() + .await + } + + async fn list(&mut self) -> Result> { + let t = proto::TableListRequest {}; + + self.retry("table_list", t, |data, mut client| async move { + client.table_list(data).await + }) + .await? + .map_err(convert_status) + .and_then(|res| async move { Ok(deserialize_table(res.table.required().ctx("table")?)?) }) + .try_collect() + .await + } + + async fn snapshot(&mut self, table_id: TableId) -> Result { + let t = proto::TableSnapshotRequest { + table_id: table_id.get(), + }; + + let resp = self + .retry("table_snapshot", t, |data, mut client| async move { + client.table_snapshot(data).await + }) + .await?; + + let table = resp.table.required().ctx("table")?; + Ok(TableSnapshot::decode(table, resp.generation)) + } +} + +#[async_trait] +impl ColumnRepo for GrpcCatalogClientRepos { + async fn create_or_get( + &mut self, + name: &str, + table_id: TableId, + column_type: ColumnType, + ) -> Result { + let c = proto::ColumnCreateOrGetRequest { + name: name.to_owned(), + table_id: table_id.get(), + column_type: serialize_column_type(column_type), + }; + + let resp = self + .retry("column_create_or_get", c, |data, mut client| async move { + client.column_create_or_get(data).await + }) + .await?; + Ok(deserialize_column(resp.column.required().ctx("column")?)?) + } + + async fn create_or_get_many_unchecked( + &mut self, + table_id: TableId, + columns: HashMap<&str, ColumnType>, + ) -> Result> { + let c = proto::ColumnCreateOrGetManyUncheckedRequest { + table_id: table_id.get(), + columns: columns + .into_iter() + .map(|(name, t)| (name.to_owned(), serialize_column_type(t))) + .collect(), + }; + + self.retry( + "column_create_or_get_many_unchecked", + c, + |data, mut client| async move { client.column_create_or_get_many_unchecked(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_column(res.column.required().ctx("column")?)?) + }) + .try_collect() + .await + } + + async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { + let c = proto::ColumnListByNamespaceIdRequest { + namespace_id: namespace_id.get(), + }; + + self.retry( + "column_list_by_namespace_id", + c, + |data, mut client| async move { client.column_list_by_namespace_id(data).await }, + ) + .await? + .map_err(convert_status) + .and_then( + |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) }, + ) + .try_collect() + .await + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + let c = proto::ColumnListByTableIdRequest { + table_id: table_id.get(), + }; + + self.retry( + "column_list_by_table_id", + c, + |data, mut client| async move { client.column_list_by_table_id(data).await }, + ) + .await? + .map_err(convert_status) + .and_then( + |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) }, + ) + .try_collect() + .await + } + + async fn list(&mut self) -> Result> { + let c = proto::ColumnListRequest {}; + + self.retry("column_list", c, |data, mut client| async move { + client.column_list(data).await + }) + .await? + .map_err(convert_status) + .and_then( + |res| async move { Ok(deserialize_column(res.column.required().ctx("column")?)?) }, + ) + .try_collect() + .await + } +} + +#[async_trait] +impl PartitionRepo for GrpcCatalogClientRepos { + async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result { + let p = proto::PartitionCreateOrGetRequest { + key: key.inner().to_owned(), + table_id: table_id.get(), + }; + + let resp = self + .retry( + "partition_create_or_get", + p, + |data, mut client| async move { client.partition_create_or_get(data).await }, + ) + .await?; + + Ok(deserialize_partition( + resp.partition.required().ctx("partition")?, + )?) + } + + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result> { + let p = proto::PartitionGetByIdBatchRequest { + partition_ids: partition_ids.iter().map(|id| id.get()).collect(), + }; + + self.retry( + "partition_get_by_id_batch", + p, + |data, mut client| async move { client.partition_get_by_id_batch(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_partition( + res.partition.required().ctx("partition")?, + )?) + }) + .try_collect() + .await + } + + async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { + let p = proto::PartitionListByTableIdRequest { + table_id: table_id.get(), + }; + + self.retry( + "partition_list_by_table_id", + p, + |data, mut client| async move { client.partition_list_by_table_id(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_partition( + res.partition.required().ctx("partition")?, + )?) + }) + .try_collect() + .await + } + + async fn list_ids(&mut self) -> Result> { + let p = proto::PartitionListIdsRequest {}; + + self.retry("partition_list_ids", p, |data, mut client| async move { + client.partition_list_ids(data).await + }) + .await? + .map_err(convert_status) + .map_ok(|res| PartitionId::new(res.partition_id)) + .try_collect() + .await + } + + async fn cas_sort_key( + &mut self, + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result> { + // This method does not use request/request_streaming_response + // because the error handling (converting to CasFailure) differs + // from how all the other methods handle errors. + + let p = proto::PartitionCasSortKeyRequest { + partition_id: partition_id.get(), + old_sort_key_ids: old_sort_key_ids.map(serialize_sort_key_ids), + new_sort_key_ids: Some(serialize_sort_key_ids(new_sort_key_ids)), + }; + + let res = self + .retry("partition_cas_sort_key", p, |data, mut client| async move { + client.partition_cas_sort_key(data).await + }) + .await + .map_err(CasFailure::QueryError)?; + + let res = res + .res + .required() + .ctx("res") + .map_err(|e| CasFailure::QueryError(e.into()))?; + + match res { + proto::partition_cas_sort_key_response::Res::Partition(p) => { + let p = deserialize_partition(p).map_err(|e| CasFailure::QueryError(e.into()))?; + Ok(p) + } + proto::partition_cas_sort_key_response::Res::CurrentSortKey(k) => { + Err(CasFailure::ValueMismatch(deserialize_sort_key_ids(k))) + } + } + } + + #[allow(clippy::too_many_arguments)] + async fn record_skipped_compaction( + &mut self, + partition_id: PartitionId, + reason: &str, + num_files: usize, + limit_num_files: usize, + limit_num_files_first_in_partition: usize, + estimated_bytes: u64, + limit_bytes: u64, + ) -> Result<()> { + let p = proto::PartitionRecordSkippedCompactionRequest { + partition_id: partition_id.get(), + reason: reason.to_owned(), + num_files: num_files as u64, + limit_num_files: limit_num_files as u64, + limit_num_files_first_in_partition: limit_num_files_first_in_partition as u64, + estimated_bytes, + limit_bytes, + }; + + self.retry( + "partition_record_skipped_compaction", + p, + |data, mut client| async move { client.partition_record_skipped_compaction(data).await }, + ) + .await?; + Ok(()) + } + + async fn get_in_skipped_compactions( + &mut self, + partition_id: &[PartitionId], + ) -> Result> { + let p = proto::PartitionGetInSkippedCompactionsRequest { + partition_ids: partition_id.iter().map(|id| id.get()).collect(), + }; + + self.retry( + "partition_get_in_skipped_compactions", + p, + |data, mut client| async move { client.partition_get_in_skipped_compactions(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_skipped_compaction(res.skipped_compaction.required().ctx("skipped_compaction")?)) + }) + .try_collect() + .await + } + + async fn list_skipped_compactions(&mut self) -> Result> { + let p = proto::PartitionListSkippedCompactionsRequest {}; + + self.retry( + "partition_list_skipped_compactions", + p, + |data, mut client| async move { client.partition_list_skipped_compactions(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_skipped_compaction( + res.skipped_compaction + .required() + .ctx("skipped_compaction")?, + )) + }) + .try_collect() + .await + } + + async fn delete_skipped_compactions( + &mut self, + partition_id: PartitionId, + ) -> Result> { + let p = proto::PartitionDeleteSkippedCompactionsRequest { + partition_id: partition_id.get(), + }; + + let resp = self + .retry( + "partition_delete_skipped_compactions", + p, + |data, mut client| async move { + client.partition_delete_skipped_compactions(data).await + }, + ) + .await?; + + Ok(resp.skipped_compaction.map(deserialize_skipped_compaction)) + } + + async fn most_recent_n(&mut self, n: usize) -> Result> { + let p = proto::PartitionMostRecentNRequest { n: n as u64 }; + + self.retry( + "partition_most_recent_n", + p, + |data, mut client| async move { client.partition_most_recent_n(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_partition( + res.partition.required().ctx("partition")?, + )?) + }) + .try_collect() + .await + } + + async fn partitions_new_file_between( + &mut self, + minimum_time: Timestamp, + maximum_time: Option, + ) -> Result> { + let p = proto::PartitionNewFileBetweenRequest { + minimum_time: minimum_time.get(), + maximum_time: maximum_time.map(|ts| ts.get()), + }; + + self.retry( + "partition_new_file_between", + p, + |data, mut client| async move { client.partition_new_file_between(data).await }, + ) + .await? + .map_err(convert_status) + .map_ok(|res| PartitionId::new(res.partition_id)) + .try_collect() + .await + } + + async fn list_old_style(&mut self) -> Result> { + let p = proto::PartitionListOldStyleRequest {}; + + self.retry( + "partition_list_old_style", + p, + |data, mut client| async move { client.partition_list_old_style(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_partition( + res.partition.required().ctx("partition")?, + )?) + }) + .try_collect() + .await + } + + async fn snapshot(&mut self, partition_id: PartitionId) -> Result { + let p = proto::PartitionSnapshotRequest { + partition_id: partition_id.get(), + }; + + let resp = self + .retry("partition_snapshot", p, |data, mut client| async move { + client.partition_snapshot(data).await + }) + .await?; + let partition = resp.partition.required().ctx("partition")?; + Ok(PartitionSnapshot::decode(partition, resp.generation)) + } +} + +#[async_trait] +impl ParquetFileRepo for GrpcCatalogClientRepos { + async fn flag_for_delete_by_retention(&mut self) -> Result> { + let p = proto::ParquetFileFlagForDeleteByRetentionRequest {}; + + self.retry( + "parquet_file_flag_for_delete_by_retention", + p, + |data, mut client| async move { + client.parquet_file_flag_for_delete_by_retention(data).await + }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(( + PartitionId::new(res.partition_id), + deserialize_object_store_id(res.object_store_id.required().ctx("object_store_id")?), + )) + }) + .try_collect() + .await + } + + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + let p = proto::ParquetFileDeleteOldIdsOnlyRequest { + older_than: older_than.get(), + }; + + self.retry( + "parquet_file_delete_old_ids_only", + p, + |data, mut client| async move { client.parquet_file_delete_old_ids_only(data).await }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_object_store_id( + res.object_store_id.required().ctx("object_store_id")?, + )) + }) + .try_collect() + .await + } + + async fn list_by_partition_not_to_delete_batch( + &mut self, + partition_ids: Vec, + ) -> Result> { + let p = proto::ParquetFileListByPartitionNotToDeleteBatchRequest { + partition_ids: partition_ids.into_iter().map(|p| p.get()).collect(), + }; + + self.retry( + "parquet_file_list_by_partition_not_to_delete_batch", + p, + |data, mut client| async move { + client + .parquet_file_list_by_partition_not_to_delete_batch(data) + .await + }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_parquet_file( + res.parquet_file.required().ctx("parquet_file")?, + )?) + }) + .try_collect() + .await + } + + async fn get_by_object_store_id( + &mut self, + object_store_id: ObjectStoreId, + ) -> Result> { + let p = proto::ParquetFileGetByObjectStoreIdRequest { + object_store_id: Some(serialize_object_store_id(object_store_id)), + }; + + let maybe_file = self.retry( + "parquet_file_get_by_object_store_id", + p, + |data, mut client| async move { client.parquet_file_get_by_object_store_id(data).await }) + .await? + .parquet_file.map(deserialize_parquet_file).transpose()?; + Ok(maybe_file) + } + + async fn exists_by_object_store_id_batch( + &mut self, + object_store_ids: Vec, + ) -> Result> { + let p = futures::stream::iter(object_store_ids.into_iter().map(|id| { + proto::ParquetFileExistsByObjectStoreIdBatchRequest { + object_store_id: Some(serialize_object_store_id(id)), + } + })); + + self.retry( + "parquet_file_exists_by_object_store_id_batch", + p, + |data, mut client: ServiceClient| async move { + client + .parquet_file_exists_by_object_store_id_batch(data) + .await + }, + ) + .await? + .map_err(convert_status) + .and_then(|res| async move { + Ok(deserialize_object_store_id( + res.object_store_id.required().ctx("object_store_id")?, + )) + }) + .try_collect() + .await + } + + async fn create_upgrade_delete( + &mut self, + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], + create: &[ParquetFileParams], + target_level: CompactionLevel, + ) -> Result> { + let p = proto::ParquetFileCreateUpgradeDeleteRequest { + partition_id: partition_id.get(), + delete: delete + .iter() + .copied() + .map(serialize_object_store_id) + .collect(), + upgrade: upgrade + .iter() + .copied() + .map(serialize_object_store_id) + .collect(), + create: create.iter().map(serialize_parquet_file_params).collect(), + target_level: target_level as i32, + }; + + let resp = self.retry( + "parquet_file_create_upgrade_delete", + p, + |data, mut client| async move { client.parquet_file_create_upgrade_delete(data).await }, + ) + .await?; + + Ok(resp + .created_parquet_file_ids + .into_iter() + .map(ParquetFileId::new) + .collect()) + } +} diff --git a/iox_catalog/src/grpc/mod.rs b/iox_catalog/src/grpc/mod.rs new file mode 100644 index 00000000000..0374f575b85 --- /dev/null +++ b/iox_catalog/src/grpc/mod.rs @@ -0,0 +1,143 @@ +//! gRPC catalog tunnel. +//! +//! This tunnels catalog requests over gRPC. + +pub mod client; +mod serialization; +pub mod server; + +#[cfg(test)] +mod tests { + use std::{net::SocketAddr, sync::Arc}; + + use data_types::NamespaceName; + use iox_time::SystemProvider; + use metric::{Attributes, Metric, U64Counter}; + use test_helpers::maybe_start_logging; + use tokio::{net::TcpListener, task::JoinSet}; + use tonic::transport::{server::TcpIncoming, Server, Uri}; + + use crate::{interface::Catalog, interface_tests::TestCatalog, mem::MemCatalog}; + + use super::*; + + #[tokio::test] + async fn test_catalog() { + maybe_start_logging(); + + crate::interface_tests::test_catalog(|| async { + let metrics = Arc::new(metric::Registry::default()); + let time_provider = Arc::new(SystemProvider::new()) as _; + let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider))); + let test_server = TestServer::new(backing_catalog).await; + let uri = test_server.uri(); + + // create new metrics for client so that they don't overlap w/ server + let metrics = Arc::new(metric::Registry::default()); + let client = Arc::new(client::GrpcCatalogClient::new( + uri, + metrics, + Arc::clone(&time_provider), + )); + + let test_catalog = TestCatalog::new(client); + test_catalog.hold_onto(test_server); + + Arc::new(test_catalog) as _ + }) + .await; + } + + #[tokio::test] + async fn test_catalog_metrics() { + maybe_start_logging(); + + let time_provider = Arc::new(SystemProvider::new()) as _; + let metrics = Arc::new(metric::Registry::default()); + let backing_catalog = Arc::new(MemCatalog::new(metrics, Arc::clone(&time_provider))); + let test_server = TestServer::new(backing_catalog).await; + let uri = test_server.uri(); + + // create new metrics for client so that they don't overlap w/ server + let metrics = Arc::new(metric::Registry::default()); + let client = Arc::new(client::GrpcCatalogClient::new( + uri, + Arc::clone(&metrics), + Arc::clone(&time_provider), + )); + + let ns = client + .repositories() + .namespaces() + .create(&NamespaceName::new("testns").unwrap(), None, None, None) + .await + .expect("namespace failed to create"); + + let _ = client + .repositories() + .tables() + .list_by_namespace_id(ns.id) + .await + .expect("failed to list namespaces"); + + let metric = metrics + .get_instrument::>("grpc_client_requests") + .expect("failed to get metric"); + + let count = metric + .get_observer(&Attributes::from(&[ + ( + "path", + "/influxdata.iox.catalog.v2.CatalogService/NamespaceCreate", + ), + ("status", "ok"), + ])) + .unwrap() + .fetch(); + + assert_eq!(count, 1); + + let count = metric + .get_observer(&Attributes::from(&[ + ( + "path", + "/influxdata.iox.catalog.v2.CatalogService/TableListByNamespaceId", + ), + ("status", "ok"), + ])) + .unwrap() + .fetch(); + + assert_eq!(count, 1); + } + + struct TestServer { + addr: SocketAddr, + #[allow(dead_code)] + task: JoinSet<()>, + } + + impl TestServer { + async fn new(catalog: Arc) -> Self { + let listener = TcpListener::bind("0.0.0.0:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let incoming = TcpIncoming::from_listener(listener, true, None).unwrap(); + let mut task = JoinSet::new(); + task.spawn(async move { + Server::builder() + .add_service(server::GrpcCatalogServer::new(catalog).service()) + .serve_with_incoming(incoming) + .await + .unwrap(); + }); + + Self { addr, task } + } + + fn uri(&self) -> Uri { + format!("http://{}:{}", self.addr.ip(), self.addr.port()) + .parse() + .unwrap() + } + } +} diff --git a/iox_catalog/src/grpc/serialization.rs b/iox_catalog/src/grpc/serialization.rs new file mode 100644 index 00000000000..2698dc424f9 --- /dev/null +++ b/iox_catalog/src/grpc/serialization.rs @@ -0,0 +1,712 @@ +use data_types::{ + partition_template::NamespacePartitionTemplateOverride, Column, ColumnId, ColumnSet, + ColumnType, Namespace, NamespaceId, ObjectStoreId, ParquetFile, ParquetFileId, + ParquetFileParams, Partition, PartitionId, SkippedCompaction, SortKeyIds, Table, TableId, + Timestamp, +}; +use generated_types::influxdata::iox::catalog::v2 as proto; +use uuid::Uuid; + +use crate::interface::SoftDeletedRows; + +#[derive(Debug)] +pub struct Error { + msg: String, + path: Vec<&'static str>, +} + +impl Error { + fn new(e: E) -> Self + where + E: std::fmt::Display, + { + Self { + msg: e.to_string(), + path: vec![], + } + } + + fn ctx(self, arg: &'static str) -> Self { + let Self { msg, mut path } = self; + path.insert(0, arg); + Self { msg, path } + } +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if !self.path.is_empty() { + write!(f, "{}", self.path[0])?; + for p in self.path.iter().skip(1) { + write!(f, ".{}", p)?; + } + write!(f, ": ")?; + } + + write!(f, "{}", self.msg)?; + + Ok(()) + } +} + +impl std::error::Error for Error {} + +impl From for crate::interface::Error { + fn from(e: Error) -> Self { + Self::External { source: e.into() } + } +} + +impl From for tonic::Status { + fn from(e: Error) -> Self { + Self::invalid_argument(e.to_string()) + } +} + +pub(crate) trait ConvertExt { + fn convert(self) -> Result; +} + +impl ConvertExt for T +where + T: TryInto, + T::Error: std::fmt::Display, +{ + fn convert(self) -> Result { + self.try_into().map_err(Error::new) + } +} + +pub(crate) trait ConvertOptExt { + fn convert_opt(self) -> Result; +} + +impl ConvertOptExt> for Option +where + T: TryInto, + T::Error: std::fmt::Display, +{ + fn convert_opt(self) -> Result, Error> { + self.map(|x| x.convert()).transpose() + } +} + +pub(crate) trait RequiredExt { + fn required(self) -> Result; +} + +impl RequiredExt for Option { + fn required(self) -> Result { + self.ok_or_else(|| Error::new("required")) + } +} + +pub(crate) trait ContextExt { + fn ctx(self, path: &'static str) -> Result; +} + +impl ContextExt for Result { + fn ctx(self, path: &'static str) -> Self { + self.map_err(|e| e.ctx(path)) + } +} + +pub(crate) fn catalog_error_to_status(e: crate::interface::Error) -> tonic::Status { + use crate::interface::Error; + + match e { + Error::External { source } => tonic::Status::internal(source.to_string()), + Error::AlreadyExists { descr } => tonic::Status::already_exists(descr), + Error::LimitExceeded { descr } => tonic::Status::resource_exhausted(descr), + Error::NotFound { descr } => tonic::Status::not_found(descr), + } +} + +pub(crate) fn convert_status(status: tonic::Status) -> crate::interface::Error { + use crate::interface::Error; + + match status.code() { + tonic::Code::Internal => Error::External { + source: status.message().to_owned().into(), + }, + tonic::Code::AlreadyExists => Error::AlreadyExists { + descr: status.message().to_owned(), + }, + tonic::Code::ResourceExhausted => Error::LimitExceeded { + descr: status.message().to_owned(), + }, + tonic::Code::NotFound => Error::NotFound { + descr: status.message().to_owned(), + }, + _ => Error::External { + source: Box::new(status), + }, + } +} + +pub(crate) fn serialize_soft_deleted_rows(sdr: SoftDeletedRows) -> i32 { + let sdr = match sdr { + SoftDeletedRows::AllRows => proto::SoftDeletedRows::AllRows, + SoftDeletedRows::ExcludeDeleted => proto::SoftDeletedRows::ExcludeDeleted, + SoftDeletedRows::OnlyDeleted => proto::SoftDeletedRows::OnlyDeleted, + }; + + sdr.into() +} + +pub(crate) fn deserialize_soft_deleted_rows(sdr: i32) -> Result { + let sdr: proto::SoftDeletedRows = sdr.convert().ctx("soft deleted rows")?; + let sdr = match sdr { + proto::SoftDeletedRows::Unspecified => { + return Err(Error::new("unspecified soft deleted rows")); + } + proto::SoftDeletedRows::AllRows => SoftDeletedRows::AllRows, + proto::SoftDeletedRows::ExcludeDeleted => SoftDeletedRows::ExcludeDeleted, + proto::SoftDeletedRows::OnlyDeleted => SoftDeletedRows::OnlyDeleted, + }; + Ok(sdr) +} + +pub(crate) fn serialize_namespace(ns: Namespace) -> proto::Namespace { + proto::Namespace { + id: ns.id.get(), + name: ns.name, + retention_period_ns: ns.retention_period_ns, + max_tables: ns.max_tables.get_i32(), + max_columns_per_table: ns.max_columns_per_table.get_i32(), + deleted_at: ns.deleted_at.map(|ts| ts.get()), + partition_template: ns.partition_template.as_proto().cloned(), + } +} + +pub(crate) fn deserialize_namespace(ns: proto::Namespace) -> Result { + Ok(Namespace { + id: NamespaceId::new(ns.id), + name: ns.name, + retention_period_ns: ns.retention_period_ns, + max_tables: ns.max_tables.convert().ctx("max_tables")?, + max_columns_per_table: ns + .max_columns_per_table + .convert() + .ctx("max_columns_per_table")?, + deleted_at: ns.deleted_at.map(Timestamp::new), + partition_template: ns + .partition_template + .convert_opt() + .ctx("partition_template")? + .unwrap_or_else(NamespacePartitionTemplateOverride::const_default), + }) +} + +pub(crate) fn serialize_table(t: Table) -> proto::Table { + proto::Table { + id: t.id.get(), + namespace_id: t.namespace_id.get(), + name: t.name, + partition_template: t.partition_template.as_proto().cloned(), + } +} + +pub(crate) fn deserialize_table(t: proto::Table) -> Result { + Ok(Table { + id: TableId::new(t.id), + namespace_id: NamespaceId::new(t.namespace_id), + name: t.name, + partition_template: t.partition_template.convert().ctx("partition_template")?, + }) +} + +pub(crate) fn serialize_column_type(t: ColumnType) -> i32 { + use generated_types::influxdata::iox::column_type::v1 as proto; + proto::ColumnType::from(t).into() +} + +pub(crate) fn deserialize_column_type(t: i32) -> Result { + use generated_types::influxdata::iox::column_type::v1 as proto; + let t: proto::ColumnType = t.convert()?; + t.convert() +} + +pub(crate) fn serialize_column(column: Column) -> proto::Column { + proto::Column { + id: column.id.get(), + table_id: column.table_id.get(), + name: column.name, + column_type: serialize_column_type(column.column_type), + } +} + +pub(crate) fn deserialize_column(column: proto::Column) -> Result { + Ok(Column { + id: ColumnId::new(column.id), + table_id: TableId::new(column.table_id), + name: column.name, + column_type: deserialize_column_type(column.column_type)?, + }) +} + +pub(crate) fn serialize_sort_key_ids(sort_key_ids: &SortKeyIds) -> proto::SortKeyIds { + proto::SortKeyIds { + column_ids: sort_key_ids.iter().map(|c_id| c_id.get()).collect(), + } +} + +pub(crate) fn deserialize_sort_key_ids(sort_key_ids: proto::SortKeyIds) -> SortKeyIds { + SortKeyIds::new(sort_key_ids.column_ids.into_iter().map(ColumnId::new)) +} + +pub(crate) fn serialize_partition(partition: Partition) -> proto::Partition { + let empty_sk = SortKeyIds::new(std::iter::empty()); + + proto::Partition { + id: partition.id.get(), + hash_id: partition + .hash_id() + .map(|id| id.as_bytes().to_vec()) + .unwrap_or_default(), + partition_key: partition.partition_key.inner().to_owned(), + table_id: partition.table_id.get(), + sort_key_ids: Some(serialize_sort_key_ids( + partition.sort_key_ids().unwrap_or(&empty_sk), + )), + new_file_at: partition.new_file_at.map(|ts| ts.get()), + } +} + +pub(crate) fn deserialize_partition(partition: proto::Partition) -> Result { + Ok(Partition::new_catalog_only( + PartitionId::new(partition.id), + (!partition.hash_id.is_empty()) + .then_some(partition.hash_id.as_slice()) + .convert_opt() + .ctx("hash_id")?, + TableId::new(partition.table_id), + partition.partition_key.into(), + deserialize_sort_key_ids(partition.sort_key_ids.required().ctx("sort_key_ids")?), + partition.new_file_at.map(Timestamp::new), + )) +} + +pub(crate) fn serialize_skipped_compaction(sc: SkippedCompaction) -> proto::SkippedCompaction { + proto::SkippedCompaction { + partition_id: sc.partition_id.get(), + reason: sc.reason, + skipped_at: sc.skipped_at.get(), + estimated_bytes: sc.estimated_bytes, + limit_bytes: sc.limit_bytes, + num_files: sc.num_files, + limit_num_files: sc.limit_num_files, + limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition, + } +} + +pub(crate) fn deserialize_skipped_compaction(sc: proto::SkippedCompaction) -> SkippedCompaction { + SkippedCompaction { + partition_id: PartitionId::new(sc.partition_id), + reason: sc.reason, + skipped_at: Timestamp::new(sc.skipped_at), + estimated_bytes: sc.estimated_bytes, + limit_bytes: sc.limit_bytes, + num_files: sc.num_files, + limit_num_files: sc.limit_num_files, + limit_num_files_first_in_partition: sc.limit_num_files_first_in_partition, + } +} + +pub(crate) fn serialize_object_store_id(id: ObjectStoreId) -> proto::ObjectStoreId { + let (high64, low64) = id.get_uuid().as_u64_pair(); + proto::ObjectStoreId { high64, low64 } +} + +pub(crate) fn deserialize_object_store_id(id: proto::ObjectStoreId) -> ObjectStoreId { + ObjectStoreId::from_uuid(Uuid::from_u64_pair(id.high64, id.low64)) +} + +pub(crate) fn serialize_column_set(set: &ColumnSet) -> proto::ColumnSet { + proto::ColumnSet { + column_ids: set.iter().map(|id| id.get()).collect(), + } +} + +pub(crate) fn deserialize_column_set(set: proto::ColumnSet) -> ColumnSet { + ColumnSet::new(set.column_ids.into_iter().map(ColumnId::new)) +} + +pub(crate) fn serialize_parquet_file_params( + params: &ParquetFileParams, +) -> proto::ParquetFileParams { + proto::ParquetFileParams { + namespace_id: params.namespace_id.get(), + table_id: params.table_id.get(), + partition_id: params.partition_id.get(), + partition_hash_id: params + .partition_hash_id + .as_ref() + .map(|id| id.as_bytes().to_vec()), + object_store_id: Some(serialize_object_store_id(params.object_store_id)), + min_time: params.min_time.get(), + max_time: params.max_time.get(), + file_size_bytes: params.file_size_bytes, + row_count: params.row_count, + compaction_level: params.compaction_level as i32, + created_at: params.created_at.get(), + column_set: Some(serialize_column_set(¶ms.column_set)), + max_l0_created_at: params.max_l0_created_at.get(), + } +} + +pub(crate) fn deserialize_parquet_file_params( + params: proto::ParquetFileParams, +) -> Result { + Ok(ParquetFileParams { + namespace_id: NamespaceId::new(params.namespace_id), + table_id: TableId::new(params.table_id), + partition_id: PartitionId::new(params.partition_id), + partition_hash_id: params + .partition_hash_id + .as_deref() + .convert_opt() + .ctx("partition_hash_id")?, + object_store_id: deserialize_object_store_id( + params.object_store_id.required().ctx("object_store_id")?, + ), + min_time: Timestamp::new(params.min_time), + max_time: Timestamp::new(params.max_time), + file_size_bytes: params.file_size_bytes, + row_count: params.row_count, + compaction_level: params.compaction_level.convert().ctx("compaction_level")?, + created_at: Timestamp::new(params.created_at), + column_set: deserialize_column_set(params.column_set.required().ctx("column_set")?), + max_l0_created_at: Timestamp::new(params.max_l0_created_at), + }) +} + +pub(crate) fn serialize_parquet_file(file: ParquetFile) -> proto::ParquetFile { + let partition_hash_id = file + .partition_hash_id + .map(|x| x.as_bytes().to_vec()) + .unwrap_or_default(); + + proto::ParquetFile { + id: file.id.get(), + namespace_id: file.namespace_id.get(), + table_id: file.table_id.get(), + partition_id: file.partition_id.get(), + partition_hash_id, + object_store_id: Some(serialize_object_store_id(file.object_store_id)), + min_time: file.min_time.get(), + max_time: file.max_time.get(), + to_delete: file.to_delete.map(|ts| ts.get()), + file_size_bytes: file.file_size_bytes, + row_count: file.row_count, + compaction_level: file.compaction_level as i32, + created_at: file.created_at.get(), + column_set: Some(serialize_column_set(&file.column_set)), + max_l0_created_at: file.max_l0_created_at.get(), + } +} + +pub(crate) fn deserialize_parquet_file(file: proto::ParquetFile) -> Result { + let partition_hash_id = match file.partition_hash_id.as_slice() { + b"" => None, + s => Some(s.convert().ctx("partition_hash_id")?), + }; + + Ok(ParquetFile { + id: ParquetFileId::new(file.id), + namespace_id: NamespaceId::new(file.namespace_id), + table_id: TableId::new(file.table_id), + partition_id: PartitionId::new(file.partition_id), + partition_hash_id, + object_store_id: deserialize_object_store_id( + file.object_store_id.required().ctx("object_store_id")?, + ), + min_time: Timestamp::new(file.min_time), + max_time: Timestamp::new(file.max_time), + to_delete: file.to_delete.map(Timestamp::new), + file_size_bytes: file.file_size_bytes, + row_count: file.row_count, + compaction_level: file.compaction_level.convert().ctx("compaction_level")?, + created_at: Timestamp::new(file.created_at), + column_set: deserialize_column_set(file.column_set.required().ctx("column_set")?), + max_l0_created_at: Timestamp::new(file.max_l0_created_at), + }) +} + +#[cfg(test)] +mod tests { + use data_types::{ + partition_template::TablePartitionTemplateOverride, CompactionLevel, PartitionHashId, + PartitionKey, + }; + + use super::*; + + #[test] + fn test_column_type_roundtrip() { + assert_column_type_roundtrip(ColumnType::Bool); + assert_column_type_roundtrip(ColumnType::I64); + assert_column_type_roundtrip(ColumnType::U64); + assert_column_type_roundtrip(ColumnType::F64); + assert_column_type_roundtrip(ColumnType::String); + assert_column_type_roundtrip(ColumnType::Tag); + assert_column_type_roundtrip(ColumnType::Time); + } + + #[track_caller] + fn assert_column_type_roundtrip(t: ColumnType) { + let protobuf = serialize_column_type(t); + let t2 = deserialize_column_type(protobuf).unwrap(); + assert_eq!(t, t2); + } + + #[test] + fn test_error_roundtrip() { + use crate::interface::Error; + + assert_error_roundtrip(Error::AlreadyExists { + descr: "foo".to_owned(), + }); + assert_error_roundtrip(Error::External { + source: "foo".to_owned().into(), + }); + assert_error_roundtrip(Error::LimitExceeded { + descr: "foo".to_owned(), + }); + assert_error_roundtrip(Error::NotFound { + descr: "foo".to_owned(), + }); + } + + #[track_caller] + fn assert_error_roundtrip(e: crate::interface::Error) { + let msg_orig = e.to_string(); + + let status = catalog_error_to_status(e); + let e = convert_status(status); + let msg = e.to_string(); + assert_eq!(msg, msg_orig); + } + + #[test] + fn test_soft_deleted_rows_roundtrip() { + assert_soft_deleted_rows_roundtrip(SoftDeletedRows::AllRows); + assert_soft_deleted_rows_roundtrip(SoftDeletedRows::ExcludeDeleted); + assert_soft_deleted_rows_roundtrip(SoftDeletedRows::OnlyDeleted); + } + + #[track_caller] + fn assert_soft_deleted_rows_roundtrip(sdr: SoftDeletedRows) { + let protobuf = serialize_soft_deleted_rows(sdr); + let sdr2 = deserialize_soft_deleted_rows(protobuf).unwrap(); + assert_eq!(sdr, sdr2); + } + + #[test] + fn test_namespace_roundtrip() { + use generated_types::influxdata::iox::partition_template::v1 as proto; + + let ns = Namespace { + id: NamespaceId::new(1), + name: "ns".to_owned(), + retention_period_ns: Some(2), + max_tables: 3.try_into().unwrap(), + max_columns_per_table: 4.try_into().unwrap(), + deleted_at: Some(Timestamp::new(5)), + partition_template: NamespacePartitionTemplateOverride::try_from( + proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), + }], + }, + ) + .unwrap(), + }; + let protobuf = serialize_namespace(ns.clone()); + let ns2 = deserialize_namespace(protobuf).unwrap(); + assert_eq!(ns, ns2); + } + + #[test] + fn test_table_roundtrip() { + use generated_types::influxdata::iox::partition_template::v1 as proto; + + let table = Table { + id: TableId::new(1), + namespace_id: NamespaceId::new(2), + name: "table".to_owned(), + partition_template: TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), + }], + }), + &NamespacePartitionTemplateOverride::const_default(), + ) + .unwrap(), + }; + let protobuf = serialize_table(table.clone()); + let table2 = deserialize_table(protobuf).unwrap(); + assert_eq!(table, table2); + } + + #[test] + fn test_column_roundtrip() { + let column = Column { + id: ColumnId::new(1), + table_id: TableId::new(2), + name: "col".to_owned(), + column_type: ColumnType::F64, + }; + let protobuf = serialize_column(column.clone()); + let column2 = deserialize_column(protobuf).unwrap(); + assert_eq!(column, column2); + } + + #[test] + fn test_sort_key_ids_roundtrip() { + assert_sort_key_ids_roundtrip(SortKeyIds::new(std::iter::empty())); + assert_sort_key_ids_roundtrip(SortKeyIds::new([ColumnId::new(1)])); + assert_sort_key_ids_roundtrip(SortKeyIds::new([ + ColumnId::new(1), + ColumnId::new(5), + ColumnId::new(20), + ])); + } + + #[track_caller] + fn assert_sort_key_ids_roundtrip(sort_key_ids: SortKeyIds) { + let protobuf = serialize_sort_key_ids(&sort_key_ids); + let sort_key_ids2 = deserialize_sort_key_ids(protobuf); + assert_eq!(sort_key_ids, sort_key_ids2); + } + + #[test] + fn test_partition_roundtrip() { + let table_id = TableId::new(1); + let partition_key = PartitionKey::from("key"); + let hash_id = PartitionHashId::new(table_id, &partition_key); + + assert_partition_roundtrip(Partition::new_catalog_only( + PartitionId::new(2), + Some(hash_id.clone()), + table_id, + partition_key.clone(), + SortKeyIds::new([ColumnId::new(3), ColumnId::new(4)]), + Some(Timestamp::new(5)), + )); + assert_partition_roundtrip(Partition::new_catalog_only( + PartitionId::new(2), + Some(hash_id), + table_id, + partition_key, + SortKeyIds::new(std::iter::empty()), + Some(Timestamp::new(5)), + )); + } + + #[track_caller] + fn assert_partition_roundtrip(partition: Partition) { + let protobuf = serialize_partition(partition.clone()); + let partition2 = deserialize_partition(protobuf).unwrap(); + assert_eq!(partition, partition2); + } + + #[test] + fn test_skipped_compaction_roundtrip() { + let sc = SkippedCompaction { + partition_id: PartitionId::new(1), + reason: "foo".to_owned(), + skipped_at: Timestamp::new(2), + estimated_bytes: 3, + limit_bytes: 4, + num_files: 5, + limit_num_files: 6, + limit_num_files_first_in_partition: 7, + }; + let protobuf = serialize_skipped_compaction(sc.clone()); + let sc2 = deserialize_skipped_compaction(protobuf); + assert_eq!(sc, sc2); + } + + #[test] + fn test_object_store_id_roundtrip() { + assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::nil())); + assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(0))); + assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX))); + assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(1))); + assert_object_store_id_roundtrip(ObjectStoreId::from_uuid(Uuid::from_u128(u128::MAX - 1))); + } + + #[track_caller] + fn assert_object_store_id_roundtrip(id: ObjectStoreId) { + let protobuf = serialize_object_store_id(id); + let id2 = deserialize_object_store_id(protobuf); + assert_eq!(id, id2); + } + + #[test] + fn test_column_set_roundtrip() { + assert_column_set_roundtrip(ColumnSet::new([])); + assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1)])); + assert_column_set_roundtrip(ColumnSet::new([ColumnId::new(1), ColumnId::new(10)])); + assert_column_set_roundtrip(ColumnSet::new([ + ColumnId::new(3), + ColumnId::new(4), + ColumnId::new(10), + ])); + } + + #[track_caller] + fn assert_column_set_roundtrip(set: ColumnSet) { + let protobuf = serialize_column_set(&set); + let set2 = deserialize_column_set(protobuf); + assert_eq!(set, set2); + } + + #[test] + fn test_parquet_file_params_roundtrip() { + let params = ParquetFileParams { + namespace_id: NamespaceId::new(1), + table_id: TableId::new(2), + partition_id: PartitionId::new(3), + partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()), + object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)), + min_time: Timestamp::new(4), + max_time: Timestamp::new(5), + file_size_bytes: 6, + row_count: 7, + compaction_level: CompactionLevel::Final, + created_at: Timestamp::new(8), + column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]), + max_l0_created_at: Timestamp::new(11), + }; + let protobuf = serialize_parquet_file_params(¶ms); + let params2 = deserialize_parquet_file_params(protobuf).unwrap(); + assert_eq!(params, params2); + } + + #[test] + fn test_parquet_file_roundtrip() { + let file = ParquetFile { + id: ParquetFileId::new(12), + namespace_id: NamespaceId::new(1), + table_id: TableId::new(2), + partition_id: PartitionId::new(3), + partition_hash_id: Some(PartitionHashId::arbitrary_for_testing()), + object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128(1337)), + min_time: Timestamp::new(4), + max_time: Timestamp::new(5), + to_delete: Some(Timestamp::new(13)), + file_size_bytes: 6, + row_count: 7, + compaction_level: CompactionLevel::Final, + created_at: Timestamp::new(8), + column_set: ColumnSet::new([ColumnId::new(9), ColumnId::new(10)]), + max_l0_created_at: Timestamp::new(11), + }; + let protobuf = serialize_parquet_file(file.clone()); + let file2 = deserialize_parquet_file(protobuf).unwrap(); + assert_eq!(file, file2); + } +} diff --git a/iox_catalog/src/grpc/server.rs b/iox_catalog/src/grpc/server.rs new file mode 100644 index 00000000000..2105457f470 --- /dev/null +++ b/iox_catalog/src/grpc/server.rs @@ -0,0 +1,1032 @@ +//! gRPC server implementation. + +use std::{pin::Pin, sync::Arc}; + +use crate::{ + grpc::serialization::{ + catalog_error_to_status, deserialize_column_type, deserialize_object_store_id, + deserialize_parquet_file_params, deserialize_soft_deleted_rows, deserialize_sort_key_ids, + serialize_column, serialize_namespace, serialize_object_store_id, serialize_parquet_file, + serialize_partition, serialize_skipped_compaction, serialize_sort_key_ids, serialize_table, + ContextExt, ConvertExt, ConvertOptExt, RequiredExt, + }, + interface::{CasFailure, Catalog}, +}; +use async_trait::async_trait; +use data_types::{ + NamespaceId, NamespaceServiceProtectionLimitsOverride, PartitionId, PartitionKey, TableId, + Timestamp, +}; +use futures::{Stream, StreamExt, TryStreamExt}; +use generated_types::influxdata::iox::catalog::v2 as proto; +use generated_types::influxdata::iox::catalog::v2::{TableSnapshotRequest, TableSnapshotResponse}; +use tonic::{Request, Response, Status}; + +type TonicStream = Pin> + Send + 'static>>; + +/// gRPC server. +#[derive(Debug)] +pub struct GrpcCatalogServer { + catalog: Arc, +} + +impl GrpcCatalogServer { + /// Create a new [`GrpcCatalogServer`]. + pub fn new(catalog: Arc) -> Self { + Self { catalog } + } + + /// Get service for integration w/ tonic. + pub fn service(&self) -> proto::catalog_service_server::CatalogServiceServer { + let this = Self { + catalog: Arc::clone(&self.catalog), + }; + proto::catalog_service_server::CatalogServiceServer::new(this) + } +} + +#[async_trait] +impl proto::catalog_service_server::CatalogService for GrpcCatalogServer { + type NamespaceListStream = TonicStream; + + type TableListByNamespaceIdStream = TonicStream; + type TableListStream = TonicStream; + + type ColumnCreateOrGetManyUncheckedStream = + TonicStream; + type ColumnListByNamespaceIdStream = TonicStream; + type ColumnListByTableIdStream = TonicStream; + type ColumnListStream = TonicStream; + + type PartitionGetByIdBatchStream = TonicStream; + type PartitionListByTableIdStream = TonicStream; + type PartitionListIdsStream = TonicStream; + type PartitionGetInSkippedCompactionsStream = + TonicStream; + type PartitionListSkippedCompactionsStream = + TonicStream; + type PartitionMostRecentNStream = TonicStream; + type PartitionNewFileBetweenStream = TonicStream; + type PartitionListOldStyleStream = TonicStream; + + type ParquetFileFlagForDeleteByRetentionStream = + TonicStream; + type ParquetFileDeleteOldIdsOnlyStream = + TonicStream; + type ParquetFileListByPartitionNotToDeleteBatchStream = + TonicStream; + type ParquetFileExistsByObjectStoreIdBatchStream = + TonicStream; + + async fn namespace_create( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let ns = self + .catalog + .repositories() + .namespaces() + .create( + &req.name.convert().ctx("name")?, + req.partition_template + .convert_opt() + .ctx("partition_template")?, + req.retention_period_ns, + req.service_protection_limits + .map(|l| { + let l = NamespaceServiceProtectionLimitsOverride { + max_tables: l.max_tables.convert_opt().ctx("max_tables")?, + max_columns_per_table: l + .max_columns_per_table + .convert_opt() + .ctx("max_columns_per_table")?, + }; + Ok(l) as Result<_, tonic::Status> + }) + .transpose()?, + ) + .await + .map_err(catalog_error_to_status)?; + + let ns = serialize_namespace(ns); + + Ok(Response::new(proto::NamespaceCreateResponse { + namespace: Some(ns), + })) + } + + async fn namespace_update_retention_period( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let ns = self + .catalog + .repositories() + .namespaces() + .update_retention_period(&req.name, req.retention_period_ns) + .await + .map_err(catalog_error_to_status)?; + + let ns = serialize_namespace(ns); + + Ok(Response::new( + proto::NamespaceUpdateRetentionPeriodResponse { + namespace: Some(ns), + }, + )) + } + + async fn namespace_list( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let deleted = deserialize_soft_deleted_rows(req.deleted)?; + + let ns_list = self + .catalog + .repositories() + .namespaces() + .list(deleted) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(ns_list.into_iter().map(|ns| { + let ns = serialize_namespace(ns); + + Ok(proto::NamespaceListResponse { + namespace: Some(ns), + }) + })) + .boxed(), + )) + } + + async fn namespace_get_by_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let deleted = deserialize_soft_deleted_rows(req.deleted)?; + + let maybe_ns = self + .catalog + .repositories() + .namespaces() + .get_by_id(NamespaceId::new(req.id), deleted) + .await + .map_err(catalog_error_to_status)?; + + let maybe_ns = maybe_ns.map(serialize_namespace); + + Ok(Response::new(proto::NamespaceGetByIdResponse { + namespace: maybe_ns, + })) + } + + async fn namespace_get_by_name( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let deleted = deserialize_soft_deleted_rows(req.deleted)?; + + let maybe_ns = self + .catalog + .repositories() + .namespaces() + .get_by_name(&req.name, deleted) + .await + .map_err(catalog_error_to_status)?; + + let maybe_ns = maybe_ns.map(serialize_namespace); + + Ok(Response::new(proto::NamespaceGetByNameResponse { + namespace: maybe_ns, + })) + } + + async fn namespace_soft_delete( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + self.catalog + .repositories() + .namespaces() + .soft_delete(&req.name) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new(proto::NamespaceSoftDeleteResponse {})) + } + + async fn namespace_update_table_limit( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let ns = self + .catalog + .repositories() + .namespaces() + .update_table_limit(&req.name, req.new_max.convert().ctx("new_max")?) + .await + .map_err(catalog_error_to_status)?; + + let ns = serialize_namespace(ns); + + Ok(Response::new(proto::NamespaceUpdateTableLimitResponse { + namespace: Some(ns), + })) + } + + async fn namespace_update_column_limit( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let ns = self + .catalog + .repositories() + .namespaces() + .update_column_limit(&req.name, req.new_max.convert().ctx("new_max")?) + .await + .map_err(catalog_error_to_status)?; + + let ns = serialize_namespace(ns); + + Ok(Response::new(proto::NamespaceUpdateColumnLimitResponse { + namespace: Some(ns), + })) + } + + async fn table_create( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let table = self + .catalog + .repositories() + .tables() + .create( + &req.name, + req.partition_template.convert().ctx("partition_template")?, + NamespaceId::new(req.namespace_id), + ) + .await + .map_err(catalog_error_to_status)?; + + let table = serialize_table(table); + + Ok(Response::new(proto::TableCreateResponse { + table: Some(table), + })) + } + + async fn table_get_by_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let maybe_table = self + .catalog + .repositories() + .tables() + .get_by_id(TableId::new(req.id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new(proto::TableGetByIdResponse { + table: maybe_table.map(serialize_table), + })) + } + + async fn table_get_by_namespace_and_name( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let maybe_table = self + .catalog + .repositories() + .tables() + .get_by_namespace_and_name(NamespaceId::new(req.namespace_id), &req.name) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new(proto::TableGetByNamespaceAndNameResponse { + table: maybe_table.map(serialize_table), + })) + } + + async fn table_list_by_namespace_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let table_list = self + .catalog + .repositories() + .tables() + .list_by_namespace_id(NamespaceId::new(req.namespace_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(table_list.into_iter().map(|table| { + let table = serialize_table(table); + Ok(proto::TableListByNamespaceIdResponse { table: Some(table) }) + })) + .boxed(), + )) + } + + async fn table_list( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let table_list = self + .catalog + .repositories() + .tables() + .list() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(table_list.into_iter().map(|table| { + let table = serialize_table(table); + Ok(proto::TableListResponse { table: Some(table) }) + })) + .boxed(), + )) + } + + async fn table_snapshot( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let snapshot = self + .catalog + .repositories() + .tables() + .snapshot(TableId::new(req.table_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new(TableSnapshotResponse { + generation: snapshot.generation(), + table: Some(snapshot.into()), + })) + } + + async fn column_create_or_get( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let column_type = deserialize_column_type(req.column_type)?; + + let column = self + .catalog + .repositories() + .columns() + .create_or_get(&req.name, TableId::new(req.table_id), column_type) + .await + .map_err(catalog_error_to_status)?; + + let column = serialize_column(column); + + Ok(Response::new(proto::ColumnCreateOrGetResponse { + column: Some(column), + })) + } + + async fn column_create_or_get_many_unchecked( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let columns = req + .columns + .iter() + .map(|(name, t)| { + let t = deserialize_column_type(*t)?; + Ok((name.as_str(), t)) + }) + .collect::>()?; + + let column_list = self + .catalog + .repositories() + .columns() + .create_or_get_many_unchecked(TableId::new(req.table_id), columns) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(column_list.into_iter().map(|column| { + let column = serialize_column(column); + Ok(proto::ColumnCreateOrGetManyUncheckedResponse { + column: Some(column), + }) + })) + .boxed(), + )) + } + + async fn column_list_by_namespace_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let column_list = self + .catalog + .repositories() + .columns() + .list_by_namespace_id(NamespaceId::new(req.namespace_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(column_list.into_iter().map(|column| { + let column = serialize_column(column); + Ok(proto::ColumnListByNamespaceIdResponse { + column: Some(column), + }) + })) + .boxed(), + )) + } + + async fn column_list_by_table_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let column_list = self + .catalog + .repositories() + .columns() + .list_by_table_id(TableId::new(req.table_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(column_list.into_iter().map(|column| { + let column = serialize_column(column); + Ok(proto::ColumnListByTableIdResponse { + column: Some(column), + }) + })) + .boxed(), + )) + } + + async fn column_list( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let column_list = self + .catalog + .repositories() + .columns() + .list() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(column_list.into_iter().map(|column| { + let column = serialize_column(column); + Ok(proto::ColumnListResponse { + column: Some(column), + }) + })) + .boxed(), + )) + } + + async fn partition_create_or_get( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let partition = self + .catalog + .repositories() + .partitions() + .create_or_get(PartitionKey::from(req.key), TableId::new(req.table_id)) + .await + .map_err(catalog_error_to_status)?; + + let partition = serialize_partition(partition); + + Ok(Response::new(proto::PartitionCreateOrGetResponse { + partition: Some(partition), + })) + } + + async fn partition_get_by_id_batch( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let partition_ids = req + .partition_ids + .into_iter() + .map(PartitionId::new) + .collect::>(); + + let partition_list = self + .catalog + .repositories() + .partitions() + .get_by_id_batch(&partition_ids) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(partition_list.into_iter().map(|partition| { + let partition = serialize_partition(partition); + Ok(proto::PartitionGetByIdBatchResponse { + partition: Some(partition), + }) + })) + .boxed(), + )) + } + + async fn partition_list_by_table_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let partition_list = self + .catalog + .repositories() + .partitions() + .list_by_table_id(TableId::new(req.table_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(partition_list.into_iter().map(|partition| { + let partition = serialize_partition(partition); + Ok(proto::PartitionListByTableIdResponse { + partition: Some(partition), + }) + })) + .boxed(), + )) + } + + async fn partition_list_ids( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let id_list = self + .catalog + .repositories() + .partitions() + .list_ids() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(id_list.into_iter().map(|id| { + Ok(proto::PartitionListIdsResponse { + partition_id: id.get(), + }) + })) + .boxed(), + )) + } + + async fn partition_cas_sort_key( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let res = self + .catalog + .repositories() + .partitions() + .cas_sort_key( + PartitionId::new(req.partition_id), + req.old_sort_key_ids.map(deserialize_sort_key_ids).as_ref(), + &deserialize_sort_key_ids(req.new_sort_key_ids.required().ctx("new_sort_key_ids")?), + ) + .await; + + match res { + Ok(partition) => Ok(Response::new(proto::PartitionCasSortKeyResponse { + res: Some(proto::partition_cas_sort_key_response::Res::Partition( + serialize_partition(partition), + )), + })), + Err(CasFailure::ValueMismatch(sort_key_ids)) => { + Ok(Response::new(proto::PartitionCasSortKeyResponse { + res: Some(proto::partition_cas_sort_key_response::Res::CurrentSortKey( + serialize_sort_key_ids(&sort_key_ids), + )), + })) + } + Err(CasFailure::QueryError(e)) => Err(catalog_error_to_status(e)), + } + } + + async fn partition_record_skipped_compaction( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + self.catalog + .repositories() + .partitions() + .record_skipped_compaction( + PartitionId::new(req.partition_id), + &req.reason, + req.num_files as usize, + req.limit_num_files as usize, + req.limit_num_files_first_in_partition as usize, + req.estimated_bytes, + req.limit_bytes, + ) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + proto::PartitionRecordSkippedCompactionResponse {}, + )) + } + + async fn partition_get_in_skipped_compactions( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let partition_ids = req + .partition_ids + .into_iter() + .map(PartitionId::new) + .collect::>(); + + let skipped_compaction_list = self + .catalog + .repositories() + .partitions() + .get_in_skipped_compactions(&partition_ids) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| { + let sc = serialize_skipped_compaction(sc); + Ok(proto::PartitionGetInSkippedCompactionsResponse { + skipped_compaction: Some(sc), + }) + })) + .boxed(), + )) + } + + async fn partition_list_skipped_compactions( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let skipped_compaction_list = self + .catalog + .repositories() + .partitions() + .list_skipped_compactions() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(skipped_compaction_list.into_iter().map(|sc| { + let sc = serialize_skipped_compaction(sc); + Ok(proto::PartitionListSkippedCompactionsResponse { + skipped_compaction: Some(sc), + }) + })) + .boxed(), + )) + } + + async fn partition_delete_skipped_compactions( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let maybe_skipped_compaction = self + .catalog + .repositories() + .partitions() + .delete_skipped_compactions(PartitionId::new(req.partition_id)) + .await + .map_err(catalog_error_to_status)?; + + let maybe_skipped_compaction = maybe_skipped_compaction.map(serialize_skipped_compaction); + + Ok(Response::new( + proto::PartitionDeleteSkippedCompactionsResponse { + skipped_compaction: maybe_skipped_compaction, + }, + )) + } + + async fn partition_most_recent_n( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let partition_list = self + .catalog + .repositories() + .partitions() + .most_recent_n(req.n as usize) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(partition_list.into_iter().map(|partition| { + let partition = serialize_partition(partition); + Ok(proto::PartitionMostRecentNResponse { + partition: Some(partition), + }) + })) + .boxed(), + )) + } + + async fn partition_new_file_between( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let id_list = self + .catalog + .repositories() + .partitions() + .partitions_new_file_between( + Timestamp::new(req.minimum_time), + req.maximum_time.map(Timestamp::new), + ) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(id_list.into_iter().map(|id| { + Ok(proto::PartitionNewFileBetweenResponse { + partition_id: id.get(), + }) + })) + .boxed(), + )) + } + + async fn partition_list_old_style( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let partition_list = self + .catalog + .repositories() + .partitions() + .list_old_style() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(partition_list.into_iter().map(|partition| { + let partition = serialize_partition(partition); + Ok(proto::PartitionListOldStyleResponse { + partition: Some(partition), + }) + })) + .boxed(), + )) + } + + async fn partition_snapshot( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let snapshot = self + .catalog + .repositories() + .partitions() + .snapshot(PartitionId::new(req.partition_id)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new(proto::PartitionSnapshotResponse { + generation: snapshot.generation(), + partition: Some(snapshot.into()), + })) + } + + async fn parquet_file_flag_for_delete_by_retention( + &self, + _request: Request, + ) -> Result, tonic::Status> { + let id_list = self + .catalog + .repositories() + .parquet_files() + .flag_for_delete_by_retention() + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(id_list.into_iter().map(|(p_id, os_id)| { + let object_store_id = serialize_object_store_id(os_id); + Ok(proto::ParquetFileFlagForDeleteByRetentionResponse { + partition_id: p_id.get(), + object_store_id: Some(object_store_id), + }) + })) + .boxed(), + )) + } + + async fn parquet_file_delete_old_ids_only( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let id_list = self + .catalog + .repositories() + .parquet_files() + .delete_old_ids_only(Timestamp::new(req.older_than)) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(id_list.into_iter().map(|id| { + let object_store_id = serialize_object_store_id(id); + Ok(proto::ParquetFileDeleteOldIdsOnlyResponse { + object_store_id: Some(object_store_id), + }) + })) + .boxed(), + )) + } + + async fn parquet_file_list_by_partition_not_to_delete_batch( + &self, + request: Request, + ) -> Result, tonic::Status> + { + let req = request.into_inner(); + let partition_ids = req + .partition_ids + .into_iter() + .map(PartitionId::new) + .collect::>(); + + let file_list = self + .catalog + .repositories() + .parquet_files() + .list_by_partition_not_to_delete_batch(partition_ids) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(file_list.into_iter().map(|file| { + let file = serialize_parquet_file(file); + Ok(proto::ParquetFileListByPartitionNotToDeleteBatchResponse { + parquet_file: Some(file), + }) + })) + .boxed(), + )) + } + + async fn parquet_file_get_by_object_store_id( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + + let maybe_file = self + .catalog + .repositories() + .parquet_files() + .get_by_object_store_id(deserialize_object_store_id( + req.object_store_id.required().ctx("object_store_id")?, + )) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + proto::ParquetFileGetByObjectStoreIdResponse { + parquet_file: maybe_file.map(serialize_parquet_file), + }, + )) + } + + async fn parquet_file_exists_by_object_store_id_batch( + &self, + request: Request>, + ) -> Result, tonic::Status> { + let object_store_ids = request + .into_inner() + .map_err(|e| tonic::Status::invalid_argument(e.to_string())) + .and_then(|req| async move { + Ok(deserialize_object_store_id( + req.object_store_id.required().ctx("object_store_id")?, + )) + }) + .try_collect::>() + .await?; + + let id_list = self + .catalog + .repositories() + .parquet_files() + .exists_by_object_store_id_batch(object_store_ids) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + futures::stream::iter(id_list.into_iter().map(|id| { + let object_store_id = serialize_object_store_id(id); + Ok(proto::ParquetFileExistsByObjectStoreIdBatchResponse { + object_store_id: Some(object_store_id), + }) + })) + .boxed(), + )) + } + + async fn parquet_file_create_upgrade_delete( + &self, + request: Request, + ) -> Result, tonic::Status> { + let req = request.into_inner(); + let delete = req + .delete + .into_iter() + .map(deserialize_object_store_id) + .collect::>(); + let upgrade = req + .upgrade + .into_iter() + .map(deserialize_object_store_id) + .collect::>(); + let create = req + .create + .into_iter() + .map(deserialize_parquet_file_params) + .collect::, _>>()?; + + let id_list = self + .catalog + .repositories() + .parquet_files() + .create_upgrade_delete( + PartitionId::new(req.partition_id), + &delete, + &upgrade, + &create, + req.target_level.convert().ctx("target_level")?, + ) + .await + .map_err(catalog_error_to_status)?; + + Ok(Response::new( + proto::ParquetFileCreateUpgradeDeleteResponse { + created_parquet_file_ids: id_list.into_iter().map(|id| id.get()).collect(), + }, + )) + } +} diff --git a/iox_catalog/src/interface.rs b/iox_catalog/src/interface.rs index d06ef68967b..dae33a26a59 100644 --- a/iox_catalog/src/interface.rs +++ b/iox_catalog/src/interface.rs @@ -1,27 +1,22 @@ //! Traits and data types for the IOx Catalog API. use async_trait::async_trait; +use data_types::snapshot::partition::PartitionSnapshot; +use data_types::snapshot::table::TableSnapshot; use data_types::{ partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, - Column, ColumnType, ColumnsByName, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, - NamespaceId, NamespaceName, NamespaceSchema, NamespaceServiceProtectionLimitsOverride, - ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, - PartitionKey, SkippedCompaction, SortedColumnSet, Table, TableId, TableSchema, Timestamp, - TransitionPartitionId, + Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId, + NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile, + ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction, + SortKeyIds, Table, TableId, Timestamp, }; use iox_time::TimeProvider; -use snafu::{OptionExt, Snafu}; +use snafu::Snafu; use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::HashMap, fmt::{Debug, Display}, sync::Arc, }; -use uuid::Uuid; - -/// Maximum number of files touched by [`ParquetFileRepo::flag_for_delete_by_retention`] at a time. -pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION: i64 = 1_000; -/// Maximum number of files touched by [`ParquetFileRepo::delete_old_ids_only`] at a time. -pub const MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE: i64 = 10_000; /// An error wrapper detailing the reason for a compare-and-swap failure. #[derive(Debug)] @@ -36,122 +31,68 @@ pub enum CasFailure { } #[derive(Debug, Snafu)] -#[allow(missing_copy_implementations, missing_docs)] +#[allow(missing_docs)] #[snafu(visibility(pub(crate)))] pub enum Error { - #[snafu(display("invalid name: {}", name))] - InvalidName { name: String }, - - #[snafu(display("name {} already exists", name))] - NameExists { name: String }, - - #[snafu(display("A table named {name} already exists in namespace {namespace_id}"))] - TableNameExists { - name: String, - namespace_id: NamespaceId, - }, - - #[snafu(display("unhandled sqlx error: {}", source))] - SqlxError { source: sqlx::Error }, - - #[snafu(display("foreign key violation: {}", source))] - ForeignKeyViolation { source: sqlx::Error }, - - #[snafu(display("column {} is type {} but write has type {}", name, existing, new))] - ColumnTypeMismatch { - name: String, - existing: ColumnType, - new: ColumnType, - }, - - #[snafu(display( - "column type {} is in the db for column {}, which is unknown", - data_type, - name - ))] - UnknownColumnType { data_type: i16, name: String }, - - #[snafu(display("namespace {} not found", name))] - NamespaceNotFoundByName { name: String }, - - #[snafu(display("namespace {} not found", id))] - NamespaceNotFoundById { id: NamespaceId }, - - #[snafu(display("table {} not found", id))] - TableNotFound { id: TableId }, - - #[snafu(display("table {} not found", name))] - TableNotFoundByName { name: String }, - - #[snafu(display("partition {} not found", id))] - PartitionNotFound { id: TransitionPartitionId }, - - #[snafu(display( - "couldn't create column {} in table {}; limit reached on namespace", - column_name, - table_id, - ))] - ColumnCreateLimitError { - column_name: String, - table_id: TableId, - }, - - #[snafu(display( - "couldn't create table {}; limit reached on namespace {}", - table_name, - namespace_id - ))] - TableCreateLimitError { - table_name: String, - namespace_id: NamespaceId, - }, - - #[snafu(display("parquet file with object_store_id {} already exists", object_store_id))] - FileExists { object_store_id: Uuid }, - - #[snafu(display("parquet file with id {} does not exist. Foreign key violation", id))] - FileNotFound { id: i64 }, - - #[snafu(display("parquet_file record {} not found", id))] - ParquetRecordNotFound { id: ParquetFileId }, - - #[snafu(display("cannot derive valid column schema from column {}: {}", name, source))] - InvalidColumn { + #[snafu(display("unhandled external error: {source}"))] + External { source: Box, - name: String, }, - #[snafu(display("cannot start a transaction: {}", source))] - StartTransaction { source: sqlx::Error }, + #[snafu(display("already exists: {descr}"))] + AlreadyExists { descr: String }, - #[snafu(display("no transaction provided"))] - NoTransaction, + #[snafu(display("limit exceeded: {descr}"))] + LimitExceeded { descr: String }, - #[snafu(display("transaction failed to commit: {}", source))] - FailedToCommit { source: sqlx::Error }, + #[snafu(display("not found: {descr}"))] + NotFound { descr: String }, +} - #[snafu(display("error while converting usize {} to i64", value))] - InvalidValue { value: usize }, +impl From for Error { + fn from(e: sqlx::Error) -> Self { + Self::External { + source: Box::new(e), + } + } +} - #[snafu(display("database setup error: {}", source))] - Setup { source: sqlx::Error }, +impl From for Error { + fn from(e: sqlx::migrate::MigrateError) -> Self { + Self::from(sqlx::Error::from(e)) + } +} - #[snafu(display( - "could not record a skipped compaction for partition {partition_id}: {source}" - ))] - CouldNotRecordSkippedCompaction { - source: sqlx::Error, - partition_id: PartitionId, - }, +impl From for Error { + fn from(e: data_types::snapshot::partition::Error) -> Self { + Self::External { + source: Box::new(e), + } + } +} - #[snafu(display("could not list skipped compactions: {source}"))] - CouldNotListSkippedCompactions { source: sqlx::Error }, +impl From for Error { + fn from(e: data_types::snapshot::table::Error) -> Self { + Self::External { + source: Box::new(e), + } + } +} - #[snafu(display("could not delete skipped compactions: {source}"))] - CouldNotDeleteSkippedCompactions { source: sqlx::Error }, +impl From for Error { + fn from(e: catalog_cache::api::quorum::Error) -> Self { + Self::External { + source: Box::new(e), + } + } +} - #[snafu(display("could not delete namespace: {source}"))] - CouldNotDeleteNamespace { source: sqlx::Error }, +impl From for Error { + fn from(e: generated_types::prost::DecodeError) -> Self { + Self::External { + source: Box::new(e), + } + } } /// A specialized `Error` for Catalog errors @@ -182,7 +123,7 @@ pub type Result = std::result::Result; /// AllRows /// /// ``` -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SoftDeletedRows { /// Return all rows. AllRows, @@ -211,7 +152,7 @@ pub trait Catalog: Send + Sync + Debug + Display { async fn setup(&self) -> Result<(), Error>; /// Accesses the repositories without a transaction scope. - async fn repositories(&self) -> Box; + fn repositories(&self) -> Box; /// Gets metric registry associated with this catalog for testing purposes. #[cfg(test)] @@ -233,7 +174,6 @@ pub trait Catalog: Send + Sync + Debug + Display { /// A repository might internally map to a wide range of different storage abstractions, ranging /// from one or more SQL tables over key-value key spaces to simple in-memory vectors. The user /// should and must not care how these are implemented. -#[async_trait] pub trait RepoCollection: Send + Sync + Debug { /// Repository for [namespaces](data_types::Namespace). fn namespaces(&mut self) -> &mut dyn NamespaceRepo; @@ -330,6 +270,9 @@ pub trait TableRepo: Send + Sync { /// List all tables. async fn list(&mut self) -> Result>; + + /// Obtain a table snapshot + async fn snapshot(&mut self, table_id: TableId) -> Result; } /// Functions for working with columns in the catalog @@ -370,6 +313,21 @@ pub trait ColumnRepo: Send + Sync { async fn list(&mut self) -> Result>; } +/// Extension trait for [`ParquetFileRepo`] +#[async_trait] +pub trait PartitionRepoExt { + /// create the parquet file + async fn get_by_id(self, partition_id: PartitionId) -> Result>; +} + +#[async_trait] +impl PartitionRepoExt for &mut dyn PartitionRepo { + async fn get_by_id(self, partition_id: PartitionId) -> Result> { + let iter = self.get_by_id_batch(&[partition_id]).await?; + Ok(iter.into_iter().next()) + } +} + /// Functions for working with IOx partitions in the catalog. These are how IOx splits up /// data within a namespace. #[async_trait] @@ -377,27 +335,10 @@ pub trait PartitionRepo: Send + Sync { /// create or get a partition record for the given partition key and table async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result; - /// get partition by ID - async fn get_by_id(&mut self, partition_id: PartitionId) -> Result>; - /// get multiple partitions by ID. /// /// the output order is undefined, non-existing partitions are not part of the output. - async fn get_by_id_batch(&mut self, partition_ids: Vec) -> Result>; - - /// get partition by deterministic hash ID - async fn get_by_hash_id( - &mut self, - partition_hash_id: &PartitionHashId, - ) -> Result>; - - /// get partition by deterministic hash ID - /// - /// the output order is undefined, non-existing partitions are not part of the output. - async fn get_by_hash_id_batch( - &mut self, - partition_hash_ids: &[&PartitionHashId], - ) -> Result>; + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result>; /// return the partitions by table id async fn list_by_table_id(&mut self, table_id: TableId) -> Result>; @@ -405,8 +346,8 @@ pub trait PartitionRepo: Send + Sync { /// return all partitions IDs async fn list_ids(&mut self) -> Result>; - /// Update the sort key for the partition, setting it to `new_sort_key` iff - /// the current value matches `old_sort_key`. + /// Update the sort key for the partition, setting it to `new_sort_key_ids` iff + /// the current value matches `old_sort_key_ids`. /// /// NOTE: it is expected that ONLY the ingesters update sort keys for /// existing partitions. @@ -416,18 +357,12 @@ pub trait PartitionRepo: Send + Sync { /// Implementations are allowed to spuriously return /// [`CasFailure::ValueMismatch`] for performance reasons in the presence of /// concurrent writers. - /// - // TODO: After the sort_key_ids field is converetd into NOT NULL, the implementation of this function - // must be changed to compare old_sort_key_ids with the existing sort_key_ids instead of - // comparing old_sort_key with existing sort_key async fn cas_sort_key( &mut self, - partition_id: &TransitionPartitionId, - old_sort_key: Option>, // todo: remove this old_sort_key - old_sort_key_ids: Option, - new_sort_key: &[&str], //todo: remove this new_sort_key - new_sort_key_ids: &SortedColumnSet, - ) -> Result, SortedColumnSet)>>; + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result>; /// Record an instance of a partition being selected for compaction but compaction was not /// completed for the specified reason. @@ -475,34 +410,41 @@ pub trait PartitionRepo: Send + Sync { /// Can be removed when all partitions have hash IDs and support for old-style partitions is no /// longer needed. async fn list_old_style(&mut self) -> Result>; + + /// Obtain a partition snapshot + async fn snapshot(&mut self, partition_id: PartitionId) -> Result; } -/// Functions for working with parquet file pointers in the catalog +/// Extension trait for [`ParquetFileRepo`] #[async_trait] -pub trait ParquetFileRepo: Send + Sync { +pub trait ParquetFileRepoExt { /// create the parquet file - async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result; + async fn create(self, parquet_file_params: ParquetFileParams) -> Result; +} - /// List all parquet files in implementation-defined, non-deterministic order. - /// - /// This includes files that were marked for deletion. - /// - /// This is mostly useful for testing and will likely not succeed in production. - async fn list_all(&mut self) -> Result>; +#[async_trait] +impl ParquetFileRepoExt for &mut dyn ParquetFileRepo { + /// create the parquet file + async fn create(self, params: ParquetFileParams) -> Result { + let files = self + .create_upgrade_delete( + params.partition_id, + &[], + &[], + &[params.clone()], + CompactionLevel::Initial, + ) + .await?; + let id = files.into_iter().next().unwrap(); + Ok(ParquetFile::from_params(params, id)) + } +} +/// Functions for working with parquet file pointers in the catalog +#[async_trait] +pub trait ParquetFileRepo: Send + Sync { /// Flag all parquet files for deletion that are older than their namespace's retention period. - async fn flag_for_delete_by_retention(&mut self) -> Result>; - - /// List all parquet files within a given namespace that are NOT marked as - /// [`to_delete`](ParquetFile::to_delete). - async fn list_by_namespace_not_to_delete( - &mut self, - namespace_id: NamespaceId, - ) -> Result>; - - /// List all parquet files within a given table that are NOT marked as - /// [`to_delete`](ParquetFile::to_delete). - async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result>; + async fn flag_for_delete_by_retention(&mut self) -> Result>; /// Delete parquet files that were marked to be deleted earlier than the specified time. /// @@ -510,2901 +452,39 @@ pub trait ParquetFileRepo: Send + Sync { /// /// This deletion is limited to a certain (backend-specific) number of files to avoid overlarge /// changes. The caller MAY call this method again if the result was NOT empty. - async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result>; + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result>; - /// List parquet files for a given partition that are NOT marked as + /// List parquet files for given partitions that are NOT marked as /// [`to_delete`](ParquetFile::to_delete). - async fn list_by_partition_not_to_delete( + /// + /// The output order is undefined, non-existing partitions are not part of the output. + async fn list_by_partition_not_to_delete_batch( &mut self, - partition_id: &TransitionPartitionId, + partition_ids: Vec, ) -> Result>; /// Return the parquet file with the given object store id // used heavily in tests for verification of catalog state. async fn get_by_object_store_id( &mut self, - object_store_id: Uuid, + object_store_id: ObjectStoreId, ) -> Result>; /// Test a batch of parquet files exist by object store ids async fn exists_by_object_store_id_batch( &mut self, - object_store_ids: Vec, - ) -> Result>; + object_store_ids: Vec, + ) -> Result>; /// Commit deletions, upgrades and creations in a single transaction. /// /// Returns IDs of created files. async fn create_upgrade_delete( &mut self, - delete: &[ParquetFileId], - upgrade: &[ParquetFileId], + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel, ) -> Result>; } - -/// Gets the namespace schema including all tables and columns. -pub async fn get_schema_by_id( - id: NamespaceId, - repos: &mut R, - deleted: SoftDeletedRows, -) -> Result -where - R: RepoCollection + ?Sized, -{ - let namespace = repos - .namespaces() - .get_by_id(id, deleted) - .await? - .context(NamespaceNotFoundByIdSnafu { id })?; - - get_schema_internal(namespace, repos).await -} - -/// Gets the namespace schema including all tables and columns. -pub async fn get_schema_by_name( - name: &str, - repos: &mut R, - deleted: SoftDeletedRows, -) -> Result -where - R: RepoCollection + ?Sized, -{ - let namespace = repos - .namespaces() - .get_by_name(name, deleted) - .await? - .context(NamespaceNotFoundByNameSnafu { name })?; - - get_schema_internal(namespace, repos).await -} - -async fn get_schema_internal(namespace: Namespace, repos: &mut R) -> Result -where - R: RepoCollection + ?Sized, -{ - // get the columns first just in case someone else is creating schema while we're doing this. - let columns = repos.columns().list_by_namespace_id(namespace.id).await?; - let tables = repos.tables().list_by_namespace_id(namespace.id).await?; - - let mut namespace = NamespaceSchema::new_empty_from(&namespace); - - let mut table_id_to_schema = BTreeMap::new(); - for t in tables { - let table_schema = TableSchema::new_empty_from(&t); - table_id_to_schema.insert(t.id, (t.name, table_schema)); - } - - for c in columns { - let (_, t) = table_id_to_schema.get_mut(&c.table_id).unwrap(); - t.add_column(c); - } - - for (_, (table_name, schema)) in table_id_to_schema { - namespace.tables.insert(table_name, schema); - } - - Ok(namespace) -} - -/// Gets the schema for one particular table in a namespace. -pub async fn get_schema_by_namespace_and_table( - name: &str, - table_name: &str, - repos: &mut R, - deleted: SoftDeletedRows, -) -> Result -where - R: RepoCollection + ?Sized, -{ - let namespace = repos - .namespaces() - .get_by_name(name, deleted) - .await? - .context(NamespaceNotFoundByNameSnafu { name })?; - - let table = repos - .tables() - .get_by_namespace_and_name(namespace.id, table_name) - .await? - .context(TableNotFoundByNameSnafu { - name: table_name.to_string(), - })?; - let mut table_schema = TableSchema::new_empty_from(&table); - - let columns = repos.columns().list_by_table_id(table.id).await?; - for c in columns { - table_schema.add_column(c); - } - - let mut namespace = NamespaceSchema::new_empty_from(&namespace); - namespace - .tables - .insert(table_name.to_string(), table_schema); - - Ok(namespace) -} - -/// Gets all the table's columns. -pub async fn get_table_columns_by_id(id: TableId, repos: &mut R) -> Result -where - R: RepoCollection + ?Sized, -{ - let columns = repos.columns().list_by_table_id(id).await?; - - Ok(ColumnsByName::new(columns)) -} - -/// Fetch all [`NamespaceSchema`] in the catalog. -/// -/// This method performs the minimal number of queries needed to build the -/// result set. No table lock is obtained, nor are queries executed within a -/// transaction, but this method does return a point-in-time snapshot of the -/// catalog state. -/// -/// # Soft Deletion -/// -/// No schemas for soft-deleted namespaces are returned. -pub async fn list_schemas( - catalog: &dyn Catalog, -) -> Result> { - let mut repos = catalog.repositories().await; - - // In order to obtain a point-in-time snapshot, first fetch the columns, - // then the tables, and then resolve the namespace IDs to Namespace in order - // to construct the schemas. - // - // The set of columns returned forms the state snapshot, with the subsequent - // queries resolving only what is needed to construct schemas for the - // retrieved columns (ignoring any newly added tables/namespaces since the - // column snapshot was taken). - // - // This approach also tolerates concurrently deleted namespaces, which are - // simply ignored at the end when joining to the namespace query result. - - // First fetch all the columns - this is the state snapshot of the catalog - // schemas. - let columns = repos.columns().list().await?; - - // Construct the set of table IDs these columns belong to. - let retain_table_ids = columns.iter().map(|c| c.table_id).collect::>(); - - // Fetch all tables, and filter for those that are needed to construct - // schemas for "columns" only. - // - // Discard any tables that have no columns or have been created since - // the "columns" snapshot was retrieved, and construct a map of ID->Table. - let tables = repos - .tables() - .list() - .await? - .into_iter() - .filter_map(|t| { - if !retain_table_ids.contains(&t.id) { - return None; - } - - Some((t.id, t)) - }) - .collect::>(); - - // Drop the table ID set as it will not be referenced again. - drop(retain_table_ids); - - // Do all the I/O to fetch the namespaces in the background, while this - // thread constructs the NamespaceId->TableSchema map below. - let namespaces = tokio::spawn(async move { - repos - .namespaces() - .list(SoftDeletedRows::ExcludeDeleted) - .await - }); - - // A set of tables within a single namespace. - type NamespaceTables = BTreeMap; - - let mut joined = HashMap::::default(); - for column in columns { - // Resolve the table this column references - let table = tables.get(&column.table_id).expect("no table for column"); - - let table_schema = joined - // Find or create a record in the joined map - // for this namespace ID. - .entry(table.namespace_id) - .or_default() - // Fetch the schema record for this table, or create an empty one. - .entry(table.name.clone()) - .or_insert_with(|| TableSchema::new_empty_from(table)); - - table_schema.add_column(column); - } - - // The table map is no longer needed - immediately reclaim the memory. - drop(tables); - - // Convert the Namespace instances into NamespaceSchema instances. - let iter = namespaces - .await - .expect("namespace list task panicked")? - .into_iter() - // Ignore any namespaces that did not exist when the "columns" snapshot - // was created, or have no tables/columns (and therefore have no entry - // in "joined"). - .filter_map(move |v| { - // The catalog call explicitly asked for no soft deleted records. - assert!(v.deleted_at.is_none()); - - let mut ns = NamespaceSchema::new_empty_from(&v); - - ns.tables = joined.remove(&v.id)?; - Some((v, ns)) - }); - - Ok(iter) -} - -#[cfg(test)] -pub(crate) mod test_helpers { - use crate::{ - test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table}, - validate_or_insert_schema, - }; - - use super::*; - use ::test_helpers::assert_error; - use assert_matches::assert_matches; - use data_types::{ColumnId, CompactionLevel, MaxColumnsPerTable, MaxTables}; - use futures::Future; - use generated_types::influxdata::iox::partition_template::v1 as proto; - use metric::{Attributes, DurationHistogram, Metric}; - use std::{collections::BTreeSet, ops::DerefMut, sync::Arc, time::Duration}; - - pub(crate) async fn test_catalog(clean_state: R) - where - R: Fn() -> F + Send + Sync, - F: Future> + Send, - { - test_setup(clean_state().await).await; - test_namespace_soft_deletion(clean_state().await).await; - test_partitions_new_file_between(clean_state().await).await; - test_column(clean_state().await).await; - test_partition(clean_state().await).await; - test_parquet_file(clean_state().await).await; - test_parquet_file_delete_broken(clean_state().await).await; - test_update_to_compaction_level_1(clean_state().await).await; - test_list_by_partiton_not_to_delete(clean_state().await).await; - test_list_schemas(clean_state().await).await; - test_list_schemas_soft_deleted_rows(clean_state().await).await; - test_delete_namespace(clean_state().await).await; - - let catalog = clean_state().await; - test_namespace(Arc::clone(&catalog)).await; - assert_metric_hit(&catalog.metrics(), "namespace_create"); - - let catalog = clean_state().await; - test_table(Arc::clone(&catalog)).await; - assert_metric_hit(&catalog.metrics(), "table_create"); - - let catalog = clean_state().await; - test_column(Arc::clone(&catalog)).await; - assert_metric_hit(&catalog.metrics(), "column_create_or_get"); - - let catalog = clean_state().await; - test_partition(Arc::clone(&catalog)).await; - assert_metric_hit(&catalog.metrics(), "partition_create_or_get"); - - let catalog = clean_state().await; - test_parquet_file(Arc::clone(&catalog)).await; - assert_metric_hit(&catalog.metrics(), "parquet_create"); - } - - async fn test_setup(catalog: Arc) { - catalog.setup().await.expect("first catalog setup"); - catalog.setup().await.expect("second catalog setup"); - } - - async fn test_namespace(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace_name = NamespaceName::new("test_namespace").unwrap(); - let namespace = repos - .namespaces() - .create(&namespace_name, None, None, None) - .await - .unwrap(); - assert!(namespace.id > NamespaceId::new(0)); - assert_eq!(namespace.name, namespace_name.as_str()); - assert_eq!( - namespace.partition_template, - NamespacePartitionTemplateOverride::default() - ); - let lookup_namespace = repos - .namespaces() - .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .unwrap(); - assert_eq!(namespace, lookup_namespace); - - // Assert default values for service protection limits. - assert_eq!(namespace.max_tables, MaxTables::default()); - assert_eq!( - namespace.max_columns_per_table, - MaxColumnsPerTable::default() - ); - - let conflict = repos - .namespaces() - .create(&namespace_name, None, None, None) - .await; - assert!(matches!( - conflict.unwrap_err(), - Error::NameExists { name: _ } - )); - - let found = repos - .namespaces() - .get_by_id(namespace.id, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .expect("namespace should be there"); - assert_eq!(namespace, found); - - let not_found = repos - .namespaces() - .get_by_id(NamespaceId::new(i64::MAX), SoftDeletedRows::ExcludeDeleted) - .await - .unwrap(); - assert!(not_found.is_none()); - - let found = repos - .namespaces() - .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .expect("namespace should be there"); - assert_eq!(namespace, found); - - let not_found = repos - .namespaces() - .get_by_name("does_not_exist", SoftDeletedRows::ExcludeDeleted) - .await - .unwrap(); - assert!(not_found.is_none()); - - let namespace2 = arbitrary_namespace(&mut *repos, "test_namespace2").await; - let mut namespaces = repos - .namespaces() - .list(SoftDeletedRows::ExcludeDeleted) - .await - .unwrap(); - namespaces.sort_by_key(|ns| ns.name.clone()); - assert_eq!(namespaces, vec![namespace, namespace2]); - - let new_table_limit = MaxTables::new(15000); - let modified = repos - .namespaces() - .update_table_limit(namespace_name.as_str(), new_table_limit) - .await - .expect("namespace should be updateable"); - assert_eq!(new_table_limit, modified.max_tables); - - let new_column_limit = MaxColumnsPerTable::new(1500); - let modified = repos - .namespaces() - .update_column_limit(namespace_name.as_str(), new_column_limit) - .await - .expect("namespace should be updateable"); - assert_eq!(new_column_limit, modified.max_columns_per_table); - - const NEW_RETENTION_PERIOD_NS: i64 = 5 * 60 * 60 * 1000 * 1000 * 1000; - let modified = repos - .namespaces() - .update_retention_period(namespace_name.as_str(), Some(NEW_RETENTION_PERIOD_NS)) - .await - .expect("namespace should be updateable"); - assert_eq!( - NEW_RETENTION_PERIOD_NS, - modified.retention_period_ns.unwrap() - ); - - let modified = repos - .namespaces() - .update_retention_period(namespace_name.as_str(), None) - .await - .expect("namespace should be updateable"); - assert!(modified.retention_period_ns.is_none()); - - // create namespace with retention period NULL (the default) - let namespace3 = arbitrary_namespace(&mut *repos, "test_namespace3").await; - assert!(namespace3.retention_period_ns.is_none()); - - // create namespace with retention period - let namespace4_name = NamespaceName::new("test_namespace4").unwrap(); - let namespace4 = repos - .namespaces() - .create(&namespace4_name, None, Some(NEW_RETENTION_PERIOD_NS), None) - .await - .expect("namespace with 5-hour retention should be created"); - assert_eq!( - NEW_RETENTION_PERIOD_NS, - namespace4.retention_period_ns.unwrap() - ); - // reset retention period to NULL to avoid affecting later tests - repos - .namespaces() - .update_retention_period(&namespace4_name, None) - .await - .expect("namespace should be updateable"); - - // create a namespace with a PartitionTemplate other than the default - let tag_partition_template = - NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate { - parts: vec![proto::TemplatePart { - part: Some(proto::template_part::Part::TagValue("tag1".into())), - }], - }) - .unwrap(); - let namespace5_name = NamespaceName::new("test_namespace5").unwrap(); - let namespace5 = repos - .namespaces() - .create( - &namespace5_name, - Some(tag_partition_template.clone()), - None, - None, - ) - .await - .unwrap(); - assert_eq!(namespace5.partition_template, tag_partition_template); - let lookup_namespace5 = repos - .namespaces() - .get_by_name(&namespace5_name, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .unwrap(); - assert_eq!(namespace5, lookup_namespace5); - - // remove namespace to avoid it from affecting later tests - repos - .namespaces() - .soft_delete("test_namespace") - .await - .expect("delete namespace should succeed"); - repos - .namespaces() - .soft_delete("test_namespace2") - .await - .expect("delete namespace should succeed"); - repos - .namespaces() - .soft_delete("test_namespace3") - .await - .expect("delete namespace should succeed"); - repos - .namespaces() - .soft_delete("test_namespace4") - .await - .expect("delete namespace should succeed"); - } - - /// Construct a set of two namespaces: - /// - /// * deleted-ns: marked as soft-deleted - /// * active-ns: not marked as deleted - /// - /// And assert the expected "soft delete" semantics / correctly filter out - /// the expected rows for all three states of [`SoftDeletedRows`]. - async fn test_namespace_soft_deletion(catalog: Arc) { - let mut repos = catalog.repositories().await; - - let deleted_ns = arbitrary_namespace(&mut *repos, "deleted-ns").await; - let active_ns = arbitrary_namespace(&mut *repos, "active-ns").await; - - // Mark "deleted-ns" as soft-deleted. - repos.namespaces().soft_delete("deleted-ns").await.unwrap(); - - // Which should be idempotent (ignoring the timestamp change - when - // changing this to "soft delete" it was idempotent, so I am preserving - // that). - repos.namespaces().soft_delete("deleted-ns").await.unwrap(); - - // Listing should respect soft deletion. - let got = repos - .namespaces() - .list(SoftDeletedRows::AllRows) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["deleted-ns", "active-ns"]); - - let got = repos - .namespaces() - .list(SoftDeletedRows::OnlyDeleted) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["deleted-ns"]); - - let got = repos - .namespaces() - .list(SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["active-ns"]); - - // As should get by ID - let got = repos - .namespaces() - .get_by_id(deleted_ns.id, SoftDeletedRows::AllRows) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["deleted-ns"]); - let got = repos - .namespaces() - .get_by_id(deleted_ns.id, SoftDeletedRows::OnlyDeleted) - .await - .unwrap() - .into_iter() - .map(|v| { - assert!(v.deleted_at.is_some()); - v.name - }); - assert_string_set_eq(got, ["deleted-ns"]); - let got = repos - .namespaces() - .get_by_id(deleted_ns.id, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap(); - assert!(got.is_none()); - let got = repos - .namespaces() - .get_by_id(active_ns.id, SoftDeletedRows::AllRows) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["active-ns"]); - let got = repos - .namespaces() - .get_by_id(active_ns.id, SoftDeletedRows::OnlyDeleted) - .await - .unwrap(); - assert!(got.is_none()); - let got = repos - .namespaces() - .get_by_id(active_ns.id, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["active-ns"]); - - // And get by name - let got = repos - .namespaces() - .get_by_name(&deleted_ns.name, SoftDeletedRows::AllRows) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["deleted-ns"]); - let got = repos - .namespaces() - .get_by_name(&deleted_ns.name, SoftDeletedRows::OnlyDeleted) - .await - .unwrap() - .into_iter() - .map(|v| { - assert!(v.deleted_at.is_some()); - v.name - }); - assert_string_set_eq(got, ["deleted-ns"]); - let got = repos - .namespaces() - .get_by_name(&deleted_ns.name, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap(); - assert!(got.is_none()); - let got = repos - .namespaces() - .get_by_name(&active_ns.name, SoftDeletedRows::AllRows) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["active-ns"]); - let got = repos - .namespaces() - .get_by_name(&active_ns.name, SoftDeletedRows::OnlyDeleted) - .await - .unwrap(); - assert!(got.is_none()); - let got = repos - .namespaces() - .get_by_name(&active_ns.name, SoftDeletedRows::ExcludeDeleted) - .await - .unwrap() - .into_iter() - .map(|v| v.name); - assert_string_set_eq(got, ["active-ns"]); - } - - // Assert the set of strings "a" is equal to the set "b", tolerating - // duplicates. - #[track_caller] - fn assert_string_set_eq(a: impl IntoIterator, b: impl IntoIterator) - where - T: Into, - U: Into, - { - let mut a = a.into_iter().map(Into::into).collect::>(); - a.sort_unstable(); - let mut b = b.into_iter().map(Into::into).collect::>(); - b.sort_unstable(); - assert_eq!(a, b); - } - - async fn test_table(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace(&mut *repos, "namespace_table_test").await; - - // test we can create a table - let t = arbitrary_table(&mut *repos, "test_table", &namespace).await; - assert!(t.id > TableId::new(0)); - assert_eq!( - t.partition_template, - TablePartitionTemplateOverride::default() - ); - - // The default template doesn't use any tag values, so no columns need to be created. - let table_columns = repos.columns().list_by_table_id(t.id).await.unwrap(); - assert!(table_columns.is_empty()); - - // test we get an error if we try to create it again - let err = repos - .tables() - .create( - "test_table", - TablePartitionTemplateOverride::try_new(None, &namespace.partition_template) - .unwrap(), - namespace.id, - ) - .await; - assert_error!( - err, - Error::TableNameExists { ref name, namespace_id } - if name == "test_table" && namespace_id == namespace.id - ); - - // get by id - assert_eq!(t, repos.tables().get_by_id(t.id).await.unwrap().unwrap()); - assert!(repos - .tables() - .get_by_id(TableId::new(i64::MAX)) - .await - .unwrap() - .is_none()); - - let tables = repos - .tables() - .list_by_namespace_id(namespace.id) - .await - .unwrap(); - assert_eq!(vec![t.clone()], tables); - - // test we can create a table of the same name in a different namespace - let namespace2 = arbitrary_namespace(&mut *repos, "two").await; - assert_ne!(namespace, namespace2); - let test_table = arbitrary_table(&mut *repos, "test_table", &namespace2).await; - assert_ne!(t.id, test_table.id); - assert_eq!(test_table.namespace_id, namespace2.id); - - // test get by namespace and name - let foo_table = arbitrary_table(&mut *repos, "foo", &namespace2).await; - assert_eq!( - repos - .tables() - .get_by_namespace_and_name(NamespaceId::new(i64::MAX), "test_table") - .await - .unwrap(), - None - ); - assert_eq!( - repos - .tables() - .get_by_namespace_and_name(namespace.id, "not_existing") - .await - .unwrap(), - None - ); - assert_eq!( - repos - .tables() - .get_by_namespace_and_name(namespace.id, "test_table") - .await - .unwrap(), - Some(t.clone()) - ); - assert_eq!( - repos - .tables() - .get_by_namespace_and_name(namespace2.id, "test_table") - .await - .unwrap() - .as_ref(), - Some(&test_table) - ); - assert_eq!( - repos - .tables() - .get_by_namespace_and_name(namespace2.id, "foo") - .await - .unwrap() - .as_ref(), - Some(&foo_table) - ); - - // All tables should be returned by list(), regardless of namespace - let mut list = repos.tables().list().await.unwrap(); - list.sort_by_key(|t| t.id); - let mut expected = [t, test_table, foo_table]; - expected.sort_by_key(|t| t.id); - assert_eq!(&list, &expected); - - // test per-namespace table limits - let latest = repos - .namespaces() - .update_table_limit("namespace_table_test", MaxTables::new(1)) - .await - .expect("namespace should be updateable"); - let err = repos - .tables() - .create( - "definitely_unique", - TablePartitionTemplateOverride::try_new(None, &latest.partition_template).unwrap(), - latest.id, - ) - .await - .expect_err("should error with table create limit error"); - assert!(matches!( - err, - Error::TableCreateLimitError { - table_name: _, - namespace_id: _ - } - )); - - // Create a table with a partition template other than the default - let custom_table_template = TablePartitionTemplateOverride::try_new( - Some(proto::PartitionTemplate { - parts: vec![ - proto::TemplatePart { - part: Some(proto::template_part::Part::TagValue("tag1".into())), - }, - proto::TemplatePart { - part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), - }, - proto::TemplatePart { - part: Some(proto::template_part::Part::TagValue("tag2".into())), - }, - ], - }), - &namespace2.partition_template, - ) - .unwrap(); - let templated = repos - .tables() - .create( - "use_a_template", - custom_table_template.clone(), - namespace2.id, - ) - .await - .unwrap(); - assert_eq!(templated.partition_template, custom_table_template); - - // Tag columns should be created for tags used in the template - let table_columns = repos - .columns() - .list_by_table_id(templated.id) - .await - .unwrap(); - assert_eq!(table_columns.len(), 2); - assert!(table_columns.iter().all(|c| c.is_tag())); - let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect(); - column_names.sort(); - assert_eq!(column_names, &["tag1", "tag2"]); - - let lookup_templated = repos - .tables() - .get_by_namespace_and_name(namespace2.id, "use_a_template") - .await - .unwrap() - .unwrap(); - assert_eq!(templated, lookup_templated); - - // Create a namespace with a partition template other than the default - let custom_namespace_template = - NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate { - parts: vec![ - proto::TemplatePart { - part: Some(proto::template_part::Part::TagValue("zzz".into())), - }, - proto::TemplatePart { - part: Some(proto::template_part::Part::TagValue("aaa".into())), - }, - proto::TemplatePart { - part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), - }, - ], - }) - .unwrap(); - let custom_namespace_name = NamespaceName::new("custom_namespace").unwrap(); - let custom_namespace = repos - .namespaces() - .create( - &custom_namespace_name, - Some(custom_namespace_template.clone()), - None, - None, - ) - .await - .unwrap(); - // Create a table without specifying the partition template - let custom_table_template = - TablePartitionTemplateOverride::try_new(None, &custom_namespace.partition_template) - .unwrap(); - let table_templated_by_namespace = repos - .tables() - .create( - "use_namespace_template", - custom_table_template, - custom_namespace.id, - ) - .await - .unwrap(); - assert_eq!( - table_templated_by_namespace.partition_template, - TablePartitionTemplateOverride::try_new(None, &custom_namespace_template).unwrap() - ); - - // Tag columns should be created for tags used in the template - let table_columns = repos - .columns() - .list_by_table_id(table_templated_by_namespace.id) - .await - .unwrap(); - assert_eq!(table_columns.len(), 2); - assert!(table_columns.iter().all(|c| c.is_tag())); - let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect(); - column_names.sort(); - assert_eq!(column_names, &["aaa", "zzz"]); - - repos - .namespaces() - .soft_delete("namespace_table_test") - .await - .expect("delete namespace should succeed"); - repos - .namespaces() - .soft_delete("two") - .await - .expect("delete namespace should succeed"); - } - - async fn test_column(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace(&mut *repos, "namespace_column_test").await; - let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; - assert_eq!(table.namespace_id, namespace.id); - - // test we can create or get a column - let c = repos - .columns() - .create_or_get("column_test", table.id, ColumnType::Tag) - .await - .unwrap(); - let cc = repos - .columns() - .create_or_get("column_test", table.id, ColumnType::Tag) - .await - .unwrap(); - assert!(c.id > ColumnId::new(0)); - assert_eq!(c, cc); - - // test that attempting to create an already defined column of a different type returns - // error - let err = repos - .columns() - .create_or_get("column_test", table.id, ColumnType::U64) - .await - .expect_err("should error with wrong column type"); - assert!(matches!(err, Error::ColumnTypeMismatch { .. })); - - // test that we can create a column of the same name under a different table - let table2 = arbitrary_table(&mut *repos, "test_table_2", &namespace).await; - let ccc = repos - .columns() - .create_or_get("column_test", table2.id, ColumnType::U64) - .await - .unwrap(); - assert_ne!(c, ccc); - - let columns = repos - .columns() - .list_by_namespace_id(namespace.id) - .await - .unwrap(); - - let mut want = vec![c.clone(), ccc]; - assert_eq!(want, columns); - - let columns = repos.columns().list_by_table_id(table.id).await.unwrap(); - - let want2 = vec![c]; - assert_eq!(want2, columns); - - // Add another tag column into table2 - let c3 = repos - .columns() - .create_or_get("b", table2.id, ColumnType::Tag) - .await - .unwrap(); - - // Listing columns should return all columns in the catalog - let list = repos.columns().list().await.unwrap(); - want.extend([c3]); - assert_eq!(list, want); - - // test create_or_get_many_unchecked, below column limit - let mut columns = HashMap::new(); - columns.insert("column_test", ColumnType::Tag); - columns.insert("new_column", ColumnType::Tag); - let table1_columns = repos - .columns() - .create_or_get_many_unchecked(table.id, columns) - .await - .unwrap(); - let mut table1_column_names: Vec<_> = table1_columns.iter().map(|c| &c.name).collect(); - table1_column_names.sort(); - assert_eq!(table1_column_names, vec!["column_test", "new_column"]); - - // test per-namespace column limits - repos - .namespaces() - .update_column_limit("namespace_column_test", MaxColumnsPerTable::new(1)) - .await - .expect("namespace should be updateable"); - let err = repos - .columns() - .create_or_get("definitely unique", table.id, ColumnType::Tag) - .await - .expect_err("should error with table create limit error"); - assert!(matches!( - err, - Error::ColumnCreateLimitError { - column_name: _, - table_id: _, - } - )); - - // test per-namespace column limits are NOT enforced with create_or_get_many_unchecked - let table3 = arbitrary_table(&mut *repos, "test_table_3", &namespace).await; - let mut columns = HashMap::new(); - columns.insert("apples", ColumnType::Tag); - columns.insert("oranges", ColumnType::Tag); - let table3_columns = repos - .columns() - .create_or_get_many_unchecked(table3.id, columns) - .await - .unwrap(); - let mut table3_column_names: Vec<_> = table3_columns.iter().map(|c| &c.name).collect(); - table3_column_names.sort(); - assert_eq!(table3_column_names, vec!["apples", "oranges"]); - - repos - .namespaces() - .soft_delete("namespace_column_test") - .await - .expect("delete namespace should succeed"); - } - - async fn test_partition(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace(&mut *repos, "namespace_partition_test").await; - let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; - - let mut created = BTreeMap::new(); - // partition to use - let partition = repos - .partitions() - .create_or_get("foo".into(), table.id) - .await - .expect("failed to create partition"); - // Test: sort_key_ids from create_or_get - assert!(partition.sort_key_ids().is_empty()); - created.insert(partition.id, partition.clone()); - // partition to use - let partition_bar = repos - .partitions() - .create_or_get("bar".into(), table.id) - .await - .expect("failed to create partition"); - created.insert(partition_bar.id, partition_bar); - // partition to be skipped later - let to_skip_partition = repos - .partitions() - .create_or_get("asdf".into(), table.id) - .await - .unwrap(); - created.insert(to_skip_partition.id, to_skip_partition.clone()); - // partition to be skipped later - let to_skip_partition_too = repos - .partitions() - .create_or_get("asdf too".into(), table.id) - .await - .unwrap(); - created.insert(to_skip_partition_too.id, to_skip_partition_too.clone()); - - // partitions can be retrieved easily - let mut created_sorted = created.values().cloned().collect::>(); - created_sorted.sort_by_key(|p| p.id); - assert_eq!( - to_skip_partition, - repos - .partitions() - .get_by_id(to_skip_partition.id) - .await - .unwrap() - .unwrap() - ); - assert_eq!( - to_skip_partition, - repos - .partitions() - .get_by_hash_id(to_skip_partition.hash_id().unwrap()) - .await - .unwrap() - .unwrap() - ); - let non_existing_partition_id = PartitionId::new(i64::MAX); - let non_existing_partition_hash_id = - PartitionHashId::new(TableId::new(i64::MAX), &PartitionKey::from("arbitrary")); - assert!(repos - .partitions() - .get_by_id(non_existing_partition_id) - .await - .unwrap() - .is_none()); - assert!(repos - .partitions() - .get_by_hash_id(&non_existing_partition_hash_id) - .await - .unwrap() - .is_none()); - let mut batch = repos - .partitions() - .get_by_id_batch( - created - .keys() - .cloned() - .chain([non_existing_partition_id]) - .collect(), - ) - .await - .unwrap(); - batch.sort_by_key(|p| p.id); - assert_eq!(created_sorted, batch); - // Test: sort_key_ids from get_by_id_batch - assert!(batch.iter().all(|p| p.sort_key_ids().is_empty())); - let mut batch = repos - .partitions() - .get_by_hash_id_batch( - &created - .values() - .map(|p| p.hash_id().unwrap()) - .chain([&non_existing_partition_hash_id]) - .collect::>(), - ) - .await - .unwrap(); - batch.sort_by_key(|p| p.id); - // Test: sort_key_ids from get_by_hash_id_batch - assert!(batch.iter().all(|p| p.sort_key_ids().is_empty())); - assert_eq!(created_sorted, batch); - - let listed = repos - .partitions() - .list_by_table_id(table.id) - .await - .expect("failed to list partitions") - .into_iter() - .map(|v| (v.id, v)) - .collect::>(); - // Test: sort_key_ids from list_by_table_id - assert!(listed.values().all(|p| p.sort_key_ids().is_empty())); - - assert_eq!(created, listed); - - let listed = repos - .partitions() - .list_ids() - .await - .expect("failed to list partitions") - .into_iter() - .collect::>(); - - assert_eq!(created.keys().copied().collect::>(), listed); - - // The code no longer supports creating old-style partitions, so this list is always empty - // in these tests. See each catalog implementation for tests that insert old-style - // partitions directly and verify they're returned. - let old_style = repos.partitions().list_old_style().await.unwrap(); - assert!( - old_style.is_empty(), - "Expected no old-style partitions, got {old_style:?}" - ); - - // sort_key should be empty on creation - assert!(to_skip_partition.sort_key.is_empty()); - assert!(to_skip_partition.sort_key_ids.as_ref().is_empty()); - - // test that updates sort_key and sort_key_ids from None to Some - let updated_partition = repos - .partitions() - .cas_sort_key( - &to_skip_partition.transition_partition_id(), - None, - None, - &["tag2", "tag1", "time"], - &SortedColumnSet::from([2, 1, 3]), - ) - .await - .unwrap(); - - // verify sort_key and sort_key_ids are updated correctly - assert_eq!(updated_partition.sort_key, &["tag2", "tag1", "time"]); - assert_eq!( - updated_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 3]) - ); - - // test that provides values of both old_sort_key and old_sort_key_ids but they do not match the existing ones - // --> the new sort key will not be updated - let err = repos - .partitions() - .cas_sort_key( - &to_skip_partition.transition_partition_id(), - Some(["bananas".to_string()].to_vec()), - Some(SortedColumnSet::from([1])), - &["tag2", "tag1", "tag3 , with comma", "time"], - &SortedColumnSet::from([1, 2, 3, 4]), - ) - .await - .expect_err("CAS with incorrect value should fail"); - // verify the sort key is not updated - assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => { - assert_eq!(old_sort_key, &["tag2", "tag1", "time"]); - assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3])); - }); - - // test that provides matched old_sort_key but not-matched old_sort_key_ids - // --> the new sort key will not be updated - let err = repos - .partitions() - .cas_sort_key( - &to_skip_partition.transition_partition_id(), - Some(["tag2".to_string(), "tag1".to_string(), "time".to_string()].to_vec()), - Some(SortedColumnSet::from([1, 5, 10])), - &["tag2", "tag1", "tag3 , with comma", "time"], - &SortedColumnSet::from([1, 2, 3, 4]), - ) - .await - .expect_err("CAS with incorrect value should fail"); - // verify the sort key is not updated - assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => { - assert_eq!(old_sort_key, &["tag2", "tag1", "time"]); - assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3])); - }); - - // test that provide None sort_key and None sort_key_ids that do not match with existing values that are not None - // --> the new sort key will not be updated - let err = repos - .partitions() - .cas_sort_key( - &to_skip_partition.transition_partition_id(), - None, - None, - &["tag2", "tag1", "tag3 , with comma", "time"], - &SortedColumnSet::from([1, 2, 3, 4]), - ) - .await - .expect_err("CAS with incorrect value should fail"); - assert_matches!(err, CasFailure::ValueMismatch((old_sort_key, old_sort_key_ids)) => { - assert_eq!(old_sort_key, &["tag2", "tag1", "time"]); - assert_eq!(old_sort_key_ids, SortedColumnSet::from([2, 1, 3])); - }); - - // test getting partition from partition id and verify values of sort_key and sort_key_ids - let updated_other_partition = repos - .partitions() - .get_by_id(to_skip_partition.id) - .await - .unwrap() - .unwrap(); - // still has the old sort key - assert_eq!( - updated_other_partition.sort_key, - vec!["tag2", "tag1", "time"] - ); - assert_eq!( - updated_other_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 3]) - ); - - // test getting partition from hash_id and verify values of sort_key and sort_key_ids - let updated_other_partition = repos - .partitions() - .get_by_hash_id(to_skip_partition.hash_id().unwrap()) - .await - .unwrap() - .unwrap(); - // still has the old sort key - assert_eq!( - updated_other_partition.sort_key, - vec!["tag2", "tag1", "time"] - ); - assert_eq!( - updated_other_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 3]) - ); - - // test that updates sort_key and sort_key_ids from Some matching values to Some other values - let updated_partition = repos - .partitions() - .cas_sort_key( - &to_skip_partition.transition_partition_id(), - Some( - ["tag2", "tag1", "time"] - .into_iter() - .map(ToString::to_string) - .collect(), - ), - Some(SortedColumnSet::from([2, 1, 3])), - &["tag2", "tag1", "tag3 , with comma", "time"], - &SortedColumnSet::from([2, 1, 4, 3]), - ) - .await - .unwrap(); - // verify the new values are updated - assert_eq!( - updated_partition.sort_key, - vec!["tag2", "tag1", "tag3 , with comma", "time"] - ); - assert_eq!( - updated_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 4, 3]) - ); - - // test getting the new sort key from partition id - let updated_partition = repos - .partitions() - .get_by_id(to_skip_partition.id) - .await - .unwrap() - .unwrap(); - assert_eq!( - updated_partition.sort_key, - vec!["tag2", "tag1", "tag3 , with comma", "time"] - ); - assert_eq!( - updated_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 4, 3]) - ); - - // test getting the new sort key from partition hash_id - let updated_partition = repos - .partitions() - .get_by_hash_id(to_skip_partition.hash_id().unwrap()) - .await - .unwrap() - .unwrap(); - assert_eq!( - updated_partition.sort_key, - vec!["tag2", "tag1", "tag3 , with comma", "time"] - ); - assert_eq!( - updated_partition.sort_key_ids, - SortedColumnSet::from([2, 1, 4, 3]) - ); - - // use to_skip_partition_too to update sort key from empty old values - // first make sure the old values are empty - assert!(to_skip_partition_too.sort_key.is_empty()); - assert!(to_skip_partition_too.sort_key_ids.as_ref().is_empty()); - - // test that provides empty old_sort_key and empty old_sort_key_ids - // --> the new sort key will be updated - let updated_to_skip_partition_too = repos - .partitions() - .cas_sort_key( - &to_skip_partition_too.transition_partition_id(), - Some(vec![]), - Some(SortedColumnSet::from([])), - &["tag3", "time"], - &SortedColumnSet::from([3, 4]), - ) - .await - .unwrap(); - // verify the new values are updated - assert_eq!(updated_to_skip_partition_too.sort_key, vec!["tag3", "time"]); - assert_eq!( - updated_to_skip_partition_too.sort_key_ids, - SortedColumnSet::from([3, 4]) - ); - - // The compactor can log why compaction was skipped - let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); - assert!( - skipped_compactions.is_empty(), - "Expected no skipped compactions, got: {skipped_compactions:?}" - ); - repos - .partitions() - .record_skipped_compaction(to_skip_partition.id, "I am le tired", 1, 2, 4, 10, 20) - .await - .unwrap(); - let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); - assert_eq!(skipped_compactions.len(), 1); - assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); - assert_eq!(skipped_compactions[0].reason, "I am le tired"); - assert_eq!(skipped_compactions[0].num_files, 1); - assert_eq!(skipped_compactions[0].limit_num_files, 2); - assert_eq!(skipped_compactions[0].estimated_bytes, 10); - assert_eq!(skipped_compactions[0].limit_bytes, 20); - // - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[to_skip_partition.id]) - .await - .unwrap(); - assert_eq!( - skipped_partition_records[0].partition_id, - to_skip_partition.id - ); - assert_eq!(skipped_partition_records[0].reason, "I am le tired"); - - // Only save the last reason that any particular partition was skipped (really if the - // partition appears in the skipped compactions, it shouldn't become a compaction candidate - // again, but race conditions and all that) - repos - .partitions() - .record_skipped_compaction(to_skip_partition.id, "I'm on fire", 11, 12, 24, 110, 120) - .await - .unwrap(); - let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); - assert_eq!(skipped_compactions.len(), 1); - assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); - assert_eq!(skipped_compactions[0].reason, "I'm on fire"); - assert_eq!(skipped_compactions[0].num_files, 11); - assert_eq!(skipped_compactions[0].limit_num_files, 12); - assert_eq!(skipped_compactions[0].estimated_bytes, 110); - assert_eq!(skipped_compactions[0].limit_bytes, 120); - // - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[to_skip_partition.id]) - .await - .unwrap(); - assert_eq!( - skipped_partition_records[0].partition_id, - to_skip_partition.id - ); - assert_eq!(skipped_partition_records[0].reason, "I'm on fire"); - - // Can receive multiple skipped compactions for different partitions - repos - .partitions() - .record_skipped_compaction( - to_skip_partition_too.id, - "I am le tired too", - 1, - 2, - 4, - 10, - 20, - ) - .await - .unwrap(); - let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); - assert_eq!(skipped_compactions.len(), 2); - assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); - assert_eq!( - skipped_compactions[1].partition_id, - to_skip_partition_too.id - ); - // confirm can fetch subset of skipped compactions (a.k.a. have two, only fetch 1) - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[to_skip_partition.id]) - .await - .unwrap(); - assert_eq!(skipped_partition_records.len(), 1); - assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[to_skip_partition_too.id]) - .await - .unwrap(); - assert_eq!(skipped_partition_records.len(), 1); - assert_eq!( - skipped_partition_records[0].partition_id, - to_skip_partition_too.id - ); - // confirm can fetch both skipped compactions, and not the unskipped one - // also confirm will not error on non-existing partition - let non_existing_partition_id = PartitionId::new(9999); - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[ - partition.id, - to_skip_partition.id, - to_skip_partition_too.id, - non_existing_partition_id, - ]) - .await - .unwrap(); - assert_eq!(skipped_partition_records.len(), 2); - assert_eq!( - skipped_partition_records[0].partition_id, - to_skip_partition.id - ); - assert_eq!( - skipped_partition_records[1].partition_id, - to_skip_partition_too.id - ); - - // Delete the skipped compactions - let deleted_skipped_compaction = repos - .partitions() - .delete_skipped_compactions(to_skip_partition.id) - .await - .unwrap() - .expect("The skipped compaction should have been returned"); - assert_eq!( - deleted_skipped_compaction.partition_id, - to_skip_partition.id - ); - assert_eq!(deleted_skipped_compaction.reason, "I'm on fire"); - assert_eq!(deleted_skipped_compaction.num_files, 11); - assert_eq!(deleted_skipped_compaction.limit_num_files, 12); - assert_eq!(deleted_skipped_compaction.estimated_bytes, 110); - assert_eq!(deleted_skipped_compaction.limit_bytes, 120); - // - let deleted_skipped_compaction = repos - .partitions() - .delete_skipped_compactions(to_skip_partition_too.id) - .await - .unwrap() - .expect("The skipped compaction should have been returned"); - assert_eq!( - deleted_skipped_compaction.partition_id, - to_skip_partition_too.id - ); - assert_eq!(deleted_skipped_compaction.reason, "I am le tired too"); - // - let skipped_partition_records = repos - .partitions() - .get_in_skipped_compactions(&[to_skip_partition.id]) - .await - .unwrap(); - assert!(skipped_partition_records.is_empty()); - - let not_deleted_skipped_compaction = repos - .partitions() - .delete_skipped_compactions(to_skip_partition.id) - .await - .unwrap(); - - assert!( - not_deleted_skipped_compaction.is_none(), - "There should be no skipped compation", - ); - - let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); - assert!( - skipped_compactions.is_empty(), - "Expected no skipped compactions, got: {skipped_compactions:?}" - ); - - let recent = repos - .partitions() - .most_recent_n(10) - .await - .expect("should list most recent"); - assert_eq!(recent.len(), 4); - - // Test: sort_key_ids from most_recent_n - // Only the first two partitions (represent to_skip_partition_too and to_skip_partition) have vallues, the others are empty - let empty_vec_string: Vec = vec![]; - - assert_eq!( - recent[0].sort_key, - vec!["tag3".to_string(), "time".to_string(),] - ); - assert_eq!(recent[0].sort_key_ids, SortedColumnSet::from(vec![3, 4])); - - assert_eq!( - recent[1].sort_key, - vec![ - "tag2".to_string(), - "tag1".to_string(), - "tag3 , with comma".to_string(), - "time".to_string() - ] - ); - assert_eq!( - recent[1].sort_key_ids, - SortedColumnSet::from(vec![2, 1, 4, 3]) - ); - - assert_eq!(recent[2].sort_key, empty_vec_string); - assert_eq!(recent[2].sort_key_ids, SortedColumnSet::from(vec![])); - - assert_eq!(recent[3].sort_key, empty_vec_string); - assert_eq!(recent[3].sort_key_ids, SortedColumnSet::from(vec![])); - - let recent = repos - .partitions() - .most_recent_n(4) - .await - .expect("should list most recent"); - assert_eq!(recent.len(), 4); // no off by one error - - let recent = repos - .partitions() - .most_recent_n(2) - .await - .expect("should list most recent"); - assert_eq!(recent.len(), 2); - - repos - .namespaces() - .soft_delete("namespace_partition_test") - .await - .expect("delete namespace should succeed"); - } - - /// tests many interactions with the catalog and parquet files. See the individual conditions - /// herein - async fn test_parquet_file(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await; - let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; - let other_table = arbitrary_table(&mut *repos, "other", &namespace).await; - let partition = repos - .partitions() - .create_or_get("one".into(), table.id) - .await - .unwrap(); - let other_partition = repos - .partitions() - .create_or_get("one".into(), other_table.id) - .await - .unwrap(); - - let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); - let parquet_file = repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - - // verify we can get it by its object store id - let pfg = repos - .parquet_files() - .get_by_object_store_id(parquet_file.object_store_id) - .await - .unwrap(); - assert_eq!(parquet_file, pfg.unwrap()); - - // verify that trying to create a file with the same UUID throws an error - let err = repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap_err(); - assert!(matches!(err, Error::FileExists { object_store_id: _ })); - - let other_params = ParquetFileParams { - table_id: other_partition.table_id, - partition_id: other_partition.transition_partition_id(), - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(50), - max_time: Timestamp::new(60), - ..parquet_file_params.clone() - }; - let other_file = repos.parquet_files().create(other_params).await.unwrap(); - - let exist_id = parquet_file.id; - let non_exist_id = ParquetFileId::new(other_file.id.get() + 10); - // make sure exists_id != non_exist_id - assert_ne!(exist_id, non_exist_id); - - // verify that to_delete is initially set to null and the file does not get deleted - assert!(parquet_file.to_delete.is_none()); - let older_than = Timestamp::new( - (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(), - ); - let deleted = repos - .parquet_files() - .delete_old_ids_only(older_than) - .await - .unwrap(); - assert!(deleted.is_empty()); - - // test list_all that includes soft-deleted file - // at this time the file is not soft-deleted yet and will be included in the returned list - let files = repos.parquet_files().list_all().await.unwrap(); - assert_eq!(files.len(), 2); - - // verify to_delete can be updated to a timestamp - repos - .parquet_files() - .create_upgrade_delete(&[parquet_file.id], &[], &[], CompactionLevel::Initial) - .await - .unwrap(); - - // test list_all that includes soft-deleted file - // at this time the file is soft-deleted and will be included in the returned list - let files = repos.parquet_files().list_all().await.unwrap(); - assert_eq!(files.len(), 2); - let marked_deleted = files - .iter() - .find(|f| f.to_delete.is_some()) - .cloned() - .unwrap(); - - // File is not deleted if it was marked to be deleted after the specified time - let before_deleted = Timestamp::new( - (catalog.time_provider().now() - Duration::from_secs(100)).timestamp_nanos(), - ); - let deleted = repos - .parquet_files() - .delete_old_ids_only(before_deleted) - .await - .unwrap(); - assert!(deleted.is_empty()); - - // test list_all that includes soft-deleted file - // at this time the file is not actually hard deleted yet and stay as soft deleted - // and will be returned in the list - let files = repos.parquet_files().list_all().await.unwrap(); - assert_eq!(files.len(), 2); - - // File is deleted if it was marked to be deleted before the specified time - let deleted = repos - .parquet_files() - .delete_old_ids_only(older_than) - .await - .unwrap(); - assert_eq!(deleted.len(), 1); - assert_eq!(marked_deleted.id, deleted[0]); - - // test list_all that includes soft-deleted file - // at this time the file is hard deleted -> the returned list is empty - let files = repos.parquet_files().list_all().await.unwrap(); - assert_eq!(files.len(), 1); - - // test list_by_table_not_to_delete - let files = repos - .parquet_files() - .list_by_table_not_to_delete(table.id) - .await - .unwrap(); - assert_eq!(files, vec![]); - let files = repos - .parquet_files() - .list_by_table_not_to_delete(other_table.id) - .await - .unwrap(); - assert_eq!(files, vec![other_file.clone()]); - - // test list_all - let files = repos.parquet_files().list_all().await.unwrap(); - assert_eq!(vec![other_file.clone()], files); - - // test list_by_namespace_not_to_delete - let namespace2 = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test1").await; - let table2 = arbitrary_table(&mut *repos, "test_table2", &namespace2).await; - let partition2 = repos - .partitions() - .create_or_get("foo".into(), table2.id) - .await - .unwrap(); - let files = repos - .parquet_files() - .list_by_namespace_not_to_delete(namespace2.id) - .await - .unwrap(); - assert!(files.is_empty()); - - let f1_params = ParquetFileParams { - table_id: partition2.table_id, - partition_id: partition2.transition_partition_id(), - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(1), - max_time: Timestamp::new(10), - ..parquet_file_params - }; - let f1 = repos - .parquet_files() - .create(f1_params.clone()) - .await - .unwrap(); - - let f2_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(50), - max_time: Timestamp::new(60), - ..f1_params.clone() - }; - let f2 = repos - .parquet_files() - .create(f2_params.clone()) - .await - .unwrap(); - let files = repos - .parquet_files() - .list_by_namespace_not_to_delete(namespace2.id) - .await - .unwrap(); - assert_eq!(vec![f1.clone(), f2.clone()], files); - - let f3_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(50), - max_time: Timestamp::new(60), - ..f2_params - }; - let f3 = repos - .parquet_files() - .create(f3_params.clone()) - .await - .unwrap(); - let files = repos - .parquet_files() - .list_by_namespace_not_to_delete(namespace2.id) - .await - .unwrap(); - assert_eq!(vec![f1.clone(), f2.clone(), f3.clone()], files); - - repos - .parquet_files() - .create_upgrade_delete(&[f2.id], &[], &[], CompactionLevel::Initial) - .await - .unwrap(); - let files = repos - .parquet_files() - .list_by_namespace_not_to_delete(namespace2.id) - .await - .unwrap(); - assert_eq!(vec![f1.clone(), f3.clone()], files); - - let files = repos - .parquet_files() - .list_by_namespace_not_to_delete(NamespaceId::new(i64::MAX)) - .await - .unwrap(); - assert!(files.is_empty()); - - // test delete_old_ids_only - let older_than = Timestamp::new( - (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(), - ); - let ids = repos - .parquet_files() - .delete_old_ids_only(older_than) - .await - .unwrap(); - assert_eq!(ids.len(), 1); - - // test retention-based flagging for deletion - // Since mem catalog has default retention 1 hour, let us first set it to 0 means infinite - let namespaces = repos - .namespaces() - .list(SoftDeletedRows::AllRows) - .await - .expect("listing namespaces"); - for namespace in namespaces { - repos - .namespaces() - .update_retention_period(&namespace.name, None) // infinite - .await - .unwrap(); - } - - // 1. with no retention period set on the ns, nothing should get flagged - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert!(ids.is_empty()); - // 2. set ns retention period to one hour then create some files before and after and - // ensure correct files get deleted - repos - .namespaces() - .update_retention_period(&namespace.name, Some(60 * 60 * 1_000_000_000)) // 1 hour - .await - .unwrap(); - let f4_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - max_time: Timestamp::new( - // a bit over an hour ago - (catalog.time_provider().now() - Duration::from_secs(60 * 65)).timestamp_nanos(), - ), - ..f3_params - }; - let f4 = repos - .parquet_files() - .create(f4_params.clone()) - .await - .unwrap(); - let f5_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - max_time: Timestamp::new( - // a bit under an hour ago - (catalog.time_provider().now() - Duration::from_secs(60 * 55)).timestamp_nanos(), - ), - ..f4_params - }; - let f5 = repos - .parquet_files() - .create(f5_params.clone()) - .await - .unwrap(); - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert!(ids.len() > 1); // it's also going to flag f1, f2 & f3 because they have low max - // timestamps but i don't want this test to be brittle if those - // values change so i'm not asserting len == 4 - let f4 = repos - .parquet_files() - .get_by_object_store_id(f4.object_store_id) - .await - .unwrap() - .unwrap(); - assert_matches!(f4.to_delete, Some(_)); // f4 is > 1hr old - let f5 = repos - .parquet_files() - .get_by_object_store_id(f5.object_store_id) - .await - .unwrap() - .unwrap(); - assert_matches!(f5.to_delete, None); // f5 is < 1hr old - - // call flag_for_delete_by_retention() again and nothing should be flagged because they've - // already been flagged - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert!(ids.is_empty()); - - // test that flag_for_delete_by_retention respects UPDATE LIMIT - // create limit + the meaning of life parquet files that are all older than the retention (>1hr) - const LIMIT: usize = 1000; - const MOL: usize = 42; - for _ in 0..LIMIT + MOL { - let params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - max_time: Timestamp::new( - // a bit over an hour ago - (catalog.time_provider().now() - Duration::from_secs(60 * 65)) - .timestamp_nanos(), - ), - ..f1_params.clone() - }; - repos.parquet_files().create(params.clone()).await.unwrap(); - } - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert_eq!(ids.len(), LIMIT); - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert_eq!(ids.len(), MOL); // second call took remainder - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert_eq!(ids.len(), 0); // none left - - // test create_update_delete - let f6_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - ..f5_params - }; - let f1_uuid = f1.object_store_id; - let f5_uuid = f5.object_store_id; - let cud = repos - .parquet_files() - .create_upgrade_delete( - &[f5.id], - &[f1.id], - &[f6_params.clone()], - CompactionLevel::Final, - ) - .await - .unwrap(); - - assert_eq!(cud.len(), 1); - let f5_delete = repos - .parquet_files() - .get_by_object_store_id(f5_uuid) - .await - .unwrap() - .unwrap(); - assert_matches!(f5_delete.to_delete, Some(_)); - - let f1_compaction_level = repos - .parquet_files() - .get_by_object_store_id(f1_uuid) - .await - .unwrap() - .unwrap(); - assert_matches!(f1_compaction_level.compaction_level, CompactionLevel::Final); - - let f6 = repos - .parquet_files() - .get_by_object_store_id(f6_params.object_store_id) - .await - .unwrap() - .unwrap(); - - let f6_uuid = f6.object_store_id; - - // test create_update_delete transaction (rollsback because f6 already exists) - let cud = repos - .parquet_files() - .create_upgrade_delete( - &[f5.id], - &[f2.id], - &[f6_params.clone()], - CompactionLevel::Final, - ) - .await; - - assert_matches!( - cud, - Err(Error::FileExists { - object_store_id - }) if object_store_id == f6_params.object_store_id - ); - - let f6_not_delete = repos - .parquet_files() - .get_by_object_store_id(f6_uuid) - .await - .unwrap() - .unwrap(); - assert_matches!(f6_not_delete.to_delete, None); - - // test exists_by_object_store_id_batch returns parquet files by object store id - let does_not_exist = Uuid::new_v4(); - let mut present = repos - .parquet_files() - .exists_by_object_store_id_batch(vec![f6_uuid, f1_uuid, does_not_exist]) - .await - .unwrap(); - assert_eq!(present.len(), 2); - let mut expected = vec![f6_uuid, f1_uuid]; - present.sort(); - expected.sort(); - assert_eq!(present, expected); - } - - async fn test_parquet_file_delete_broken(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace_1 = arbitrary_namespace(&mut *repos, "retention_broken_1").await; - let namespace_2 = repos - .namespaces() - .create( - &NamespaceName::new("retention_broken_2").unwrap(), - None, - Some(1), - None, - ) - .await - .unwrap(); - let table_1 = arbitrary_table(&mut *repos, "test_table", &namespace_1).await; - let table_2 = arbitrary_table(&mut *repos, "test_table", &namespace_2).await; - let partition_1 = repos - .partitions() - .create_or_get("one".into(), table_1.id) - .await - .unwrap(); - let partition_2 = repos - .partitions() - .create_or_get("one".into(), table_2.id) - .await - .unwrap(); - - let parquet_file_params_1 = - arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1); - let parquet_file_params_2 = - arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2); - let _parquet_file_1 = repos - .parquet_files() - .create(parquet_file_params_1) - .await - .unwrap(); - let parquet_file_2 = repos - .parquet_files() - .create(parquet_file_params_2) - .await - .unwrap(); - - let ids = repos - .parquet_files() - .flag_for_delete_by_retention() - .await - .unwrap(); - assert_eq!(ids, vec![parquet_file_2.id]); - } - - async fn test_partitions_new_file_between(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace(&mut *repos, "test_partitions_new_file_between").await; - let table = - arbitrary_table(&mut *repos, "test_table_for_new_file_between", &namespace).await; - - // param for the tests - let time_now = Timestamp::from(catalog.time_provider().now()); - let time_one_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(1)); - let time_two_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(2)); - let time_three_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(3)); - let time_five_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(5)); - let time_six_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(6)); - - // Db has no partitions - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // ----------------- - // PARTITION one - // The DB has 1 partition but it does not have any file - let partition1 = repos - .partitions() - .create_or_get("one".into(), table.id) - .await - .unwrap(); - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // create files for partition one - let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition1); - - // create a deleted L0 file that was created 3 hours ago - let delete_l0_file = repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - repos - .parquet_files() - .create_upgrade_delete(&[delete_l0_file.id], &[], &[], CompactionLevel::Initial) - .await - .unwrap(); - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert!(partitions.is_empty()); - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, Some(time_one_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_one_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // create a deleted L0 file that was created 1 hour ago - let l0_one_hour_ago_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - created_at: time_one_hour_ago, - ..parquet_file_params.clone() - }; - repos - .parquet_files() - .create(l0_one_hour_ago_file_params.clone()) - .await - .unwrap(); - // partition one should be returned - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_two_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // ----------------- - // PARTITION two - // Partition two without any file - let partition2 = repos - .partitions() - .create_or_get("two".into(), table.id) - .await - .unwrap(); - // should return partition one only - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - - // Add a L0 file created 5 hours ago - let l0_five_hour_ago_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - created_at: time_five_hour_ago, - partition_id: partition2.transition_partition_id(), - ..parquet_file_params.clone() - }; - repos - .parquet_files() - .create(l0_five_hour_ago_file_params.clone()) - .await - .unwrap(); - // still return partition one only - let partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - // Between six and three hours ago, return only partition 2 - let partitions = repos - .partitions() - .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition2.id); - - // Add an L1 file created just now - let l1_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - created_at: time_now, - partition_id: partition2.transition_partition_id(), - compaction_level: CompactionLevel::FileNonOverlapped, - ..parquet_file_params.clone() - }; - repos - .parquet_files() - .create(l1_file_params.clone()) - .await - .unwrap(); - // should return both partitions - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 2); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition2.id); - // Only return partition1: the creation time must be strictly less than the maximum time, - // not equal - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - // Between six and three hours ago, return none - let partitions = repos - .partitions() - .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // ----------------- - // PARTITION three - // Partition three without any file - let partition3 = repos - .partitions() - .create_or_get("three".into(), table.id) - .await - .unwrap(); - // should return partition one and two only - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 2); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition2.id); - // Only return partition1: the creation time must be strictly less than the maximum time, - // not equal - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - // When the maximum time is greater than the creation time of partition2, return it - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now + 1)) - .await - .unwrap(); - assert_eq!(partitions.len(), 2); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition2.id); - // Between six and three hours ago, return none - let partitions = repos - .partitions() - .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // Add an L2 file created just now for partition three - let l2_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - created_at: time_now, - partition_id: partition3.transition_partition_id(), - compaction_level: CompactionLevel::Final, - ..parquet_file_params.clone() - }; - repos - .parquet_files() - .create(l2_file_params.clone()) - .await - .unwrap(); - // now should return partition one two and three - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 3); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition2.id); - assert_eq!(partitions[2], partition3.id); - // Only return partition1: the creation time must be strictly less than the maximum time, - // not equal - let partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 1); - assert_eq!(partitions[0], partition1.id); - // Between six and three hours ago, return none - let partitions = repos - .partitions() - .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - - // add an L0 file created one hour ago for partition three - let l0_one_hour_ago_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - created_at: time_one_hour_ago, - partition_id: partition3.transition_partition_id(), - ..parquet_file_params.clone() - }; - repos - .parquet_files() - .create(l0_one_hour_ago_file_params.clone()) - .await - .unwrap(); - // should return all partitions - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_two_hour_ago, None) - .await - .unwrap(); - assert_eq!(partitions.len(), 3); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition2.id); - assert_eq!(partitions[2], partition3.id); - // Only return partitions 1 and 3; 2 was created just now - let mut partitions = repos - .partitions() - .partitions_new_file_between(time_three_hour_ago, Some(time_now)) - .await - .unwrap(); - assert_eq!(partitions.len(), 2); - partitions.sort(); - assert_eq!(partitions[0], partition1.id); - assert_eq!(partitions[1], partition3.id); - // Between six and three hours ago, return none - let partitions = repos - .partitions() - .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) - .await - .unwrap(); - assert!(partitions.is_empty()); - } - - async fn test_list_by_partiton_not_to_delete(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = arbitrary_namespace( - &mut *repos, - "namespace_parquet_file_test_list_by_partiton_not_to_delete", - ) - .await; - let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; - - let partition = repos - .partitions() - .create_or_get("test_list_by_partiton_not_to_delete_one".into(), table.id) - .await - .unwrap(); - let partition2 = repos - .partitions() - .create_or_get("test_list_by_partiton_not_to_delete_two".into(), table.id) - .await - .unwrap(); - - let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); - - let parquet_file = repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - let delete_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - ..parquet_file_params.clone() - }; - let delete_file = repos - .parquet_files() - .create(delete_file_params) - .await - .unwrap(); - repos - .parquet_files() - .create_upgrade_delete(&[delete_file.id], &[], &[], CompactionLevel::Initial) - .await - .unwrap(); - let level1_file_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - ..parquet_file_params.clone() - }; - let mut level1_file = repos - .parquet_files() - .create(level1_file_params) - .await - .unwrap(); - repos - .parquet_files() - .create_upgrade_delete( - &[], - &[level1_file.id], - &[], - CompactionLevel::FileNonOverlapped, - ) - .await - .unwrap(); - level1_file.compaction_level = CompactionLevel::FileNonOverlapped; - - let other_partition_params = ParquetFileParams { - partition_id: partition2.transition_partition_id(), - object_store_id: Uuid::new_v4(), - ..parquet_file_params.clone() - }; - let _partition2_file = repos - .parquet_files() - .create(other_partition_params) - .await - .unwrap(); - - let files = repos - .parquet_files() - .list_by_partition_not_to_delete(&partition.transition_partition_id()) - .await - .unwrap(); - assert_eq!(files.len(), 2); - - let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect(); - file_ids.sort(); - let mut expected_ids = vec![parquet_file.id, level1_file.id]; - expected_ids.sort(); - assert_eq!(file_ids, expected_ids); - - // Using the catalog partition ID should return the same files, even if the Parquet file - // records don't have the partition ID on them (which is the default now) - let files = repos - .parquet_files() - .list_by_partition_not_to_delete(&TransitionPartitionId::Deprecated(partition.id)) - .await - .unwrap(); - assert_eq!(files.len(), 2); - - let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect(); - file_ids.sort(); - let mut expected_ids = vec![parquet_file.id, level1_file.id]; - expected_ids.sort(); - assert_eq!(file_ids, expected_ids); - } - - async fn test_update_to_compaction_level_1(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace = - arbitrary_namespace(&mut *repos, "namespace_update_to_compaction_level_1_test").await; - let table = arbitrary_table(&mut *repos, "update_table", &namespace).await; - let partition = repos - .partitions() - .create_or_get("test_update_to_compaction_level_1_one".into(), table.id) - .await - .unwrap(); - - // Set up the window of times we're interested in level 1 files for - let query_min_time = Timestamp::new(5); - let query_max_time = Timestamp::new(10); - - // Create a file with times entirely within the window - let mut parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); - parquet_file_params.min_time = query_min_time + 1; - parquet_file_params.max_time = query_max_time - 1; - let parquet_file = repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - - // Create a file that will remain as level 0 - let level_0_params = ParquetFileParams { - object_store_id: Uuid::new_v4(), - ..parquet_file_params.clone() - }; - let level_0_file = repos.parquet_files().create(level_0_params).await.unwrap(); - - // Create a ParquetFileId that doesn't actually exist in the catalog - let nonexistent_parquet_file_id = ParquetFileId::new(level_0_file.id.get() + 1); - - // Make parquet_file compaction level 1, attempt to mark the nonexistent file; operation - // should succeed - let created = repos - .parquet_files() - .create_upgrade_delete( - &[], - &[parquet_file.id, nonexistent_parquet_file_id], - &[], - CompactionLevel::FileNonOverlapped, - ) - .await - .unwrap(); - assert_eq!(created, vec![]); - - // remove namespace to avoid it from affecting later tests - repos - .namespaces() - .soft_delete("namespace_update_to_compaction_level_1_test") - .await - .expect("delete namespace should succeed"); - } - - /// Assert that a namespace deletion does NOT cascade to the tables/schema - /// items/parquet files/etc. - /// - /// Removal of this entities breaks the invariant that once created, a row - /// always exists for the lifetime of an IOx process, and causes the system - /// to panic in multiple components. It's also ineffective, because most - /// components maintain a cache of at least one of these entities. - /// - /// Instead soft deleted namespaces should have their files GC'd like a - /// normal parquet file deletion, removing the rows once they're no longer - /// being actively used by the system. This is done by waiting a long time - /// before deleting records, and whilst isn't perfect, it is largely - /// effective. - async fn test_delete_namespace(catalog: Arc) { - let mut repos = catalog.repositories().await; - let namespace_1 = - arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_1").await; - let table_1 = arbitrary_table(&mut *repos, "test_table_1", &namespace_1).await; - let _c = repos - .columns() - .create_or_get("column_test_1", table_1.id, ColumnType::Tag) - .await - .unwrap(); - let partition_1 = repos - .partitions() - .create_or_get("test_delete_namespace_one".into(), table_1.id) - .await - .unwrap(); - - // parquet files - let parquet_file_params = - arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1); - repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - let parquet_file_params_2 = ParquetFileParams { - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(200), - max_time: Timestamp::new(300), - ..parquet_file_params - }; - repos - .parquet_files() - .create(parquet_file_params_2.clone()) - .await - .unwrap(); - - // we've now created a namespace with a table and parquet files. before we test deleting - // it, let's create another so we can ensure that doesn't get deleted. - let namespace_2 = - arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_2").await; - let table_2 = arbitrary_table(&mut *repos, "test_table_2", &namespace_2).await; - let _c = repos - .columns() - .create_or_get("column_test_2", table_2.id, ColumnType::Tag) - .await - .unwrap(); - let partition_2 = repos - .partitions() - .create_or_get("test_delete_namespace_two".into(), table_2.id) - .await - .unwrap(); - - // parquet files - let parquet_file_params = - arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2); - repos - .parquet_files() - .create(parquet_file_params.clone()) - .await - .unwrap(); - let parquet_file_params_2 = ParquetFileParams { - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(200), - max_time: Timestamp::new(300), - ..parquet_file_params - }; - repos - .parquet_files() - .create(parquet_file_params_2.clone()) - .await - .unwrap(); - - // now delete namespace_1 and assert it's all gone and none of - // namespace_2 is gone - repos - .namespaces() - .soft_delete("namespace_test_delete_namespace_1") - .await - .expect("delete namespace should succeed"); - // assert that namespace is soft-deleted, but the table, column, and parquet files are all - // still there. - assert!(repos - .namespaces() - .get_by_id(namespace_1.id, SoftDeletedRows::ExcludeDeleted) - .await - .expect("get namespace should succeed") - .is_none()); - assert_eq!( - repos - .namespaces() - .get_by_id(namespace_1.id, SoftDeletedRows::AllRows) - .await - .expect("get namespace should succeed") - .map(|mut v| { - // The only change after soft-deletion should be the deleted_at - // field being set - this block normalises that field, so that - // the before/after can be asserted as equal. - v.deleted_at = None; - v - }) - .expect("should see soft-deleted row"), - namespace_1 - ); - assert_eq!( - repos - .tables() - .get_by_id(table_1.id) - .await - .expect("get table should succeed") - .expect("should return row"), - table_1 - ); - assert_eq!( - repos - .columns() - .list_by_namespace_id(namespace_1.id) - .await - .expect("listing columns should succeed") - .len(), - 1 - ); - assert_eq!( - repos - .columns() - .list_by_table_id(table_1.id) - .await - .expect("listing columns should succeed") - .len(), - 1 - ); - - // partition's get_by_id should succeed - repos - .partitions() - .get_by_id(partition_1.id) - .await - .unwrap() - .unwrap(); - - // assert that the namespace, table, column, and parquet files for namespace_2 are still - // there - assert!(repos - .namespaces() - .get_by_id(namespace_2.id, SoftDeletedRows::ExcludeDeleted) - .await - .expect("get namespace should succeed") - .is_some()); - - assert!(repos - .tables() - .get_by_id(table_2.id) - .await - .expect("get table should succeed") - .is_some()); - assert_eq!( - repos - .columns() - .list_by_namespace_id(namespace_2.id) - .await - .expect("listing columns should succeed") - .len(), - 1 - ); - assert_eq!( - repos - .columns() - .list_by_table_id(table_2.id) - .await - .expect("listing columns should succeed") - .len(), - 1 - ); - - // partition's get_by_id should succeed - repos - .partitions() - .get_by_id(partition_2.id) - .await - .unwrap() - .unwrap(); - } - - /// Upsert a namespace called `namespace_name` and write `lines` to it. - async fn populate_namespace( - repos: &mut R, - namespace_name: &str, - lines: &str, - ) -> (Namespace, NamespaceSchema) - where - R: RepoCollection + ?Sized, - { - let namespace = repos - .namespaces() - .create( - &NamespaceName::new(namespace_name).unwrap(), - None, - None, - None, - ) - .await; - - let namespace = match namespace { - Ok(v) => v, - Err(Error::NameExists { .. }) => repos - .namespaces() - .get_by_name(namespace_name, SoftDeletedRows::AllRows) - .await - .unwrap() - .unwrap(), - e @ Err(_) => e.unwrap(), - }; - - let batches = mutable_batch_lp::lines_to_batches(lines, 42).unwrap(); - let batches = batches.iter().map(|(table, batch)| (table.as_str(), batch)); - let ns = NamespaceSchema::new_empty_from(&namespace); - - let schema = validate_or_insert_schema(batches, &ns, repos) - .await - .expect("validate schema failed") - .unwrap_or(ns); - - (namespace, schema) - } - - async fn test_list_schemas(catalog: Arc) { - let mut repos = catalog.repositories().await; - - let ns1 = populate_namespace( - repos.deref_mut(), - "ns1", - "cpu,tag=1 field=1i\nanother,tag=1 field=1.0", - ) - .await; - let ns2 = populate_namespace( - repos.deref_mut(), - "ns2", - "cpu,tag=1 field=1i\nsomethingelse field=1u", - ) - .await; - - // Otherwise the in-mem catalog deadlocks.... (but not postgres) - drop(repos); - - let got = list_schemas(&*catalog) - .await - .expect("should be able to list the schemas") - .collect::>(); - - assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1); - assert!(got.contains(&ns2), "{:#?}\n\nwant{:#?}", got, &ns2); - } - - async fn test_list_schemas_soft_deleted_rows(catalog: Arc) { - let mut repos = catalog.repositories().await; - - let ns1 = populate_namespace( - repos.deref_mut(), - "ns1", - "cpu,tag=1 field=1i\nanother,tag=1 field=1.0", - ) - .await; - let ns2 = populate_namespace( - repos.deref_mut(), - "ns2", - "cpu,tag=1 field=1i\nsomethingelse field=1u", - ) - .await; - - repos - .namespaces() - .soft_delete(&ns2.0.name) - .await - .expect("failed to soft delete namespace"); - - // Otherwise the in-mem catalog deadlocks.... (but not postgres) - drop(repos); - - let got = list_schemas(&*catalog) - .await - .expect("should be able to list the schemas") - .collect::>(); - - assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1); - assert!(!got.contains(&ns2), "{:#?}\n\n do not want{:#?}", got, &ns2); - } - - fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) { - let histogram = metrics - .get_instrument::>("catalog_op_duration") - .expect("failed to read metric") - .get_observer(&Attributes::from(&[("op", name), ("result", "success")])) - .expect("failed to get observer") - .fetch(); - - let hit_count = histogram.sample_count(); - assert!(hit_count > 1, "metric did not record any calls"); - } -} diff --git a/iox_catalog/src/interface_tests.rs b/iox_catalog/src/interface_tests.rs new file mode 100644 index 00000000000..4635483c37a --- /dev/null +++ b/iox_catalog/src/interface_tests.rs @@ -0,0 +1,3203 @@ +//! Abstract tests of the catalog interface w/o relying on the actual implementation. +use crate::{ + interface::{ + CasFailure, Catalog, Error, ParquetFileRepoExt, PartitionRepoExt, RepoCollection, + SoftDeletedRows, + }, + test_helpers::{arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table}, + util::{list_schemas, validate_or_insert_schema}, +}; + +use ::test_helpers::assert_error; +use assert_matches::assert_matches; +use async_trait::async_trait; +use data_types::snapshot::table::TableSnapshot; +use data_types::{ + partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, + ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId, + NamespaceName, NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileId, ParquetFileParams, + PartitionId, SortKeyIds, TableId, Timestamp, +}; +use data_types::{snapshot::partition::PartitionSnapshot, Column, PartitionHashId, PartitionKey}; +use futures::{Future, StreamExt}; +use generated_types::influxdata::iox::partition_template::v1 as proto; +use iox_time::TimeProvider; +use metric::{Attributes, DurationHistogram, Metric}; +use parking_lot::Mutex; +use std::{any::Any, fmt::Display}; +use std::{ + collections::{BTreeMap, BTreeSet, HashMap}, + ops::DerefMut, + sync::Arc, + time::Duration, +}; + +pub(crate) async fn test_catalog(clean_state: R) +where + R: Fn() -> F + Send + Sync, + F: Future> + Send, +{ + test_setup(clean_state().await).await; + test_namespace_soft_deletion(clean_state().await).await; + test_partitions_new_file_between(clean_state().await).await; + test_column(clean_state().await).await; + test_partition(clean_state().await).await; + test_parquet_file(clean_state().await).await; + test_parquet_file_delete_broken(clean_state().await).await; + test_update_to_compaction_level_1(clean_state().await).await; + test_list_by_partiton_not_to_delete(clean_state().await).await; + test_list_schemas(clean_state().await).await; + test_list_schemas_soft_deleted_rows(clean_state().await).await; + test_delete_namespace(clean_state().await).await; + + let catalog = clean_state().await; + test_namespace(Arc::clone(&catalog)).await; + assert_metric_hit(&catalog.metrics(), "namespace_create"); + + let catalog = clean_state().await; + test_table(Arc::clone(&catalog)).await; + assert_metric_hit(&catalog.metrics(), "table_create"); + + let catalog = clean_state().await; + test_column(Arc::clone(&catalog)).await; + assert_metric_hit(&catalog.metrics(), "column_create_or_get"); + + let catalog = clean_state().await; + test_partition(Arc::clone(&catalog)).await; + assert_metric_hit(&catalog.metrics(), "partition_create_or_get"); + + let catalog = clean_state().await; + test_parquet_file(Arc::clone(&catalog)).await; + assert_metric_hit(&catalog.metrics(), "parquet_create_upgrade_delete"); + + test_two_repos(clean_state().await).await; + test_partition_create_or_get_idempotent(clean_state().await).await; + test_column_create_or_get_many_unchecked(clean_state).await; +} + +async fn test_setup(catalog: Arc) { + catalog.setup().await.expect("first catalog setup"); + catalog.setup().await.expect("second catalog setup"); +} + +async fn test_namespace(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace_name = NamespaceName::new("test_namespace").unwrap(); + let namespace = repos + .namespaces() + .create(&namespace_name, None, None, None) + .await + .unwrap(); + assert!(namespace.id > NamespaceId::new(0)); + assert_eq!(namespace.name, namespace_name.as_str()); + assert_eq!( + namespace.partition_template, + NamespacePartitionTemplateOverride::default() + ); + let lookup_namespace = repos + .namespaces() + .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .unwrap(); + assert_eq!(namespace, lookup_namespace); + + // Assert default values for service protection limits. + assert_eq!(namespace.max_tables, MaxTables::default()); + assert_eq!( + namespace.max_columns_per_table, + MaxColumnsPerTable::default() + ); + + let conflict = repos + .namespaces() + .create(&namespace_name, None, None, None) + .await; + assert!(matches!(conflict.unwrap_err(), Error::AlreadyExists { .. })); + + let found = repos + .namespaces() + .get_by_id(namespace.id, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .expect("namespace should be there"); + assert_eq!(namespace, found); + + let not_found = repos + .namespaces() + .get_by_id(NamespaceId::new(i64::MAX), SoftDeletedRows::ExcludeDeleted) + .await + .unwrap(); + assert!(not_found.is_none()); + + let found = repos + .namespaces() + .get_by_name(&namespace_name, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .expect("namespace should be there"); + assert_eq!(namespace, found); + + let not_found = repos + .namespaces() + .get_by_name("does_not_exist", SoftDeletedRows::ExcludeDeleted) + .await + .unwrap(); + assert!(not_found.is_none()); + + let namespace2 = arbitrary_namespace(&mut *repos, "test_namespace2").await; + let mut namespaces = repos + .namespaces() + .list(SoftDeletedRows::ExcludeDeleted) + .await + .unwrap(); + namespaces.sort_by_key(|ns| ns.name.clone()); + assert_eq!(namespaces, vec![namespace, namespace2]); + + let new_table_limit = MaxTables::try_from(15_000).unwrap(); + let modified = repos + .namespaces() + .update_table_limit(namespace_name.as_str(), new_table_limit) + .await + .expect("namespace should be updateable"); + assert_eq!(new_table_limit, modified.max_tables); + + let new_column_limit = MaxColumnsPerTable::try_from(1_500).unwrap(); + let modified = repos + .namespaces() + .update_column_limit(namespace_name.as_str(), new_column_limit) + .await + .expect("namespace should be updateable"); + assert_eq!(new_column_limit, modified.max_columns_per_table); + + const NEW_RETENTION_PERIOD_NS: i64 = 5 * 60 * 60 * 1000 * 1000 * 1000; + let modified = repos + .namespaces() + .update_retention_period(namespace_name.as_str(), Some(NEW_RETENTION_PERIOD_NS)) + .await + .expect("namespace should be updateable"); + assert_eq!( + NEW_RETENTION_PERIOD_NS, + modified.retention_period_ns.unwrap() + ); + + let modified = repos + .namespaces() + .update_retention_period(namespace_name.as_str(), None) + .await + .expect("namespace should be updateable"); + assert!(modified.retention_period_ns.is_none()); + + // create namespace with retention period NULL (the default) + let namespace3 = arbitrary_namespace(&mut *repos, "test_namespace3").await; + assert!(namespace3.retention_period_ns.is_none()); + + // create namespace with retention period + let namespace4_name = NamespaceName::new("test_namespace4").unwrap(); + let namespace4 = repos + .namespaces() + .create(&namespace4_name, None, Some(NEW_RETENTION_PERIOD_NS), None) + .await + .expect("namespace with 5-hour retention should be created"); + assert_eq!( + NEW_RETENTION_PERIOD_NS, + namespace4.retention_period_ns.unwrap() + ); + // reset retention period to NULL to avoid affecting later tests + repos + .namespaces() + .update_retention_period(&namespace4_name, None) + .await + .expect("namespace should be updateable"); + + // create a namespace with a PartitionTemplate other than the default + let tag_partition_template = + NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate { + parts: vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("tag1".into())), + }], + }) + .unwrap(); + let namespace5_name = NamespaceName::new("test_namespace5").unwrap(); + let namespace5 = repos + .namespaces() + .create( + &namespace5_name, + Some(tag_partition_template.clone()), + None, + None, + ) + .await + .unwrap(); + assert_eq!(namespace5.partition_template, tag_partition_template); + let lookup_namespace5 = repos + .namespaces() + .get_by_name(&namespace5_name, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .unwrap(); + assert_eq!(namespace5, lookup_namespace5); + + // remove namespace to avoid it from affecting later tests + repos + .namespaces() + .soft_delete("test_namespace") + .await + .expect("delete namespace should succeed"); + repos + .namespaces() + .soft_delete("test_namespace2") + .await + .expect("delete namespace should succeed"); + repos + .namespaces() + .soft_delete("test_namespace3") + .await + .expect("delete namespace should succeed"); + repos + .namespaces() + .soft_delete("test_namespace4") + .await + .expect("delete namespace should succeed"); +} + +/// Construct a set of two namespaces: +/// +/// * deleted-ns: marked as soft-deleted +/// * active-ns: not marked as deleted +/// +/// And assert the expected "soft delete" semantics / correctly filter out +/// the expected rows for all three states of [`SoftDeletedRows`]. +async fn test_namespace_soft_deletion(catalog: Arc) { + let mut repos = catalog.repositories(); + + let deleted_ns = arbitrary_namespace(&mut *repos, "deleted-ns").await; + let active_ns = arbitrary_namespace(&mut *repos, "active-ns").await; + + // Mark "deleted-ns" as soft-deleted. + repos.namespaces().soft_delete("deleted-ns").await.unwrap(); + + // Which should be idempotent (ignoring the timestamp change - when + // changing this to "soft delete" it was idempotent, so I am preserving + // that). + repos.namespaces().soft_delete("deleted-ns").await.unwrap(); + + // Listing should respect soft deletion. + let got = repos + .namespaces() + .list(SoftDeletedRows::AllRows) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["deleted-ns", "active-ns"]); + + let got = repos + .namespaces() + .list(SoftDeletedRows::OnlyDeleted) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["deleted-ns"]); + + let got = repos + .namespaces() + .list(SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["active-ns"]); + + // As should get by ID + let got = repos + .namespaces() + .get_by_id(deleted_ns.id, SoftDeletedRows::AllRows) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["deleted-ns"]); + let got = repos + .namespaces() + .get_by_id(deleted_ns.id, SoftDeletedRows::OnlyDeleted) + .await + .unwrap() + .into_iter() + .map(|v| { + assert!(v.deleted_at.is_some()); + v.name + }); + assert_string_set_eq(got, ["deleted-ns"]); + let got = repos + .namespaces() + .get_by_id(deleted_ns.id, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap(); + assert!(got.is_none()); + let got = repos + .namespaces() + .get_by_id(active_ns.id, SoftDeletedRows::AllRows) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["active-ns"]); + let got = repos + .namespaces() + .get_by_id(active_ns.id, SoftDeletedRows::OnlyDeleted) + .await + .unwrap(); + assert!(got.is_none()); + let got = repos + .namespaces() + .get_by_id(active_ns.id, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["active-ns"]); + + // And get by name + let got = repos + .namespaces() + .get_by_name(&deleted_ns.name, SoftDeletedRows::AllRows) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["deleted-ns"]); + let got = repos + .namespaces() + .get_by_name(&deleted_ns.name, SoftDeletedRows::OnlyDeleted) + .await + .unwrap() + .into_iter() + .map(|v| { + assert!(v.deleted_at.is_some()); + v.name + }); + assert_string_set_eq(got, ["deleted-ns"]); + let got = repos + .namespaces() + .get_by_name(&deleted_ns.name, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap(); + assert!(got.is_none()); + let got = repos + .namespaces() + .get_by_name(&active_ns.name, SoftDeletedRows::AllRows) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["active-ns"]); + let got = repos + .namespaces() + .get_by_name(&active_ns.name, SoftDeletedRows::OnlyDeleted) + .await + .unwrap(); + assert!(got.is_none()); + let got = repos + .namespaces() + .get_by_name(&active_ns.name, SoftDeletedRows::ExcludeDeleted) + .await + .unwrap() + .into_iter() + .map(|v| v.name); + assert_string_set_eq(got, ["active-ns"]); +} + +// Assert the set of strings "a" is equal to the set "b", tolerating +// duplicates. +#[track_caller] +fn assert_string_set_eq(a: impl IntoIterator, b: impl IntoIterator) +where + T: Into, + U: Into, +{ + let mut a = a.into_iter().map(Into::into).collect::>(); + a.sort_unstable(); + let mut b = b.into_iter().map(Into::into).collect::>(); + b.sort_unstable(); + assert_eq!(a, b); +} + +async fn test_table(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace(&mut *repos, "namespace_table_test").await; + + // test we can create a table + let t = arbitrary_table(&mut *repos, "test_table", &namespace).await; + assert!(t.id > TableId::new(0)); + assert_eq!( + t.partition_template, + TablePartitionTemplateOverride::default() + ); + + // The default template doesn't use any tag values, so no columns need to be created. + let table_columns = repos.columns().list_by_table_id(t.id).await.unwrap(); + assert!(table_columns.is_empty()); + + // test we get an error if we try to create it again + let err = repos + .tables() + .create( + "test_table", + TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(), + namespace.id, + ) + .await; + assert_error!( + err, + Error::AlreadyExists { ref descr } + if descr == &format!("table 'test_table' in namespace {}", namespace.id) + ); + + // get by id + assert_eq!(t, repos.tables().get_by_id(t.id).await.unwrap().unwrap()); + assert!(repos + .tables() + .get_by_id(TableId::new(i64::MAX)) + .await + .unwrap() + .is_none()); + + let tables = repos + .tables() + .list_by_namespace_id(namespace.id) + .await + .unwrap(); + assert_eq!(vec![t.clone()], tables); + + // test we can create a table of the same name in a different namespace + let namespace2 = arbitrary_namespace(&mut *repos, "two").await; + assert_ne!(namespace, namespace2); + let test_table = arbitrary_table(&mut *repos, "test_table", &namespace2).await; + assert_ne!(t.id, test_table.id); + assert_eq!(test_table.namespace_id, namespace2.id); + + // test get by namespace and name + let foo_table = arbitrary_table(&mut *repos, "foo", &namespace2).await; + assert_eq!( + repos + .tables() + .get_by_namespace_and_name(NamespaceId::new(i64::MAX), "test_table") + .await + .unwrap(), + None + ); + assert_eq!( + repos + .tables() + .get_by_namespace_and_name(namespace.id, "not_existing") + .await + .unwrap(), + None + ); + assert_eq!( + repos + .tables() + .get_by_namespace_and_name(namespace.id, "test_table") + .await + .unwrap(), + Some(t.clone()) + ); + assert_eq!( + repos + .tables() + .get_by_namespace_and_name(namespace2.id, "test_table") + .await + .unwrap() + .as_ref(), + Some(&test_table) + ); + assert_eq!( + repos + .tables() + .get_by_namespace_and_name(namespace2.id, "foo") + .await + .unwrap() + .as_ref(), + Some(&foo_table) + ); + + // All tables should be returned by list(), regardless of namespace + let mut list = repos.tables().list().await.unwrap(); + list.sort_by_key(|t| t.id); + let mut expected = [t, test_table, foo_table]; + expected.sort_by_key(|t| t.id); + assert_eq!(&list, &expected); + + // test per-namespace table limits + let latest = repos + .namespaces() + .update_table_limit("namespace_table_test", MaxTables::try_from(1).unwrap()) + .await + .expect("namespace should be updateable"); + let err = repos + .tables() + .create( + "definitely_unique", + TablePartitionTemplateOverride::try_new(None, &latest.partition_template).unwrap(), + latest.id, + ) + .await + .expect_err("should error with table create limit error"); + assert!(matches!(err, Error::LimitExceeded { .. })); + + // Create a table with a partition template other than the default + let custom_table_template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("tag1".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("tag2".into())), + }, + ], + }), + &namespace2.partition_template, + ) + .unwrap(); + let templated = repos + .tables() + .create( + "use_a_template", + custom_table_template.clone(), + namespace2.id, + ) + .await + .unwrap(); + assert_eq!(templated.partition_template, custom_table_template); + + // Tag columns should be created for tags used in the template + let table_columns = repos + .columns() + .list_by_table_id(templated.id) + .await + .unwrap(); + assert_eq!(table_columns.len(), 2); + assert!(table_columns.iter().all(|c| c.is_tag())); + let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect(); + column_names.sort(); + assert_eq!(column_names, &["tag1", "tag2"]); + + let lookup_templated = repos + .tables() + .get_by_namespace_and_name(namespace2.id, "use_a_template") + .await + .unwrap() + .unwrap(); + assert_eq!(templated, lookup_templated); + + // Create a namespace with a partition template other than the default + let custom_namespace_template = + NamespacePartitionTemplateOverride::try_from(proto::PartitionTemplate { + parts: vec![ + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("zzz".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("aaa".into())), + }, + proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("year-%Y".into())), + }, + ], + }) + .unwrap(); + let custom_namespace_name = NamespaceName::new("custom_namespace").unwrap(); + let custom_namespace = repos + .namespaces() + .create( + &custom_namespace_name, + Some(custom_namespace_template.clone()), + None, + None, + ) + .await + .unwrap(); + // Create a table without specifying the partition template + let custom_table_template = + TablePartitionTemplateOverride::try_new(None, &custom_namespace.partition_template) + .unwrap(); + let table_templated_by_namespace = repos + .tables() + .create( + "use_namespace_template", + custom_table_template, + custom_namespace.id, + ) + .await + .unwrap(); + assert_eq!( + table_templated_by_namespace.partition_template, + TablePartitionTemplateOverride::try_new(None, &custom_namespace_template).unwrap() + ); + + // Tag columns should be created for tags used in the template + let table_columns = repos + .columns() + .list_by_table_id(table_templated_by_namespace.id) + .await + .unwrap(); + assert_eq!(table_columns.len(), 2); + assert!(table_columns.iter().all(|c| c.is_tag())); + let mut column_names: Vec<_> = table_columns.iter().map(|c| &c.name).collect(); + column_names.sort(); + assert_eq!(column_names, &["aaa", "zzz"]); + + repos + .namespaces() + .soft_delete("namespace_table_test") + .await + .expect("delete namespace should succeed"); + repos + .namespaces() + .soft_delete("two") + .await + .expect("delete namespace should succeed"); +} + +async fn test_column(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace(&mut *repos, "namespace_column_test").await; + let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; + assert_eq!(table.namespace_id, namespace.id); + + // test we can create or get a column + let c = repos + .columns() + .create_or_get("column_test", table.id, ColumnType::Tag) + .await + .unwrap(); + + let ts1 = repos.tables().snapshot(table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts1).await; + + let cc = repos + .columns() + .create_or_get("column_test", table.id, ColumnType::Tag) + .await + .unwrap(); + assert!(c.id > ColumnId::new(0)); + assert_eq!(c, cc); + + let ts2 = repos.tables().snapshot(table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts2).await; + + assert_gt(ts2.generation(), ts1.generation()); + + // test that attempting to create an already defined column of a different type returns + // error + let err = repos + .columns() + .create_or_get("column_test", table.id, ColumnType::U64) + .await + .expect_err("should error with wrong column type"); + assert!(matches!(err, Error::AlreadyExists { .. })); + + // test that we can create a column of the same name under a different table + let table2 = arbitrary_table(&mut *repos, "test_table_2", &namespace).await; + let ccc = repos + .columns() + .create_or_get("column_test", table2.id, ColumnType::U64) + .await + .unwrap(); + assert_ne!(c, ccc); + + let columns = repos + .columns() + .list_by_namespace_id(namespace.id) + .await + .unwrap(); + + let ts3 = repos.tables().snapshot(table2.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts3).await; + + let mut want = vec![c.clone(), ccc]; + assert_eq!(want, columns); + + let columns = repos.columns().list_by_table_id(table.id).await.unwrap(); + + let want2 = vec![c]; + assert_eq!(want2, columns); + + // Add another tag column into table2 + let c3 = repos + .columns() + .create_or_get("b", table2.id, ColumnType::Tag) + .await + .unwrap(); + + let ts4 = repos.tables().snapshot(table2.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts4).await; + + assert_gt(ts4.generation(), ts3.generation()); + + // Listing columns should return all columns in the catalog + let list = repos.columns().list().await.unwrap(); + want.extend([c3]); + assert_eq!(list, want); + + // test create_or_get_many_unchecked, below column limit + let mut columns = HashMap::new(); + columns.insert("column_test", ColumnType::Tag); + columns.insert("new_column", ColumnType::Tag); + let table1_columns = repos + .columns() + .create_or_get_many_unchecked(table.id, columns) + .await + .unwrap(); + let mut table1_column_names: Vec<_> = table1_columns.iter().map(|c| &c.name).collect(); + table1_column_names.sort(); + assert_eq!(table1_column_names, vec!["column_test", "new_column"]); + + // test per-namespace column limits + repos + .namespaces() + .update_column_limit( + "namespace_column_test", + MaxColumnsPerTable::try_from(1).unwrap(), + ) + .await + .expect("namespace should be updateable"); + let err = repos + .columns() + .create_or_get("definitely unique", table.id, ColumnType::Tag) + .await + .expect_err("should error with table create limit error"); + assert!(matches!(err, Error::LimitExceeded { .. })); + + // test per-namespace column limits are NOT enforced with create_or_get_many_unchecked + let table3 = arbitrary_table(&mut *repos, "test_table_3", &namespace).await; + let mut columns = HashMap::new(); + columns.insert("apples", ColumnType::Tag); + columns.insert("oranges", ColumnType::Tag); + let table3_columns = repos + .columns() + .create_or_get_many_unchecked(table3.id, columns) + .await + .unwrap(); + let mut table3_column_names: Vec<_> = table3_columns.iter().map(|c| &c.name).collect(); + table3_column_names.sort(); + assert_eq!(table3_column_names, vec!["apples", "oranges"]); + + repos + .namespaces() + .soft_delete("namespace_column_test") + .await + .expect("delete namespace should succeed"); +} + +async fn test_partition(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace(&mut *repos, "namespace_partition_test").await; + let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; + + let mut created = BTreeMap::new(); + // partition to use + let partition = repos + .partitions() + .create_or_get("foo".into(), table.id) + .await + .expect("failed to create partition"); + // Test: sort_key_ids from create_or_get + assert!(partition.sort_key_ids().is_none()); + created.insert(partition.id, partition.clone()); + // partition to use + let partition_bar = repos + .partitions() + .create_or_get("bar".into(), table.id) + .await + .expect("failed to create partition"); + created.insert(partition_bar.id, partition_bar); + // partition to be skipped later + let to_skip_partition = repos + .partitions() + .create_or_get("asdf".into(), table.id) + .await + .unwrap(); + created.insert(to_skip_partition.id, to_skip_partition.clone()); + // partition to be skipped later + let to_skip_partition_too = repos + .partitions() + .create_or_get("asdf too".into(), table.id) + .await + .unwrap(); + created.insert(to_skip_partition_too.id, to_skip_partition_too.clone()); + + // partitions can be retrieved easily + let mut created_sorted = created.values().cloned().collect::>(); + created_sorted.sort_by_key(|p| p.id); + assert_eq!( + to_skip_partition, + repos + .partitions() + .get_by_id_batch(&[to_skip_partition.id]) + .await + .unwrap() + .into_iter() + .next() + .unwrap() + ); + let non_existing_partition_id = PartitionId::new(i64::MAX); + assert!(repos + .partitions() + .get_by_id_batch(&[non_existing_partition_id]) + .await + .unwrap() + .is_empty()); + let mut batch = repos + .partitions() + .get_by_id_batch( + &created + .keys() + .cloned() + // non-existing entries are ignored + .chain([non_existing_partition_id]) + // duplicates are ignored + .chain(created.keys().cloned()) + .collect::>(), + ) + .await + .unwrap(); + batch.sort_by_key(|p| p.id); + assert_eq!(created_sorted, batch); + // Test: sort_key_ids from get_by_id_batch + assert!(batch.iter().all(|p| p.sort_key_ids().is_none())); + + assert_eq!(created_sorted, batch); + + let s1 = repos.tables().snapshot(table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &s1).await; + + let listed = repos + .partitions() + .list_by_table_id(table.id) + .await + .expect("failed to list partitions") + .into_iter() + .map(|v| (v.id, v)) + .collect::>(); + // Test: sort_key_ids from list_by_table_id + assert!(listed.values().all(|p| p.sort_key_ids().is_none())); + + assert_eq!(created, listed); + + let listed = repos + .partitions() + .list_ids() + .await + .expect("failed to list partitions") + .into_iter() + .collect::>(); + + assert_eq!(created.keys().copied().collect::>(), listed); + + // The code no longer supports creating old-style partitions, so this list is always empty + // in these tests. See each catalog implementation for tests that insert old-style + // partitions directly and verify they're returned. + let old_style = repos.partitions().list_old_style().await.unwrap(); + assert!( + old_style.is_empty(), + "Expected no old-style partitions, got {old_style:?}" + ); + + // sort key should be unset on creation + assert!(to_skip_partition.sort_key_ids().is_none()); + + let s1 = repos + .partitions() + .snapshot(to_skip_partition.id) + .await + .unwrap(); + validate_partition_snapshot(repos.as_mut(), &s1).await; + + // test that updates sort key from None to Some + let updated_partition = repos + .partitions() + .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([2, 1, 3])) + .await + .unwrap(); + + // verify sort key is updated correctly + assert_eq!( + updated_partition.sort_key_ids().unwrap(), + &SortKeyIds::from([2, 1, 3]) + ); + + let s2 = repos + .partitions() + .snapshot(to_skip_partition.id) + .await + .unwrap(); + assert_gt(s2.generation(), s1.generation()); + validate_partition_snapshot(repos.as_mut(), &s2).await; + + // test that provides value of old_sort_key_ids but it do not match the existing one + // --> the new sort key will not be updated + let err = repos + .partitions() + .cas_sort_key( + to_skip_partition.id, + Some(&SortKeyIds::from([1])), + &SortKeyIds::from([1, 2, 3, 4]), + ) + .await + .expect_err("CAS with incorrect value should fail"); + // verify the sort key is not updated + assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => { + assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3])); + }); + + // test that provides same length but not-matched old_sort_key_ids + // --> the new sort key will not be updated + let err = repos + .partitions() + .cas_sort_key( + to_skip_partition.id, + Some(&SortKeyIds::from([1, 5, 10])), + &SortKeyIds::from([1, 2, 3, 4]), + ) + .await + .expect_err("CAS with incorrect value should fail"); + // verify the sort key is not updated + assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => { + assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3])); + }); + + // test that provide None sort_key_ids that do not match with existing values that are not None + // --> the new sort key will not be updated + let err = repos + .partitions() + .cas_sort_key(to_skip_partition.id, None, &SortKeyIds::from([1, 2, 3, 4])) + .await + .expect_err("CAS with incorrect value should fail"); + assert_matches!(err, CasFailure::ValueMismatch(old_sort_key_ids) => { + assert_eq!(old_sort_key_ids, SortKeyIds::from([2, 1, 3])); + }); + + // test getting partition from partition id and verify values of sort_key and sort_key_ids + let updated_other_partition = repos + .partitions() + .get_by_id_batch(&[to_skip_partition.id]) + .await + .unwrap() + .into_iter() + .next() + .unwrap(); + // still has the old sort key + assert_eq!( + updated_other_partition.sort_key_ids().unwrap(), + &SortKeyIds::from([2, 1, 3]) + ); + + // test that updates sort_key_ids from Some matching value to Some other value + let updated_partition = repos + .partitions() + .cas_sort_key( + to_skip_partition.id, + Some(&SortKeyIds::from([2, 1, 3])), + &SortKeyIds::from([2, 1, 4, 3]), + ) + .await + .unwrap(); + // verify the new values are updated + assert_eq!( + updated_partition.sort_key_ids().unwrap(), + &SortKeyIds::from([2, 1, 4, 3]) + ); + + // test getting the new sort key from partition id + let updated_partition = repos + .partitions() + .get_by_id_batch(&[to_skip_partition.id]) + .await + .unwrap() + .into_iter() + .next() + .unwrap(); + assert_eq!( + updated_partition.sort_key_ids().unwrap(), + &SortKeyIds::from([2, 1, 4, 3]) + ); + + // use to_skip_partition_too to update sort key from empty old values + // first make sure the old sort key is unset + assert!(to_skip_partition_too.sort_key_ids().is_none()); + + // test that provides empty old_sort_key_ids + // --> the new sort key will be updated + let updated_to_skip_partition_too = repos + .partitions() + .cas_sort_key(to_skip_partition_too.id, None, &SortKeyIds::from([3, 4])) + .await + .unwrap(); + // verify the new values are updated + assert_eq!( + updated_to_skip_partition_too.sort_key_ids().unwrap(), + &SortKeyIds::from([3, 4]) + ); + + let s3 = repos + .partitions() + .snapshot(to_skip_partition.id) + .await + .unwrap(); + assert_gt(s3.generation(), s2.generation()); + validate_partition_snapshot(repos.as_mut(), &s3).await; + + // The compactor can log why compaction was skipped + let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); + assert!( + skipped_compactions.is_empty(), + "Expected no skipped compactions, got: {skipped_compactions:?}" + ); + repos + .partitions() + .record_skipped_compaction(to_skip_partition.id, "I am le tired", 1, 2, 4, 10, 20) + .await + .unwrap(); + let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); + assert_eq!(skipped_compactions.len(), 1); + assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); + assert_eq!(skipped_compactions[0].reason, "I am le tired"); + assert_eq!(skipped_compactions[0].num_files, 1); + assert_eq!(skipped_compactions[0].limit_num_files, 2); + assert_eq!(skipped_compactions[0].estimated_bytes, 10); + assert_eq!(skipped_compactions[0].limit_bytes, 20); + // + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[ + to_skip_partition.id, + PartitionId::new(i64::MAX), + to_skip_partition.id, + ]) + .await + .unwrap(); + assert_eq!( + skipped_partition_records[0].partition_id, + to_skip_partition.id + ); + assert_eq!(skipped_partition_records[0].reason, "I am le tired"); + + let s4 = repos + .partitions() + .snapshot(to_skip_partition.id) + .await + .unwrap(); + assert_gt(s4.generation(), s3.generation()); + validate_partition_snapshot(repos.as_mut(), &s4).await; + + // Only save the last reason that any particular partition was skipped (really if the + // partition appears in the skipped compactions, it shouldn't become a compaction candidate + // again, but race conditions and all that) + repos + .partitions() + .record_skipped_compaction(to_skip_partition.id, "I'm on fire", 11, 12, 24, 110, 120) + .await + .unwrap(); + let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); + assert_eq!(skipped_compactions.len(), 1); + assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); + assert_eq!(skipped_compactions[0].reason, "I'm on fire"); + assert_eq!(skipped_compactions[0].num_files, 11); + assert_eq!(skipped_compactions[0].limit_num_files, 12); + assert_eq!(skipped_compactions[0].estimated_bytes, 110); + assert_eq!(skipped_compactions[0].limit_bytes, 120); + // + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[to_skip_partition.id]) + .await + .unwrap(); + assert_eq!( + skipped_partition_records[0].partition_id, + to_skip_partition.id + ); + assert_eq!(skipped_partition_records[0].reason, "I'm on fire"); + + // Can receive multiple skipped compactions for different partitions + repos + .partitions() + .record_skipped_compaction( + to_skip_partition_too.id, + "I am le tired too", + 1, + 2, + 4, + 10, + 20, + ) + .await + .unwrap(); + let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); + assert_eq!(skipped_compactions.len(), 2); + assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); + assert_eq!( + skipped_compactions[1].partition_id, + to_skip_partition_too.id + ); + // confirm can fetch subset of skipped compactions (a.k.a. have two, only fetch 1) + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[to_skip_partition.id]) + .await + .unwrap(); + assert_eq!(skipped_partition_records.len(), 1); + assert_eq!(skipped_compactions[0].partition_id, to_skip_partition.id); + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[to_skip_partition_too.id]) + .await + .unwrap(); + assert_eq!(skipped_partition_records.len(), 1); + assert_eq!( + skipped_partition_records[0].partition_id, + to_skip_partition_too.id + ); + // confirm can fetch both skipped compactions, and not the unskipped one + // also confirm will not error on non-existing partition + let non_existing_partition_id = PartitionId::new(9999); + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[ + partition.id, + to_skip_partition.id, + to_skip_partition_too.id, + non_existing_partition_id, + ]) + .await + .unwrap(); + assert_eq!(skipped_partition_records.len(), 2); + assert_eq!( + skipped_partition_records[0].partition_id, + to_skip_partition.id + ); + assert_eq!( + skipped_partition_records[1].partition_id, + to_skip_partition_too.id + ); + + // Delete the skipped compactions + let deleted_skipped_compaction = repos + .partitions() + .delete_skipped_compactions(to_skip_partition.id) + .await + .unwrap() + .expect("The skipped compaction should have been returned"); + assert_eq!( + deleted_skipped_compaction.partition_id, + to_skip_partition.id + ); + assert_eq!(deleted_skipped_compaction.reason, "I'm on fire"); + assert_eq!(deleted_skipped_compaction.num_files, 11); + assert_eq!(deleted_skipped_compaction.limit_num_files, 12); + assert_eq!(deleted_skipped_compaction.estimated_bytes, 110); + assert_eq!(deleted_skipped_compaction.limit_bytes, 120); + // + let deleted_skipped_compaction = repos + .partitions() + .delete_skipped_compactions(to_skip_partition_too.id) + .await + .unwrap() + .expect("The skipped compaction should have been returned"); + assert_eq!( + deleted_skipped_compaction.partition_id, + to_skip_partition_too.id + ); + assert_eq!(deleted_skipped_compaction.reason, "I am le tired too"); + // + let skipped_partition_records = repos + .partitions() + .get_in_skipped_compactions(&[to_skip_partition.id]) + .await + .unwrap(); + assert!(skipped_partition_records.is_empty()); + + let not_deleted_skipped_compaction = repos + .partitions() + .delete_skipped_compactions(to_skip_partition.id) + .await + .unwrap(); + + assert!( + not_deleted_skipped_compaction.is_none(), + "There should be no skipped compation", + ); + + let skipped_compactions = repos.partitions().list_skipped_compactions().await.unwrap(); + assert!( + skipped_compactions.is_empty(), + "Expected no skipped compactions, got: {skipped_compactions:?}" + ); + + let recent = repos + .partitions() + .most_recent_n(10) + .await + .expect("should list most recent"); + assert_eq!(recent.len(), 4); + + // Test: sort_key_ids from most_recent_n + // Only the first two partitions (represent to_skip_partition_too and to_skip_partition) have vallues, the others are empty + assert_eq!( + recent[0].sort_key_ids().unwrap(), + &SortKeyIds::from(vec![3, 4]) + ); + assert_eq!( + recent[1].sort_key_ids().unwrap(), + &SortKeyIds::from(vec![2, 1, 4, 3]) + ); + assert!(recent[2].sort_key_ids().is_none()); + assert!(recent[3].sort_key_ids().is_none()); + + let recent = repos + .partitions() + .most_recent_n(4) + .await + .expect("should list most recent"); + assert_eq!(recent.len(), 4); // no off by one error + + let recent = repos + .partitions() + .most_recent_n(2) + .await + .expect("should list most recent"); + assert_eq!(recent.len(), 2); + + repos + .namespaces() + .soft_delete("namespace_partition_test") + .await + .expect("delete namespace should succeed"); +} + +async fn validate_partition_snapshot(repos: &mut dyn RepoCollection, snapshot: &PartitionSnapshot) { + // compare files + let mut expected = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![snapshot.partition_id()]) + .await + .unwrap(); + expected.sort_unstable_by_key(|x| x.id); + let mut actual = snapshot.files().collect::, _>>().unwrap(); + actual.sort_unstable_by_key(|x| x.id); + assert_eq!(expected, actual); + + // compare skipped partition + let expected = repos + .partitions() + .get_in_skipped_compactions(&[snapshot.partition_id()]) + .await + .unwrap() + .into_iter() + .next(); + let actual = snapshot.skipped_compaction(); + assert_eq!(actual, expected); + + // compare partition itself + let actual = snapshot.partition().unwrap(); + let expected = repos + .partitions() + .get_by_id(snapshot.partition_id()) + .await + .unwrap() + .unwrap(); + assert_eq!(actual, expected); +} + +async fn validate_table_snapshot(repos: &mut dyn RepoCollection, snapshot: &TableSnapshot) { + let table = snapshot.table().unwrap(); + + let expected = repos.tables().get_by_id(table.id).await.unwrap().unwrap(); + assert_eq!(table, expected); + + // compare columns + let mut expected = repos.columns().list_by_table_id(table.id).await.unwrap(); + expected.sort_unstable_by_key(|x| x.id); + let mut actual = snapshot.columns().collect::, _>>().unwrap(); + actual.sort_unstable_by_key(|x| x.id); + assert_eq!(expected, actual); + + // compare partitions + let mut expected = repos.partitions().list_by_table_id(table.id).await.unwrap(); + expected.sort_unstable_by_key(|x| x.id); + let mut actual = snapshot + .partitions() + .collect::, _>>() + .unwrap(); + actual.sort_unstable_by_key(|x| x.id()); + assert_eq!(expected.len(), actual.len()); + + let eq = expected + .iter() + .zip(&actual) + .all(|(l, r)| l.id == r.id() && l.partition_key.as_bytes() == r.key()); + assert!(eq, "expected {expected:?} got {actual:?}"); +} + +/// List all parquet files in given namespace. +async fn list_parquet_files_by_namespace_not_to_delete( + catalog: Arc, + namespace_id: NamespaceId, +) -> Vec { + let partitions = futures::stream::iter( + catalog + .repositories() + .tables() + .list_by_namespace_id(namespace_id) + .await + .unwrap(), + ) + .then(|t| { + let catalog = Arc::clone(&catalog); + async move { + futures::stream::iter( + catalog + .repositories() + .partitions() + .list_by_table_id(t.id) + .await + .unwrap(), + ) + } + }) + .flatten() + .map(|p| p.id) + .collect::>() + .await; + + catalog + .repositories() + .parquet_files() + .list_by_partition_not_to_delete_batch(partitions) + .await + .unwrap() +} + +/// tests many interactions with the catalog and parquet files. See the individual conditions +/// herein +async fn test_parquet_file(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test").await; + let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; + let other_table = arbitrary_table(&mut *repos, "other", &namespace).await; + let partition = repos + .partitions() + .create_or_get("one".into(), table.id) + .await + .unwrap(); + let other_partition = repos + .partitions() + .create_or_get("one".into(), other_table.id) + .await + .unwrap(); + + let ts1 = repos.tables().snapshot(table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts1).await; + + let ts2 = repos.tables().snapshot(other_table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts2).await; + + let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); + let parquet_file = repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + + // verify we can get it by its object store id + let pfg = repos + .parquet_files() + .get_by_object_store_id(parquet_file.object_store_id) + .await + .unwrap(); + assert_eq!(parquet_file, pfg.unwrap()); + + // verify that trying to create a file with the same UUID throws an error + let err = repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap_err(); + assert!(matches!(err, Error::AlreadyExists { .. })); + + let other_params = ParquetFileParams { + table_id: other_partition.table_id, + partition_id: other_partition.id, + partition_hash_id: other_partition.hash_id().cloned(), + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(50), + max_time: Timestamp::new(60), + ..parquet_file_params.clone() + }; + let other_file = repos.parquet_files().create(other_params).await.unwrap(); + + let exist_id = parquet_file.id; + let non_exist_id = ParquetFileId::new(other_file.id.get() + 10); + // make sure exists_id != non_exist_id + assert_ne!(exist_id, non_exist_id); + + // verify that to_delete is initially set to null and the file does not get deleted + assert!(parquet_file.to_delete.is_none()); + let older_than = Timestamp::new( + (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(), + ); + let deleted = repos + .parquet_files() + .delete_old_ids_only(older_than) + .await + .unwrap(); + assert!(deleted.is_empty()); + + // test list_all that includes soft-deleted file + // at this time the file is not soft-deleted yet and will be included in the returned list + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await; + assert_eq!(files.len(), 2); + + // verify to_delete can be updated to a timestamp + repos + .parquet_files() + .create_upgrade_delete( + parquet_file.partition_id, + &[parquet_file.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap(); + + // test list_all that includes soft-deleted file + // at this time the file is soft-deleted and will be NOT included in the returned list + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await; + assert_eq!(files.len(), 1); + + // the deleted file can still be retrieved by UUID though + repos + .parquet_files() + .get_by_object_store_id(parquet_file.object_store_id) + .await + .unwrap() + .unwrap(); + + // File is not deleted if it was marked to be deleted after the specified time + let before_deleted = Timestamp::new( + (catalog.time_provider().now() - Duration::from_secs(100)).timestamp_nanos(), + ); + let deleted = repos + .parquet_files() + .delete_old_ids_only(before_deleted) + .await + .unwrap(); + assert!(deleted.is_empty()); + + // not hard-deleted yet + repos + .parquet_files() + .get_by_object_store_id(parquet_file.object_store_id) + .await + .unwrap() + .unwrap(); + + // File is deleted if it was marked to be deleted before the specified time + let deleted = repos + .parquet_files() + .delete_old_ids_only(older_than) + .await + .unwrap(); + assert_eq!(deleted.len(), 1); + assert_eq!(parquet_file.object_store_id, deleted[0]); + + // test list_all that includes soft-deleted file + // at this time the file is hard deleted -> the returned list is empty + assert!(repos + .parquet_files() + .get_by_object_store_id(parquet_file.object_store_id) + .await + .unwrap() + .is_none()); + + // test list + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace.id).await; + assert_eq!(vec![other_file.clone()], files); + + // test list_by_namespace_not_to_delete + let namespace2 = arbitrary_namespace(&mut *repos, "namespace_parquet_file_test1").await; + let table2 = arbitrary_table(&mut *repos, "test_table2", &namespace2).await; + let partition2 = repos + .partitions() + .create_or_get("foo".into(), table2.id) + .await + .unwrap(); + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await; + assert!(files.is_empty()); + + let ts3 = repos.tables().snapshot(table2.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts3).await; + + let f1_params = ParquetFileParams { + table_id: partition2.table_id, + partition_id: partition2.id, + partition_hash_id: partition2.hash_id().cloned(), + namespace_id: namespace2.id, + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(1), + max_time: Timestamp::new(10), + ..parquet_file_params + }; + let f1 = repos + .parquet_files() + .create(f1_params.clone()) + .await + .unwrap(); + + let f2_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(50), + max_time: Timestamp::new(60), + ..f1_params.clone() + }; + let f2 = repos + .parquet_files() + .create(f2_params.clone()) + .await + .unwrap(); + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await; + assert_eq!(vec![f1.clone(), f2.clone()], files); + + let f3_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(50), + max_time: Timestamp::new(60), + ..f2_params + }; + let f3 = repos + .parquet_files() + .create(f3_params.clone()) + .await + .unwrap(); + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await; + assert_eq!(vec![f1.clone(), f2.clone(), f3.clone()], files); + + let s1 = repos.partitions().snapshot(partition2.id).await.unwrap(); + validate_partition_snapshot(repos.as_mut(), &s1).await; + + repos + .parquet_files() + .create_upgrade_delete( + f2.partition_id, + &[f2.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap(); + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await; + assert_eq!(vec![f1.clone(), f3.clone()], files); + + // Cannot delete file twice + let err = repos + .parquet_files() + .create_upgrade_delete( + partition2.id, + &[f2.object_store_id, f3.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap_err(); + assert_matches!(err, Error::NotFound { .. }); + + let err = repos + .parquet_files() + .create_upgrade_delete( + partition2.id, + &[f2.object_store_id], + &[f3.object_store_id], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap_err(); + assert_matches!(err, Error::NotFound { .. }); + + // Cannot upgrade deleted file + let err = repos + .parquet_files() + .create_upgrade_delete( + partition2.id, + &[f3.object_store_id], + &[f2.object_store_id], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap_err(); + assert_matches!(err, Error::NotFound { .. }); + + // Failed transactions don't modify + let files = + list_parquet_files_by_namespace_not_to_delete(Arc::clone(&catalog), namespace2.id).await; + assert_eq!(vec![f1.clone(), f3.clone()], files); + + let s2 = repos.partitions().snapshot(partition2.id).await.unwrap(); + assert_gt(s2.generation(), s1.generation()); + validate_partition_snapshot(repos.as_mut(), &s2).await; + + let files = list_parquet_files_by_namespace_not_to_delete( + Arc::clone(&catalog), + NamespaceId::new(i64::MAX), + ) + .await; + assert!(files.is_empty()); + + // test delete_old_ids_only + let older_than = Timestamp::new( + (catalog.time_provider().now() + Duration::from_secs(100)).timestamp_nanos(), + ); + let ids = repos + .parquet_files() + .delete_old_ids_only(older_than) + .await + .unwrap(); + assert_eq!(ids.len(), 1); + + let s3 = repos.partitions().snapshot(partition2.id).await.unwrap(); + assert_ge(s3.generation(), s2.generation()); // no new snapshot required, but some backends will generate a new one + validate_partition_snapshot(repos.as_mut(), &s3).await; + + // test retention-based flagging for deletion + // Since mem catalog has default retention 1 hour, let us first set it to 0 means infinite + let namespaces = repos + .namespaces() + .list(SoftDeletedRows::AllRows) + .await + .expect("listing namespaces"); + for namespace in namespaces { + repos + .namespaces() + .update_retention_period(&namespace.name, None) // infinite + .await + .unwrap(); + } + + // 1. with no retention period set on the ns, nothing should get flagged + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert!(ids.is_empty()); + // 2. set ns retention period to one hour then create some files before and after and + // ensure correct files get deleted + repos + .namespaces() + .update_retention_period(&namespace2.name, Some(60 * 60 * 1_000_000_000)) // 1 hour + .await + .unwrap(); + let f4_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + max_time: Timestamp::new( + // a bit over an hour ago + (catalog.time_provider().now() - Duration::from_secs(60 * 65)).timestamp_nanos(), + ), + ..f3_params + }; + let f4 = repos + .parquet_files() + .create(f4_params.clone()) + .await + .unwrap(); + let f5_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + max_time: Timestamp::new( + // a bit under an hour ago + (catalog.time_provider().now() - Duration::from_secs(60 * 55)).timestamp_nanos(), + ), + ..f4_params + }; + let f5 = repos + .parquet_files() + .create(f5_params.clone()) + .await + .unwrap(); + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert!(ids.len() > 1); // it's also going to flag f1, f2 & f3 because they have low max + // timestamps but i don't want this test to be brittle if those + // values change so i'm not asserting len == 4 + let f4 = repos + .parquet_files() + .get_by_object_store_id(f4.object_store_id) + .await + .unwrap() + .unwrap(); + assert_matches!(f4.to_delete, Some(_)); // f4 is > 1hr old + let f5 = repos + .parquet_files() + .get_by_object_store_id(f5.object_store_id) + .await + .unwrap() + .unwrap(); + assert_matches!(f5.to_delete, None); // f5 is < 1hr old + + let s4 = repos.partitions().snapshot(partition2.id).await.unwrap(); + assert_gt(s4.generation(), s3.generation()); + validate_partition_snapshot(repos.as_mut(), &s4).await; + + // call flag_for_delete_by_retention() again and nothing should be flagged because they've + // already been flagged + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert!(ids.is_empty()); + + // test that flag_for_delete_by_retention respects UPDATE LIMIT + // create limit + the meaning of life parquet files that are all older than the retention (>1hr) + const LIMIT: usize = 1000; + const MOL: usize = 42; + let now = catalog.time_provider().now(); + let params = (0..LIMIT + MOL) + .map(|_| { + ParquetFileParams { + object_store_id: ObjectStoreId::new(), + max_time: Timestamp::new( + // a bit over an hour ago + (now - Duration::from_secs(60 * 65)).timestamp_nanos(), + ), + ..f1_params.clone() + } + }) + .collect::>(); + repos + .parquet_files() + .create_upgrade_delete( + f1_params.partition_id, + &[], + &[], + ¶ms, + CompactionLevel::Initial, + ) + .await + .unwrap(); + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert_eq!(ids.len(), LIMIT); + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert_eq!(ids.len(), MOL); // second call took remainder + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert_eq!(ids.len(), 0); // none left + + // test create_update_delete + let f6_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + ..f5_params + }; + let f6 = repos + .parquet_files() + .create(f6_params.clone()) + .await + .unwrap(); + + let f7_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + ..f6_params + }; + let f1_uuid = f1.object_store_id; + let f6_uuid = f6.object_store_id; + let f5_uuid = f5.object_store_id; + let cud = repos + .parquet_files() + .create_upgrade_delete( + f5.partition_id, + &[f5.object_store_id], + &[f6.object_store_id], + &[f7_params.clone()], + CompactionLevel::Final, + ) + .await + .unwrap(); + + assert_eq!(cud.len(), 1); + let f5_delete = repos + .parquet_files() + .get_by_object_store_id(f5_uuid) + .await + .unwrap() + .unwrap(); + assert_matches!(f5_delete.to_delete, Some(_)); + + let f6_compaction_level = repos + .parquet_files() + .get_by_object_store_id(f6_uuid) + .await + .unwrap() + .unwrap(); + + assert_matches!(f6_compaction_level.compaction_level, CompactionLevel::Final); + + let f7 = repos + .parquet_files() + .get_by_object_store_id(f7_params.object_store_id) + .await + .unwrap() + .unwrap(); + + let f7_uuid = f7.object_store_id; + + // test create_update_delete transaction (rollback because f7 already exists) + let cud = repos + .parquet_files() + .create_upgrade_delete( + partition2.id, + &[], + &[], + &[f7_params.clone()], + CompactionLevel::Final, + ) + .await; + + assert_matches!( + cud, + Err(Error::AlreadyExists { + descr + }) if descr == f7_params.object_store_id.to_string() + ); + + let f1_to_delete = repos + .parquet_files() + .get_by_object_store_id(f1_uuid) + .await + .unwrap() + .unwrap(); + assert_matches!(f1_to_delete.to_delete, Some(_)); + + let f7_not_delete = repos + .parquet_files() + .get_by_object_store_id(f7_uuid) + .await + .unwrap() + .unwrap(); + assert_matches!(f7_not_delete.to_delete, None); + + // test exists_by_object_store_id_batch returns parquet files by object store id + let does_not_exist = ObjectStoreId::new(); + let mut present = repos + .parquet_files() + .exists_by_object_store_id_batch(vec![f1_uuid, f7_uuid, does_not_exist]) + .await + .unwrap(); + let mut expected = vec![f1_uuid, f7_uuid]; + present.sort(); + expected.sort(); + assert_eq!(present, expected); + + let s5 = repos.partitions().snapshot(partition2.id).await.unwrap(); + assert_gt(s5.generation(), s4.generation()); + validate_partition_snapshot(repos.as_mut(), &s5).await; + + // Cannot mix partition IDs + let partition3 = repos + .partitions() + .create_or_get("three".into(), table.id) + .await + .unwrap(); + + let ts4 = repos.tables().snapshot(table.id).await.unwrap(); + validate_table_snapshot(repos.as_mut(), &ts4).await; + assert_gt(ts4.generation(), ts1.generation()); + + let f8_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + partition_id: partition3.id, + ..f7_params + }; + let err = repos + .parquet_files() + .create_upgrade_delete( + partition2.id, + &[f7_uuid], + &[], + &[f8_params.clone()], + CompactionLevel::Final, + ) + .await + .unwrap_err() + .to_string(); + + assert!( + err.contains("Inconsistent ParquetFileParams, expected PartitionId"), + "{err}" + ); + + let list = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![partition2.id]) + .await + .unwrap(); + assert_eq!(list.len(), 2); + + repos + .parquet_files() + .create_upgrade_delete( + partition3.id, + &[], + &[], + &[f8_params.clone()], + CompactionLevel::Final, + ) + .await + .unwrap(); + + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![partition3.id]) + .await + .unwrap(); + assert_eq!(files.len(), 1); + let f8_uuid = files[0].object_store_id; + + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![]) + .await + .unwrap(); + assert_eq!(files.len(), 0); + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![partition2.id, partition3.id]) + .await + .unwrap(); + assert_eq!(files.len(), 3); + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![ + partition2.id, + PartitionId::new(i64::MAX), + partition3.id, + partition2.id, + ]) + .await + .unwrap(); + assert_eq!(files.len(), 3); + + let err = repos + .parquet_files() + .create_upgrade_delete(partition2.id, &[f8_uuid], &[], &[], CompactionLevel::Final) + .await + .unwrap_err(); + + assert_matches!(err, Error::NotFound { .. }); + + let err = repos + .parquet_files() + .create_upgrade_delete(partition2.id, &[], &[f8_uuid], &[], CompactionLevel::Final) + .await + .unwrap_err(); + + assert_matches!(err, Error::NotFound { .. }); + + repos + .parquet_files() + .create_upgrade_delete(partition3.id, &[f8_uuid], &[], &[], CompactionLevel::Final) + .await + .unwrap(); + + // take snapshot of unknown partition + let err = repos + .partitions() + .snapshot(PartitionId::new(i64::MAX)) + .await + .unwrap_err(); + assert_matches!(err, Error::NotFound { .. }); +} + +async fn test_parquet_file_delete_broken(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace_1 = arbitrary_namespace(&mut *repos, "retention_broken_1").await; + let namespace_2 = repos + .namespaces() + .create( + &NamespaceName::new("retention_broken_2").unwrap(), + None, + Some(1), + None, + ) + .await + .unwrap(); + let table_1 = arbitrary_table(&mut *repos, "test_table", &namespace_1).await; + let table_2 = arbitrary_table(&mut *repos, "test_table", &namespace_2).await; + let partition_1 = repos + .partitions() + .create_or_get("one".into(), table_1.id) + .await + .unwrap(); + let partition_2 = repos + .partitions() + .create_or_get("one".into(), table_2.id) + .await + .unwrap(); + + let parquet_file_params_1 = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1); + let parquet_file_params_2 = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2); + let _parquet_file_1 = repos + .parquet_files() + .create(parquet_file_params_1) + .await + .unwrap(); + let parquet_file_2 = repos + .parquet_files() + .create(parquet_file_params_2) + .await + .unwrap(); + + let ids = repos + .parquet_files() + .flag_for_delete_by_retention() + .await + .unwrap(); + assert_eq!( + ids, + vec![(parquet_file_2.partition_id, parquet_file_2.object_store_id)] + ); +} + +async fn test_partitions_new_file_between(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace(&mut *repos, "test_partitions_new_file_between").await; + let table = arbitrary_table(&mut *repos, "test_table_for_new_file_between", &namespace).await; + + // param for the tests + let time_now = Timestamp::from(catalog.time_provider().now()); + let time_one_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(1)); + let time_two_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(2)); + let time_three_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(3)); + let time_five_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(5)); + let time_six_hour_ago = Timestamp::from(catalog.time_provider().hours_ago(6)); + + // Db has no partitions + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // ----------------- + // PARTITION one + // The DB has 1 partition but it does not have any file + let partition1 = repos + .partitions() + .create_or_get("one".into(), table.id) + .await + .unwrap(); + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // create files for partition one + let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition1); + + // create a deleted L0 file that was created 3 hours ago + let delete_l0_file = repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + repos + .parquet_files() + .create_upgrade_delete( + delete_l0_file.partition_id, + &[delete_l0_file.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap(); + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert!(partitions.is_empty()); + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, Some(time_one_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_one_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // create a deleted L0 file that was created 1 hour ago + let l0_one_hour_ago_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + created_at: time_one_hour_ago, + ..parquet_file_params.clone() + }; + repos + .parquet_files() + .create(l0_one_hour_ago_file_params.clone()) + .await + .unwrap(); + // partition one should be returned + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_two_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // ----------------- + // PARTITION two + // Partition two without any file + let partition2 = repos + .partitions() + .create_or_get("two".into(), table.id) + .await + .unwrap(); + // should return partition one only + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + + // Add a L0 file created 5 hours ago + let l0_five_hour_ago_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + created_at: time_five_hour_ago, + partition_id: partition2.id, + partition_hash_id: partition2.hash_id().cloned(), + ..parquet_file_params.clone() + }; + repos + .parquet_files() + .create(l0_five_hour_ago_file_params.clone()) + .await + .unwrap(); + // still return partition one only + let partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + // Between six and three hours ago, return only partition 2 + let partitions = repos + .partitions() + .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition2.id); + + // Add an L1 file created just now + let l1_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + created_at: time_now, + partition_id: partition2.id, + partition_hash_id: partition2.hash_id().cloned(), + compaction_level: CompactionLevel::FileNonOverlapped, + ..parquet_file_params.clone() + }; + repos + .parquet_files() + .create(l1_file_params.clone()) + .await + .unwrap(); + // should return both partitions + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 2); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition2.id); + // Only return partition1: the creation time must be strictly less than the maximum time, + // not equal + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + // Between six and three hours ago, return none + let partitions = repos + .partitions() + .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // ----------------- + // PARTITION three + // Partition three without any file + let partition3 = repos + .partitions() + .create_or_get("three".into(), table.id) + .await + .unwrap(); + // should return partition one and two only + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 2); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition2.id); + // Only return partition1: the creation time must be strictly less than the maximum time, + // not equal + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + // When the maximum time is greater than the creation time of partition2, return it + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now + 1)) + .await + .unwrap(); + assert_eq!(partitions.len(), 2); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition2.id); + // Between six and three hours ago, return none + let partitions = repos + .partitions() + .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // Add an L2 file created just now for partition three + let l2_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + created_at: time_now, + partition_id: partition3.id, + partition_hash_id: partition3.hash_id().cloned(), + compaction_level: CompactionLevel::Final, + ..parquet_file_params.clone() + }; + repos + .parquet_files() + .create(l2_file_params.clone()) + .await + .unwrap(); + // now should return partition one two and three + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 3); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition2.id); + assert_eq!(partitions[2], partition3.id); + // Only return partition1: the creation time must be strictly less than the maximum time, + // not equal + let partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 1); + assert_eq!(partitions[0], partition1.id); + // Between six and three hours ago, return none + let partitions = repos + .partitions() + .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); + + // add an L0 file created one hour ago for partition three + let l0_one_hour_ago_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + created_at: time_one_hour_ago, + partition_id: partition3.id, + partition_hash_id: partition3.hash_id().cloned(), + ..parquet_file_params.clone() + }; + repos + .parquet_files() + .create(l0_one_hour_ago_file_params.clone()) + .await + .unwrap(); + // should return all partitions + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_two_hour_ago, None) + .await + .unwrap(); + assert_eq!(partitions.len(), 3); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition2.id); + assert_eq!(partitions[2], partition3.id); + // Only return partitions 1 and 3; 2 was created just now + let mut partitions = repos + .partitions() + .partitions_new_file_between(time_three_hour_ago, Some(time_now)) + .await + .unwrap(); + assert_eq!(partitions.len(), 2); + partitions.sort(); + assert_eq!(partitions[0], partition1.id); + assert_eq!(partitions[1], partition3.id); + // Between six and three hours ago, return none + let partitions = repos + .partitions() + .partitions_new_file_between(time_six_hour_ago, Some(time_three_hour_ago)) + .await + .unwrap(); + assert!(partitions.is_empty()); +} + +async fn test_list_by_partiton_not_to_delete(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = arbitrary_namespace( + &mut *repos, + "namespace_parquet_file_test_list_by_partiton_not_to_delete", + ) + .await; + let table = arbitrary_table(&mut *repos, "test_table", &namespace).await; + + let partition = repos + .partitions() + .create_or_get("test_list_by_partiton_not_to_delete_one".into(), table.id) + .await + .unwrap(); + let partition2 = repos + .partitions() + .create_or_get("test_list_by_partiton_not_to_delete_two".into(), table.id) + .await + .unwrap(); + + let parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); + + let parquet_file = repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + let delete_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + ..parquet_file_params.clone() + }; + let delete_file = repos + .parquet_files() + .create(delete_file_params) + .await + .unwrap(); + repos + .parquet_files() + .create_upgrade_delete( + partition.id, + &[delete_file.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) + .await + .unwrap(); + let level1_file_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + ..parquet_file_params.clone() + }; + let mut level1_file = repos + .parquet_files() + .create(level1_file_params) + .await + .unwrap(); + repos + .parquet_files() + .create_upgrade_delete( + partition.id, + &[], + &[level1_file.object_store_id], + &[], + CompactionLevel::FileNonOverlapped, + ) + .await + .unwrap(); + level1_file.compaction_level = CompactionLevel::FileNonOverlapped; + + let other_partition_params = ParquetFileParams { + partition_id: partition2.id, + partition_hash_id: partition2.hash_id().cloned(), + object_store_id: ObjectStoreId::new(), + ..parquet_file_params.clone() + }; + let _partition2_file = repos + .parquet_files() + .create(other_partition_params) + .await + .unwrap(); + + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![partition.id]) + .await + .unwrap(); + assert_eq!(files.len(), 2); + + let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect(); + file_ids.sort(); + let mut expected_ids = vec![parquet_file.id, level1_file.id]; + expected_ids.sort(); + assert_eq!(file_ids, expected_ids); + + // Using the catalog partition ID should return the same files, even if the Parquet file + // records don't have the partition ID on them (which is the default now) + let files = repos + .parquet_files() + .list_by_partition_not_to_delete_batch(vec![partition.id]) + .await + .unwrap(); + assert_eq!(files.len(), 2); + + let mut file_ids: Vec<_> = files.into_iter().map(|f| f.id).collect(); + file_ids.sort(); + let mut expected_ids = vec![parquet_file.id, level1_file.id]; + expected_ids.sort(); + assert_eq!(file_ids, expected_ids); +} + +async fn test_update_to_compaction_level_1(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace = + arbitrary_namespace(&mut *repos, "namespace_update_to_compaction_level_1_test").await; + let table = arbitrary_table(&mut *repos, "update_table", &namespace).await; + let partition = repos + .partitions() + .create_or_get("test_update_to_compaction_level_1_one".into(), table.id) + .await + .unwrap(); + + // Set up the window of times we're interested in level 1 files for + let query_min_time = Timestamp::new(5); + let query_max_time = Timestamp::new(10); + + // Create a file with times entirely within the window + let mut parquet_file_params = arbitrary_parquet_file_params(&namespace, &table, &partition); + parquet_file_params.min_time = query_min_time + 1; + parquet_file_params.max_time = query_max_time - 1; + let parquet_file = repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + + // Create a file that will remain as level 0 + let level_0_params = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + ..parquet_file_params.clone() + }; + repos.parquet_files().create(level_0_params).await.unwrap(); + + // Make parquet_file compaction level 1 + let created = repos + .parquet_files() + .create_upgrade_delete( + parquet_file.partition_id, + &[], + &[parquet_file.object_store_id], + &[], + CompactionLevel::FileNonOverlapped, + ) + .await + .unwrap(); + assert_eq!(created, vec![]); + + // remove namespace to avoid it from affecting later tests + repos + .namespaces() + .soft_delete("namespace_update_to_compaction_level_1_test") + .await + .expect("delete namespace should succeed"); +} + +/// Assert that a namespace deletion does NOT cascade to the tables/schema +/// items/parquet files/etc. +/// +/// Removal of this entities breaks the invariant that once created, a row +/// always exists for the lifetime of an IOx process, and causes the system +/// to panic in multiple components. It's also ineffective, because most +/// components maintain a cache of at least one of these entities. +/// +/// Instead soft deleted namespaces should have their files GC'd like a +/// normal parquet file deletion, removing the rows once they're no longer +/// being actively used by the system. This is done by waiting a long time +/// before deleting records, and whilst isn't perfect, it is largely +/// effective. +async fn test_delete_namespace(catalog: Arc) { + let mut repos = catalog.repositories(); + let namespace_1 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_1").await; + let table_1 = arbitrary_table(&mut *repos, "test_table_1", &namespace_1).await; + let _c = repos + .columns() + .create_or_get("column_test_1", table_1.id, ColumnType::Tag) + .await + .unwrap(); + let partition_1 = repos + .partitions() + .create_or_get("test_delete_namespace_one".into(), table_1.id) + .await + .unwrap(); + + // parquet files + let parquet_file_params = arbitrary_parquet_file_params(&namespace_1, &table_1, &partition_1); + repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + let parquet_file_params_2 = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(200), + max_time: Timestamp::new(300), + ..parquet_file_params + }; + repos + .parquet_files() + .create(parquet_file_params_2.clone()) + .await + .unwrap(); + + // we've now created a namespace with a table and parquet files. before we test deleting + // it, let's create another so we can ensure that doesn't get deleted. + let namespace_2 = arbitrary_namespace(&mut *repos, "namespace_test_delete_namespace_2").await; + let table_2 = arbitrary_table(&mut *repos, "test_table_2", &namespace_2).await; + let _c = repos + .columns() + .create_or_get("column_test_2", table_2.id, ColumnType::Tag) + .await + .unwrap(); + let partition_2 = repos + .partitions() + .create_or_get("test_delete_namespace_two".into(), table_2.id) + .await + .unwrap(); + + // parquet files + let parquet_file_params = arbitrary_parquet_file_params(&namespace_2, &table_2, &partition_2); + repos + .parquet_files() + .create(parquet_file_params.clone()) + .await + .unwrap(); + let parquet_file_params_2 = ParquetFileParams { + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(200), + max_time: Timestamp::new(300), + ..parquet_file_params + }; + repos + .parquet_files() + .create(parquet_file_params_2.clone()) + .await + .unwrap(); + + // now delete namespace_1 and assert it's all gone and none of + // namespace_2 is gone + repos + .namespaces() + .soft_delete("namespace_test_delete_namespace_1") + .await + .expect("delete namespace should succeed"); + // assert that namespace is soft-deleted, but the table, column, and parquet files are all + // still there. + assert!(repos + .namespaces() + .get_by_id(namespace_1.id, SoftDeletedRows::ExcludeDeleted) + .await + .expect("get namespace should succeed") + .is_none()); + assert_eq!( + repos + .namespaces() + .get_by_id(namespace_1.id, SoftDeletedRows::AllRows) + .await + .expect("get namespace should succeed") + .map(|mut v| { + // The only change after soft-deletion should be the deleted_at + // field being set - this block normalises that field, so that + // the before/after can be asserted as equal. + v.deleted_at = None; + v + }) + .expect("should see soft-deleted row"), + namespace_1 + ); + assert_eq!( + repos + .tables() + .get_by_id(table_1.id) + .await + .expect("get table should succeed") + .expect("should return row"), + table_1 + ); + assert_eq!( + repos + .columns() + .list_by_namespace_id(namespace_1.id) + .await + .expect("listing columns should succeed") + .len(), + 1 + ); + assert_eq!( + repos + .columns() + .list_by_table_id(table_1.id) + .await + .expect("listing columns should succeed") + .len(), + 1 + ); + + // partition's get_by_id should succeed + repos + .partitions() + .get_by_id_batch(&[partition_1.id]) + .await + .unwrap() + .into_iter() + .next() + .unwrap(); + + // assert that the namespace, table, column, and parquet files for namespace_2 are still + // there + assert!(repos + .namespaces() + .get_by_id(namespace_2.id, SoftDeletedRows::ExcludeDeleted) + .await + .expect("get namespace should succeed") + .is_some()); + + assert!(repos + .tables() + .get_by_id(table_2.id) + .await + .expect("get table should succeed") + .is_some()); + assert_eq!( + repos + .columns() + .list_by_namespace_id(namespace_2.id) + .await + .expect("listing columns should succeed") + .len(), + 1 + ); + assert_eq!( + repos + .columns() + .list_by_table_id(table_2.id) + .await + .expect("listing columns should succeed") + .len(), + 1 + ); + + // partition's get_by_id should succeed + repos + .partitions() + .get_by_id_batch(&[partition_2.id]) + .await + .unwrap() + .into_iter() + .next() + .unwrap(); +} + +/// Upsert a namespace called `namespace_name` and write `lines` to it. +async fn populate_namespace( + repos: &mut R, + namespace_name: &str, + lines: &str, +) -> (Namespace, NamespaceSchema) +where + R: RepoCollection + ?Sized, +{ + let namespace = repos + .namespaces() + .create( + &NamespaceName::new(namespace_name).unwrap(), + None, + None, + None, + ) + .await; + + let namespace = match namespace { + Ok(v) => v, + Err(Error::AlreadyExists { .. }) => repos + .namespaces() + .get_by_name(namespace_name, SoftDeletedRows::AllRows) + .await + .unwrap() + .unwrap(), + e @ Err(_) => e.unwrap(), + }; + + let batches = mutable_batch_lp::lines_to_batches(lines, 42).unwrap(); + let batches = batches.iter().map(|(table, batch)| (table.as_str(), batch)); + let ns = NamespaceSchema::new_empty_from(&namespace); + + let schema = validate_or_insert_schema(batches, &ns, repos) + .await + .expect("validate schema failed") + .unwrap_or(ns); + + (namespace, schema) +} + +async fn test_list_schemas(catalog: Arc) { + let mut repos = catalog.repositories(); + + let ns1 = populate_namespace( + repos.deref_mut(), + "ns1", + "cpu,tag=1 field=1i\nanother,tag=1 field=1.0", + ) + .await; + let ns2 = populate_namespace( + repos.deref_mut(), + "ns2", + "cpu,tag=1 field=1i\nsomethingelse field=1u", + ) + .await; + + // Otherwise the in-mem catalog deadlocks.... (but not postgres) + drop(repos); + + let got = list_schemas(&*catalog) + .await + .expect("should be able to list the schemas") + .collect::>(); + + assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1); + assert!(got.contains(&ns2), "{:#?}\n\nwant{:#?}", got, &ns2); +} + +async fn test_list_schemas_soft_deleted_rows(catalog: Arc) { + let mut repos = catalog.repositories(); + + let ns1 = populate_namespace( + repos.deref_mut(), + "ns1", + "cpu,tag=1 field=1i\nanother,tag=1 field=1.0", + ) + .await; + let ns2 = populate_namespace( + repos.deref_mut(), + "ns2", + "cpu,tag=1 field=1i\nsomethingelse field=1u", + ) + .await; + + repos + .namespaces() + .soft_delete(&ns2.0.name) + .await + .expect("failed to soft delete namespace"); + + // Otherwise the in-mem catalog deadlocks.... (but not postgres) + drop(repos); + + let got = list_schemas(&*catalog) + .await + .expect("should be able to list the schemas") + .collect::>(); + + assert!(got.contains(&ns1), "{:#?}\n\nwant{:#?}", got, &ns1); + assert!(!got.contains(&ns2), "{:#?}\n\n do not want{:#?}", got, &ns2); +} + +/// Ensure that we can create two repo objects and that they instantly share their state. +/// +/// This is a regression test for . +async fn test_two_repos(catalog: Arc) { + let mut repos_1 = catalog.repositories(); + let mut repos_2 = catalog.repositories(); + let repo_1 = repos_1.namespaces(); + let repo_2 = repos_2.namespaces(); + + let namespace_name = NamespaceName::new("test_namespace").unwrap(); + repo_1 + .create(&namespace_name, None, None, None) + .await + .unwrap(); + + repo_2 + .get_by_name(&namespace_name, SoftDeletedRows::AllRows) + .await + .unwrap() + .unwrap(); +} + +async fn test_partition_create_or_get_idempotent(catalog: Arc) { + let mut repos = catalog.repositories(); + + let namespace = arbitrary_namespace(&mut *repos, "ns4").await; + let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id; + + let key = PartitionKey::from("bananas"); + + let hash_id = PartitionHashId::new(table_id, &key); + + let a = repos + .partitions() + .create_or_get(key.clone(), table_id) + .await + .expect("should create OK"); + + assert_eq!(a.hash_id().unwrap(), &hash_id); + // Test: sort_key_ids from partition_create_or_get_idempotent + assert!(a.sort_key_ids().is_none()); + + // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent. + let b = repos + .partitions() + .create_or_get(key.clone(), table_id) + .await + .expect("idempotent write should succeed"); + + assert_eq!(a, b); + + // Check that the hash_id is saved in the database and is returned when queried. + let table_partitions = repos.partitions().list_by_table_id(table_id).await.unwrap(); + assert_eq!(table_partitions.len(), 1); + assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id); + + // Test: sort_key_ids from partition_create_or_get_idempotent + assert!(table_partitions[0].sort_key_ids().is_none()); +} + +#[track_caller] +fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) { + let histogram = metrics + .get_instrument::>("catalog_op_duration") + .expect("failed to read metric") + .get_observer(&Attributes::from(&[("op", name), ("result", "success")])) + .expect("failed to get observer") + .fetch(); + + let hit_count = histogram.sample_count(); + assert!(hit_count > 0, "metric did not record any calls"); +} + +async fn test_column_create_or_get_many_unchecked(clean_state: R) +where + R: Fn() -> F + Send + Sync, + F: Future> + Send, +{ + // Issue a few calls to create_or_get_many that contain distinct columns and + // covers the full set of column types. + test_column_create_or_get_many_unchecked_sub( + clean_state().await, + &[ + &[ + ("test1", ColumnType::I64), + ("test2", ColumnType::U64), + ("test3", ColumnType::F64), + ("test4", ColumnType::Bool), + ("test5", ColumnType::String), + ("test6", ColumnType::Time), + ("test7", ColumnType::Tag), + ], + &[("test8", ColumnType::String), ("test9", ColumnType::Bool)], + ], + |res| assert_matches!(res, Ok(_)), + ) + .await; + + // Issue two calls with overlapping columns - request should succeed (upsert + // semantics). + test_column_create_or_get_many_unchecked_sub( + clean_state().await, + &[ + &[ + ("test1", ColumnType::I64), + ("test2", ColumnType::U64), + ("test3", ColumnType::F64), + ("test4", ColumnType::Bool), + ], + &[ + ("test1", ColumnType::I64), + ("test2", ColumnType::U64), + ("test3", ColumnType::F64), + ("test4", ColumnType::Bool), + ("test5", ColumnType::String), + ("test6", ColumnType::Time), + ("test7", ColumnType::Tag), + ("test8", ColumnType::String), + ], + ], + |res| assert_matches!(res, Ok(_)), + ) + .await; + + // Issue two calls with the same columns and types. + test_column_create_or_get_many_unchecked_sub( + clean_state().await, + &[ + &[ + ("test1", ColumnType::I64), + ("test2", ColumnType::U64), + ("test3", ColumnType::F64), + ("test4", ColumnType::Bool), + ], + &[ + ("test1", ColumnType::I64), + ("test2", ColumnType::U64), + ("test3", ColumnType::F64), + ("test4", ColumnType::Bool), + ], + ], + |res| assert_matches!(res, Ok(_)), + ) + .await; + + // Issue two calls with overlapping columns with conflicting types and + // observe a correctly populated ColumnTypeMismatch error. + test_column_create_or_get_many_unchecked_sub( + clean_state().await, + &[ + &[ + ("test1", ColumnType::String), + ("test2", ColumnType::String), + ("test3", ColumnType::String), + ("test4", ColumnType::String), + ], + &[ + ("test1", ColumnType::String), + ("test2", ColumnType::Bool), // This one differs + ("test3", ColumnType::String), + // 4 is missing. + ("test5", ColumnType::String), + ("test6", ColumnType::Time), + ("test7", ColumnType::Tag), + ("test8", ColumnType::String), + ], + ], + |res| assert_matches!(res, Err(e) => { + assert_matches!(e, Error::AlreadyExists { descr } => { + assert_eq!(descr, "column test2 is type string but schema update has type bool"); + }) + }), + ).await; +} + +async fn test_column_create_or_get_many_unchecked_sub( + catalog: Arc, + calls: &[&[(&'static str, ColumnType)]], + want: F, +) where + F: FnOnce(Result, Error>) + Send, +{ + let mut repos = catalog.repositories(); + + let namespace = arbitrary_namespace(&mut *repos, "ns4").await; + let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id; + + let mut last_got = None; + for insert in calls { + let insert = insert + .iter() + .map(|(n, t)| (*n, *t)) + .collect::>(); + + let got = repos + .columns() + .create_or_get_many_unchecked(table_id, insert.clone()) + .await; + + // The returned columns MUST always match the requested + // column values if successful. + if let Ok(got) = &got { + assert_eq!(insert.len(), got.len()); + + for got in got { + assert_eq!(table_id, got.table_id); + let requested_column_type = insert + .get(got.name.as_str()) + .expect("Should have gotten back a column that was inserted"); + assert_eq!(*requested_column_type, got.column_type,); + } + + assert_metric_hit(&catalog.metrics(), "column_create_or_get_many_unchecked"); + } + + last_got = Some(got); + } + + want(last_got.unwrap()); +} + +/// [`Catalog`] wrapper that is helpful for testing. +#[derive(Debug)] +pub(crate) struct TestCatalog { + hold_onto: Mutex>>, + inner: Arc, +} + +impl TestCatalog { + /// Create new test catalog. + pub(crate) fn new(inner: Arc) -> Self { + Self { + hold_onto: Mutex::new(vec![]), + inner, + } + } + + /// Hold onto given value til dropped. + pub(crate) fn hold_onto(&self, o: T) + where + T: Send + 'static, + { + self.hold_onto.lock().push(Box::new(o) as _) + } +} + +impl Display for TestCatalog { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "test({})", self.inner) + } +} + +#[async_trait] +impl Catalog for TestCatalog { + async fn setup(&self) -> Result<(), Error> { + self.inner.setup().await + } + + fn repositories(&self) -> Box { + self.inner.repositories() + } + + fn metrics(&self) -> Arc { + self.inner.metrics() + } + + fn time_provider(&self) -> Arc { + self.inner.time_provider() + } +} + +#[track_caller] +fn assert_gt(a: T, b: T) +where + T: Display + PartialOrd, +{ + assert!(a > b, "failed: {a} > {b}",); +} + +#[track_caller] +fn assert_ge(a: T, b: T) +where + T: Display + PartialOrd, +{ + assert!(a >= b, "failed: {a} >= {b}",); +} diff --git a/iox_catalog/src/lib.rs b/iox_catalog/src/lib.rs index d7d56113b9d..17fa14f836e 100644 --- a/iox_catalog/src/lib.rs +++ b/iox_catalog/src/lib.rs @@ -19,700 +19,17 @@ // Workaround for "unused crate" lint false positives. use workspace_hack as _; -use crate::interface::{ColumnTypeMismatchSnafu, Error, RepoCollection, Result}; -use data_types::{ - partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, - ColumnType, NamespaceId, NamespaceSchema, Partition, TableSchema, TransitionPartitionId, -}; -use mutable_batch::MutableBatch; -use std::{borrow::Cow, collections::HashMap}; -use thiserror::Error; - -/// Column name for built in time column on every table. -pub const TIME_COLUMN: &str = "time"; - -/// Default retention period for data in the catalog. -pub const DEFAULT_RETENTION_PERIOD: Option = None; - +pub mod cache; +pub mod constants; +pub mod grpc; pub mod interface; -pub(crate) mod kafkaless_transition; pub mod mem; pub mod metrics; pub mod migrate; pub mod postgres; pub mod sqlite; - -/// An [`crate::interface::Error`] scoped to a single table for schema validation errors. -#[derive(Debug, Error)] -#[error("table {}, {}", .0, .1)] -pub struct TableScopedError(String, Error); - -impl TableScopedError { - /// Return the table name for this error. - pub fn table(&self) -> &str { - &self.0 - } - - /// Return a reference to the error. - pub fn err(&self) -> &Error { - &self.1 - } - - /// Return ownership of the error, discarding the table name. - pub fn into_err(self) -> Error { - self.1 - } -} - -/// Look up a partition in the catalog by either database-assigned ID or deterministic hash ID. -/// -/// The existence of this function should be temporary; it can be removed once all partition lookup -/// is happening with only the deterministic hash ID. -pub async fn partition_lookup( - repos: &mut R, - id: &TransitionPartitionId, -) -> Result, Error> -where - R: RepoCollection + ?Sized, -{ - match id { - TransitionPartitionId::Deprecated(partition_id) => { - repos.partitions().get_by_id(*partition_id).await - } - TransitionPartitionId::Deterministic(partition_hash_id) => { - repos.partitions().get_by_hash_id(partition_hash_id).await - } - } -} - -/// Look up multiple partitions in the catalog by either database-assigned ID or deterministic hash ID. -/// -/// The output only contains existing partitions, the order is undefined. -/// -/// The existence of this function should be temporary; it can be removed once all partition lookup -/// is happening with only the deterministic hash ID. -pub async fn partition_lookup_batch( - repos: &mut R, - ids: &[&TransitionPartitionId], -) -> Result, Error> -where - R: RepoCollection + ?Sized, -{ - let mut partition_ids = Vec::with_capacity(ids.len()); - let mut partition_hash_ids = Vec::with_capacity(ids.len()); - - for id in ids { - match id { - TransitionPartitionId::Deprecated(partition_id) => { - partition_ids.push(*partition_id); - } - TransitionPartitionId::Deterministic(partition_hash_id) => { - partition_hash_ids.push(partition_hash_id); - } - } - } - - let mut out = Vec::with_capacity(partition_ids.len() + partition_hash_ids.len()); - if !partition_ids.is_empty() { - let mut partitions = repos.partitions().get_by_id_batch(partition_ids).await?; - out.append(&mut partitions); - } - if !partition_hash_ids.is_empty() { - let mut partitions = repos - .partitions() - .get_by_hash_id_batch(&partition_hash_ids) - .await?; - out.append(&mut partitions); - } - Ok(out) -} - -/// Given an iterator of `(table_name, batch)` to validate, this function -/// ensures all the columns within `batch` match the existing schema for -/// `table_name` in `schema`. If the column does not already exist in `schema`, -/// it is created and an updated [`NamespaceSchema`] is returned. -/// -/// This function pushes schema additions through to the backend catalog, and -/// relies on the catalog to serialize concurrent additions of a given column, -/// ensuring only one type is ever accepted per column. -pub async fn validate_or_insert_schema<'a, T, U, R>( - tables: T, - schema: &NamespaceSchema, - repos: &mut R, -) -> Result, TableScopedError> -where - T: IntoIterator + Send + Sync, - U: Iterator + Send, - R: RepoCollection + ?Sized, -{ - let tables = tables.into_iter(); - - // The (potentially updated) NamespaceSchema to return to the caller. - let mut schema = Cow::Borrowed(schema); - - for (table_name, batch) in tables { - validate_mutable_batch(batch, table_name, &mut schema, repos) - .await - .map_err(|e| TableScopedError(table_name.to_string(), e))?; - } - - match schema { - Cow::Owned(v) => Ok(Some(v)), - Cow::Borrowed(_) => Ok(None), - } -} - -// &mut Cow is used to avoid a copy, so allow it -#[allow(clippy::ptr_arg)] -async fn validate_mutable_batch( - mb: &MutableBatch, - table_name: &str, - schema: &mut Cow<'_, NamespaceSchema>, - repos: &mut R, -) -> Result<()> -where - R: RepoCollection + ?Sized, -{ - // Check if the table exists in the schema. - // - // Because the entry API requires &mut it is not used to avoid a premature - // clone of the Cow. - let mut table = match schema.tables.get(table_name) { - Some(t) => Cow::Borrowed(t), - None => { - // The table does not exist in the cached schema. - // - // Attempt to load an existing table from the catalog or create a new table in the - // catalog to populate the cache. - let table = - table_load_or_create(repos, schema.id, &schema.partition_template, table_name) - .await?; - - assert!(schema - .to_mut() - .tables - .insert(table_name.to_string(), table) - .is_none()); - - Cow::Borrowed(schema.tables.get(table_name).unwrap()) - } - }; - - // The table is now in the schema (either by virtue of it already existing, - // or through adding it above). - // - // If the table itself needs to be updated during column validation it - // becomes a Cow::owned() copy and the modified copy should be inserted into - // the schema before returning. - let mut column_batch: HashMap<&str, ColumnType> = HashMap::new(); - - for (name, col) in mb.columns() { - // Check if the column exists in the cached schema. - // - // If it does, validate it. If it does not exist, create it and insert - // it into the cached schema. - - match table.columns.get(name.as_str()) { - Some(existing) if existing.matches_type(col.influx_type()) => { - // No action is needed as the column matches the existing column - // schema. - } - Some(existing) => { - // The column schema, and the column in the mutable batch are of - // different types. - return ColumnTypeMismatchSnafu { - name, - existing: existing.column_type, - new: col.influx_type(), - } - .fail(); - } - None => { - // The column does not exist in the cache, add it to the column - // batch to be bulk inserted later. - let old = column_batch.insert(name.as_str(), ColumnType::from(col.influx_type())); - assert!( - old.is_none(), - "duplicate column name `{name}` in new column batch shouldn't be possible" - ); - } - } - } - - if !column_batch.is_empty() { - repos - .columns() - .create_or_get_many_unchecked(table.id, column_batch) - .await? - .into_iter() - .for_each(|c| table.to_mut().add_column(c)); - } - - if let Cow::Owned(table) = table { - // The table schema was mutated and needs inserting into the namespace - // schema to make the changes visible to the caller. - assert!(schema - .to_mut() - .tables - .insert(table_name.to_string(), table) - .is_some()); - } - - Ok(()) -} - -/// load the table or create a new one -pub async fn table_load_or_create( - repos: &mut R, - namespace_id: NamespaceId, - namespace_partition_template: &NamespacePartitionTemplateOverride, - table_name: &str, -) -> Result -where - R: RepoCollection + ?Sized, -{ - let table = match repos - .tables() - .get_by_namespace_and_name(namespace_id, table_name) - .await? - { - Some(table) => table, - None => { - // There is a possibility of a race condition here, if another request has also - // created this table after the `get_by_namespace_and_name` call but before - // this `create` call. In that (hopefully) rare case, do an additional fetch - // from the catalog for the record that should now exist. - let create_result = repos - .tables() - .create( - table_name, - // This table is being created implicitly by this write, so there's no - // possibility of a user-supplied partition template here, which is why there's - // a hardcoded `None`. If there is a namespace template, it must be valid because - // validity was checked during its creation, so that's why there's an `expect`. - TablePartitionTemplateOverride::try_new(None, namespace_partition_template) - .expect("no table partition template; namespace partition template has been validated"), - namespace_id, - ) - .await; - if let Err(Error::TableNameExists { .. }) = create_result { - repos - .tables() - .get_by_namespace_and_name(namespace_id, table_name) - // Propagate any `Err` returned by the catalog - .await? - // Getting `Ok(None)` should be impossible if we're in this code path because - // the `create` request just said the table exists - .expect( - "Table creation failed because the table exists, so looking up the table \ - should return `Some(table)`, but it returned `None`", - ) - } else { - create_result? - } - } - }; - - let mut table = TableSchema::new_empty_from(&table); - - // Always add a time column to all new tables. - let time_col = repos - .columns() - .create_or_get(TIME_COLUMN, table.id, ColumnType::Time) - .await?; - - table.add_column(time_col); - - Ok(table) -} - -/// Catalog helper functions for creation of catalog objects -pub mod test_helpers { - use crate::RepoCollection; - use data_types::{ - partition_template::TablePartitionTemplateOverride, ColumnId, ColumnSet, CompactionLevel, - Namespace, NamespaceName, ParquetFileParams, Partition, Table, Timestamp, - }; - use uuid::Uuid; - - /// When the details of the namespace don't matter; the test just needs *a* catalog namespace - /// with a particular name. - /// - /// Use [`NamespaceRepo::create`] directly if: - /// - /// - The values of the parameters to `create` need to be different than what's here - /// - The values of the parameters to `create` are relevant to the behavior under test - /// - You expect namespace creation to fail in the test - /// - /// [`NamespaceRepo::create`]: crate::interface::NamespaceRepo::create - pub async fn arbitrary_namespace( - repos: &mut R, - name: &str, - ) -> Namespace { - let namespace_name = NamespaceName::new(name).unwrap(); - repos - .namespaces() - .create(&namespace_name, None, None, None) - .await - .unwrap() - } - - /// When the details of the table don't matter; the test just needs *a* catalog table - /// with a particular name in a particular namespace. - /// - /// Use [`TableRepo::create`] directly if: - /// - /// - The values of the parameters to `create_or_get` need to be different than what's here - /// - The values of the parameters to `create_or_get` are relevant to the behavior under test - /// - You expect table creation to fail in the test - /// - /// [`TableRepo::create`]: crate::interface::TableRepo::create - pub async fn arbitrary_table( - repos: &mut R, - name: &str, - namespace: &Namespace, - ) -> Table { - repos - .tables() - .create( - name, - TablePartitionTemplateOverride::try_new(None, &namespace.partition_template) - .unwrap(), - namespace.id, - ) - .await - .unwrap() - } - - /// When the details of a Parquet file record don't matter, the test just needs *a* Parquet - /// file record in a particular namespace+table+partition. - pub fn arbitrary_parquet_file_params( - namespace: &Namespace, - table: &Table, - partition: &Partition, - ) -> ParquetFileParams { - ParquetFileParams { - namespace_id: namespace.id, - table_id: table.id, - partition_id: partition.transition_partition_id(), - object_store_id: Uuid::new_v4(), - min_time: Timestamp::new(1), - max_time: Timestamp::new(10), - file_size_bytes: 1337, - row_count: 0, - compaction_level: CompactionLevel::Initial, - created_at: Timestamp::new(1), - column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]), - max_l0_created_at: Timestamp::new(1), - } - } -} +pub mod test_helpers; +pub mod util; #[cfg(test)] -mod tests { - use std::{collections::BTreeMap, sync::Arc}; - - use super::*; - use crate::{ - interface::{get_schema_by_name, SoftDeletedRows}, - mem::MemCatalog, - }; - - // Generate a test that simulates multiple, sequential writes in `lp` and - // asserts the resulting schema. - // - // This test asserts the cached schema and the database entry are always in - // sync. - macro_rules! test_validate_schema { - ( - $name:ident, - lp = [$($lp:literal,)+], // An array of multi-line LP writes - want_observe_conflict = $want_observe_conflict:literal, // true if a schema validation error should be observed at some point - want_schema = {$($want_schema:tt) +} // The expected resulting schema after all writes complete. - ) => { - paste::paste! { - #[allow(clippy::bool_assert_comparison)] - #[tokio::test] - async fn []() { - use crate::{interface::Catalog, test_helpers::arbitrary_namespace}; - use std::ops::DerefMut; - use pretty_assertions::assert_eq; - const NAMESPACE_NAME: &str = "bananas"; - - let metrics = Arc::new(metric::Registry::default()); - let repo = MemCatalog::new(metrics); - let mut txn = repo.repositories().await; - - let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME) - .await; - let schema = NamespaceSchema::new_empty_from(&namespace); - - // Apply all the lp literals as individual writes, feeding - // the result of one validation into the next to drive - // incremental construction of the schemas. - let mut observed_conflict = false; - $( - let schema = { - let lp: String = $lp.to_string(); - - let writes = mutable_batch_lp::lines_to_batches(lp.as_str(), 42) - .expect("failed to build test writes from LP"); - - let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut()) - .await; - - match got { - Err(TableScopedError(_, Error::ColumnTypeMismatch{ .. })) => { - observed_conflict = true; - schema - }, - Err(e) => panic!("unexpected error: {}", e), - Ok(Some(new_schema)) => new_schema, - Ok(None) => schema, - } - }; - )+ - - assert_eq!($want_observe_conflict, observed_conflict, "should error mismatch"); - - // Invariant: in absence of concurrency, the schema within - // the database must always match the incrementally built - // cached schema. - let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut(), SoftDeletedRows::ExcludeDeleted) - .await - .expect("database failed to query for namespace schema"); - assert_eq!(schema, db_schema, "schema in DB and cached schema differ"); - - // Generate the map of tables => desired column types - let want_tables: BTreeMap> = test_validate_schema!(@table, $($want_schema)+); - - // Generate a similarly structured map from the actual - // schema - let actual_tables: BTreeMap> = schema - .tables - .iter() - .map(|(table, table_schema)| { - let desired_cols = table_schema - .columns - .iter() - .map(|(column, column_schema)| (column.clone(), column_schema.column_type)) - .collect::>(); - - (table.clone(), desired_cols) - }) - .collect(); - - // Assert the actual namespace contents matches the desired - // table schemas in the test args. - assert_eq!(want_tables, actual_tables, "cached schema and desired schema differ"); - } - } - }; - // Generate a map of table names => column map (below) - // - // out: BTreeMap> - (@table, $($table_name:literal: [$($columns:tt) +],)*) => {{ - let mut tables = BTreeMap::new(); - $( - let want_cols = test_validate_schema!(@column, $($columns)+); - assert!(tables.insert($table_name.to_string(), want_cols).is_none()); - )* - tables - }}; - // Generate a map of column names => ColumnType - // - // out: BTreeMap - (@column, $($col_name:literal => $col_type:expr,)+) => {{ - let mut cols = BTreeMap::new(); - $( - assert!(cols.insert($col_name.to_string(), $col_type).is_none()); - )* - cols - }}; - } - - test_validate_schema!( - one_write_multiple_tables, - lp = [ - " - m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ - m1,t1=a f1=3i 2\n\ - m2,t3=b f1=true 1\n\ - ", - ], - want_observe_conflict = false, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "t2" => ColumnType::Tag, - "f1" => ColumnType::I64, - "f2" => ColumnType::F64, - "time" => ColumnType::Time, - ], - "m2": [ - "f1" => ColumnType::Bool, - "t3" => ColumnType::Tag, - "time" => ColumnType::Time, - ], - } - ); - - // test that a new table will be created - test_validate_schema!( - two_writes_incremental_new_table, - lp = [ - " - m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ - m1,t1=a f1=3i 2\n\ - m2,t3=b f1=true 1\n\ - ", - " - m1,t1=c f1=1i 2\n\ - new_measurement,t9=a f10=true 1\n\ - ", - ], - want_observe_conflict = false, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "t2" => ColumnType::Tag, - "f1" => ColumnType::I64, - "f2" => ColumnType::F64, - "time" => ColumnType::Time, - ], - "m2": [ - "f1" => ColumnType::Bool, - "t3" => ColumnType::Tag, - "time" => ColumnType::Time, - ], - "new_measurement": [ - "t9" => ColumnType::Tag, - "f10" => ColumnType::Bool, - "time" => ColumnType::Time, - ], - } - ); - - // test that a new column for an existing table will be created - test_validate_schema!( - two_writes_incremental_new_column, - lp = [ - " - m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ - m1,t1=a f1=3i 2\n\ - m2,t3=b f1=true 1\n\ - ", - "m1,new_tag=c new_field=1i 2", - ], - want_observe_conflict = false, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "t2" => ColumnType::Tag, - "f1" => ColumnType::I64, - "f2" => ColumnType::F64, - "time" => ColumnType::Time, - // These are the incremental additions: - "new_tag" => ColumnType::Tag, - "new_field" => ColumnType::I64, - ], - "m2": [ - "f1" => ColumnType::Bool, - "t3" => ColumnType::Tag, - "time" => ColumnType::Time, - ], - } - ); - - test_validate_schema!( - table_always_has_time_column, - lp = [ - "m1,t1=a f1=2i", - ], - want_observe_conflict = false, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "f1" => ColumnType::I64, - "time" => ColumnType::Time, - ], - } - ); - - test_validate_schema!( - two_writes_conflicting_column_types, - lp = [ - "m1,t1=a f1=2i", - // Second write has conflicting type for f1. - "m1,t1=a f1=2.0", - ], - want_observe_conflict = true, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "f1" => ColumnType::I64, - "time" => ColumnType::Time, - ], - } - ); - - test_validate_schema!( - two_writes_tag_field_transposition, - lp = [ - // x is a tag - "m1,t1=a,x=t f1=2i", - // x is a field - "m1,t1=a x=t,f1=2i", - ], - want_observe_conflict = true, - want_schema = { - "m1": [ - "t1" => ColumnType::Tag, - "x" => ColumnType::Tag, - "f1" => ColumnType::I64, - "time" => ColumnType::Time, - ], - } - ); - - #[tokio::test] - async fn validate_table_create_race_doesnt_get_all_columns() { - use crate::{interface::Catalog, test_helpers::arbitrary_namespace}; - use std::{collections::BTreeSet, ops::DerefMut}; - const NAMESPACE_NAME: &str = "bananas"; - - let repo = MemCatalog::new(Default::default()); - let mut txn = repo.repositories().await; - let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME).await; - - // One cached schema has no tables. - let empty_schema = NamespaceSchema::new_empty_from(&namespace); - - // Another cached schema gets a write that creates a table with some columns. - let schema_with_table = empty_schema.clone(); - let writes = mutable_batch_lp::lines_to_batches("m1,t1=a f1=2i", 42).unwrap(); - validate_or_insert_schema( - writes.iter().map(|(k, v)| (k.as_str(), v)), - &schema_with_table, - txn.deref_mut(), - ) - .await - .unwrap(); - - // then the empty schema adds the same table with some different columns - let other_writes = mutable_batch_lp::lines_to_batches("m1,t2=a f2=2i", 43).unwrap(); - let formerly_empty_schema = validate_or_insert_schema( - other_writes.iter().map(|(k, v)| (k.as_str(), v)), - &empty_schema, - txn.deref_mut(), - ) - .await - .unwrap() - .unwrap(); - - // the formerly-empty schema should NOT have all the columns; schema convergence is handled - // at a higher level by the namespace cache/gossip system - let table = formerly_empty_schema.tables.get("m1").unwrap(); - assert_eq!(table.columns.names(), BTreeSet::from(["t2", "f2", "time"])); - } -} +pub(crate) mod interface_tests; diff --git a/iox_catalog/src/mem.rs b/iox_catalog/src/mem.rs index a779a5567e6..0d810fd0ca8 100644 --- a/iox_catalog/src/mem.rs +++ b/iox_catalog/src/mem.rs @@ -1,35 +1,38 @@ //! This module implements an in-memory implementation of the iox_catalog interface. It can be //! used for testing or for an IOx designed to run without catalog persistence. -use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE; use crate::{ + constants::{ + MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, + }, interface::{ - CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo, - ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, - MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, + AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, + PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, }, metrics::MetricDecorator, }; use async_trait::async_trait; -use data_types::SortedColumnSet; +use data_types::snapshot::partition::PartitionSnapshot; +use data_types::snapshot::table::TableSnapshot; use data_types::{ partition_template::{ NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart, }, Column, ColumnId, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, - NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, - ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, - SkippedCompaction, Table, TableId, Timestamp, TransitionPartitionId, + NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, + ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, + PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp, }; -use iox_time::{SystemProvider, TimeProvider}; +use iox_time::TimeProvider; +use parking_lot::Mutex; use snafu::ensure; -use sqlx::types::Uuid; +use std::ops::Deref; use std::{ collections::{HashMap, HashSet}, fmt::{Display, Formatter}, + ops::DerefMut, sync::Arc, }; -use tokio::sync::{Mutex, OwnedMutexGuard}; /// In-memory catalog that implements the `RepoCollection` and individual repo traits from /// the catalog interface. @@ -40,20 +43,20 @@ pub struct MemCatalog { } impl MemCatalog { - /// return new initialized `MemCatalog` - pub fn new(metrics: Arc) -> Self { + /// return new initialized [`MemCatalog`] + pub fn new(metrics: Arc, time_provider: Arc) -> Self { Self { metrics, collections: Default::default(), - time_provider: Arc::new(SystemProvider::new()), + time_provider, } } /// Add partition directly, for testing purposes only as it does not do any consistency or /// uniqueness checks - pub async fn add_partition(&self, partition: Partition) { - let mut collections = Arc::clone(&self.collections).lock_owned().await; - collections.partitions.push(partition); + pub fn add_partition(&self, partition: Partition) { + let mut stage = self.collections.lock(); + stage.partitions.push(partition.into()); } } @@ -63,12 +66,42 @@ impl std::fmt::Debug for MemCatalog { } } +/// A wrapper around `T` adding a generation number +#[derive(Debug, Clone)] +struct Versioned { + generation: u64, + value: T, +} + +impl Deref for Versioned { + type Target = T; + + fn deref(&self) -> &Self::Target { + &self.value + } +} + +impl DerefMut for Versioned { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.value + } +} + +impl From for Versioned { + fn from(value: T) -> Self { + Self { + generation: 0, + value, + } + } +} + #[derive(Default, Debug, Clone)] struct MemCollections { namespaces: Vec, - tables: Vec
, + tables: Vec>, columns: Vec, - partitions: Vec, + partitions: Vec>, skipped_compactions: Vec, parquet_files: Vec, } @@ -76,16 +109,10 @@ struct MemCollections { /// transaction bound to an in-memory catalog. #[derive(Debug)] pub struct MemTxn { - inner: OwnedMutexGuard, + collections: Arc>, time_provider: Arc, } -impl MemTxn { - fn stage(&mut self) -> &mut MemCollections { - &mut self.inner - } -} - impl Display for MemCatalog { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { write!(f, "Memory") @@ -98,14 +125,15 @@ impl Catalog for MemCatalog { Ok(()) } - async fn repositories(&self) -> Box { - let collections = Arc::clone(&self.collections).lock_owned().await; + fn repositories(&self) -> Box { + let collections = Arc::clone(&self.collections); Box::new(MetricDecorator::new( MemTxn { - inner: collections, + collections, time_provider: self.time_provider(), }, Arc::clone(&self.metrics), + self.time_provider(), )) } @@ -119,7 +147,6 @@ impl Catalog for MemCatalog { } } -#[async_trait] impl RepoCollection for MemTxn { fn namespaces(&mut self) -> &mut dyn NamespaceRepo { self @@ -151,11 +178,11 @@ impl NamespaceRepo for MemTxn { retention_period_ns: Option, service_protection_limits: Option, ) -> Result { - let stage = self.stage(); + let mut stage = self.collections.lock(); if stage.namespaces.iter().any(|n| n.name == name.as_str()) { - return Err(Error::NameExists { - name: name.to_string(), + return Err(Error::AlreadyExists { + descr: name.to_string(), }); } @@ -180,7 +207,7 @@ impl NamespaceRepo for MemTxn { } async fn list(&mut self, deleted: SoftDeletedRows) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); Ok(filter_namespace_soft_delete(&stage.namespaces, deleted) .cloned() @@ -192,11 +219,13 @@ impl NamespaceRepo for MemTxn { id: NamespaceId, deleted: SoftDeletedRows, ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); - Ok(filter_namespace_soft_delete(&stage.namespaces, deleted) + let res = filter_namespace_soft_delete(&stage.namespaces, deleted) .find(|n| n.id == id) - .cloned()) + .cloned(); + + Ok(res) } async fn get_by_name( @@ -204,39 +233,41 @@ impl NamespaceRepo for MemTxn { name: &str, deleted: SoftDeletedRows, ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); - Ok(filter_namespace_soft_delete(&stage.namespaces, deleted) + let res = filter_namespace_soft_delete(&stage.namespaces, deleted) .find(|n| n.name == name) - .cloned()) + .cloned(); + + Ok(res) } // performs a cascading delete of all things attached to the namespace, then deletes the // namespace async fn soft_delete(&mut self, name: &str) -> Result<()> { + let mut stage = self.collections.lock(); let timestamp = self.time_provider.now(); - let stage = self.stage(); // get namespace by name match stage.namespaces.iter_mut().find(|n| n.name == name) { Some(n) => { n.deleted_at = Some(Timestamp::from(timestamp)); Ok(()) } - None => Err(Error::NamespaceNotFoundByName { - name: name.to_string(), + None => Err(Error::NotFound { + descr: name.to_string(), }), } } async fn update_table_limit(&mut self, name: &str, new_max: MaxTables) -> Result { - let stage = self.stage(); + let mut stage = self.collections.lock(); match stage.namespaces.iter_mut().find(|n| n.name == name) { Some(n) => { n.max_tables = new_max; Ok(n.clone()) } - None => Err(Error::NamespaceNotFoundByName { - name: name.to_string(), + None => Err(Error::NotFound { + descr: name.to_string(), }), } } @@ -246,14 +277,14 @@ impl NamespaceRepo for MemTxn { name: &str, new_max: MaxColumnsPerTable, ) -> Result { - let stage = self.stage(); + let mut stage = self.collections.lock(); match stage.namespaces.iter_mut().find(|n| n.name == name) { Some(n) => { n.max_columns_per_table = new_max; Ok(n.clone()) } - None => Err(Error::NamespaceNotFoundByName { - name: name.to_string(), + None => Err(Error::NotFound { + descr: name.to_string(), }), } } @@ -263,14 +294,14 @@ impl NamespaceRepo for MemTxn { name: &str, retention_period_ns: Option, ) -> Result { - let stage = self.stage(); + let mut stage = self.collections.lock(); match stage.namespaces.iter_mut().find(|n| n.name == name) { Some(n) => { n.retention_period_ns = retention_period_ns; Ok(n.clone()) } - None => Err(Error::NamespaceNotFoundByName { - name: name.to_string(), + None => Err(Error::NotFound { + descr: name.to_string(), }), } } @@ -284,9 +315,9 @@ impl TableRepo for MemTxn { partition_template: TablePartitionTemplateOverride, namespace_id: NamespaceId, ) -> Result
{ - let table = { - let stage = self.stage(); + let mut stage = self.collections.lock(); + let table = { // this block is just to ensure the mem impl correctly creates TableCreateLimitError in // tests, we don't care about any of the errors it is discarding stage @@ -294,10 +325,10 @@ impl TableRepo for MemTxn { .iter() .find(|n| n.id == namespace_id) .cloned() - .ok_or_else(|| Error::NamespaceNotFoundByName { + .ok_or_else(|| Error::NotFound { // we're never going to use this error, this is just for flow control, // so it doesn't matter that we only have the ID, not the name - name: "".to_string(), + descr: "".to_string(), }) .and_then(|n| { let max_tables = n.max_tables; @@ -306,10 +337,12 @@ impl TableRepo for MemTxn { .iter() .filter(|t| t.namespace_id == namespace_id) .count(); - if tables_count >= max_tables.get().try_into().unwrap() { - return Err(Error::TableCreateLimitError { - table_name: name.to_string(), - namespace_id, + if tables_count >= max_tables.get() { + return Err(Error::LimitExceeded { + descr: format!( + "couldn't create table {}; limit reached on namespace {}", + name, namespace_id + ), }); } Ok(()) @@ -321,9 +354,8 @@ impl TableRepo for MemTxn { .find(|t| t.name == name && t.namespace_id == namespace_id) { Some(_t) => { - return Err(Error::TableNameExists { - name: name.to_string(), - namespace_id, + return Err(Error::AlreadyExists { + descr: format!("table '{name}' in namespace {namespace_id}"), }) } None => { @@ -333,23 +365,19 @@ impl TableRepo for MemTxn { name: name.to_string(), partition_template, }; - stage.tables.push(table); - stage.tables.last().unwrap() + stage.tables.push(table.into()); + stage.tables.last().unwrap().value.clone() } } }; - let table = table.clone(); - // Partitioning is only supported for tags, so create tag columns for all `TagValue` // partition template parts. It's important this happens within the table creation // transaction so that there isn't a possibility of a concurrent write creating these // columns with an unsupported type. for template_part in table.partition_template.parts() { if let TemplatePart::TagValue(tag_name) = template_part { - self.columns() - .create_or_get(tag_name, table.id, ColumnType::Tag) - .await?; + create_or_get_column(&mut stage, tag_name, table.id, ColumnType::Tag)?; } } @@ -357,9 +385,10 @@ impl TableRepo for MemTxn { } async fn get_by_id(&mut self, table_id: TableId) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); - Ok(stage.tables.iter().find(|t| t.id == table_id).cloned()) + let mut tables = stage.tables.iter(); + Ok(tables.find(|t| t.id == table_id).map(|v| v.value.clone())) } async fn get_by_namespace_and_name( @@ -367,30 +396,59 @@ impl TableRepo for MemTxn { namespace_id: NamespaceId, name: &str, ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); - Ok(stage - .tables - .iter() - .find(|t| t.namespace_id == namespace_id && t.name == name) - .cloned()) + let mut tables = stage.tables.iter(); + let search = tables.find(|t| t.namespace_id == namespace_id && t.name == name); + Ok(search.map(|v| v.value.clone())) } async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); - let tables: Vec<_> = stage - .tables - .iter() - .filter(|t| t.namespace_id == namespace_id) - .cloned() - .collect(); + let tables = stage.tables.iter(); + let filtered = tables.filter(|t| t.namespace_id == namespace_id); + let tables: Vec<_> = filtered.map(|v| v.value.clone()).collect(); Ok(tables) } async fn list(&mut self) -> Result> { - let stage = self.stage(); - Ok(stage.tables.clone()) + let stage = self.collections.lock(); + Ok(stage.tables.iter().map(|v| v.value.clone()).collect()) + } + + async fn snapshot(&mut self, table_id: TableId) -> Result { + let mut guard = self.collections.lock(); + + let (table, generation) = { + let mut tables = guard.tables.iter_mut(); + let search = tables.find(|x| x.id == table_id); + let table = search.ok_or_else(|| Error::NotFound { + descr: table_id.to_string(), + })?; + + let generation = table.generation; + table.generation += 1; + (table.value.clone(), generation) + }; + + let columns = guard + .columns + .iter() + .filter(|x| x.table_id == table_id) + .cloned() + .collect(); + + let partitions = guard + .partitions + .iter() + .filter(|x| x.table_id == table_id) + .map(|v| v.value.clone()) + .collect(); + + Ok(TableSnapshot::encode( + table, partitions, columns, generation, + )?) } } @@ -402,74 +460,8 @@ impl ColumnRepo for MemTxn { table_id: TableId, column_type: ColumnType, ) -> Result { - let stage = self.stage(); - - // this block is just to ensure the mem impl correctly creates ColumnCreateLimitError in - // tests, we don't care about any of the errors it is discarding - stage - .tables - .iter() - .find(|t| t.id == table_id) - .cloned() - .ok_or(Error::TableNotFound { id: table_id }) // error never used, this is just for flow control - .and_then(|t| { - stage - .namespaces - .iter() - .find(|n| n.id == t.namespace_id) - .cloned() - .ok_or_else(|| Error::NamespaceNotFoundByName { - // we're never going to use this error, this is just for flow control, - // so it doesn't matter that we only have the ID, not the name - name: "".to_string(), - }) - .and_then(|n| { - let max_columns_per_table = n.max_columns_per_table; - let columns_count = stage - .columns - .iter() - .filter(|t| t.table_id == table_id) - .count(); - if columns_count >= max_columns_per_table.get().try_into().unwrap() { - return Err(Error::ColumnCreateLimitError { - column_name: name.to_string(), - table_id, - }); - } - Ok(()) - })?; - Ok(()) - })?; - - let column = match stage - .columns - .iter() - .find(|t| t.name == name && t.table_id == table_id) - { - Some(c) => { - ensure!( - column_type == c.column_type, - ColumnTypeMismatchSnafu { - name, - existing: c.column_type, - new: column_type - } - ); - c - } - None => { - let column = Column { - id: ColumnId::new(stage.columns.len() as i64 + 1), - table_id, - name: name.to_string(), - column_type, - }; - stage.columns.push(column); - stage.columns.last().unwrap() - } - }; - - Ok(column.clone()) + let mut stage = self.collections.lock(); + create_or_get_column(&mut stage, name, table_id, column_type) } async fn create_or_get_many_unchecked( @@ -481,7 +473,7 @@ impl ColumnRepo for MemTxn { // check column limits when inserting many columns because it's complicated and expensive, // and for testing purposes the in-memory catalog needs to match its functionality. - let stage = self.stage(); + let mut stage = self.collections.lock(); let out: Vec<_> = columns .iter() @@ -494,10 +486,11 @@ impl ColumnRepo for MemTxn { Some(c) => { ensure!( column_type == c.column_type, - ColumnTypeMismatchSnafu { - name: column_name, - existing: c.column_type, - new: column_type + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + column_name, c.column_type, column_type + ), } ); Ok(c.clone()) @@ -520,7 +513,7 @@ impl ColumnRepo for MemTxn { } async fn list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let table_ids: Vec<_> = stage .tables @@ -539,7 +532,7 @@ impl ColumnRepo for MemTxn { } async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let columns: Vec<_> = stage .columns @@ -552,7 +545,7 @@ impl ColumnRepo for MemTxn { } async fn list(&mut self) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); Ok(stage.columns.clone()) } } @@ -560,7 +553,7 @@ impl ColumnRepo for MemTxn { #[async_trait] impl PartitionRepo for MemTxn { async fn create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result { - let stage = self.stage(); + let mut stage = self.collections.lock(); let partition = match stage .partitions @@ -569,96 +562,50 @@ impl PartitionRepo for MemTxn { { Some(p) => p, None => { - let p = Partition::new_in_memory_only( + let hash_id = PartitionHashId::new(table_id, &key); + let p = Partition::new_catalog_only( PartitionId::new(stage.partitions.len() as i64 + 1), + Some(hash_id), table_id, key, - vec![], - SortedColumnSet::new(vec![]), + SortKeyIds::default(), None, ); - stage.partitions.push(p); + stage.partitions.push(p.into()); stage.partitions.last().unwrap() } }; - Ok(partition.clone()) + Ok(partition.value.clone()) } - async fn get_by_id(&mut self, partition_id: PartitionId) -> Result> { - let stage = self.stage(); + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result> { + let lookup = partition_ids.iter().collect::>(); - Ok(stage - .partitions - .iter() - .find(|p| p.id == partition_id) - .cloned()) - } - - async fn get_by_id_batch(&mut self, partition_ids: Vec) -> Result> { - let lookup = partition_ids.into_iter().collect::>(); - - let stage = self.stage(); + let stage = self.collections.lock(); Ok(stage .partitions .iter() .filter(|p| lookup.contains(&p.id)) - .cloned() - .collect()) - } - - async fn get_by_hash_id( - &mut self, - partition_hash_id: &PartitionHashId, - ) -> Result> { - let stage = self.stage(); - - Ok(stage - .partitions - .iter() - .find(|p| { - p.hash_id() - .map(|hash_id| hash_id == partition_hash_id) - .unwrap_or_default() - }) - .cloned()) - } - - async fn get_by_hash_id_batch( - &mut self, - partition_hash_ids: &[&PartitionHashId], - ) -> Result> { - let lookup = partition_hash_ids.iter().copied().collect::>(); - - let stage = self.stage(); - - Ok(stage - .partitions - .iter() - .filter(|p| { - p.hash_id() - .map(|hash_id| lookup.contains(hash_id)) - .unwrap_or_default() - }) - .cloned() + .map(|x| x.value.clone()) .collect()) } async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let partitions: Vec<_> = stage .partitions .iter() .filter(|p| p.table_id == table_id) - .cloned() + .map(|x| x.value.clone()) .collect(); Ok(partitions) } async fn list_ids(&mut self) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let partitions: Vec<_> = stage.partitions.iter().map(|p| p.id).collect(); @@ -667,45 +614,24 @@ impl PartitionRepo for MemTxn { async fn cas_sort_key( &mut self, - partition_id: &TransitionPartitionId, - old_sort_key: Option>, - old_sort_key_ids: Option, - new_sort_key: &[&str], - new_sort_key_ids: &SortedColumnSet, - ) -> Result, SortedColumnSet)>> { - // These asserts are here to cacth bugs. They will be removed when we remove the sort_key - // field from the Partition - assert_eq!( - old_sort_key.as_ref().map(|v| v.len()), - old_sort_key_ids.as_ref().map(|v| v.len()) - ); - assert_eq!(new_sort_key.len(), new_sort_key_ids.len()); - - let stage = self.stage(); - let old_sort_key = old_sort_key.unwrap_or_default(); - let old_sort_key_ids = old_sort_key_ids.unwrap_or_default(); - - match stage.partitions.iter_mut().find(|p| match partition_id { - TransitionPartitionId::Deterministic(hash_id) => { - p.hash_id().map_or(false, |h| h == hash_id) - } - TransitionPartitionId::Deprecated(id) => p.id == *id, - }) { - Some(p) if p.sort_key_ids == old_sort_key_ids => { - // This is here to catch bugs. It will be removed when we remove the sort_key - assert_eq!(p.sort_key, old_sort_key); - p.sort_key = new_sort_key.iter().map(|s| s.to_string()).collect(); - p.sort_key_ids = new_sort_key_ids.clone(); - Ok(p.clone()) + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result> { + let mut stage = self.collections.lock(); + + match stage.partitions.iter_mut().find(|p| p.id == partition_id) { + Some(p) if p.sort_key_ids() == old_sort_key_ids => { + p.set_sort_key_ids(new_sort_key_ids); + Ok(p.value.clone()) } Some(p) => { - return Err(CasFailure::ValueMismatch(( - p.sort_key.clone(), - p.sort_key_ids.clone(), - ))); + return Err(CasFailure::ValueMismatch( + p.sort_key_ids().cloned().unwrap_or_default(), + )); } - None => Err(CasFailure::QueryError(Error::PartitionNotFound { - id: partition_id.clone(), + None => Err(CasFailure::QueryError(Error::NotFound { + descr: partition_id.to_string(), })), } } @@ -720,34 +646,31 @@ impl PartitionRepo for MemTxn { estimated_bytes: u64, limit_bytes: u64, ) -> Result<()> { + let mut stage = self.collections.lock(); + let reason = reason.to_string(); let skipped_at = Timestamp::from(self.time_provider.now()); - let stage = self.stage(); + let sc = SkippedCompaction { + partition_id, + reason, + skipped_at, + num_files: num_files as i64, + limit_num_files: limit_num_files as i64, + limit_num_files_first_in_partition: limit_num_files_first_in_partition as i64, + estimated_bytes: estimated_bytes as i64, + limit_bytes: limit_bytes as i64, + }; + match stage .skipped_compactions .iter_mut() .find(|s| s.partition_id == partition_id) { Some(s) => { - s.reason = reason; - s.skipped_at = skipped_at; - s.num_files = num_files as i64; - s.limit_num_files = limit_num_files as i64; - s.limit_num_files_first_in_partition = limit_num_files_first_in_partition as i64; - s.estimated_bytes = estimated_bytes as i64; - s.limit_bytes = limit_bytes as i64; + *s = sc; } - None => stage.skipped_compactions.push(SkippedCompaction { - partition_id, - reason, - skipped_at, - num_files: num_files as i64, - limit_num_files: limit_num_files as i64, - limit_num_files_first_in_partition: limit_num_files_first_in_partition as i64, - estimated_bytes: estimated_bytes as i64, - limit_bytes: limit_bytes as i64, - }), + None => stage.skipped_compactions.push(sc), } Ok(()) } @@ -756,7 +679,7 @@ impl PartitionRepo for MemTxn { &mut self, partition_ids: &[PartitionId], ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let find: HashSet<&PartitionId> = partition_ids.iter().collect(); Ok(stage .skipped_compactions @@ -767,7 +690,7 @@ impl PartitionRepo for MemTxn { } async fn list_skipped_compactions(&mut self) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); Ok(stage.skipped_compactions.clone()) } @@ -777,7 +700,7 @@ impl PartitionRepo for MemTxn { ) -> Result> { use std::mem; - let stage = self.stage(); + let mut stage = self.collections.lock(); let skipped_compactions = mem::take(&mut stage.skipped_compactions); let (mut removed, remaining) = skipped_compactions .into_iter() @@ -792,8 +715,9 @@ impl PartitionRepo for MemTxn { } async fn most_recent_n(&mut self, n: usize) -> Result> { - let stage = self.stage(); - Ok(stage.partitions.iter().rev().take(n).cloned().collect()) + let stage = self.collections.lock(); + let iter = stage.partitions.iter().rev().take(n); + Ok(iter.map(|x| x.value.clone()).collect()) } async fn partitions_new_file_between( @@ -801,7 +725,7 @@ impl PartitionRepo for MemTxn { minimum_time: Timestamp, maximum_time: Option, ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let partitions: Vec<_> = stage .partitions @@ -819,34 +743,65 @@ impl PartitionRepo for MemTxn { } async fn list_old_style(&mut self) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); let old_style: Vec<_> = stage .partitions .iter() .filter(|p| p.hash_id().is_none()) - .cloned() + .map(|x| x.value.clone()) .collect(); Ok(old_style) } -} -#[async_trait] -impl ParquetFileRepo for MemTxn { - async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result { - create_parquet_file(self.stage(), parquet_file_params).await - } + async fn snapshot(&mut self, partition_id: PartitionId) -> Result { + let mut guard = self.collections.lock(); + let (partition, generation) = { + let search = guard.partitions.iter_mut().find(|x| x.id == partition_id); + let partition = search.ok_or_else(|| Error::NotFound { + descr: format!("Partition {partition_id} not found"), + })?; + + let generation = partition.generation; + partition.generation += 1; + (partition.value.clone(), generation) + }; - async fn list_all(&mut self) -> Result> { - let stage = self.stage(); + let files = guard + .parquet_files + .iter() + .filter(|x| x.partition_id == partition_id && x.to_delete.is_none()) + .cloned() + .collect(); + + let search = guard.tables.iter().find(|x| x.id == partition.table_id); + let table = search.ok_or_else(|| Error::NotFound { + descr: format!("Table {} not found", partition.table_id), + })?; - Ok(stage.parquet_files.clone()) + let sc = guard + .skipped_compactions + .iter() + .find(|sc| sc.partition_id == partition_id) + .cloned(); + + Ok(PartitionSnapshot::encode( + table.namespace_id, + partition, + files, + sc, + generation, + )?) } +} - async fn flag_for_delete_by_retention(&mut self) -> Result> { +#[async_trait] +impl ParquetFileRepo for MemTxn { + async fn flag_for_delete_by_retention(&mut self) -> Result> { + let mut stage = self.collections.lock(); let now = Timestamp::from(self.time_provider.now()); - let stage = self.stage(); + let stage = stage.deref_mut(); Ok(stage .parquet_files @@ -864,7 +819,7 @@ impl ParquetFileRepo for MemTxn { ns.retention_period_ns.and_then(|rp| { if f.max_time < now - rp { f.to_delete = Some(now); - Some(f.id) + Some((f.partition_id, f.object_store_id)) } else { None } @@ -875,40 +830,8 @@ impl ParquetFileRepo for MemTxn { .collect()) } - async fn list_by_namespace_not_to_delete( - &mut self, - namespace_id: NamespaceId, - ) -> Result> { - let stage = self.stage(); - - let table_ids: HashSet<_> = stage - .tables - .iter() - .filter_map(|table| (table.namespace_id == namespace_id).then_some(table.id)) - .collect(); - let parquet_files: Vec<_> = stage - .parquet_files - .iter() - .filter(|f| table_ids.contains(&f.table_id) && f.to_delete.is_none()) - .cloned() - .collect(); - Ok(parquet_files) - } - - async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result> { - let stage = self.stage(); - - let parquet_files: Vec<_> = stage - .parquet_files - .iter() - .filter(|f| table_id == f.table_id && f.to_delete.is_none()) - .cloned() - .collect(); - Ok(parquet_files) - } - - async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { - let stage = self.stage(); + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + let mut stage = self.collections.lock(); let (delete, keep): (Vec<_>, Vec<_>) = stage.parquet_files.iter().cloned().partition( |f| matches!(f.to_delete, Some(marked_deleted) if marked_deleted < older_than), @@ -919,50 +842,31 @@ impl ParquetFileRepo for MemTxn { let delete = delete .into_iter() .take(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE as usize) - .map(|f| f.id) + .map(|f| f.object_store_id) .collect(); Ok(delete) } - async fn list_by_partition_not_to_delete( + async fn list_by_partition_not_to_delete_batch( &mut self, - partition_id: &TransitionPartitionId, + partition_ids: Vec, ) -> Result> { - let stage = self.stage(); - - let partition = stage - .partitions - .iter() - .find(|p| match partition_id { - TransitionPartitionId::Deterministic(hash_id) => p - .hash_id() - .map(|p_hash_id| p_hash_id == hash_id) - .unwrap_or(false), - TransitionPartitionId::Deprecated(id) => id == &p.id, - }) - .unwrap() - .clone(); + let partition_ids = partition_ids.into_iter().collect::>(); + let stage = self.collections.lock(); Ok(stage .parquet_files .iter() - .filter(|f| match &f.partition_id { - TransitionPartitionId::Deterministic(hash_id) => partition - .hash_id() - .map(|p_hash_id| p_hash_id == hash_id) - .unwrap_or(false), - TransitionPartitionId::Deprecated(id) => id == &partition.id, - }) - .filter(|f| f.to_delete.is_none()) + .filter(|f| partition_ids.contains(&f.partition_id) && f.to_delete.is_none()) .cloned() .collect()) } async fn get_by_object_store_id( &mut self, - object_store_id: Uuid, + object_store_id: ObjectStoreId, ) -> Result> { - let stage = self.stage(); + let stage = self.collections.lock(); Ok(stage .parquet_files @@ -973,9 +877,9 @@ impl ParquetFileRepo for MemTxn { async fn exists_by_object_store_id_batch( &mut self, - object_store_ids: Vec, - ) -> Result> { - let stage = self.stage(); + object_store_ids: Vec, + ) -> Result> { + let stage = self.collections.lock(); Ok(stage .parquet_files @@ -987,8 +891,9 @@ impl ParquetFileRepo for MemTxn { async fn create_upgrade_delete( &mut self, - delete: &[ParquetFileId], - upgrade: &[ParquetFileId], + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel, ) -> Result> { @@ -1000,22 +905,28 @@ impl ParquetFileRepo for MemTxn { "attempted to upgrade a file scheduled for delete" ); - let mut stage = self.inner.clone(); + let mut collections = self.collections.lock(); + let mut stage = collections.clone(); for id in delete { let marked_at = Timestamp::from(self.time_provider.now()); - flag_for_delete(&mut stage, *id, marked_at).await?; + flag_for_delete(&mut stage, partition_id, *id, marked_at)?; } - update_compaction_level(&mut stage, upgrade, target_level).await?; + update_compaction_level(&mut stage, partition_id, upgrade, target_level)?; let mut ids = Vec::with_capacity(create.len()); for file in create { - let res = create_parquet_file(&mut stage, file.clone()).await?; + if file.partition_id != partition_id { + return Err(Error::External { + source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(), + }); + } + let res = create_parquet_file(&mut stage, file.clone())?; ids.push(res.id); } - *self.inner = stage; + *collections = stage; Ok(ids) } @@ -1032,9 +943,88 @@ fn filter_namespace_soft_delete<'a>( }) } +fn create_or_get_column( + stage: &mut MemCollections, + name: &str, + table_id: TableId, + column_type: ColumnType, +) -> Result { + // this block is just to ensure the mem impl correctly creates ColumnCreateLimitError in + // tests, we don't care about any of the errors it is discarding + stage + .tables + .iter() + .find(|t| t.id == table_id) + .cloned() + .ok_or(Error::NotFound { + descr: format!("table: {}", table_id), + }) // error never used, this is just for flow control + .and_then(|t| { + stage + .namespaces + .iter() + .find(|n| n.id == t.namespace_id) + .cloned() + .ok_or_else(|| Error::NotFound { + // we're never going to use this error, this is just for flow control, + // so it doesn't matter that we only have the ID, not the name + descr: "".to_string(), + }) + .and_then(|n| { + let max_columns_per_table = n.max_columns_per_table; + let columns_count = stage + .columns + .iter() + .filter(|t| t.table_id == table_id) + .count(); + if columns_count >= max_columns_per_table.get() { + return Err(Error::LimitExceeded { + descr: format!( + "couldn't create column {} in table {}; limit reached on namespace", + name, table_id + ), + }); + } + Ok(()) + })?; + Ok(()) + })?; + + let column = match stage + .columns + .iter() + .find(|t| t.name == name && t.table_id == table_id) + { + Some(c) => { + ensure!( + column_type == c.column_type, + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + name, c.column_type, column_type + ), + } + ); + c + } + None => { + let column = Column { + id: ColumnId::new(stage.columns.len() as i64 + 1), + table_id, + name: name.to_string(), + column_type, + }; + stage.columns.push(column); + stage.columns.last().unwrap() + } + }; + + Ok(column.clone()) +} + // The following three functions are helpers to the create_upgrade_delete method. // They are also used by the respective create/flag_for_delete/update_compaction_level methods. -async fn create_parquet_file( +fn create_parquet_file( stage: &mut MemCollections, parquet_file_params: ParquetFileParams, ) -> Result { @@ -1043,8 +1033,8 @@ async fn create_parquet_file( .iter() .any(|f| f.object_store_id == parquet_file_params.object_store_id) { - return Err(Error::FileExists { - object_store_id: parquet_file_params.object_store_id, + return Err(Error::AlreadyExists { + descr: parquet_file_params.object_store_id.to_string(), }); } @@ -1053,47 +1043,73 @@ async fn create_parquet_file( ParquetFileId::new(stage.parquet_files.len() as i64 + 1), ); let created_at = parquet_file.created_at; - let partition_id = parquet_file.partition_id.clone(); + let partition_id = parquet_file.partition_id; stage.parquet_files.push(parquet_file); // Update the new_file_at field its partition to the time of created_at let partition = stage .partitions .iter_mut() - .find(|p| p.transition_partition_id() == partition_id) - .ok_or(Error::PartitionNotFound { id: partition_id })?; + .find(|p| p.id == partition_id) + .ok_or(Error::NotFound { + descr: partition_id.to_string(), + })?; partition.new_file_at = Some(created_at); Ok(stage.parquet_files.last().unwrap().clone()) } -async fn flag_for_delete( +fn flag_for_delete( stage: &mut MemCollections, - id: ParquetFileId, + partition_id: PartitionId, + id: ObjectStoreId, marked_at: Timestamp, ) -> Result<()> { - match stage.parquet_files.iter_mut().find(|p| p.id == id) { - Some(f) => f.to_delete = Some(marked_at), - None => return Err(Error::ParquetRecordNotFound { id }), + match stage + .parquet_files + .iter_mut() + .find(|p| p.object_store_id == id && p.partition_id == partition_id) + { + Some(f) if f.to_delete.is_none() => f.to_delete = Some(marked_at), + _ => { + return Err(Error::NotFound { + descr: format!("parquet file {id} not found for delete"), + }) + } } Ok(()) } -async fn update_compaction_level( +fn update_compaction_level( stage: &mut MemCollections, - parquet_file_ids: &[ParquetFileId], + partition_id: PartitionId, + object_store_ids: &[ObjectStoreId], compaction_level: CompactionLevel, -) -> Result> { - let mut updated = Vec::with_capacity(parquet_file_ids.len()); +) -> Result> { + let all_ids = stage + .parquet_files + .iter() + .filter(|f| f.partition_id == partition_id && f.to_delete.is_none()) + .map(|f| f.object_store_id) + .collect::>(); + for id in object_store_ids { + if !all_ids.contains(id) { + return Err(Error::NotFound { + descr: format!("parquet file {id} not found for upgrade"), + }); + } + } + let update_ids = object_store_ids.iter().copied().collect::>(); + let mut updated = Vec::with_capacity(object_store_ids.len()); for f in stage .parquet_files .iter_mut() - .filter(|p| parquet_file_ids.contains(&p.id)) + .filter(|p| update_ids.contains(&p.object_store_id) && p.partition_id == partition_id) { f.compaction_level = compaction_level; - updated.push(f.id); + updated.push(f.object_store_id); } Ok(updated) @@ -1101,14 +1117,17 @@ async fn update_compaction_level( #[cfg(test)] mod tests { + use iox_time::SystemProvider; + use super::*; use std::sync::Arc; #[tokio::test] async fn test_catalog() { - crate::interface::test_helpers::test_catalog(|| async { + crate::interface_tests::test_catalog(|| async { let metrics = Arc::new(metric::Registry::default()); - let x: Arc = Arc::new(MemCatalog::new(metrics)); + let time_provider = Arc::new(SystemProvider::new()); + let x: Arc = Arc::new(MemCatalog::new(metrics, time_provider)); x }) .await; diff --git a/iox_catalog/src/metrics.rs b/iox_catalog/src/metrics.rs index e02646705fa..b179fd3cb24 100644 --- a/iox_catalog/src/metrics.rs +++ b/iox_catalog/src/metrics.rs @@ -5,17 +5,18 @@ use crate::interface::{ SoftDeletedRows, TableRepo, }; use async_trait::async_trait; +use data_types::snapshot::table::TableSnapshot; use data_types::{ partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, + snapshot::partition::PartitionSnapshot, Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId, - NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId, - ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, - SortedColumnSet, Table, TableId, Timestamp, TransitionPartitionId, + NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile, + ParquetFileId, ParquetFileParams, Partition, PartitionId, PartitionKey, SkippedCompaction, + SortKeyIds, Table, TableId, Timestamp, }; -use iox_time::{SystemProvider, TimeProvider}; +use iox_time::TimeProvider; use metric::{DurationHistogram, Metric}; use std::{collections::HashMap, fmt::Debug, sync::Arc}; -use uuid::Uuid; /// Decorates a implementation of the catalog's [`RepoCollection`] (and the /// transactional variant) with instrumentation that emits latency histograms @@ -24,27 +25,30 @@ use uuid::Uuid; /// Values are recorded under the `catalog_op_duration` metric, labelled by /// operation name and result (success/error). #[derive(Debug)] -pub struct MetricDecorator { +pub struct MetricDecorator { inner: T, - time_provider: P, + time_provider: Arc, metrics: Arc, } impl MetricDecorator { /// Wrap `T` with instrumentation recording operation latency in `metrics`. - pub fn new(inner: T, metrics: Arc) -> Self { + pub fn new( + inner: T, + metrics: Arc, + time_provider: Arc, + ) -> Self { Self { inner, - time_provider: Default::default(), + time_provider, metrics, } } } -impl RepoCollection for MetricDecorator +impl RepoCollection for MetricDecorator where T: NamespaceRepo + TableRepo + ColumnRepo + PartitionRepo + ParquetFileRepo + Debug, - P: TimeProvider, { fn namespaces(&mut self) -> &mut dyn NamespaceRepo { self @@ -97,7 +101,7 @@ macro_rules! decorate { )+] ) => { #[async_trait] - impl $trait for MetricDecorator { + impl $trait for MetricDecorator { /// NOTE: if you're seeing an error here about "not all trait items /// implemented" or something similar, one or more methods are /// missing from / incorrectly defined in the decorate!() blocks @@ -152,6 +156,7 @@ decorate!( "table_get_by_namespace_and_name" = get_by_namespace_and_name(&mut self, namespace_id: NamespaceId, name: &str) -> Result>; "table_list_by_namespace_id" = list_by_namespace_id(&mut self, namespace_id: NamespaceId) -> Result>; "table_list" = list(&mut self) -> Result>; + "table_snapshot" = snapshot(&mut self, table_id: TableId) -> Result; ] ); @@ -170,13 +175,10 @@ decorate!( impl_trait = PartitionRepo, methods = [ "partition_create_or_get" = create_or_get(&mut self, key: PartitionKey, table_id: TableId) -> Result; - "partition_get_by_id" = get_by_id(&mut self, partition_id: PartitionId) -> Result>; - "partition_get_by_id_batch" = get_by_id_batch(&mut self, partition_ids: Vec) -> Result>; - "partition_get_by_hash_id" = get_by_hash_id(&mut self, partition_hash_id: &PartitionHashId) -> Result>; - "partition_get_by_hash_id_batch" = get_by_hash_id_batch(&mut self, partition_hash_ids: &[&PartitionHashId]) -> Result>; + "partition_get_by_id_batch" = get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result>; "partition_list_by_table_id" = list_by_table_id(&mut self, table_id: TableId) -> Result>; "partition_list_ids" = list_ids(&mut self) -> Result>; - "partition_update_sort_key" = cas_sort_key(&mut self, partition_id: &TransitionPartitionId, old_sort_key: Option>, old_sort_key_ids: Option, new_sort_key: &[&str], new_sort_key_ids: &SortedColumnSet) -> Result, SortedColumnSet)>>; + "partition_update_sort_key" = cas_sort_key(&mut self, partition_id: PartitionId, old_sort_key_ids: Option<&SortKeyIds>, new_sort_key_ids: &SortKeyIds) -> Result>; "partition_record_skipped_compaction" = record_skipped_compaction(&mut self, partition_id: PartitionId, reason: &str, num_files: usize, limit_num_files: usize, limit_num_files_first_in_partition: usize, estimated_bytes: u64, limit_bytes: u64) -> Result<()>; "partition_list_skipped_compactions" = list_skipped_compactions(&mut self) -> Result>; "partition_delete_skipped_compactions" = delete_skipped_compactions(&mut self, partition_id: PartitionId) -> Result>; @@ -184,21 +186,18 @@ decorate!( "partition_partitions_new_file_between" = partitions_new_file_between(&mut self, minimum_time: Timestamp, maximum_time: Option) -> Result>; "partition_get_in_skipped_compactions" = get_in_skipped_compactions(&mut self, partition_ids: &[PartitionId]) -> Result>; "partition_list_old_style" = list_old_style(&mut self) -> Result>; + "partition_snapshot" = snapshot(&mut self, partition_id: PartitionId) -> Result; ] ); decorate!( impl_trait = ParquetFileRepo, methods = [ - "parquet_create" = create(&mut self, parquet_file_params: ParquetFileParams) -> Result; - "parquet_list_all" = list_all(&mut self) -> Result>; - "parquet_flag_for_delete_by_retention" = flag_for_delete_by_retention(&mut self) -> Result>; - "parquet_list_by_namespace_not_to_delete" = list_by_namespace_not_to_delete(&mut self, namespace_id: NamespaceId) -> Result>; - "parquet_list_by_table_not_to_delete" = list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result>; - "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result>; - "parquet_list_by_partition_not_to_delete" = list_by_partition_not_to_delete(&mut self, partition_id: &TransitionPartitionId) -> Result>; - "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: Uuid) -> Result>; - "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec) -> Result>; - "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, delete: &[ParquetFileId], upgrade: &[ParquetFileId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result>; + "parquet_flag_for_delete_by_retention" = flag_for_delete_by_retention(&mut self) -> Result>; + "parquet_delete_old_ids_only" = delete_old_ids_only(&mut self, older_than: Timestamp) -> Result>; + "parquet_list_by_partition_not_to_delete_batch" = list_by_partition_not_to_delete_batch(&mut self, partition_ids: Vec) -> Result>; + "parquet_get_by_object_store_id" = get_by_object_store_id(&mut self, object_store_id: ObjectStoreId) -> Result>; + "parquet_exists_by_object_store_id_batch" = exists_by_object_store_id_batch(&mut self, object_store_ids: Vec) -> Result>; + "parquet_create_upgrade_delete" = create_upgrade_delete(&mut self, partition_id: PartitionId, delete: &[ObjectStoreId], upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel) -> Result>; ] ); diff --git a/iox_catalog/src/migrate.rs b/iox_catalog/src/migrate.rs index ac38fb6dbdf..5bbf9635206 100644 --- a/iox_catalog/src/migrate.rs +++ b/iox_catalog/src/migrate.rs @@ -537,11 +537,11 @@ impl TryFrom<&Migrator> for IOxMigrator { } } -/// Validate an already-applied migration. +/// Validate already-applied migrations /// /// Checks that: /// -/// - applied migration is known +/// - all applied migrations are known or all known migrations are applied /// - checksum of applied migration and known migration match /// - new migrations are newer than both the successfully applied and the dirty version /// - there is at most one dirty migration (bug check) @@ -552,9 +552,18 @@ fn validate_applied_migrations( ) -> Result<(), MigrateError> { let migrations: HashMap<_, _> = migrator.migrations.iter().map(|m| (m.version, m)).collect(); - for applied_migration in applied_migrations { + let mut dirty_version = None; + for (idx, applied_migration) in applied_migrations.iter().enumerate() { match migrations.get(&applied_migration.version) { None => { + if idx == migrations.len() && dirty_version.is_none() { + // All migrations in `migrator` have been applied + // We therefore continue as this should not prevent startup + // if there are no local migrations to apply + warn!("found applied migrations not present locally, but all local migrations applied - continuing"); + return Ok(()); + } + return Err(MigrateError::VersionMissing(applied_migration.version)); } Some(migration) => { @@ -564,7 +573,15 @@ fn validate_applied_migrations( { return Err(MigrateError::VersionMismatch(migration.version)); } + if applied_migration.dirty { + if let Some(first) = dirty_version { + return Err(MigrateError::Source(format!( + "there are multiple dirty versions, this should not happen and is considered a bug: {:?}", + &[first, migration.version], + ).into())); + } + dirty_version = Some(migration.version); warn!( version = migration.version, "found dirty migration, trying to recover" @@ -574,19 +591,6 @@ fn validate_applied_migrations( } } - let dirty_versions = applied_migrations - .iter() - .filter(|m| m.dirty) - .map(|m| m.version) - .collect::>(); - if dirty_versions.len() > 1 { - return Err(MigrateError::Source(format!( - "there are multiple dirty versions, this should not happen and is considered a bug: {:?}", - dirty_versions, - ).into())); - } - let dirty_version = dirty_versions.into_iter().next(); - let applied_last = applied_migrations .iter() .filter(|m| Some(m.version) != dirty_version) @@ -2273,6 +2277,42 @@ mod tests { ); } + #[tokio::test] + async fn test_migrator_allows_unknown_migrations_if_they_are_clean() { + maybe_skip_integration!(); + let mut conn = setup().await; + let conn = &mut *conn; + + let migrator_1 = IOxMigrator::try_new([ + IOxMigration { + version: 1, + description: "".into(), + steps: [].into(), + checksum: [1, 2, 3].into(), + other_compatible_checksums: [].into(), + }, + IOxMigration { + version: 2, + description: "".into(), + steps: [].into(), + checksum: [4, 5, 6].into(), + other_compatible_checksums: [].into(), + }, + ]) + .unwrap(); + let migrator_2 = IOxMigrator::try_new([IOxMigration { + version: 1, + description: "".into(), + steps: [].into(), + checksum: [1, 2, 3].into(), + other_compatible_checksums: [].into(), + }]) + .unwrap(); + + migrator_1.run_direct(conn).await.unwrap(); + migrator_2.run_direct(conn).await.unwrap(); + } + #[tokio::test] async fn test_tester_finds_invalid_migration() { maybe_skip_integration!(); diff --git a/iox_catalog/src/postgres.rs b/iox_catalog/src/postgres.rs index 1744a532ffe..ef9c5d28070 100644 --- a/iox_catalog/src/postgres.rs +++ b/iox_catalog/src/postgres.rs @@ -1,29 +1,28 @@ //! A Postgres backed implementation of the Catalog -use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE; +use crate::interface::PartitionRepoExt; use crate::{ - interface::{ - self, CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo, - ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, - MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, + constants::{ + MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, }, - kafkaless_transition::{ - SHARED_QUERY_POOL, SHARED_QUERY_POOL_ID, SHARED_TOPIC_ID, SHARED_TOPIC_NAME, - TRANSITION_SHARD_ID, TRANSITION_SHARD_INDEX, + interface::{ + AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, + PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, }, metrics::MetricDecorator, migrate::IOxMigrator, }; use async_trait::async_trait; -use data_types::SortedColumnSet; +use data_types::snapshot::partition::PartitionSnapshot; +use data_types::snapshot::table::TableSnapshot; use data_types::{ partition_template::{ NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart, }, Column, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceId, - NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, ParquetFileId, - ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, SkippedCompaction, - Table, TableId, Timestamp, TransitionPartitionId, + NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, ParquetFile, + ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, + SkippedCompaction, SortKeyIds, Table, TableId, Timestamp, }; use iox_time::{SystemProvider, TimeProvider}; use metric::{Attributes, Instrument, MetricKind}; @@ -33,14 +32,21 @@ use parking_lot::{RwLock, RwLockWriteGuard}; use snafu::prelude::*; use sqlx::{ postgres::{PgConnectOptions, PgPoolOptions}, - types::Uuid, Acquire, ConnectOptions, Executor, Postgres, Row, }; use sqlx_hotswap_pool::HotSwapPool; -use std::borrow::Cow; -use std::collections::HashSet; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::{collections::HashMap, fmt::Display, str::FromStr, sync::Arc, time::Duration}; +use std::{ + borrow::Cow, + collections::{HashMap, HashSet}, + env, + fmt::Display, + str::FromStr, + sync::{ + atomic::{AtomicU64, Ordering}, + Arc, + }, + time::Duration, +}; static MIGRATOR: Lazy = Lazy::new(|| IOxMigrator::try_from(&sqlx::migrate!()).expect("valid migration")); @@ -122,9 +128,7 @@ impl PostgresCatalog { options: PostgresConnectionOptions, metrics: Arc, ) -> Result { - let pool = new_pool(&options, Arc::clone(&metrics)) - .await - .map_err(|e| Error::SqlxError { source: e })?; + let pool = new_pool(&options, Arc::clone(&metrics)).await?; Ok(Self { pool, @@ -243,67 +247,14 @@ impl Catalog for PostgresCatalog { // This makes the migrations/20210217134322_create_schema.sql step unnecessary; we need to // keep that file because migration files are immutable. let create_schema_query = format!("CREATE SCHEMA IF NOT EXISTS {};", self.schema_name()); - self.pool - .execute(sqlx::query(&create_schema_query)) - .await - .map_err(|e| Error::Setup { source: e })?; - - MIGRATOR - .run(&self.pool) - .await - .map_err(|e| Error::Setup { source: e.into() })?; - - // We need to manually insert the topic here so that we can create the transition shard - // below. - sqlx::query( - r#" -INSERT INTO topic (name) -VALUES ($1) -ON CONFLICT ON CONSTRAINT topic_name_unique -DO NOTHING; - "#, - ) - .bind(SHARED_TOPIC_NAME) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; - - // The transition shard must exist and must have magic ID and INDEX. - sqlx::query( - r#" -INSERT INTO shard (id, topic_id, shard_index, min_unpersisted_sequence_number) -OVERRIDING SYSTEM VALUE -VALUES ($1, $2, $3, 0) -ON CONFLICT ON CONSTRAINT shard_unique -DO NOTHING; - "#, - ) - .bind(TRANSITION_SHARD_ID) - .bind(SHARED_TOPIC_ID) - .bind(TRANSITION_SHARD_INDEX) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; + self.pool.execute(sqlx::query(&create_schema_query)).await?; - // We need to manually insert the query pool here so that we can create namespaces that - // reference it. - sqlx::query( - r#" -INSERT INTO query_pool (name) -VALUES ($1) -ON CONFLICT ON CONSTRAINT query_pool_name_unique -DO NOTHING; - "#, - ) - .bind(SHARED_QUERY_POOL) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; + MIGRATOR.run(&self.pool).await?; Ok(()) } - async fn repositories(&self) -> Box { + fn repositories(&self) -> Box { Box::new(MetricDecorator::new( PostgresTxn { inner: PostgresTxnInner { @@ -312,6 +263,7 @@ DO NOTHING; time_provider: Arc::clone(&self.time_provider), }, Arc::clone(&self.metrics), + Arc::clone(&self.time_provider), )) } @@ -453,10 +405,17 @@ async fn new_raw_pool( metrics: PoolMetrics, ) -> Result, sqlx::Error> { // sqlx exposes some options as pool options, while other options are available as connection options. - let connect_options = PgConnectOptions::from_str(parsed_dsn)? + let mut connect_options = PgConnectOptions::from_str(parsed_dsn)? // the default is INFO, which is frankly surprising. .log_statements(log::LevelFilter::Trace); + // Workaround sqlx ignoring the SSL_CERT_FILE environment variable. + // Remove workaround when upstream sqlx handles SSL_CERT_FILE properly (#8994). + let cert_file = env::var("SSL_CERT_FILE").unwrap_or_default(); + if !cert_file.is_empty() { + connect_options = connect_options.ssl_root_cert(cert_file); + } + let app_name = options.app_name.clone(); let app_name2 = options.app_name.clone(); // just to log below let schema_name = options.schema_name.clone(); @@ -610,7 +569,6 @@ fn get_dsn_file_path(dsn: &str) -> Option { .then(|| dsn[DSN_SCHEME.len()..].to_owned()) } -#[async_trait] impl RepoCollection for PostgresTxn { fn namespaces(&mut self) -> &mut dyn NamespaceRepo { self @@ -663,24 +621,24 @@ RETURNING *; .fetch_one(executor) .await .map_err(|e| match e { - sqlx::Error::RowNotFound => Error::ColumnCreateLimitError { - column_name: name.to_string(), - table_id, + sqlx::Error::RowNotFound => Error::LimitExceeded { + descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id) }, _ => { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { descr: e.to_string() } } else { - Error::SqlxError { source: e } + Error::External { source: Box::new(e) } } }})?; ensure!( rec.column_type == column_type, - ColumnTypeMismatchSnafu { - name, - existing: rec.column_type, - new: column_type, + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + name, rec.column_type, column_type + ), } ); @@ -706,30 +664,32 @@ impl NamespaceRepo for PostgresTxn { let rec = sqlx::query_as::<_, Namespace>( r#" INSERT INTO namespace ( - name, topic_id, query_pool_id, retention_period_ns, max_tables, max_columns_per_table, partition_template + name, retention_period_ns, max_tables, max_columns_per_table, partition_template ) -VALUES ( $1, $2, $3, $4, $5, $6, $7 ) +VALUES ( $1, $2, $3, $4, $5 ) RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at, partition_template; "#, ) .bind(name.as_str()) // $1 - .bind(SHARED_TOPIC_ID) // $2 - .bind(SHARED_QUERY_POOL_ID) // $3 - .bind(retention_period_ns) // $4 - .bind(max_tables) // $5 - .bind(max_columns_per_table) // $6 - .bind(partition_template); // $7 + .bind(retention_period_ns) // $2 + .bind(max_tables) // $3 + .bind(max_columns_per_table) // $4 + .bind(partition_template); // $5 let rec = rec.fetch_one(&mut self.inner).await.map_err(|e| { if is_unique_violation(&e) { - Error::NameExists { - name: name.to_string(), + Error::AlreadyExists { + descr: name.to_string(), } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; @@ -750,8 +710,7 @@ WHERE {v}; .as_str(), ) .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -781,7 +740,7 @@ WHERE id=$1 AND {v}; return Ok(None); } - let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + let namespace = rec?; Ok(Some(namespace)) } @@ -811,7 +770,7 @@ WHERE name=$1 AND {v}; return Ok(None); } - let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + let namespace = rec?; Ok(Some(namespace)) } @@ -825,7 +784,7 @@ WHERE name=$1 AND {v}; .bind(name) // $2 .execute(&mut self.inner) .await - .context(interface::CouldNotDeleteNamespaceSnafu) + .map_err(Error::from) .map(|_| ()) } @@ -845,10 +804,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -874,10 +835,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -903,10 +866,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -921,12 +886,7 @@ impl TableRepo for PostgresTxn { partition_template: TablePartitionTemplateOverride, namespace_id: NamespaceId, ) -> Result
{ - let mut tx = self - .inner - .pool - .begin() - .await - .map_err(|e| Error::StartTransaction { source: e })?; + let mut tx = self.inner.pool.begin().await?; // A simple insert statement becomes quite complicated in order to avoid checking the table // limits in a select and then conditionally inserting (which would be racey). @@ -955,20 +915,25 @@ RETURNING *; .fetch_one(&mut *tx) .await .map_err(|e| match e { - sqlx::Error::RowNotFound => Error::TableCreateLimitError { - table_name: name.to_string(), - namespace_id, + sqlx::Error::RowNotFound => Error::LimitExceeded { + descr: format!( + "couldn't create table {}; limit reached on namespace {}", + name, namespace_id + ), }, _ => { if is_unique_violation(&e) { - Error::TableNameExists { - name: name.to_string(), - namespace_id, + Error::AlreadyExists { + descr: format!("table '{name}' in namespace {namespace_id}"), } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } } })?; @@ -984,9 +949,7 @@ RETURNING *; } } - tx.commit() - .await - .map_err(|source| Error::FailedToCommit { source })?; + tx.commit().await?; Ok(table) } @@ -1007,7 +970,7 @@ WHERE id = $1; return Ok(None); } - let table = rec.map_err(|e| Error::SqlxError { source: e })?; + let table = rec?; Ok(Some(table)) } @@ -1033,7 +996,7 @@ WHERE namespace_id = $1 AND name = $2; return Ok(None); } - let table = rec.map_err(|e| Error::SqlxError { source: e })?; + let table = rec?; Ok(Some(table)) } @@ -1048,8 +1011,7 @@ WHERE namespace_id = $1; ) .bind(namespace_id) .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -1057,11 +1019,52 @@ WHERE namespace_id = $1; async fn list(&mut self) -> Result> { let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;") .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } + + async fn snapshot(&mut self, table_id: TableId) -> Result { + let mut tx = self.inner.pool.begin().await?; + let rec = sqlx::query_as::<_, Table>("SELECT * from table_name WHERE id = $1 FOR UPDATE;") + .bind(table_id) // $1 + .fetch_one(&mut *tx) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Err(Error::NotFound { + descr: format!("table: {table_id}"), + }); + } + let table = rec?; + + let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;") + .bind(table_id) // $1 + .fetch_all(&mut *tx) + .await?; + + let partitions = + sqlx::query_as::<_, Partition>(r#"SELECT * FROM partition WHERE table_id = $1;"#) + .bind(table_id) // $1 + .fetch_all(&mut *tx) + .await?; + + let (generation,): (i64,) = sqlx::query_as( + "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING generation;", + ) + .bind(table_id) // $1 + .fetch_one(&mut *tx) + .await?; + + tx.commit().await?; + + Ok(TableSnapshot::encode( + table, + partitions, + columns, + generation as _, + )?) + } } #[async_trait] @@ -1085,8 +1088,7 @@ WHERE table_name.namespace_id = $1; ) .bind(namespace_id) .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -1100,8 +1102,7 @@ WHERE table_id = $1; ) .bind(table_id) .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -1109,8 +1110,7 @@ WHERE table_id = $1; async fn list(&mut self) -> Result> { let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;") .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -1150,9 +1150,13 @@ RETURNING *; .await .map_err(|e| { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; @@ -1162,10 +1166,11 @@ RETURNING *; let want = columns.get(existing.name.as_str()).unwrap(); ensure!( existing.column_type == *want, - ColumnTypeMismatchSnafu { - name: &existing.name, - existing: existing.column_type, - new: *want, + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + existing.name, existing.column_type, want + ), } ); } @@ -1182,58 +1187,52 @@ impl PartitionRepo for PostgresTxn { let v = sqlx::query_as::<_, Partition>( r#" INSERT INTO partition - (partition_key, shard_id, table_id, hash_id, sort_key, sort_key_ids) + (partition_key, table_id, hash_id, sort_key_ids) VALUES - ( $1, $2, $3, $4, '{}', '{}') + ( $1, $2, $3, '{}') ON CONFLICT ON CONSTRAINT partition_key_unique DO UPDATE SET partition_key = partition.partition_key -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, ) - .bind(key) // $1 - .bind(TRANSITION_SHARD_ID) // $2 - .bind(table_id) // $3 - .bind(&hash_id) // $4 + .bind(&key) // $1 + .bind(table_id) // $2 + .bind(&hash_id) // $3 .fetch_one(&mut self.inner) .await .map_err(|e| { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } + } else if is_unique_violation(&e) { + // Logging more information to diagnose a production issue maybe + warn!( + error=?e, + %table_id, + %key, + %hash_id, + "possible duplicate partition_hash_id?" + ); + Error::External { + source: Box::new(e), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; Ok(v) } - async fn get_by_id(&mut self, partition_id: PartitionId) -> Result> { - let rec = sqlx::query_as::<_, Partition>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE id = $1; - "#, - ) - .bind(partition_id) // $1 - .fetch_one(&mut self.inner) - .await; - - if let Err(sqlx::Error::RowNotFound) = rec { - return Ok(None); - } - - let partition = rec.map_err(|e| Error::SqlxError { source: e })?; - - Ok(Some(partition)) - } - - async fn get_by_id_batch(&mut self, partition_ids: Vec) -> Result> { + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result> { let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect(); sqlx::query_as::<_, Partition>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE id = ANY($1); "#, @@ -1241,56 +1240,13 @@ WHERE id = ANY($1); .bind(&ids[..]) // $1 .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) - } - - async fn get_by_hash_id( - &mut self, - partition_hash_id: &PartitionHashId, - ) -> Result> { - let rec = sqlx::query_as::<_, Partition>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE hash_id = $1; - "#, - ) - .bind(partition_hash_id) // $1 - .fetch_one(&mut self.inner) - .await; - - if let Err(sqlx::Error::RowNotFound) = rec { - return Ok(None); - } - - let partition = rec.map_err(|e| Error::SqlxError { source: e })?; - - Ok(Some(partition)) - } - - async fn get_by_hash_id_batch( - &mut self, - partition_ids: &[&PartitionHashId], - ) -> Result> { - let ids: Vec<_> = partition_ids.iter().map(|p| p.as_bytes()).collect(); - - sqlx::query_as::<_, Partition>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE hash_id = ANY($1); - "#, - ) - .bind(&ids[..]) // $1 - .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { sqlx::query_as::<_, Partition>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE table_id = $1; "#, @@ -1298,7 +1254,7 @@ WHERE table_id = $1; .bind(table_id) // $1 .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn list_ids(&mut self) -> Result> { @@ -1310,7 +1266,7 @@ WHERE table_id = $1; ) .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } /// Update the sort key for `partition_id` if and only if `old_sort_key` @@ -1321,52 +1277,26 @@ WHERE table_id = $1; /// round trips to service a transaction in the happy path). async fn cas_sort_key( &mut self, - partition_id: &TransitionPartitionId, - old_sort_key: Option>, - old_sort_key_ids: Option, - new_sort_key: &[&str], - new_sort_key_ids: &SortedColumnSet, - ) -> Result, SortedColumnSet)>> { - // These asserts are here to cacth bugs. They will be removed when we remove the sort_key - // field from the Partition - assert_eq!( - old_sort_key.as_ref().map(|v| v.len()), - old_sort_key_ids.as_ref().map(|v| v.len()) - ); - assert_eq!(new_sort_key.len(), new_sort_key_ids.len()); - - let old_sort_key = old_sort_key.unwrap_or_default(); - let old_sort_key_ids = old_sort_key_ids.unwrap_or_default(); + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result> { + let old_sort_key_ids = old_sort_key_ids + .map(std::ops::Deref::deref) + .unwrap_or_default(); // This `match` will go away when all partitions have hash IDs in the database. - let query = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, Partition>( - r#" -UPDATE partition -SET sort_key = $1, sort_key_ids = $4 -WHERE hash_id = $2 AND sort_key = $3 AND sort_key_ids = $5 -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; - "#, - ) - .bind(new_sort_key) // $1 - .bind(hash_id) // $2 - .bind(&old_sort_key) // $3 - .bind(new_sort_key_ids) // $4 - .bind(old_sort_key_ids), // $5 - TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, Partition>( - r#" + let query = sqlx::query_as::<_, Partition>( + r#" UPDATE partition -SET sort_key = $1, sort_key_ids = $4 -WHERE id = $2 AND sort_key = $3 AND sort_key_ids = $5 -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +SET sort_key_ids = $1 +WHERE id = $2 AND sort_key_ids = $3 +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, - ) - .bind(new_sort_key) // $1 - .bind(id) // $2 - .bind(&old_sort_key) // $3 - .bind(new_sort_key_ids) // $4 - .bind(old_sort_key_ids), // $5 - }; + ) + .bind(new_sort_key_ids) // $1 + .bind(partition_id) // $2 + .bind(old_sort_key_ids); // $3; let res = query.fetch_one(&mut self.inner).await; @@ -1384,24 +1314,26 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file // // NOTE: this is racy, but documented - this might return "Sort // key differs! Old key: " - let partition = crate::partition_lookup(self, partition_id) + let partition = (self as &mut dyn PartitionRepo) + .get_by_id(partition_id) .await .map_err(CasFailure::QueryError)? - .ok_or(CasFailure::QueryError(Error::PartitionNotFound { - id: partition_id.clone(), + .ok_or(CasFailure::QueryError(Error::NotFound { + descr: partition_id.to_string(), }))?; - return Err(CasFailure::ValueMismatch(( - partition.sort_key, - partition.sort_key_ids, - ))); + return Err(CasFailure::ValueMismatch( + partition.sort_key_ids().cloned().unwrap_or_default(), + )); + } + Err(e) => { + return Err(CasFailure::QueryError(Error::External { + source: Box::new(e), + })) } - Err(e) => return Err(CasFailure::QueryError(Error::SqlxError { source: e })), }; debug!( ?partition_id, - ?old_sort_key, - ?new_sort_key, ?new_sort_key_ids, "partition sort key cas successful" ); @@ -1445,8 +1377,7 @@ skipped_at = EXCLUDED.skipped_at; .bind(estimated_bytes as i64) .bind(limit_bytes as i64) .execute(&mut self.inner) - .await - .context(interface::CouldNotRecordSkippedCompactionSnafu { partition_id })?; + .await?; Ok(()) } @@ -1465,7 +1396,7 @@ skipped_at = EXCLUDED.skipped_at; return Ok(Vec::new()); } - let skipped_partition_records = rec.map_err(|e| Error::SqlxError { source: e })?; + let skipped_partition_records = rec?; Ok(skipped_partition_records) } @@ -1478,7 +1409,7 @@ SELECT * FROM skipped_compactions ) .fetch_all(&mut self.inner) .await - .context(interface::CouldNotListSkippedCompactionsSnafu) + .map_err(Error::from) } async fn delete_skipped_compactions( @@ -1495,15 +1426,13 @@ RETURNING * .bind(partition_id) .fetch_optional(&mut self.inner) .await - .context(interface::CouldNotDeleteSkippedCompactionsSnafu) + .map_err(Error::from) } async fn most_recent_n(&mut self, n: usize) -> Result> { sqlx::query_as( - // TODO: Carol has confirmed the persisted_sequence_number is not needed anywhere so let us remove it - // but in a seperate PR to ensure we don't break anything r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, persisted_sequence_number, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition ORDER BY id DESC LIMIT $1;"#, @@ -1511,7 +1440,7 @@ LIMIT $1;"#, .bind(n as i64) // $1 .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn partitions_new_file_between( @@ -1536,7 +1465,7 @@ LIMIT $1;"#, .bind(maximum_time) // $2 .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn list_old_style(&mut self) -> Result> { @@ -1549,49 +1478,72 @@ LIMIT $1;"#, // The load this query saves vastly outsizes the load this query causes. sqlx::query_as( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, persisted_sequence_number, - new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE hash_id IS NULL ORDER BY id DESC;"#, ) .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } -} -#[async_trait] -impl ParquetFileRepo for PostgresTxn { - async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result { - let executor = &mut self.inner; - let id = create_parquet_file(executor, &parquet_file_params).await?; - Ok(ParquetFile::from_params(parquet_file_params, id)) - } + async fn snapshot(&mut self, partition_id: PartitionId) -> Result { + let mut tx = self.inner.pool.begin().await?; - async fn list_all(&mut self) -> Result> { - sqlx::query_as::<_, ParquetFile>( - r#" -SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id, - parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id, - parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete, - parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level, - parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at -FROM parquet_file; - "#, + let rec = + sqlx::query_as::<_, Partition>("SELECT * from partition WHERE id = $1 FOR UPDATE;") + .bind(partition_id) // $1 + .fetch_one(&mut *tx) + .await; + if let Err(sqlx::Error::RowNotFound) = rec { + return Err(Error::NotFound { + descr: format!("partition: {partition_id}"), + }); + } + let partition = rec?; + + let files = + sqlx::query_as::<_, ParquetFile>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;") + .bind(partition_id) // $1 + .fetch_all(&mut *tx) + .await?; + + let sc = sqlx::query_as::<_, SkippedCompaction>( + r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#, ) - .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e }) + .bind(partition_id) // $1 + .fetch_optional(&mut *tx) + .await?; + + let (generation, namespace_id): (i64,NamespaceId) = sqlx::query_as( + "UPDATE partition SET generation = partition.generation + 1 from table_name where partition.id = $1 and table_name.id = partition.table_id RETURNING partition.generation, table_name.namespace_id;", + ) + .bind(partition_id) // $1 + .fetch_one(&mut *tx) + .await?; + + tx.commit().await?; + + Ok(PartitionSnapshot::encode( + namespace_id, + partition, + files, + sc, + generation as _, + )?) } +} - async fn flag_for_delete_by_retention(&mut self) -> Result> { +#[async_trait] +impl ParquetFileRepo for PostgresTxn { + async fn flag_for_delete_by_retention(&mut self) -> Result> { let flagged_at = Timestamp::from(self.time_provider.now()); // TODO - include check of table retention period once implemented let flagged = sqlx::query( r#" WITH parquet_file_ids as ( - SELECT parquet_file.id + SELECT parquet_file.object_store_id FROM namespace, parquet_file WHERE namespace.retention_period_ns IS NOT NULL AND parquet_file.to_delete IS NULL @@ -1601,127 +1553,72 @@ WITH parquet_file_ids as ( ) UPDATE parquet_file SET to_delete = $1 -WHERE id IN (SELECT id FROM parquet_file_ids) -RETURNING id; +WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids) +RETURNING partition_id, object_store_id; "#, ) .bind(flagged_at) // $1 .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2 .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; - let flagged = flagged.into_iter().map(|row| row.get("id")).collect(); + let flagged = flagged + .into_iter() + .map(|row| (row.get("partition_id"), row.get("object_store_id"))) + .collect(); Ok(flagged) } - async fn list_by_namespace_not_to_delete( - &mut self, - namespace_id: NamespaceId, - ) -> Result> { - sqlx::query_as::<_, ParquetFile>( - r#" -SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id, - parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id, - parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete, - parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level, - parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at -FROM parquet_file -INNER JOIN table_name on table_name.id = parquet_file.table_id -WHERE table_name.namespace_id = $1 - AND parquet_file.to_delete IS NULL; - "#, - ) - .bind(namespace_id) // $1 - .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e }) - } - - async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result> { - sqlx::query_as::<_, ParquetFile>( - r#" -SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, - min_time, max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, - column_set, max_l0_created_at -FROM parquet_file -WHERE table_id = $1 AND to_delete IS NULL; - "#, - ) - .bind(table_id) // $1 - .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e }) - } - - async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-postgres-ctes-to-the-rescue let deleted = sqlx::query( r#" WITH parquet_file_ids as ( - SELECT id + SELECT object_store_id FROM parquet_file WHERE to_delete < $1 LIMIT $2 ) DELETE FROM parquet_file -WHERE id IN (SELECT id FROM parquet_file_ids) -RETURNING id; +WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids) +RETURNING object_store_id; "#, ) .bind(older_than) // $1 .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2 .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; - let deleted = deleted.into_iter().map(|row| row.get("id")).collect(); + let deleted = deleted + .into_iter() + .map(|row| row.get("object_store_id")) + .collect(); Ok(deleted) } - async fn list_by_partition_not_to_delete( + async fn list_by_partition_not_to_delete_batch( &mut self, - partition_id: &TransitionPartitionId, + partition_ids: Vec, ) -> Result> { - // This `match` will go away when all partitions have hash IDs in the database. - let query = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFile>( - r#" -SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id, - object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count, - compaction_level, created_at, column_set, max_l0_created_at -FROM parquet_file -INNER JOIN partition -ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id -WHERE partition.hash_id = $1 - AND parquet_file.to_delete IS NULL; - "#, - ) - .bind(hash_id), // $1 - TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFile>( - r#" + sqlx::query_as::<_, ParquetFile>( + r#" SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set, max_l0_created_at FROM parquet_file -INNER JOIN partition -ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id -WHERE partition.id = $1 +WHERE parquet_file.partition_id = ANY($1) AND parquet_file.to_delete IS NULL; "#, - ) - .bind(id), // $1 - }; - - query - .fetch_all(&mut self.inner) - .await - .map_err(|e| Error::SqlxError { source: e }) + ) + .bind(partition_ids) // $1 + .fetch_all(&mut self.inner) + .await + .map_err(Error::from) } async fn get_by_object_store_id( &mut self, - object_store_id: Uuid, + object_store_id: ObjectStoreId, ) -> Result> { let rec = sqlx::query_as::<_, ParquetFile>( r#" @@ -1740,15 +1637,15 @@ WHERE object_store_id = $1; return Ok(None); } - let parquet_file = rec.map_err(|e| Error::SqlxError { source: e })?; + let parquet_file = rec?; Ok(Some(parquet_file)) } async fn exists_by_object_store_id_batch( &mut self, - object_store_ids: Vec, - ) -> Result> { + object_store_ids: Vec, + ) -> Result> { sqlx::query( // sqlx's readme suggests using PG's ANY operator instead of IN; see link below. // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query @@ -1759,48 +1656,48 @@ WHERE object_store_id = ANY($1); "#, ) .bind(object_store_ids) // $1 - .map(|pgr| pgr.get::("object_store_id")) + .map(|pgr| pgr.get::("object_store_id")) .fetch_all(&mut self.inner) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn create_upgrade_delete( &mut self, - delete: &[ParquetFileId], - upgrade: &[ParquetFileId], + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel, ) -> Result> { - let delete_set: HashSet<_> = delete.iter().map(|d| d.get()).collect(); - let upgrade_set: HashSet<_> = upgrade.iter().map(|u| u.get()).collect(); + let delete_set: HashSet<_> = delete.iter().map(|d| d.get_uuid()).collect(); + let upgrade_set: HashSet<_> = upgrade.iter().map(|u| u.get_uuid()).collect(); assert!( delete_set.is_disjoint(&upgrade_set), "attempted to upgrade a file scheduled for delete" ); - let mut tx = self - .inner - .pool - .begin() - .await - .map_err(|e| Error::StartTransaction { source: e })?; + let mut tx = self.inner.pool.begin().await?; let marked_at = Timestamp::from(self.time_provider.now()); - flag_for_delete(&mut *tx, delete, marked_at).await?; + flag_for_delete(&mut *tx, partition_id, delete, marked_at).await?; - update_compaction_level(&mut *tx, upgrade, target_level).await?; + update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?; let mut ids = Vec::with_capacity(create.len()); for file in create { - let id = create_parquet_file(&mut *tx, file).await?; + if file.partition_id != partition_id { + return Err(Error::External { + source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(), + }); + } + let id = create_parquet_file(&mut *tx, partition_id, file).await?; ids.push(id); } - tx.commit() - .await - .map_err(|source| Error::FailedToCommit { source })?; + tx.commit().await?; + Ok(ids) } } @@ -1809,6 +1706,7 @@ WHERE object_store_id = ANY($1); // They are also used by the respective create/flag_for_delete/update_compaction_level methods. async fn create_parquet_file<'q, E>( executor: E, + partition_id: PartitionId, parquet_file_params: &ParquetFileParams, ) -> Result where @@ -1817,7 +1715,8 @@ where let ParquetFileParams { namespace_id, table_id, - partition_id, + partition_id: _, + partition_hash_id, object_store_id, min_time, max_time, @@ -1829,46 +1728,43 @@ where max_l0_created_at, } = parquet_file_params; - let (partition_id, partition_hash_id) = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)), - TransitionPartitionId::Deprecated(id) => (Some(id), None), - }; - - let partition_hash_id_ref = &partition_hash_id.as_ref(); let query = sqlx::query_scalar::<_, ParquetFileId>( r#" INSERT INTO parquet_file ( - shard_id, table_id, partition_id, partition_hash_id, object_store_id, + table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, file_size_bytes, row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at ) -VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14 ) +VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 ) RETURNING id; "#, ) - .bind(TRANSITION_SHARD_ID) // $1 - .bind(table_id) // $2 - .bind(partition_id) // $3 - .bind(partition_hash_id_ref) // $4 - .bind(object_store_id) // $5 - .bind(min_time) // $6 - .bind(max_time) // $7 - .bind(file_size_bytes) // $8 - .bind(row_count) // $9 - .bind(compaction_level) // $10 - .bind(created_at) // $11 - .bind(namespace_id) // $12 - .bind(column_set) // $13 - .bind(max_l0_created_at); // $14 + .bind(table_id) // $1 + .bind(partition_id) // $2 + .bind(partition_hash_id.as_ref()) // $3 + .bind(object_store_id) // $4 + .bind(min_time) // $5 + .bind(max_time) // $6 + .bind(file_size_bytes) // $7 + .bind(row_count) // $8 + .bind(compaction_level) // $9 + .bind(created_at) // $10 + .bind(namespace_id) // $11 + .bind(column_set) // $12 + .bind(max_l0_created_at); // $13 let parquet_file_id = query.fetch_one(executor).await.map_err(|e| { if is_unique_violation(&e) { - Error::FileExists { - object_store_id: *object_store_id, + Error::AlreadyExists { + descr: object_store_id.to_string(), } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; @@ -1877,44 +1773,57 @@ RETURNING id; async fn flag_for_delete<'q, E>( executor: E, - ids: &[ParquetFileId], + partition_id: PartitionId, + ids: &[ObjectStoreId], marked_at: Timestamp, ) -> Result<()> where E: Executor<'q, Database = Postgres>, { - let query = sqlx::query(r#"UPDATE parquet_file SET to_delete = $1 WHERE id = ANY($2);"#) - .bind(marked_at) // $1 - .bind(ids); // $2 - query - .execute(executor) - .await - .map_err(|e| Error::SqlxError { source: e })?; + let updated = + sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id;"#) + .bind(marked_at) // $1 + .bind(ids) // $2 + .bind(partition_id) // $3 + .fetch_all(executor) + .await?; + + if updated.len() != ids.len() { + return Err(Error::NotFound { + descr: "parquet file(s) not found for delete".to_string(), + }); + } Ok(()) } async fn update_compaction_level<'q, E>( executor: E, - parquet_file_ids: &[ParquetFileId], + partition_id: PartitionId, + parquet_file_ids: &[ObjectStoreId], compaction_level: CompactionLevel, ) -> Result<()> where E: Executor<'q, Database = Postgres>, { - let query = sqlx::query( + let updated = sqlx::query_as::<_, (i64,)>( r#" UPDATE parquet_file SET compaction_level = $1 -WHERE id = ANY($2); +WHERE object_store_id = ANY($2) AND partition_id = $3 AND to_delete is NULL RETURNING id; "#, ) .bind(compaction_level) // $1 - .bind(parquet_file_ids); // $2 - query - .execute(executor) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .bind(parquet_file_ids) // $2 + .bind(partition_id) // $3 + .fetch_all(executor) + .await?; + + if updated.len() != parquet_file_ids.len() { + return Err(Error::NotFound { + descr: "parquet file(s) not found for upgrade".to_string(), + }); + } Ok(()) } @@ -1959,7 +1868,7 @@ pub(crate) mod test_utils { use rand::Rng; use sqlx::migrate::MigrateDatabase; - pub const TEST_DSN_ENV: &str = "TEST_INFLUXDB_IOX_CATALOG_DSN"; + pub(crate) const TEST_DSN_ENV: &str = "TEST_INFLUXDB_IOX_CATALOG_DSN"; /// Helper macro to skip tests if TEST_INTEGRATION and TEST_INFLUXDB_IOX_CATALOG_DSN environment /// variables are not set. @@ -2010,7 +1919,7 @@ pub(crate) mod test_utils { pub(crate) use maybe_skip_integration; - pub async fn create_db(dsn: &str) { + pub(crate) async fn create_db(dsn: &str) { // Create the catalog database if it doesn't exist if !Postgres::database_exists(dsn).await.unwrap() { // Ignore failure if another test has already created the database @@ -2018,7 +1927,7 @@ pub(crate) mod test_utils { } } - pub async fn setup_db_no_migration() -> PostgresCatalog { + pub(crate) async fn setup_db_no_migration() -> PostgresCatalog { // create a random schema for this particular pool let schema_name = { // use scope to make it clear to clippy / rust that `rng` is @@ -2030,7 +1939,9 @@ pub(crate) mod test_utils { .take(20) .map(char::from) .collect::() + .to_ascii_lowercase() }; + info!(schema_name, "test schema"); let metrics = Arc::new(metric::Registry::default()); let dsn = std::env::var("TEST_INFLUXDB_IOX_CATALOG_DSN").unwrap(); @@ -2068,7 +1979,7 @@ pub(crate) mod test_utils { pg } - pub async fn setup_db() -> PostgresCatalog { + pub(crate) async fn setup_db() -> PostgresCatalog { let pg = setup_db_no_migration().await; // Run the migrations against this random schema. pg.setup().await.expect("failed to initialise database"); @@ -2079,6 +1990,7 @@ pub(crate) mod test_utils { #[cfg(test)] mod tests { use super::*; + use crate::interface::ParquetFileRepoExt; use crate::{ postgres::test_utils::{ create_db, maybe_skip_integration, setup_db, setup_db_no_migration, @@ -2088,23 +2000,11 @@ mod tests { use assert_matches::assert_matches; use data_types::partition_template::TemplatePart; use generated_types::influxdata::iox::partition_template::v1 as proto; - use metric::{Attributes, DurationHistogram, Metric, Observation, RawReporter}; + use metric::{Observation, RawReporter}; use std::{io::Write, ops::Deref, sync::Arc, time::Instant}; use tempfile::NamedTempFile; use test_helpers::maybe_start_logging; - fn assert_metric_hit(metrics: &metric::Registry, name: &'static str) { - let histogram = metrics - .get_instrument::>("catalog_op_duration") - .expect("failed to read metric") - .get_observer(&Attributes::from(&[("op", name), ("result", "success")])) - .expect("failed to get observer") - .fetch(); - - let hit_count = histogram.sample_count(); - assert!(hit_count > 0, "metric did not record any calls"); - } - /// Small no-op test just to print out the migrations. /// /// This is helpful to look up migration checksums and debug parsing of the migration files. @@ -2159,7 +2059,7 @@ mod tests { let postgres: Arc = Arc::new(postgres); - crate::interface::test_helpers::test_catalog(|| async { + crate::interface_tests::test_catalog(|| async { // Clean the schema. pool .execute(format!("DROP SCHEMA {schema_name} CASCADE").as_str()) @@ -2191,55 +2091,6 @@ mod tests { .await; } - #[tokio::test] - async fn test_partition_create_or_get_idempotent() { - maybe_skip_integration!(); - - let postgres = setup_db().await; - let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; - - let namespace = arbitrary_namespace(&mut *repos, "ns4").await; - let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id; - - let key = PartitionKey::from("bananas"); - - let hash_id = PartitionHashId::new(table_id, &key); - - let a = repos - .partitions() - .create_or_get(key.clone(), table_id) - .await - .expect("should create OK"); - - assert_eq!(a.hash_id().unwrap(), &hash_id); - // Test: sort_key_ids from partition_create_or_get_idempotent - assert!(a.sort_key_ids().is_empty()); - - // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent. - let b = repos - .partitions() - .create_or_get(key.clone(), table_id) - .await - .expect("idempotent write should succeed"); - - assert_eq!(a, b); - - // Check that the hash_id is saved in the database and is returned when queried. - let table_partitions = postgres - .repositories() - .await - .partitions() - .list_by_table_id(table_id) - .await - .unwrap(); - assert_eq!(table_partitions.len(), 1); - assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id); - - // Test: sort_key_ids from partition_create_or_get_idempotent - assert!(table_partitions[0].sort_key_ids().is_empty()); - } - #[tokio::test] async fn existing_partitions_without_hash_id() { maybe_skip_integration!(); @@ -2247,7 +2098,7 @@ mod tests { let postgres = setup_db().await; let pool = postgres.pool.clone(); let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; + let mut repos = postgres.repositories(); let namespace = arbitrary_namespace(&mut *repos, "ns4").await; let table = arbitrary_table(&mut *repos, "table", &namespace).await; @@ -2259,17 +2110,16 @@ mod tests { sqlx::query( r#" INSERT INTO partition - (partition_key, shard_id, table_id, sort_key, sort_key_ids) + (partition_key, table_id, sort_key_ids) VALUES - ( $1, $2, $3, '{}', '{}') + ( $1, $2, '{}') ON CONFLICT ON CONSTRAINT partition_key_unique DO UPDATE SET partition_key = partition.partition_key -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, ) .bind(&key) // $1 - .bind(TRANSITION_SHARD_ID) // $2 - .bind(table_id) // $3 + .bind(table_id) // $2 .fetch_one(&pool) .await .unwrap(); @@ -2289,7 +2139,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file .expect("idempotent write should succeed"); // Test: sort_key_ids from freshly insert with empty value - assert!(inserted_again.sort_key_ids().is_empty()); + assert!(inserted_again.sort_key_ids().is_none()); assert_eq!(partition, &inserted_again); @@ -2301,10 +2151,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file .create(parquet_file_params) .await .unwrap(); - assert_matches!( - parquet_file.partition_id, - TransitionPartitionId::Deprecated(_) - ); + assert_eq!(parquet_file.partition_hash_id, None); // Add a partition record WITH a hash ID repos @@ -2404,164 +2251,6 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file assert_eq!(application_name, TEST_APPLICATION_NAME_NEW); } - macro_rules! test_column_create_or_get_many_unchecked { - ( - $name:ident, - calls = {$([$($col_name:literal => $col_type:expr),+ $(,)?]),+}, - want = $($want:tt)+ - ) => { - paste::paste! { - #[tokio::test] - async fn []() { - maybe_skip_integration!(); - - let postgres = setup_db().await; - let metrics = Arc::clone(&postgres.metrics); - let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; - - let namespace = arbitrary_namespace(&mut *repos, "ns4") - .await; - let table_id = arbitrary_table(&mut *repos, "table", &namespace) - .await - .id; - - $( - let mut insert = HashMap::new(); - $( - insert.insert($col_name, $col_type); - )+ - - let got = repos - .columns() - .create_or_get_many_unchecked(table_id, insert.clone()) - .await; - - // The returned columns MUST always match the requested - // column values if successful. - if let Ok(got) = &got { - assert_eq!(insert.len(), got.len()); - - for got in got { - assert_eq!(table_id, got.table_id); - let requested_column_type = insert - .get(got.name.as_str()) - .expect("Should have gotten back a column that was inserted"); - assert_eq!( - *requested_column_type, - ColumnType::try_from(got.column_type) - .expect("invalid column type") - ); - } - - assert_metric_hit(&metrics, "column_create_or_get_many_unchecked"); - } - )+ - - assert_matches!(got, $($want)+); - } - } - } - } - - // Issue a few calls to create_or_get_many that contain distinct columns and - // covers the full set of column types. - test_column_create_or_get_many_unchecked!( - insert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - ], - [ - "test8" => ColumnType::String, - "test9" => ColumnType::Bool, - ] - }, - want = Ok(_) - ); - - // Issue two calls with overlapping columns - request should succeed (upsert - // semantics). - test_column_create_or_get_many_unchecked!( - partial_upsert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ], - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - "test8" => ColumnType::String, - ] - }, - want = Ok(_) - ); - - // Issue two calls with the same columns and types. - test_column_create_or_get_many_unchecked!( - full_upsert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ], - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ] - }, - want = Ok(_) - ); - - // Issue two calls with overlapping columns with conflicting types and - // observe a correctly populated ColumnTypeMismatch error. - test_column_create_or_get_many_unchecked!( - partial_type_conflict, - calls = { - [ - "test1" => ColumnType::String, - "test2" => ColumnType::String, - "test3" => ColumnType::String, - "test4" => ColumnType::String, - ], - [ - "test1" => ColumnType::String, - "test2" => ColumnType::Bool, // This one differs - "test3" => ColumnType::String, - // 4 is missing. - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - "test8" => ColumnType::String, - ] - }, - want = Err(e) => { - assert_matches!(e, Error::ColumnTypeMismatch { name, existing, new } => { - assert_eq!(name, "test2"); - assert_eq!(existing, ColumnType::String); - assert_eq!(new, ColumnType::Bool); - }) - } - ); - #[tokio::test] async fn test_billing_summary_on_parqet_file_creation() { maybe_skip_integration!(); @@ -2569,7 +2258,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let postgres = setup_db().await; let pool = postgres.pool.clone(); let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; + let mut repos = postgres.repositories(); let namespace = arbitrary_namespace(&mut *repos, "ns4").await; let table = arbitrary_table(&mut *repos, "table", &namespace).await; let key = "bananas"; @@ -2585,7 +2274,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let f1 = repos.parquet_files().create(p1.clone()).await.unwrap(); // insert the same again with a different size; we should then have 3x1337 as total file // size - p1.object_store_id = Uuid::new_v4(); + p1.object_store_id = ObjectStoreId::new(); p1.file_size_bytes *= 2; let _f2 = repos .parquet_files() @@ -2604,7 +2293,13 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file // flag f1 for deletion and assert that the total file size is reduced accordingly. repos .parquet_files() - .create_upgrade_delete(&[f1.id], &[], &[], CompactionLevel::Initial) + .create_upgrade_delete( + partition.id, + &[f1.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) .await .expect("flag parquet file for deletion should succeed"); let total_file_size_bytes: i64 = @@ -2638,7 +2333,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let postgres = setup_db().await; let pool = postgres.pool.clone(); let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; + let mut repos = postgres.repositories(); let namespace_name = "apples"; @@ -2647,17 +2342,15 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let insert_null_partition_template_namespace = sqlx::query( r#" INSERT INTO namespace ( - name, topic_id, query_pool_id, retention_period_ns, partition_template + name, retention_period_ns, partition_template ) -VALUES ( $1, $2, $3, $4, NULL ) +VALUES ( $1, $2, NULL ) RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at, partition_template; "#, ) .bind(namespace_name) // $1 - .bind(SHARED_TOPIC_ID) // $2 - .bind(SHARED_QUERY_POOL_ID) // $3 - .bind(None::>); // $4 + .bind(None::>); // $2 insert_null_partition_template_namespace .fetch_one(&pool) @@ -2756,7 +2449,7 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele let postgres = setup_db().await; let pool = postgres.pool.clone(); let postgres: Arc = Arc::new(postgres); - let mut repos = postgres.repositories().await; + let mut repos = postgres.repositories(); let namespace_default_template_name = "oranges"; let namespace_default_template = repos diff --git a/iox_catalog/src/sqlite.rs b/iox_catalog/src/sqlite.rs index 7e29a2b9a4a..e91cde3e9bf 100644 --- a/iox_catalog/src/sqlite.rs +++ b/iox_catalog/src/sqlite.rs @@ -1,45 +1,46 @@ //! A SQLite backed implementation of the Catalog +use crate::interface::PartitionRepoExt; use crate::{ - interface::{ - self, CasFailure, Catalog, ColumnRepo, ColumnTypeMismatchSnafu, Error, NamespaceRepo, - ParquetFileRepo, PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, - MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, + constants::{ + MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE, MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION, }, - kafkaless_transition::{ - SHARED_QUERY_POOL, SHARED_QUERY_POOL_ID, SHARED_TOPIC_ID, SHARED_TOPIC_NAME, - TRANSITION_SHARD_ID, TRANSITION_SHARD_INDEX, + interface::{ + AlreadyExistsSnafu, CasFailure, Catalog, ColumnRepo, Error, NamespaceRepo, ParquetFileRepo, + PartitionRepo, RepoCollection, Result, SoftDeletedRows, TableRepo, }, metrics::MetricDecorator, }; use async_trait::async_trait; +use data_types::snapshot::partition::PartitionSnapshot; +use data_types::snapshot::table::TableSnapshot; use data_types::{ partition_template::{ NamespacePartitionTemplateOverride, TablePartitionTemplateOverride, TemplatePart, }, Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, MaxColumnsPerTable, MaxTables, - Namespace, NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ParquetFile, - ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, PartitionKey, - SkippedCompaction, SortedColumnSet, Table, TableId, Timestamp, TransitionPartitionId, + Namespace, NamespaceId, NamespaceName, NamespaceServiceProtectionLimitsOverride, ObjectStoreId, + ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, + PartitionKey, SkippedCompaction, SortKeyIds, Table, TableId, Timestamp, }; -use serde::{Deserialize, Serialize}; -use std::{collections::HashMap, fmt::Display}; -use std::{collections::HashSet, fmt::Write}; - -use crate::interface::MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE; use iox_time::{SystemProvider, TimeProvider}; use metric::Registry; use observability_deps::tracing::debug; use parking_lot::Mutex; +use serde::{Deserialize, Serialize}; use snafu::prelude::*; -use sqlx::sqlite::SqliteRow; -use sqlx::types::Json; use sqlx::{ - migrate::Migrator, sqlite::SqliteConnectOptions, types::Uuid, Executor, Pool, Row, Sqlite, - SqlitePool, + migrate::Migrator, + sqlite::{SqliteConnectOptions, SqliteRow}, + types::Json, + Executor, FromRow, Pool, Row, Sqlite, SqlitePool, +}; +use std::{ + collections::{HashMap, HashSet}, + fmt::Display, + str::FromStr, + sync::Arc, }; -use std::str::FromStr; -use std::sync::Arc; static MIGRATOR: Migrator = sqlx::migrate!("sqlite/migrations"); @@ -137,13 +138,9 @@ impl<'c> Executor<'c> for &'c mut SqliteTxnInner { impl SqliteCatalog { /// Connect to the catalog store. pub async fn connect(options: SqliteConnectionOptions, metrics: Arc) -> Result { - let opts = SqliteConnectOptions::from_str(&options.file_path) - .map_err(|e| Error::SqlxError { source: e })? - .create_if_missing(true); + let opts = SqliteConnectOptions::from_str(&options.file_path)?.create_if_missing(true); - let pool = SqlitePool::connect_with(opts) - .await - .map_err(|e| Error::SqlxError { source: e })?; + let pool = SqlitePool::connect_with(opts).await?; Ok(Self { metrics, pool, @@ -162,61 +159,12 @@ impl Display for SqliteCatalog { #[async_trait] impl Catalog for SqliteCatalog { async fn setup(&self) -> Result<()> { - MIGRATOR - .run(&self.pool) - .await - .map_err(|e| Error::Setup { source: e.into() })?; - - // We need to manually insert the topic here so that we can create the transition shard - // below. - sqlx::query( - r#" -INSERT INTO topic (name) -VALUES ($1) -ON CONFLICT (name) -DO NOTHING; - "#, - ) - .bind(SHARED_TOPIC_NAME) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; - - // The transition shard must exist and must have magic ID and INDEX. - sqlx::query( - r#" -INSERT INTO shard (id, topic_id, shard_index, min_unpersisted_sequence_number) -VALUES ($1, $2, $3, 0) -ON CONFLICT (topic_id, shard_index) -DO NOTHING; - "#, - ) - .bind(TRANSITION_SHARD_ID) - .bind(SHARED_TOPIC_ID) - .bind(TRANSITION_SHARD_INDEX) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; - - // We need to manually insert the query pool here so that we can create namespaces that - // reference it. - sqlx::query( - r#" -INSERT INTO query_pool (name) -VALUES ($1) -ON CONFLICT (name) -DO NOTHING; - "#, - ) - .bind(SHARED_QUERY_POOL) - .execute(&self.pool) - .await - .map_err(|e| Error::Setup { source: e })?; + MIGRATOR.run(&self.pool).await?; Ok(()) } - async fn repositories(&self) -> Box { + fn repositories(&self) -> Box { Box::new(MetricDecorator::new( SqliteTxn { inner: Mutex::new(SqliteTxnInner { @@ -225,6 +173,7 @@ DO NOTHING; time_provider: Arc::clone(&self.time_provider), }, Arc::clone(&self.metrics), + Arc::clone(&self.time_provider), )) } @@ -238,7 +187,6 @@ DO NOTHING; } } -#[async_trait] impl RepoCollection for SqliteTxn { fn namespaces(&mut self) -> &mut dyn NamespaceRepo { self @@ -279,29 +227,31 @@ impl NamespaceRepo for SqliteTxn { let rec = sqlx::query_as::<_, Namespace>( r#" -INSERT INTO namespace ( name, topic_id, query_pool_id, retention_period_ns, max_tables, max_columns_per_table, partition_template ) -VALUES ( $1, $2, $3, $4, $5, $6, $7 ) +INSERT INTO namespace ( name, retention_period_ns, max_tables, max_columns_per_table, partition_template ) +VALUES ( $1, $2, $3, $4, $5 ) RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at, partition_template; "#, ) .bind(name.as_str()) // $1 - .bind(SHARED_TOPIC_ID) // $2 - .bind(SHARED_QUERY_POOL_ID) // $3 - .bind(retention_period_ns) // $4 - .bind(max_tables) // $5 - .bind(max_columns_per_table) // $6 - .bind(partition_template); // $7 + .bind(retention_period_ns) // $2 + .bind(max_tables) // $3 + .bind(max_columns_per_table) // $4 + .bind(partition_template); // $5 let rec = rec.fetch_one(self.inner.get_mut()).await.map_err(|e| { if is_unique_violation(&e) { - Error::NameExists { - name: name.to_string(), + Error::AlreadyExists { + descr: name.to_string(), } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; @@ -322,8 +272,7 @@ WHERE {v}; .as_str(), ) .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -353,7 +302,7 @@ WHERE id=$1 AND {v}; return Ok(None); } - let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + let namespace = rec?; Ok(Some(namespace)) } @@ -383,7 +332,7 @@ WHERE name=$1 AND {v}; return Ok(None); } - let namespace = rec.map_err(|e| Error::SqlxError { source: e })?; + let namespace = rec?; Ok(Some(namespace)) } @@ -397,7 +346,7 @@ WHERE name=$1 AND {v}; .bind(name) // $2 .execute(self.inner.get_mut()) .await - .context(interface::CouldNotDeleteNamespaceSnafu) + .map_err(Error::from) .map(|_| ()) } @@ -417,10 +366,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -446,10 +397,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -475,10 +428,12 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele .await; let namespace = rec.map_err(|e| match e { - sqlx::Error::RowNotFound => Error::NamespaceNotFoundByName { - name: name.to_string(), + sqlx::Error::RowNotFound => Error::NotFound { + descr: name.to_string(), + }, + _ => Error::External { + source: Box::new(e), }, - _ => Error::SqlxError { source: e }, })?; Ok(namespace) @@ -520,24 +475,24 @@ RETURNING *; .fetch_one(executor) .await .map_err(|e| match e { - sqlx::Error::RowNotFound => Error::ColumnCreateLimitError { - column_name: name.to_string(), - table_id, + sqlx::Error::RowNotFound => Error::LimitExceeded { + descr: format!("couldn't create column {} in table {}; limit reached on namespace", name, table_id) }, _ => { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { descr: e.to_string() } } else { - Error::SqlxError { source: e } + Error::External { source: Box::new(e) } } }})?; ensure!( rec.column_type == column_type, - ColumnTypeMismatchSnafu { - name, - existing: rec.column_type, - new: column_type, + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + name, rec.column_type, column_type + ), } ); @@ -552,13 +507,7 @@ impl TableRepo for SqliteTxn { partition_template: TablePartitionTemplateOverride, namespace_id: NamespaceId, ) -> Result
{ - let mut tx = self - .inner - .get_mut() - .pool - .begin() - .await - .map_err(|e| Error::StartTransaction { source: e })?; + let mut tx = self.inner.get_mut().pool.begin().await?; // A simple insert statement becomes quite complicated in order to avoid checking the table // limits in a select and then conditionally inserting (which would be racey). @@ -587,20 +536,25 @@ RETURNING *; .fetch_one(&mut *tx) .await .map_err(|e| match e { - sqlx::Error::RowNotFound => Error::TableCreateLimitError { - table_name: name.to_string(), - namespace_id, + sqlx::Error::RowNotFound => Error::LimitExceeded { + descr: format!( + "couldn't create table {}; limit reached on namespace {}", + name, namespace_id + ), }, _ => { if is_unique_violation(&e) { - Error::TableNameExists { - name: name.to_string(), - namespace_id, + Error::AlreadyExists { + descr: format!("table '{name}' in namespace {namespace_id}"), } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } } })?; @@ -616,9 +570,7 @@ RETURNING *; } } - tx.commit() - .await - .map_err(|source| Error::FailedToCommit { source })?; + tx.commit().await?; Ok(table) } @@ -639,7 +591,7 @@ WHERE id = $1; return Ok(None); } - let table = rec.map_err(|e| Error::SqlxError { source: e })?; + let table = rec?; Ok(Some(table)) } @@ -665,7 +617,7 @@ WHERE namespace_id = $1 AND name = $2; return Ok(None); } - let table = rec.map_err(|e| Error::SqlxError { source: e })?; + let table = rec?; Ok(Some(table)) } @@ -680,8 +632,7 @@ WHERE namespace_id = $1; ) .bind(namespace_id) .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -689,11 +640,52 @@ WHERE namespace_id = $1; async fn list(&mut self) -> Result> { let rec = sqlx::query_as::<_, Table>("SELECT * FROM table_name;") .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } + + async fn snapshot(&mut self, table_id: TableId) -> Result { + let mut tx = self.inner.get_mut().pool.begin().await?; + + // This will upgrade the transaction to be exclusive + let rec = sqlx::query( + "UPDATE table_name SET generation = generation + 1 where id = $1 RETURNING *;", + ) + .bind(table_id) // $1 + .fetch_one(&mut *tx) + .await; + + if let Err(sqlx::Error::RowNotFound) = rec { + return Err(Error::NotFound { + descr: format!("table: {table_id}"), + }); + } + let row = rec?; + + let generation: i64 = row.get("generation"); + let table = Table::from_row(&row)?; + + let columns = sqlx::query_as::<_, Column>("SELECT * from column_name where table_id = $1;") + .bind(table_id) // $1 + .fetch_all(&mut *tx) + .await?; + + let partitions = + sqlx::query_as::<_, PartitionPod>("SELECT * from partition where table_id = $1;") + .bind(table_id) // $1 + .fetch_all(&mut *tx) + .await?; + + tx.commit().await?; + + Ok(TableSnapshot::encode( + table, + partitions.into_iter().map(Into::into).collect(), + columns, + generation as _, + )?) + } } #[async_trait] @@ -717,8 +709,7 @@ WHERE table_name.namespace_id = $1; ) .bind(namespace_id) .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -732,8 +723,7 @@ WHERE table_id = $1; ) .bind(table_id) .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -741,8 +731,7 @@ WHERE table_id = $1; async fn list(&mut self) -> Result> { let rec = sqlx::query_as::<_, Column>("SELECT * FROM column_name;") .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; Ok(rec) } @@ -791,9 +780,13 @@ RETURNING *; .await .map_err(|e| { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; @@ -803,10 +796,11 @@ RETURNING *; let want = columns.get(existing.name.as_str()).unwrap(); ensure!( existing.column_type == *want, - ColumnTypeMismatchSnafu { - name: &existing.name, - existing: existing.column_type, - new: *want, + AlreadyExistsSnafu { + descr: format!( + "column {} is type {} but schema update has type {}", + existing.name, existing.column_type, want + ), } ); } @@ -824,21 +818,19 @@ struct PartitionPod { hash_id: Option, table_id: TableId, partition_key: PartitionKey, - sort_key: Json>, sort_key_ids: Json>, new_file_at: Option, } impl From for Partition { fn from(value: PartitionPod) -> Self { - let sort_key_ids = SortedColumnSet::from(value.sort_key_ids.0); + let sort_key_ids = SortKeyIds::from(value.sort_key_ids.0); - Self::new_with_hash_id_from_sqlite_catalog_only( + Self::new_catalog_only( value.id, value.hash_id, value.table_id, value.partition_key, - value.sort_key.0, sort_key_ids, value.new_file_at, ) @@ -857,59 +849,41 @@ impl PartitionRepo for SqliteTxn { let v = sqlx::query_as::<_, PartitionPod>( r#" INSERT INTO partition - (partition_key, shard_id, table_id, hash_id, sort_key, sort_key_ids) + (partition_key, table_id, hash_id, sort_key_ids) VALUES - ($1, $2, $3, $4, '[]', '[]') + ($1, $2, $3, '[]') ON CONFLICT (table_id, partition_key) DO UPDATE SET partition_key = partition.partition_key -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, ) .bind(key) // $1 - .bind(TRANSITION_SHARD_ID) // $2 - .bind(table_id) // $3 - .bind(&hash_id) // $4 + .bind(table_id) // $2 + .bind(&hash_id) // $3 .fetch_one(self.inner.get_mut()) .await .map_err(|e| { if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; Ok(v.into()) } - async fn get_by_id(&mut self, partition_id: PartitionId) -> Result> { - let rec = sqlx::query_as::<_, PartitionPod>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE id = $1; - "#, - ) - .bind(partition_id) // $1 - .fetch_one(self.inner.get_mut()) - .await; - - if let Err(sqlx::Error::RowNotFound) = rec { - return Ok(None); - } - - let partition = rec.map_err(|e| Error::SqlxError { source: e })?; - - Ok(Some(partition.into())) - } - - async fn get_by_id_batch(&mut self, partition_ids: Vec) -> Result> { + async fn get_by_id_batch(&mut self, partition_ids: &[PartitionId]) -> Result> { // We use a JSON-based "IS IN" check. let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect(); sqlx::query_as::<_, PartitionPod>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE id IN (SELECT value FROM json_each($1)); "#, @@ -918,77 +892,20 @@ WHERE id IN (SELECT value FROM json_each($1)); .fetch_all(self.inner.get_mut()) .await .map(|vals| vals.into_iter().map(Partition::from).collect()) - .map_err(|e| Error::SqlxError { source: e }) - } - - async fn get_by_hash_id( - &mut self, - partition_hash_id: &PartitionHashId, - ) -> Result> { - let rec = sqlx::query_as::<_, PartitionPod>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE hash_id = $1; - "#, - ) - .bind(partition_hash_id) // $1 - .fetch_one(self.inner.get_mut()) - .await; - - if let Err(sqlx::Error::RowNotFound) = rec { - return Ok(None); - } - - let partition = rec.map_err(|e| Error::SqlxError { source: e })?; - - Ok(Some(partition.into())) - } - - async fn get_by_hash_id_batch( - &mut self, - partition_hash_ids: &[&PartitionHashId], - ) -> Result> { - // We use a JSON-based "IS IN" check. - let ids: Vec<_> = partition_hash_ids - .iter() - .map(|id| { - // convert partiion hash ID to uppercase hex string - let bytes = id.as_bytes(); - let mut s = String::with_capacity(bytes.len() * 2); - for b in bytes { - write!(&mut s, "{:02X}", b).expect("never fails"); - } - s - }) - .collect(); - - sqlx::query_as::<_, PartitionPod>( - r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at -FROM partition -WHERE hex(hash_id) IN (SELECT value FROM json_each($1)); - "#, - ) - .bind(Json(&ids[..])) // $1 - .fetch_all(self.inner.get_mut()) - .await - .map(|vals| vals.into_iter().map(Partition::from).collect()) - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn list_by_table_id(&mut self, table_id: TableId) -> Result> { Ok(sqlx::query_as::<_, PartitionPod>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE table_id = $1; "#, ) .bind(table_id) // $1 .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? + .await? .into_iter() .map(Into::into) .collect()) @@ -1003,7 +920,7 @@ WHERE table_id = $1; ) .fetch_all(self.inner.get_mut()) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } /// Update the sort key for `partition_id` if and only if `old_sort_key` @@ -1014,57 +931,26 @@ WHERE table_id = $1; /// round trips to service a transaction in the happy path). async fn cas_sort_key( &mut self, - partition_id: &TransitionPartitionId, - old_sort_key: Option>, - old_sort_key_ids: Option, - new_sort_key: &[&str], - new_sort_key_ids: &SortedColumnSet, - ) -> Result, SortedColumnSet)>> { - // These asserts are here to cacth bugs. They will be removed when we remove the sort_key - // field from the Partition - assert_eq!( - old_sort_key.as_ref().map(|v| v.len()), - old_sort_key_ids.as_ref().map(|v| v.len()) - ); - assert_eq!(new_sort_key.len(), new_sort_key_ids.len()); + partition_id: PartitionId, + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + ) -> Result> { + let old_sort_key_ids: Vec = old_sort_key_ids.map(Into::into).unwrap_or_default(); - let old_sort_key = old_sort_key.unwrap_or_default(); - let raw_old_sort_key_ids: Vec<_> = old_sort_key_ids - .unwrap_or_default() - .iter() - .map(|c| c.get()) - .collect(); - let raw_new_sort_key_ids: Vec<_> = new_sort_key_ids.iter().map(|cid| cid.get()).collect(); + let raw_new_sort_key_ids: Vec = new_sort_key_ids.into(); // This `match` will go away when all partitions have hash IDs in the database. - let query = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, PartitionPod>( - r#" -UPDATE partition -SET sort_key = $1, sort_key_ids = $4 -WHERE hash_id = $2 AND sort_key = $3 AND sort_key_ids = $5 -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; - "#, - ) - .bind(Json(new_sort_key)) // $1 - .bind(hash_id) // $2 - .bind(Json(&old_sort_key)) // $3 - .bind(Json(&raw_new_sort_key_ids)) // $4 - .bind(Json(&raw_old_sort_key_ids)), // $5 - TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, PartitionPod>( - r#" + let query = sqlx::query_as::<_, PartitionPod>( + r#" UPDATE partition -SET sort_key = $1, sort_key_ids = $4 -WHERE id = $2 AND sort_key = $3 AND sort_key_ids = $5 -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +SET sort_key_ids = $1 +WHERE id = $2 AND sort_key_ids = $3 +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, - ) - .bind(Json(new_sort_key)) // $1 - .bind(id) // $2 - .bind(Json(&old_sort_key)) // $3 - .bind(Json(&raw_new_sort_key_ids)) // $4 - .bind(Json(&raw_old_sort_key_ids)), // $5 - }; + ) + .bind(Json(raw_new_sort_key_ids)) // $1 + .bind(partition_id) // $2 + .bind(Json(old_sort_key_ids)); // $3 let res = query.fetch_one(self.inner.get_mut()).await; @@ -1082,26 +968,26 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file // // NOTE: this is racy, but documented - this might return "Sort // key differs! Old key: " - let partition = crate::partition_lookup(self, partition_id) + + let partition = (self as &mut dyn PartitionRepo) + .get_by_id(partition_id) .await .map_err(CasFailure::QueryError)? - .ok_or(CasFailure::QueryError(Error::PartitionNotFound { - id: partition_id.clone(), + .ok_or(CasFailure::QueryError(Error::NotFound { + descr: partition_id.to_string(), }))?; - return Err(CasFailure::ValueMismatch(( - partition.sort_key, - partition.sort_key_ids, - ))); + return Err(CasFailure::ValueMismatch( + partition.sort_key_ids().cloned().unwrap_or_default(), + )); + } + Err(e) => { + return Err(CasFailure::QueryError(Error::External { + source: Box::new(e), + })) } - Err(e) => return Err(CasFailure::QueryError(Error::SqlxError { source: e })), }; - debug!( - ?partition_id, - ?old_sort_key, - ?new_sort_key, - "partition sort key cas successful" - ); + debug!(?partition_id, "partition sort key cas successful"); Ok(partition.into()) } @@ -1143,8 +1029,7 @@ skipped_at = EXCLUDED.skipped_at; .bind(limit_bytes as i64) .bind(std::time::SystemTime::now().duration_since(std::time::UNIX_EPOCH).unwrap().as_secs() as i64) .execute(self.inner.get_mut()) - .await - .context(interface::CouldNotRecordSkippedCompactionSnafu { partition_id })?; + .await?; Ok(()) } @@ -1160,7 +1045,7 @@ skipped_at = EXCLUDED.skipped_at; .fetch_all(self.inner.get_mut()) .await; - let skipped_partition_records = rec.map_err(|e| Error::SqlxError { source: e })?; + let skipped_partition_records = rec?; Ok(skipped_partition_records) } @@ -1173,7 +1058,7 @@ SELECT * FROM skipped_compactions ) .fetch_all(self.inner.get_mut()) .await - .context(interface::CouldNotListSkippedCompactionsSnafu) + .map_err(Error::from) } async fn delete_skipped_compactions( @@ -1190,13 +1075,13 @@ RETURNING * .bind(partition_id) .fetch_optional(self.inner.get_mut()) .await - .context(interface::CouldNotDeleteSkippedCompactionsSnafu) + .map_err(Error::from) } async fn most_recent_n(&mut self, n: usize) -> Result> { Ok(sqlx::query_as::<_, PartitionPod>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition ORDER BY id DESC LIMIT $1; @@ -1204,8 +1089,7 @@ LIMIT $1; ) .bind(n as i64) // $1 .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? + .await? .into_iter() .map(Into::into) .collect()) @@ -1233,25 +1117,74 @@ LIMIT $1; .bind(maximum_time) // $2 .fetch_all(self.inner.get_mut()) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn list_old_style(&mut self) -> Result> { Ok(sqlx::query_as::<_, PartitionPod>( r#" -SELECT id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at +SELECT id, hash_id, table_id, partition_key, sort_key_ids, new_file_at FROM partition WHERE hash_id IS NULL ORDER BY id DESC; "#, ) .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? + .await? .into_iter() .map(Into::into) .collect()) } + + async fn snapshot(&mut self, partition_id: PartitionId) -> Result { + let mut tx = self.inner.get_mut().pool.begin().await?; + + // This will upgrade the transaction to be exclusive + let rec = sqlx::query( + "UPDATE partition SET generation = generation + 1 where id = $1 RETURNING *;", + ) + .bind(partition_id) // $1 + .fetch_one(&mut *tx) + .await; + if let Err(sqlx::Error::RowNotFound) = rec { + return Err(Error::NotFound { + descr: format!("partition: {partition_id}"), + }); + } + let row = rec?; + + let generation: i64 = row.get("generation"); + let partition = PartitionPod::from_row(&row)?; + + let (namespace_id,): (NamespaceId,) = + sqlx::query_as("SELECT namespace_id from table_name where id = $1") + .bind(partition.table_id) // $1 + .fetch_one(&mut *tx) + .await?; + + let files = + sqlx::query_as::<_, ParquetFilePod>("SELECT * from parquet_file where partition_id = $1 AND parquet_file.to_delete IS NULL;") + .bind(partition_id) // $1 + .fetch_all(&mut *tx) + .await?; + + let sc = sqlx::query_as::( + r#"SELECT * FROM skipped_compactions WHERE partition_id = $1;"#, + ) + .bind(partition_id) + .fetch_optional(&mut *tx) + .await?; + + tx.commit().await?; + + Ok(PartitionSnapshot::encode( + namespace_id, + partition.into(), + files.into_iter().map(Into::into).collect(), + sc, + generation as _, + )?) + } } fn from_column_set(v: &ColumnSet) -> Json> { @@ -1267,9 +1200,9 @@ struct ParquetFilePod { id: ParquetFileId, namespace_id: NamespaceId, table_id: TableId, - #[sqlx(flatten)] - partition_id: TransitionPartitionId, - object_store_id: Uuid, + partition_id: PartitionId, + partition_hash_id: Option, + object_store_id: ObjectStoreId, min_time: Timestamp, max_time: Timestamp, to_delete: Option, @@ -1288,6 +1221,7 @@ impl From for ParquetFile { namespace_id: value.namespace_id, table_id: value.table_id, partition_id: value.partition_id, + partition_hash_id: value.partition_hash_id, object_store_id: value.object_store_id, min_time: value.min_time, max_time: value.max_time, @@ -1304,39 +1238,13 @@ impl From for ParquetFile { #[async_trait] impl ParquetFileRepo for SqliteTxn { - async fn create(&mut self, parquet_file_params: ParquetFileParams) -> Result { - let executor = self.inner.get_mut(); - create_parquet_file(executor, parquet_file_params).await - } - - async fn list_all(&mut self) -> Result> { - // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large - // `parquet_metadata` column!! - Ok(sqlx::query_as::<_, ParquetFilePod>( - r#" -SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id, - parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id, - parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete, - parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level, - parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at -FROM parquet_file; - "#, - ) - .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? - .into_iter() - .map(Into::into) - .collect()) - } - - async fn flag_for_delete_by_retention(&mut self) -> Result> { + async fn flag_for_delete_by_retention(&mut self) -> Result> { let flagged_at = Timestamp::from(self.time_provider.now()); // TODO - include check of table retention period once implemented let flagged = sqlx::query( r#" WITH parquet_file_ids as ( - SELECT parquet_file.id + SELECT parquet_file.object_store_id FROM namespace, parquet_file WHERE namespace.retention_period_ns IS NOT NULL AND parquet_file.to_delete IS NULL @@ -1346,130 +1254,71 @@ WITH parquet_file_ids as ( ) UPDATE parquet_file SET to_delete = $1 -WHERE id IN (SELECT id FROM parquet_file_ids) -RETURNING id; +WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids) +RETURNING partition_id, object_store_id; "#, ) .bind(flagged_at) // $1 .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_RETENTION) // $2 .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; - let flagged = flagged.into_iter().map(|row| row.get("id")).collect(); + let flagged = flagged + .into_iter() + .map(|row| (row.get("partition_id"), row.get("object_store_id"))) + .collect(); Ok(flagged) } - async fn list_by_namespace_not_to_delete( - &mut self, - namespace_id: NamespaceId, - ) -> Result> { - // Deliberately doesn't use `SELECT *` to avoid the performance hit of fetching the large - // `parquet_metadata` column!! - Ok(sqlx::query_as::<_, ParquetFilePod>( - r#" -SELECT parquet_file.id, parquet_file.namespace_id, parquet_file.table_id, - parquet_file.partition_id, parquet_file.partition_hash_id, parquet_file.object_store_id, - parquet_file.min_time, parquet_file.max_time, parquet_file.to_delete, - parquet_file.file_size_bytes, parquet_file.row_count, parquet_file.compaction_level, - parquet_file.created_at, parquet_file.column_set, parquet_file.max_l0_created_at -FROM parquet_file -INNER JOIN table_name on table_name.id = parquet_file.table_id -WHERE table_name.namespace_id = $1 - AND parquet_file.to_delete IS NULL; - "#, - ) - .bind(namespace_id) // $1 - .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? - .into_iter() - .map(Into::into) - .collect()) - } - - async fn list_by_table_not_to_delete(&mut self, table_id: TableId) -> Result> { - Ok(sqlx::query_as::<_, ParquetFilePod>( - r#" -SELECT id, namespace_id, table_id, partition_id, partition_hash_id, object_store_id, - min_time, max_time, to_delete, file_size_bytes, - row_count, compaction_level, created_at, column_set, max_l0_created_at -FROM parquet_file -WHERE table_id = $1 AND to_delete IS NULL; - "#, - ) - .bind(table_id) // $1 - .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? - .into_iter() - .map(Into::into) - .collect()) - } - - async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { + async fn delete_old_ids_only(&mut self, older_than: Timestamp) -> Result> { // see https://www.crunchydata.com/blog/simulating-update-or-delete-with-limit-in-sqlite-ctes-to-the-rescue let deleted = sqlx::query( r#" WITH parquet_file_ids as ( - SELECT id + SELECT object_store_id FROM parquet_file WHERE to_delete < $1 LIMIT $2 ) DELETE FROM parquet_file -WHERE id IN (SELECT id FROM parquet_file_ids) -RETURNING id; +WHERE object_store_id IN (SELECT object_store_id FROM parquet_file_ids) +RETURNING object_store_id; "#, ) .bind(older_than) // $1 .bind(MAX_PARQUET_FILES_SELECTED_ONCE_FOR_DELETE) // $2 .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .await?; - let deleted = deleted.into_iter().map(|row| row.get("id")).collect(); + let deleted = deleted + .into_iter() + .map(|row| row.get("object_store_id")) + .collect(); Ok(deleted) } - async fn list_by_partition_not_to_delete( + async fn list_by_partition_not_to_delete_batch( &mut self, - partition_id: &TransitionPartitionId, + partition_ids: Vec, ) -> Result> { - // This `match` will go away when all partitions have hash IDs in the database. - let query = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => sqlx::query_as::<_, ParquetFilePod>( - r#" -SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id, - object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count, - compaction_level, created_at, column_set, max_l0_created_at -FROM parquet_file -INNER JOIN partition -ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id -WHERE partition.hash_id = $1 - AND parquet_file.to_delete IS NULL; - "#, - ) - .bind(hash_id), // $1 - TransitionPartitionId::Deprecated(id) => sqlx::query_as::<_, ParquetFilePod>( - r#" + // We use a JSON-based "IS IN" check. + let ids: Vec<_> = partition_ids.iter().map(|p| p.get()).collect(); + + let query = sqlx::query_as::<_, ParquetFilePod>( + r#" SELECT parquet_file.id, namespace_id, parquet_file.table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, parquet_file.to_delete, file_size_bytes, row_count, compaction_level, created_at, column_set, max_l0_created_at FROM parquet_file -INNER JOIN partition -ON partition.id = parquet_file.partition_id OR partition.hash_id = parquet_file.partition_hash_id -WHERE partition.id = $1 +WHERE parquet_file.partition_id IN (SELECT value FROM json_each($1)) AND parquet_file.to_delete IS NULL; "#, - ) - .bind(id), // $1 - }; + ) + .bind(Json(&ids[..])); // $1 Ok(query .fetch_all(self.inner.get_mut()) - .await - .map_err(|e| Error::SqlxError { source: e })? + .await? .into_iter() .map(Into::into) .collect()) @@ -1477,7 +1326,7 @@ WHERE partition.id = $1 async fn get_by_object_store_id( &mut self, - object_store_id: Uuid, + object_store_id: ObjectStoreId, ) -> Result> { let rec = sqlx::query_as::<_, ParquetFilePod>( r#" @@ -1496,19 +1345,19 @@ WHERE object_store_id = $1; return Ok(None); } - let parquet_file = rec.map_err(|e| Error::SqlxError { source: e })?; + let parquet_file = rec?; Ok(Some(parquet_file.into())) } async fn exists_by_object_store_id_batch( &mut self, - object_store_ids: Vec, - ) -> Result> { + object_store_ids: Vec, + ) -> Result> { let in_value = object_store_ids .into_iter() // use a sqlite blob literal - .map(|id| format!("X'{}'", id.simple())) + .map(|id| format!("X'{}'", id.get_uuid().simple())) .collect::>() .join(","); @@ -1519,18 +1368,19 @@ FROM parquet_file WHERE object_store_id IN ({v});", v = in_value )) - .map(|slr: SqliteRow| slr.get::("object_store_id")) + .map(|slr: SqliteRow| slr.get::("object_store_id")) // limitation of sqlx: will not bind arrays // https://github.com/launchbadge/sqlx/blob/main/FAQ.md#how-can-i-do-a-select--where-foo-in--query .fetch_all(self.inner.get_mut()) .await - .map_err(|e| Error::SqlxError { source: e }) + .map_err(Error::from) } async fn create_upgrade_delete( &mut self, - delete: &[ParquetFileId], - upgrade: &[ParquetFileId], + partition_id: PartitionId, + delete: &[ObjectStoreId], + upgrade: &[ObjectStoreId], create: &[ParquetFileParams], target_level: CompactionLevel, ) -> Result> { @@ -1541,29 +1391,26 @@ WHERE object_store_id IN ({v});", delete_set.is_disjoint(&upgrade_set), "attempted to upgrade a file scheduled for delete" ); - let mut tx = self - .inner - .get_mut() - .pool - .begin() - .await - .map_err(|e| Error::StartTransaction { source: e })?; + let mut tx = self.inner.get_mut().pool.begin().await?; for id in delete { let marked_at = Timestamp::from(self.time_provider.now()); - flag_for_delete(&mut *tx, *id, marked_at).await?; + flag_for_delete(&mut *tx, partition_id, *id, marked_at).await?; } - update_compaction_level(&mut *tx, upgrade, target_level).await?; + update_compaction_level(&mut *tx, partition_id, upgrade, target_level).await?; let mut ids = Vec::with_capacity(create.len()); for file in create { + if file.partition_id != partition_id { + return Err(Error::External { + source: format!("Inconsistent ParquetFileParams, expected PartitionId({partition_id}) got PartitionId({})", file.partition_id).into(), + }); + } let res = create_parquet_file(&mut *tx, file.clone()).await?; ids.push(res.id); } - tx.commit() - .await - .map_err(|e| Error::FailedToCommit { source: e })?; + tx.commit().await?; Ok(ids) } @@ -1582,6 +1429,7 @@ where namespace_id, table_id, partition_id, + partition_hash_id, object_store_id, min_time, max_time, @@ -1593,96 +1441,116 @@ where max_l0_created_at, } = parquet_file_params; - let (partition_id, partition_hash_id) = match partition_id { - TransitionPartitionId::Deterministic(hash_id) => (None, Some(hash_id)), - TransitionPartitionId::Deprecated(id) => (Some(id), None), - }; let res = sqlx::query_as::<_, ParquetFilePod>( r#" INSERT INTO parquet_file ( - shard_id, table_id, partition_id, partition_hash_id, object_store_id, + table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, file_size_bytes, row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at ) -VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14 ) +VALUES ( $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13 ) RETURNING id, table_id, partition_id, partition_hash_id, object_store_id, min_time, max_time, to_delete, file_size_bytes, row_count, compaction_level, created_at, namespace_id, column_set, max_l0_created_at; "#, ) - .bind(TRANSITION_SHARD_ID) // $1 - .bind(table_id) // $2 - .bind(partition_id) // $3 - .bind(partition_hash_id.as_ref()) // $4 - .bind(object_store_id) // $5 - .bind(min_time) // $6 - .bind(max_time) // $7 - .bind(file_size_bytes) // $8 - .bind(row_count) // $9 - .bind(compaction_level) // $10 - .bind(created_at) // $11 - .bind(namespace_id) // $12 - .bind(from_column_set(&column_set)) // $13 - .bind(max_l0_created_at) // $14 + .bind(table_id) // $1 + .bind(partition_id) // $2 + .bind(partition_hash_id.as_ref()) // $3 + .bind(object_store_id) // $4 + .bind(min_time) // $5 + .bind(max_time) // $6 + .bind(file_size_bytes) // $7 + .bind(row_count) // $8 + .bind(compaction_level) // $9 + .bind(created_at) // $10 + .bind(namespace_id) // $11 + .bind(from_column_set(&column_set)) // $12 + .bind(max_l0_created_at) // $13 .fetch_one(executor) .await; let rec = res.map_err(|e| { if is_unique_violation(&e) { - Error::FileExists { object_store_id } + Error::AlreadyExists { + descr: object_store_id.to_string(), + } } else if is_fk_violation(&e) { - Error::ForeignKeyViolation { source: e } + Error::NotFound { + descr: e.to_string(), + } } else { - Error::SqlxError { source: e } + Error::External { + source: Box::new(e), + } } })?; Ok(rec.into()) } -async fn flag_for_delete<'q, E>(executor: E, id: ParquetFileId, marked_at: Timestamp) -> Result<()> +async fn flag_for_delete<'q, E>( + executor: E, + partition_id: PartitionId, + id: ObjectStoreId, + marked_at: Timestamp, +) -> Result<()> where E: Executor<'q, Database = Sqlite>, { - let query = sqlx::query(r#"UPDATE parquet_file SET to_delete = $1 WHERE id = $2;"#) - .bind(marked_at) // $1 - .bind(id); // $2 + let updated = + sqlx::query_as::<_, (i64,)>(r#"UPDATE parquet_file SET to_delete = $1 WHERE object_store_id = $2 AND partition_id = $3 AND to_delete is NULL returning id;"#) + .bind(marked_at) // $1 + .bind(id) // $2 + .bind(partition_id) // $3 + .fetch_all(executor) + .await?; - query - .execute(executor) - .await - .map_err(|e| Error::SqlxError { source: e })?; + if updated.len() != 1 { + return Err(Error::NotFound { + descr: format!("parquet file {id} not found for delete"), + }); + } Ok(()) } async fn update_compaction_level<'q, E>( executor: E, - parquet_file_ids: &[ParquetFileId], + partition_id: PartitionId, + object_store_ids: &[ObjectStoreId], compaction_level: CompactionLevel, -) -> Result> +) -> Result<()> where E: Executor<'q, Database = Sqlite>, { - // We use a JSON-based "IS IN" check. - let ids: Vec<_> = parquet_file_ids.iter().map(|p| p.get()).collect(); - let query = sqlx::query( + let in_value = object_store_ids + .iter() + // use a sqlite blob literal + .map(|id| format!("X'{}'", id.get_uuid().simple())) + .collect::>() + .join(","); + + let updated = sqlx::query_as::<_, (i64,)>(&format!( r#" UPDATE parquet_file SET compaction_level = $1 -WHERE id IN (SELECT value FROM json_each($2)) -RETURNING id; +WHERE object_store_id IN ({v}) AND partition_id = $2 AND to_delete is NULL returning id; "#, - ) + v = in_value, + )) .bind(compaction_level) // $1 - .bind(Json(&ids[..])); // $2 - let updated = query - .fetch_all(executor) - .await - .map_err(|e| Error::SqlxError { source: e })?; + .bind(partition_id) // $2 + .fetch_all(executor) + .await?; - let updated = updated.into_iter().map(|row| row.get("id")).collect(); - Ok(updated) + if updated.len() != object_store_ids.len() { + return Err(Error::NotFound { + descr: "parquet file(s) not found for upgrade".to_string(), + }); + } + + Ok(()) } /// The error code returned by SQLite for a unique constraint violation. @@ -1722,27 +1590,15 @@ fn is_unique_violation(e: &sqlx::Error) -> bool { #[cfg(test)] mod tests { use super::*; + use crate::interface::ParquetFileRepoExt; use crate::test_helpers::{ arbitrary_namespace, arbitrary_parquet_file_params, arbitrary_table, }; use assert_matches::assert_matches; use data_types::partition_template::TemplatePart; use generated_types::influxdata::iox::partition_template::v1 as proto; - use metric::{Attributes, DurationHistogram, Metric}; use std::sync::Arc; - fn assert_metric_hit(metrics: &Registry, name: &'static str) { - let histogram = metrics - .get_instrument::>("catalog_op_duration") - .expect("failed to read metric") - .get_observer(&Attributes::from(&[("op", name), ("result", "success")])) - .expect("failed to get observer") - .fetch(); - - let hit_count = histogram.sample_count(); - assert!(hit_count > 0, "metric did not record any calls"); - } - async fn setup_db() -> SqliteCatalog { let dsn = std::env::var("TEST_INFLUXDB_SQLITE_DSN").unwrap_or("sqlite::memory:".to_string()); @@ -1757,7 +1613,7 @@ mod tests { #[tokio::test] async fn test_catalog() { - interface::test_helpers::test_catalog(|| async { + crate::interface_tests::test_catalog(|| async { let sqlite = setup_db().await; let sqlite: Arc = Arc::new(sqlite); sqlite @@ -1765,57 +1621,12 @@ mod tests { .await; } - #[tokio::test] - async fn test_partition_create_or_get_idempotent() { - let sqlite = setup_db().await; - let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; - - let namespace = arbitrary_namespace(&mut *repos, "ns4").await; - let table_id = arbitrary_table(&mut *repos, "table", &namespace).await.id; - - let key = PartitionKey::from("bananas"); - - let hash_id = PartitionHashId::new(table_id, &key); - - let a = repos - .partitions() - .create_or_get(key.clone(), table_id) - .await - .expect("should create OK"); - - assert_eq!(a.hash_id().unwrap(), &hash_id); - - // Call create_or_get for the same (key, table_id) pair, to ensure the write is idempotent. - let b = repos - .partitions() - .create_or_get(key.clone(), table_id) - .await - .expect("idempotent write should succeed"); - - assert_eq!(a, b); - - // Check that the hash_id is saved in the database and is returned when queried. - let table_partitions = sqlite - .repositories() - .await - .partitions() - .list_by_table_id(table_id) - .await - .unwrap(); - assert_eq!(table_partitions.len(), 1); - assert_eq!(table_partitions[0].hash_id().unwrap(), &hash_id); - - // Test: sort_key_ids from partition_create_or_get_idempotent - assert!(table_partitions[0].sort_key_ids().is_empty()); - } - #[tokio::test] async fn existing_partitions_without_hash_id() { let sqlite: SqliteCatalog = setup_db().await; let pool = sqlite.pool.clone(); let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; + let mut repos = sqlite.repositories(); let namespace = arbitrary_namespace(&mut *repos, "ns4").await; let table = arbitrary_table(&mut *repos, "table", &namespace).await; @@ -1827,17 +1638,16 @@ mod tests { sqlx::query( r#" INSERT INTO partition - (partition_key, shard_id, table_id, sort_key, sort_key_ids) + (partition_key, table_id, sort_key_ids) VALUES - ($1, $2, $3, '[]', '[]') + ($1, $2, '[]') ON CONFLICT (table_id, partition_key) DO UPDATE SET partition_key = partition.partition_key -RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file_at; +RETURNING id, hash_id, table_id, partition_key, sort_key_ids, new_file_at; "#, ) .bind(&key) // $1 - .bind(TRANSITION_SHARD_ID) // $2 - .bind(table_id) // $3 + .bind(table_id) // $2 .fetch_one(&pool) .await .unwrap(); @@ -1856,7 +1666,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file .expect("idempotent write should succeed"); // Test: sort_key_ids from freshly insert with empty value - assert!(inserted_again.sort_key_ids().is_empty()); + assert!(inserted_again.sort_key_ids().is_none()); assert_eq!(partition, &inserted_again); @@ -1868,10 +1678,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file .create(parquet_file_params) .await .unwrap(); - assert_matches!( - parquet_file.partition_id, - TransitionPartitionId::Deprecated(_) - ); + assert_eq!(parquet_file.partition_hash_id, None); // Add a partition record WITH a hash ID repos @@ -1886,168 +1693,12 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file assert_eq!(old_style_partitions[0].id, partition.id); } - macro_rules! test_column_create_or_get_many_unchecked { - ( - $name:ident, - calls = {$([$($col_name:literal => $col_type:expr),+ $(,)?]),+}, - want = $($want:tt)+ - ) => { - paste::paste! { - #[tokio::test] - async fn []() { - let sqlite = setup_db().await; - let metrics = Arc::clone(&sqlite.metrics); - let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; - - let namespace = arbitrary_namespace(&mut *repos, "ns4") - .await; - let table_id = arbitrary_table(&mut *repos, "table", &namespace) - .await - .id; - - $( - let mut insert = HashMap::new(); - $( - insert.insert($col_name, $col_type); - )+ - - let got = repos - .columns() - .create_or_get_many_unchecked(table_id, insert.clone()) - .await; - - // The returned columns MUST always match the requested - // column values if successful. - if let Ok(got) = &got { - assert_eq!(insert.len(), got.len()); - - for got in got { - assert_eq!(table_id, got.table_id); - let requested_column_type = insert - .get(got.name.as_str()) - .expect("Should have gotten back a column that was inserted"); - assert_eq!( - *requested_column_type, - ColumnType::try_from(got.column_type) - .expect("invalid column type") - ); - } - - assert_metric_hit(&metrics, "column_create_or_get_many_unchecked"); - } - )+ - - assert_matches!(got, $($want)+); - } - } - } - } - - // Issue a few calls to create_or_get_many that contain distinct columns and - // covers the full set of column types. - test_column_create_or_get_many_unchecked!( - insert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - ], - [ - "test8" => ColumnType::String, - "test9" => ColumnType::Bool, - ] - }, - want = Ok(_) - ); - - // Issue two calls with overlapping columns - request should succeed (upsert - // semantics). - test_column_create_or_get_many_unchecked!( - partial_upsert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ], - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - "test8" => ColumnType::String, - ] - }, - want = Ok(_) - ); - - // Issue two calls with the same columns and types. - test_column_create_or_get_many_unchecked!( - full_upsert, - calls = { - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ], - [ - "test1" => ColumnType::I64, - "test2" => ColumnType::U64, - "test3" => ColumnType::F64, - "test4" => ColumnType::Bool, - ] - }, - want = Ok(_) - ); - - // Issue two calls with overlapping columns with conflicting types and - // observe a correctly populated ColumnTypeMismatch error. - test_column_create_or_get_many_unchecked!( - partial_type_conflict, - calls = { - [ - "test1" => ColumnType::String, - "test2" => ColumnType::String, - "test3" => ColumnType::String, - "test4" => ColumnType::String, - ], - [ - "test1" => ColumnType::String, - "test2" => ColumnType::Bool, // This one differs - "test3" => ColumnType::String, - // 4 is missing. - "test5" => ColumnType::String, - "test6" => ColumnType::Time, - "test7" => ColumnType::Tag, - "test8" => ColumnType::String, - ] - }, - want = Err(e) => { - assert_matches!(e, Error::ColumnTypeMismatch { name, existing, new } => { - assert_eq!(name, "test2"); - assert_eq!(existing, ColumnType::String); - assert_eq!(new, ColumnType::Bool); - }) - } - ); - #[tokio::test] async fn test_billing_summary_on_parqet_file_creation() { let sqlite = setup_db().await; let pool = sqlite.pool.clone(); let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; + let mut repos = sqlite.repositories(); let namespace = arbitrary_namespace(&mut *repos, "ns4").await; let table = arbitrary_table(&mut *repos, "table", &namespace).await; let key = "bananas"; @@ -2067,7 +1718,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file .expect("create parquet file should succeed"); // insert the same again with a different size; we should then have 3x1337 as total file // size - p1.object_store_id = Uuid::new_v4(); + p1.object_store_id = ObjectStoreId::new(); p1.file_size_bytes *= 2; let _f2 = repos .parquet_files() @@ -2086,7 +1737,13 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file // flag f1 for deletion and assert that the total file size is reduced accordingly. repos .parquet_files() - .create_upgrade_delete(&[f1.id], &[], &[], CompactionLevel::Initial) + .create_upgrade_delete( + partition.id, + &[f1.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) .await .expect("flag parquet file for deletion should succeed"); let total_file_size_bytes: i64 = @@ -2117,7 +1774,7 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let sqlite = setup_db().await; let pool = sqlite.pool.clone(); let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; + let mut repos = sqlite.repositories(); let namespace_name = "apples"; @@ -2126,17 +1783,15 @@ RETURNING id, hash_id, table_id, partition_key, sort_key, sort_key_ids, new_file let insert_null_partition_template_namespace = sqlx::query( r#" INSERT INTO namespace ( - name, topic_id, query_pool_id, retention_period_ns, partition_template + name, retention_period_ns, partition_template ) -VALUES ( $1, $2, $3, $4, NULL ) +VALUES ( $1, $2, NULL ) RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, deleted_at, partition_template; "#, ) .bind(namespace_name) // $1 - .bind(SHARED_TOPIC_ID) // $2 - .bind(SHARED_QUERY_POOL_ID) // $3 - .bind(None::>); // $4 + .bind(None::>); // $2 insert_null_partition_template_namespace .fetch_one(&pool) @@ -2233,7 +1888,7 @@ RETURNING id, name, retention_period_ns, max_tables, max_columns_per_table, dele let sqlite = setup_db().await; let pool = sqlite.pool.clone(); let sqlite: Arc = Arc::new(sqlite); - let mut repos = sqlite.repositories().await; + let mut repos = sqlite.repositories(); let namespace_default_template_name = "oranges"; let namespace_default_template = repos diff --git a/iox_catalog/src/test_helpers.rs b/iox_catalog/src/test_helpers.rs new file mode 100644 index 00000000000..0861d79fb8a --- /dev/null +++ b/iox_catalog/src/test_helpers.rs @@ -0,0 +1,92 @@ +//! Catalog helper functions for creation of catalog objects +use data_types::{ + partition_template::TablePartitionTemplateOverride, ColumnId, ColumnSet, CompactionLevel, + Namespace, NamespaceName, ObjectStoreId, ParquetFileParams, Partition, Table, TableSchema, + Timestamp, +}; + +use crate::interface::RepoCollection; + +/// When the details of the namespace don't matter; the test just needs *a* catalog namespace +/// with a particular name. +/// +/// Use [`NamespaceRepo::create`] directly if: +/// +/// - The values of the parameters to `create` need to be different than what's here +/// - The values of the parameters to `create` are relevant to the behavior under test +/// - You expect namespace creation to fail in the test +/// +/// [`NamespaceRepo::create`]: crate::interface::NamespaceRepo::create +pub async fn arbitrary_namespace( + repos: &mut R, + name: &str, +) -> Namespace { + let namespace_name = NamespaceName::new(name).unwrap(); + repos + .namespaces() + .create(&namespace_name, None, None, None) + .await + .unwrap() +} + +/// When the details of the table don't matter; the test just needs *a* catalog table +/// with a particular name in a particular namespace. +/// +/// Use [`TableRepo::create`] directly if: +/// +/// - The values of the parameters to `create_or_get` need to be different than what's here +/// - The values of the parameters to `create_or_get` are relevant to the behavior under test +/// - You expect table creation to fail in the test +/// +/// [`TableRepo::create`]: crate::interface::TableRepo::create +pub async fn arbitrary_table( + repos: &mut R, + name: &str, + namespace: &Namespace, +) -> Table { + repos + .tables() + .create( + name, + TablePartitionTemplateOverride::try_new(None, &namespace.partition_template).unwrap(), + namespace.id, + ) + .await + .unwrap() +} + +/// Load or create an arbitrary table schema in the same way that a write implicitly creates a +/// table, that is, with a time column. +pub async fn arbitrary_table_schema_load_or_create( + repos: &mut R, + name: &str, + namespace: &Namespace, +) -> TableSchema { + crate::util::table_load_or_create(repos, namespace.id, &namespace.partition_template, name) + .await + .unwrap() +} + +/// When the details of a Parquet file record don't matter, the test just needs *a* Parquet +/// file record in a particular namespace+table+partition. +pub fn arbitrary_parquet_file_params( + namespace: &Namespace, + table: &Table, + partition: &Partition, +) -> ParquetFileParams { + ParquetFileParams { + namespace_id: namespace.id, + table_id: table.id, + partition_id: partition.id, + partition_hash_id: partition.hash_id().cloned(), + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(1), + max_time: Timestamp::new(10), + file_size_bytes: 1337, + row_count: 0, + compaction_level: CompactionLevel::Initial, + created_at: Timestamp::new(1), + column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]), + max_l0_created_at: Timestamp::new(1), + } +} diff --git a/iox_catalog/src/util.rs b/iox_catalog/src/util.rs new file mode 100644 index 00000000000..d6d184fbf09 --- /dev/null +++ b/iox_catalog/src/util.rs @@ -0,0 +1,897 @@ +//! Helper methods to simplify catalog work. +//! +//! They all use the public [`Catalog`] interface and have no special access to internals, so in theory they can be +//! implement downstream as well. + +use std::{ + borrow::Cow, + collections::{BTreeMap, HashMap, HashSet}, + sync::Arc, +}; + +use data_types::{ + partition_template::{NamespacePartitionTemplateOverride, TablePartitionTemplateOverride}, + ColumnType, ColumnsByName, Namespace, NamespaceId, NamespaceSchema, PartitionId, SortKeyIds, + TableId, TableSchema, +}; +use mutable_batch::MutableBatch; +use thiserror::Error; + +use crate::{ + constants::TIME_COLUMN, + interface::{CasFailure, Catalog, Error, RepoCollection, SoftDeletedRows}, +}; + +/// Gets the namespace schema including all tables and columns. +pub async fn get_schema_by_id( + id: NamespaceId, + repos: &mut R, + deleted: SoftDeletedRows, +) -> Result, crate::interface::Error> +where + R: RepoCollection + ?Sized, +{ + let Some(namespace) = repos.namespaces().get_by_id(id, deleted).await? else { + return Ok(None); + }; + + Ok(Some(get_schema_internal(namespace, repos).await?)) +} + +/// Gets the namespace schema including all tables and columns. +pub async fn get_schema_by_name( + name: &str, + repos: &mut R, + deleted: SoftDeletedRows, +) -> Result, crate::interface::Error> +where + R: RepoCollection + ?Sized, +{ + let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else { + return Ok(None); + }; + + Ok(Some(get_schema_internal(namespace, repos).await?)) +} + +async fn get_schema_internal( + namespace: Namespace, + repos: &mut R, +) -> Result +where + R: RepoCollection + ?Sized, +{ + // get the columns first just in case someone else is creating schema while we're doing this. + let columns = repos.columns().list_by_namespace_id(namespace.id).await?; + let tables = repos.tables().list_by_namespace_id(namespace.id).await?; + + let mut namespace = NamespaceSchema::new_empty_from(&namespace); + + let mut table_id_to_schema = BTreeMap::new(); + for t in tables { + let table_schema = TableSchema::new_empty_from(&t); + table_id_to_schema.insert(t.id, (t.name, table_schema)); + } + + for c in columns { + let (_, t) = table_id_to_schema.get_mut(&c.table_id).unwrap(); + t.add_column(c); + } + + for (_, (table_name, schema)) in table_id_to_schema { + namespace.tables.insert(table_name, schema); + } + + Ok(namespace) +} + +/// Gets the schema for one particular table in a namespace. +pub async fn get_schema_by_namespace_and_table( + name: &str, + table_name: &str, + repos: &mut R, + deleted: SoftDeletedRows, +) -> Result, crate::interface::Error> +where + R: RepoCollection + ?Sized, +{ + let Some(namespace) = repos.namespaces().get_by_name(name, deleted).await? else { + return Ok(None); + }; + + let Some(table) = repos + .tables() + .get_by_namespace_and_name(namespace.id, table_name) + .await? + else { + return Ok(None); + }; + + let mut table_schema = TableSchema::new_empty_from(&table); + + let columns = repos.columns().list_by_table_id(table.id).await?; + for c in columns { + table_schema.add_column(c); + } + + let mut namespace = NamespaceSchema::new_empty_from(&namespace); + namespace + .tables + .insert(table_name.to_string(), table_schema); + + Ok(Some(namespace)) +} + +/// Gets all the table's columns. +pub async fn get_table_columns_by_id( + id: TableId, + repos: &mut R, +) -> Result +where + R: RepoCollection + ?Sized, +{ + let columns = repos.columns().list_by_table_id(id).await?; + + Ok(ColumnsByName::new(columns)) +} + +/// Fetch all [`NamespaceSchema`] in the catalog. +/// +/// This method performs the minimal number of queries needed to build the +/// result set. No table lock is obtained, nor are queries executed within a +/// transaction, but this method does return a point-in-time snapshot of the +/// catalog state. +/// +/// # Soft Deletion +/// +/// No schemas for soft-deleted namespaces are returned. +pub async fn list_schemas( + catalog: &dyn Catalog, +) -> Result, crate::interface::Error> { + let mut repos = catalog.repositories(); + + // In order to obtain a point-in-time snapshot, first fetch the columns, + // then the tables, and then resolve the namespace IDs to Namespace in order + // to construct the schemas. + // + // The set of columns returned forms the state snapshot, with the subsequent + // queries resolving only what is needed to construct schemas for the + // retrieved columns (ignoring any newly added tables/namespaces since the + // column snapshot was taken). + // + // This approach also tolerates concurrently deleted namespaces, which are + // simply ignored at the end when joining to the namespace query result. + + // First fetch all the columns - this is the state snapshot of the catalog + // schemas. + let columns = repos.columns().list().await?; + + // Construct the set of table IDs these columns belong to. + let retain_table_ids = columns.iter().map(|c| c.table_id).collect::>(); + + // Fetch all tables, and filter for those that are needed to construct + // schemas for "columns" only. + // + // Discard any tables that have no columns or have been created since + // the "columns" snapshot was retrieved, and construct a map of ID->Table. + let tables = repos + .tables() + .list() + .await? + .into_iter() + .filter_map(|t| { + if !retain_table_ids.contains(&t.id) { + return None; + } + + Some((t.id, t)) + }) + .collect::>(); + + // Drop the table ID set as it will not be referenced again. + drop(retain_table_ids); + + // Do all the I/O to fetch the namespaces in the background, while this + // thread constructs the NamespaceId->TableSchema map below. + let namespaces = tokio::spawn(async move { + repos + .namespaces() + .list(SoftDeletedRows::ExcludeDeleted) + .await + }); + + // A set of tables within a single namespace. + type NamespaceTables = BTreeMap; + + let mut joined = HashMap::::default(); + for column in columns { + // Resolve the table this column references + let table = tables.get(&column.table_id).expect("no table for column"); + + let table_schema = joined + // Find or create a record in the joined map + // for this namespace ID. + .entry(table.namespace_id) + .or_default() + // Fetch the schema record for this table, or create an empty one. + .entry(table.name.clone()) + .or_insert_with(|| TableSchema::new_empty_from(table)); + + table_schema.add_column(column); + } + + // The table map is no longer needed - immediately reclaim the memory. + drop(tables); + + // Convert the Namespace instances into NamespaceSchema instances. + let iter = namespaces + .await + .expect("namespace list task panicked")? + .into_iter() + // Ignore any namespaces that did not exist when the "columns" snapshot + // was created, or have no tables/columns (and therefore have no entry + // in "joined"). + .filter_map(move |v| { + // The catalog call explicitly asked for no soft deleted records. + assert!(v.deleted_at.is_none()); + + let mut ns = NamespaceSchema::new_empty_from(&v); + + ns.tables = joined.remove(&v.id)?; + Some((v, ns)) + }); + + Ok(iter) +} + +/// In a backoff loop, retry calling the compare-and-swap sort key catalog function if the catalog +/// returns a query error unrelated to the CAS operation. +/// +/// Returns with a value of `Ok` containing the new sort key if: +/// +/// - No concurrent updates were detected +/// - A concurrent update was detected, but the other update resulted in the same value this update +/// was attempting to set +/// +/// Returns with a value of `Err(newly_observed_value)` if a concurrent, conflicting update was +/// detected. It is expected that callers of this function will take the returned value into +/// account (in whatever manner is appropriate) before calling this function again. +/// +/// NOTE: it is expected that ONLY processes that ingest data (currently only the ingesters or the +/// bulk ingest API) update sort keys for existing partitions. Consider how calling this function +/// from new processes will interact with the existing calls. +pub async fn retry_cas_sort_key( + old_sort_key_ids: Option<&SortKeyIds>, + new_sort_key_ids: &SortKeyIds, + partition_id: PartitionId, + catalog: Arc, +) -> Result { + use backoff::Backoff; + use observability_deps::tracing::{info, warn}; + use std::ops::ControlFlow; + + Backoff::new(&Default::default()) + .retry_with_backoff("cas_sort_key", || { + let new_sort_key_ids = new_sort_key_ids.clone(); + let catalog = Arc::clone(&catalog); + async move { + let mut repos = catalog.repositories(); + match repos + .partitions() + .cas_sort_key(partition_id, old_sort_key_ids, &new_sort_key_ids) + .await + { + Ok(_) => ControlFlow::Break(Ok(new_sort_key_ids)), + Err(CasFailure::QueryError(e)) => ControlFlow::Continue(e), + Err(CasFailure::ValueMismatch(observed_sort_key_ids)) + if observed_sort_key_ids == new_sort_key_ids => + { + // A CAS failure occurred because of a concurrent + // sort key update, however the new catalog sort key + // exactly matches the sort key this node wants to + // commit. + // + // This is the sad-happy path, and this task can + // continue. + info!( + %partition_id, + ?old_sort_key_ids, + ?observed_sort_key_ids, + update_sort_key_ids=?new_sort_key_ids, + "detected matching concurrent sort key update" + ); + ControlFlow::Break(Ok(new_sort_key_ids)) + } + Err(CasFailure::ValueMismatch(observed_sort_key_ids)) => { + // Another ingester concurrently updated the sort + // key. + // + // This breaks a sort-key update invariant - sort + // key updates MUST be serialised. This operation must + // be retried. + // + // See: + // https://github.com/influxdata/influxdb_iox/issues/6439 + // + warn!( + %partition_id, + ?old_sort_key_ids, + ?observed_sort_key_ids, + update_sort_key_ids=?new_sort_key_ids, + "detected concurrent sort key update" + ); + // Stop the retry loop with an error containing the + // newly observed sort key. + ControlFlow::Break(Err(observed_sort_key_ids)) + } + } + } + }) + .await + .expect("retry forever") +} + +/// An [`crate::interface::Error`] scoped to a single table for schema validation errors. +#[derive(Debug, Error)] +#[error("table {}, {}", .0, .1)] +pub struct TableScopedError(String, Error); + +impl TableScopedError { + /// Return the table name for this error. + pub fn table(&self) -> &str { + &self.0 + } + + /// Return a reference to the error. + pub fn err(&self) -> &Error { + &self.1 + } + + /// Return ownership of the error, discarding the table name. + pub fn into_err(self) -> Error { + self.1 + } +} + +/// Given an iterator of `(table_name, batch)` to validate, this function +/// ensures all the columns within `batch` match the existing schema for +/// `table_name` in `schema`. If the column does not already exist in `schema`, +/// it is created and an updated [`NamespaceSchema`] is returned. +/// +/// This function pushes schema additions through to the backend catalog, and +/// relies on the catalog to serialize concurrent additions of a given column, +/// ensuring only one type is ever accepted per column. +pub async fn validate_or_insert_schema<'a, T, U, R>( + tables: T, + schema: &NamespaceSchema, + repos: &mut R, +) -> Result, TableScopedError> +where + T: IntoIterator + Send + Sync, + U: Iterator + Send, + R: RepoCollection + ?Sized, +{ + let tables = tables.into_iter(); + + // The (potentially updated) NamespaceSchema to return to the caller. + let mut schema = Cow::Borrowed(schema); + + for (table_name, batch) in tables { + validate_mutable_batch(batch, table_name, &mut schema, repos).await?; + } + + match schema { + Cow::Owned(v) => Ok(Some(v)), + Cow::Borrowed(_) => Ok(None), + } +} + +// &mut Cow is used to avoid a copy, so allow it +#[allow(clippy::ptr_arg)] +async fn validate_mutable_batch( + mb: &MutableBatch, + table_name: &str, + schema: &mut Cow<'_, NamespaceSchema>, + repos: &mut R, +) -> Result<(), TableScopedError> +where + R: RepoCollection + ?Sized, +{ + // Check if the table exists in the schema. + // + // Because the entry API requires &mut it is not used to avoid a premature + // clone of the Cow. + let mut table = match schema.tables.get(table_name) { + Some(t) => Cow::Borrowed(t), + None => { + // The table does not exist in the cached schema. + // + // Attempt to load an existing table from the catalog or create a new table in the + // catalog to populate the cache. + let table = + table_load_or_create(repos, schema.id, &schema.partition_template, table_name) + .await + .map_err(|e| TableScopedError(table_name.to_string(), e))?; + + assert!(schema + .to_mut() + .tables + .insert(table_name.to_string(), table) + .is_none()); + + Cow::Borrowed(schema.tables.get(table_name).unwrap()) + } + }; + + // The table is now in the schema (either by virtue of it already existing, + // or through adding it above). + // + // If the table itself needs to be updated during column validation it + // becomes a Cow::owned() copy and the modified copy should be inserted into + // the schema before returning. + validate_and_insert_columns( + mb.columns() + .map(|(name, col)| (name, col.influx_type().into())), + table_name, + &mut table, + repos, + ) + .await?; + + if let Cow::Owned(table) = table { + // The table schema was mutated and needs inserting into the namespace + // schema to make the changes visible to the caller. + assert!(schema + .to_mut() + .tables + .insert(table_name.to_string(), table) + .is_some()); + } + + Ok(()) +} + +/// Given an iterator of `(column_name, column_type)` to validate, this function ensures all the +/// columns match the existing `TableSchema` in `table`. If the column does not already exist in +/// `table`, it is created and the `table` is changed to the `Cow::Owned` variant. +/// +/// This function pushes schema additions through to the backend catalog, and relies on the catalog +/// to serialize concurrent additions of a given column, ensuring only one type is ever accepted +/// per column. +// &mut Cow is used to avoid a copy, so allow it +#[allow(clippy::ptr_arg)] +pub async fn validate_and_insert_columns( + columns: impl Iterator + Send, + table_name: &str, + table: &mut Cow<'_, TableSchema>, + repos: &mut R, +) -> Result<(), TableScopedError> +where + R: RepoCollection + ?Sized, +{ + let mut column_batch: HashMap<&str, ColumnType> = HashMap::new(); + + for (name, column_type) in columns { + // Check if the column exists in the cached schema. + // + // If it does, validate it. If it does not exist, create it and insert + // it into the cached schema. + + match table.columns.get(name.as_str()) { + Some(existing) if existing.column_type == column_type => { + // No action is needed as the column matches the existing column + // schema. + } + Some(existing) => { + // The column schema and the column in the schema change are of + // different types. + return Err(TableScopedError( + table_name.to_string(), + Error::AlreadyExists { + descr: format!( + "column {} is type {} but schema update has type {}", + name, existing.column_type, column_type + ), + }, + )); + } + None => { + // The column does not exist in the cache, add it to the column + // batch to be bulk inserted later. + let old = column_batch.insert(name.as_str(), column_type); + assert!( + old.is_none(), + "duplicate column name `{name}` in new column schema shouldn't be possible" + ); + } + } + } + + if !column_batch.is_empty() { + repos + .columns() + .create_or_get_many_unchecked(table.id, column_batch) + .await + .map_err(|e| TableScopedError(table_name.to_string(), e))? + .into_iter() + .for_each(|c| table.to_mut().add_column(c)); + } + + Ok(()) +} + +/// Load or create table. +pub async fn table_load_or_create( + repos: &mut R, + namespace_id: NamespaceId, + namespace_partition_template: &NamespacePartitionTemplateOverride, + table_name: &str, +) -> Result +where + R: RepoCollection + ?Sized, +{ + let table = match repos + .tables() + .get_by_namespace_and_name(namespace_id, table_name) + .await? + { + Some(table) => table, + None => { + // There is a possibility of a race condition here, if another request has also + // created this table after the `get_by_namespace_and_name` call but before + // this `create` call. In that (hopefully) rare case, do an additional fetch + // from the catalog for the record that should now exist. + let create_result = repos + .tables() + .create( + table_name, + // This table is being created implicitly by this write, so there's no + // possibility of a user-supplied partition template here, which is why there's + // a hardcoded `None`. If there is a namespace template, it must be valid because + // validity was checked during its creation, so that's why there's an `expect`. + TablePartitionTemplateOverride::try_new(None, namespace_partition_template) + .expect("no table partition template; namespace partition template has been validated"), + namespace_id, + ) + .await; + if let Err(Error::AlreadyExists { .. }) = create_result { + repos + .tables() + .get_by_namespace_and_name(namespace_id, table_name) + // Propagate any `Err` returned by the catalog + .await? + // Getting `Ok(None)` should be impossible if we're in this code path because + // the `create` request just said the table exists + .expect( + "Table creation failed because the table exists, so looking up the table \ + should return `Some(table)`, but it returned `None`", + ) + } else { + create_result? + } + } + }; + + let mut table = TableSchema::new_empty_from(&table); + + // Always add a time column to all new tables. + let time_col = repos + .columns() + .create_or_get(TIME_COLUMN, table.id, ColumnType::Time) + .await?; + + table.add_column(time_col); + + Ok(table) +} + +#[cfg(test)] +mod tests { + use std::{collections::BTreeMap, sync::Arc}; + + use super::*; + use crate::{interface::SoftDeletedRows, mem::MemCatalog, util::get_schema_by_name}; + + // Generate a test that simulates multiple, sequential writes in `lp` and + // asserts the resulting schema. + // + // This test asserts the cached schema and the database entry are always in + // sync. + macro_rules! test_validate_schema { + ( + $name:ident, + lp = [$($lp:literal,)+], // An array of multi-line LP writes + want_observe_conflict = $want_observe_conflict:literal, // true if a schema validation error should be observed at some point + want_schema = {$($want_schema:tt) +} // The expected resulting schema after all writes complete. + ) => { + paste::paste! { + #[allow(clippy::bool_assert_comparison)] + #[tokio::test] + async fn []() { + use crate::{interface::Catalog, test_helpers::arbitrary_namespace}; + use std::ops::DerefMut; + use pretty_assertions::assert_eq; + const NAMESPACE_NAME: &str = "bananas"; + + let metrics = Arc::new(metric::Registry::default()); + let time_provider = Arc::new(iox_time::SystemProvider::new()); + let repo = MemCatalog::new(metrics, time_provider); + let mut txn = repo.repositories(); + + let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME) + .await; + let schema = NamespaceSchema::new_empty_from(&namespace); + + // Apply all the lp literals as individual writes, feeding + // the result of one validation into the next to drive + // incremental construction of the schemas. + let mut observed_conflict = false; + $( + let schema = { + let lp: String = $lp.to_string(); + + let writes = mutable_batch_lp::lines_to_batches(lp.as_str(), 42) + .expect("failed to build test writes from LP"); + + let got = validate_or_insert_schema(writes.iter().map(|(k, v)| (k.as_str(), v)), &schema, txn.deref_mut()) + .await; + + match got { + Err(TableScopedError(_, Error::AlreadyExists{ .. })) => { + observed_conflict = true; + schema + }, + Err(e) => panic!("unexpected error: {}", e), + Ok(Some(new_schema)) => new_schema, + Ok(None) => schema, + } + }; + )+ + + assert_eq!($want_observe_conflict, observed_conflict, "should error mismatch"); + + // Invariant: in absence of concurrency, the schema within + // the database must always match the incrementally built + // cached schema. + let db_schema = get_schema_by_name(NAMESPACE_NAME, txn.deref_mut(), SoftDeletedRows::ExcludeDeleted) + .await + .expect("database failed to query for namespace schema") + .expect("namespace exists"); + assert_eq!(schema, db_schema, "schema in DB and cached schema differ"); + + // Generate the map of tables => desired column types + let want_tables: BTreeMap, ColumnType>> = test_validate_schema!(@table, $($want_schema)+); + + // Generate a similarly structured map from the actual + // schema + let actual_tables: BTreeMap, ColumnType>> = schema + .tables + .iter() + .map(|(table, table_schema)| { + let desired_cols = table_schema + .columns + .iter() + .map(|(column, column_schema)| (Arc::clone(&column), column_schema.column_type)) + .collect::>(); + + (table.clone(), desired_cols) + }) + .collect(); + + // Assert the actual namespace contents matches the desired + // table schemas in the test args. + assert_eq!(want_tables, actual_tables, "cached schema and desired schema differ"); + } + } + }; + // Generate a map of table names => column map (below) + // + // out: BTreeMap> + (@table, $($table_name:literal: [$($columns:tt) +],)*) => {{ + let mut tables = BTreeMap::new(); + $( + let want_cols = test_validate_schema!(@column, $($columns)+); + assert!(tables.insert($table_name.to_string(), want_cols).is_none()); + )* + tables + }}; + // Generate a map of column names => ColumnType + // + // out: BTreeMap + (@column, $($col_name:literal => $col_type:expr,)+) => {{ + let mut cols = BTreeMap::new(); + $( + assert!(cols.insert(Arc::from($col_name), $col_type).is_none()); + )* + cols + }}; + } + + test_validate_schema!( + one_write_multiple_tables, + lp = [ + " + m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ + m1,t1=a f1=3i 2\n\ + m2,t3=b f1=true 1\n\ + ", + ], + want_observe_conflict = false, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "t2" => ColumnType::Tag, + "f1" => ColumnType::I64, + "f2" => ColumnType::F64, + "time" => ColumnType::Time, + ], + "m2": [ + "f1" => ColumnType::Bool, + "t3" => ColumnType::Tag, + "time" => ColumnType::Time, + ], + } + ); + + // test that a new table will be created + test_validate_schema!( + two_writes_incremental_new_table, + lp = [ + " + m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ + m1,t1=a f1=3i 2\n\ + m2,t3=b f1=true 1\n\ + ", + " + m1,t1=c f1=1i 2\n\ + new_measurement,t9=a f10=true 1\n\ + ", + ], + want_observe_conflict = false, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "t2" => ColumnType::Tag, + "f1" => ColumnType::I64, + "f2" => ColumnType::F64, + "time" => ColumnType::Time, + ], + "m2": [ + "f1" => ColumnType::Bool, + "t3" => ColumnType::Tag, + "time" => ColumnType::Time, + ], + "new_measurement": [ + "t9" => ColumnType::Tag, + "f10" => ColumnType::Bool, + "time" => ColumnType::Time, + ], + } + ); + + // test that a new column for an existing table will be created + test_validate_schema!( + two_writes_incremental_new_column, + lp = [ + " + m1,t1=a,t2=b f1=2i,f2=2.0 1\n\ + m1,t1=a f1=3i 2\n\ + m2,t3=b f1=true 1\n\ + ", + "m1,new_tag=c new_field=1i 2", + ], + want_observe_conflict = false, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "t2" => ColumnType::Tag, + "f1" => ColumnType::I64, + "f2" => ColumnType::F64, + "time" => ColumnType::Time, + // These are the incremental additions: + "new_tag" => ColumnType::Tag, + "new_field" => ColumnType::I64, + ], + "m2": [ + "f1" => ColumnType::Bool, + "t3" => ColumnType::Tag, + "time" => ColumnType::Time, + ], + } + ); + + test_validate_schema!( + table_always_has_time_column, + lp = [ + "m1,t1=a f1=2i", + ], + want_observe_conflict = false, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "f1" => ColumnType::I64, + "time" => ColumnType::Time, + ], + } + ); + + test_validate_schema!( + two_writes_conflicting_column_types, + lp = [ + "m1,t1=a f1=2i", + // Second write has conflicting type for f1. + "m1,t1=a f1=2.0", + ], + want_observe_conflict = true, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "f1" => ColumnType::I64, + "time" => ColumnType::Time, + ], + } + ); + + test_validate_schema!( + two_writes_tag_field_transposition, + lp = [ + // x is a tag + "m1,t1=a,x=t f1=2i", + // x is a field + "m1,t1=a x=t,f1=2i", + ], + want_observe_conflict = true, + want_schema = { + "m1": [ + "t1" => ColumnType::Tag, + "x" => ColumnType::Tag, + "f1" => ColumnType::I64, + "time" => ColumnType::Time, + ], + } + ); + + #[tokio::test] + async fn validate_table_create_race_doesnt_get_all_columns() { + use crate::{interface::Catalog, test_helpers::arbitrary_namespace}; + use std::{collections::BTreeSet, ops::DerefMut}; + const NAMESPACE_NAME: &str = "bananas"; + + let repo = MemCatalog::new( + Default::default(), + Arc::new(iox_time::SystemProvider::new()), + ); + let mut txn = repo.repositories(); + let namespace = arbitrary_namespace(&mut *txn, NAMESPACE_NAME).await; + + // One cached schema has no tables. + let empty_schema = NamespaceSchema::new_empty_from(&namespace); + + // Another cached schema gets a write that creates a table with some columns. + let schema_with_table = empty_schema.clone(); + let writes = mutable_batch_lp::lines_to_batches("m1,t1=a f1=2i", 42).unwrap(); + validate_or_insert_schema( + writes.iter().map(|(k, v)| (k.as_str(), v)), + &schema_with_table, + txn.deref_mut(), + ) + .await + .unwrap(); + + // then the empty schema adds the same table with some different columns + let other_writes = mutable_batch_lp::lines_to_batches("m1,t2=a f2=2i", 43).unwrap(); + let formerly_empty_schema = validate_or_insert_schema( + other_writes.iter().map(|(k, v)| (k.as_str(), v)), + &empty_schema, + txn.deref_mut(), + ) + .await + .unwrap() + .unwrap(); + + // the formerly-empty schema should NOT have all the columns; schema convergence is handled + // at a higher level by the namespace cache/gossip system + let table = formerly_empty_schema.tables.get("m1").unwrap(); + assert_eq!(table.columns.names(), BTreeSet::from(["t2", "f2", "time"])); + } +} diff --git a/iox_data_generator/Cargo.toml b/iox_data_generator/Cargo.toml index 4d96757474f..72898969d09 100644 --- a/iox_data_generator/Cargo.toml +++ b/iox_data_generator/Cargo.toml @@ -6,27 +6,30 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] bytes = "1.5" chrono = { version = "0.4", default-features = false } clap = { version = "4", features = ["derive", "env", "cargo"] } datafusion_util = { path = "../datafusion_util" } futures = "0.3" -handlebars = "4.4.0" +handlebars = "5.1.0" humantime = "2.1.0" influxdb2_client = { path = "../influxdb2_client" } -itertools = "0.11.0" +itertools = "0.12.0" mutable_batch_lp = { path = "../mutable_batch_lp" } mutable_batch = { path = "../mutable_batch" } parquet_file = { path = "../parquet_file" } rand = { version = "0.8.3", features = ["small_rng"] } -regex = "1.9" +regex = "1.10" schema = { path = "../schema" } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.107" -snafu = "0.7" -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } -toml = "0.8.0" +serde_json = "1.0.111" +snafu = "0.8" +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +toml = "0.8.8" tracing = "0.1" tracing-subscriber = "0.3" uuid = { version = "1", default_features = false } diff --git a/iox_data_generator/src/substitution.rs b/iox_data_generator/src/substitution.rs index 22ff6d5f6fa..b5e558a34fa 100644 --- a/iox_data_generator/src/substitution.rs +++ b/iox_data_generator/src/substitution.rs @@ -4,13 +4,13 @@ use crate::specification; use chrono::prelude::*; use handlebars::{ - Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderError, + Context, Handlebars, Helper, HelperDef, HelperResult, Output, RenderContext, RenderErrorReason, }; use rand::rngs::SmallRng; use rand::{distributions::Alphanumeric, seq::SliceRandom, Rng, RngCore}; use serde_json::Value; use snafu::{ResultExt, Snafu}; -use std::{collections::BTreeMap, convert::TryInto}; +use std::collections::BTreeMap; /// Substitution-specific Results pub type Result = std::result::Result; @@ -76,7 +76,7 @@ pub(crate) struct RandomHelper; impl HelperDef for RandomHelper { fn call<'reg: 'rc, 'rc>( &self, - h: &Helper<'_, '_>, + h: &Helper<'_>, _: &Handlebars<'_>, _: &Context, _: &mut RenderContext<'_, '_>, @@ -84,12 +84,20 @@ impl HelperDef for RandomHelper { ) -> HelperResult { let param = h .param(0) - .ok_or_else(|| RenderError::new("`random` requires a parameter"))? + .ok_or(RenderErrorReason::ParamNotFoundForIndex("random", 0))? .value() .as_u64() - .ok_or_else(|| RenderError::new("`random`'s parameter must be an unsigned integer"))? + .ok_or_else(|| { + RenderErrorReason::ParamTypeMismatchForName( + "random", + "0".to_string(), + "unsigned integer".to_string(), + ) + })? .try_into() - .map_err(|_| RenderError::new("`random`'s parameter must fit in a usize"))?; + .map_err(|_| { + RenderErrorReason::Other("`random`'s parameter must fit in a usize".to_string()) + })?; let mut rng = rand::thread_rng(); @@ -111,7 +119,7 @@ pub(crate) struct FormatNowHelper; impl HelperDef for FormatNowHelper { fn call<'reg: 'rc, 'rc>( &self, - h: &Helper<'_, '_>, + h: &Helper<'_>, _: &Handlebars<'_>, c: &Context, _: &mut RenderContext<'_, '_>, @@ -119,7 +127,7 @@ impl HelperDef for FormatNowHelper { ) -> HelperResult { let format = h .param(0) - .ok_or_else(|| RenderError::new("`format-time` requires a parameter"))? + .ok_or(RenderErrorReason::ParamNotFoundForIndex("format-time", 0))? .render(); let timestamp = c @@ -142,7 +150,7 @@ pub(crate) struct GuidHelper; impl HelperDef for GuidHelper { fn call<'reg: 'rc, 'rc>( &self, - _h: &Helper<'_, '_>, + _h: &Helper<'_>, _: &Handlebars<'_>, _: &Context, _: &mut RenderContext<'_, '_>, diff --git a/iox_data_generator/src/tag_pair.rs b/iox_data_generator/src/tag_pair.rs index 3adbff8d202..302fc4d8739 100644 --- a/iox_data_generator/src/tag_pair.rs +++ b/iox_data_generator/src/tag_pair.rs @@ -9,7 +9,7 @@ use std::fmt::Formatter; use std::sync::{Arc, Mutex}; /// Results specific to the tag_pair module -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// Errors that may happen while creating or regenerating tag pairs #[derive(Snafu, Debug)] diff --git a/iox_query/Cargo.toml b/iox_query/Cargo.toml index 94b85ecd782..e4535319c03 100644 --- a/iox_query/Cargo.toml +++ b/iox_query/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + # This crate is designed to be independent of the rest of the IOx # server and specific storage systems such as Mutable Buffer and Read Buffer. # @@ -15,7 +18,7 @@ license.workspace = true # 2. Allow for query logic testing without bringing in all the storage systems. [dependencies] # In alphabetical order -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } arrow_util = { path = "../arrow_util" } async-trait = "0.1" chrono = { version = "0.4", default-features = false } @@ -25,8 +28,9 @@ datafusion_util = { path = "../datafusion_util" } executor = { path = "../executor"} futures = "0.3" hashbrown = { workspace = true } -indexmap = { version = "2.0", features = ["std"] } -itertools = "0.11.0" +indexmap = { version = "2.1", features = ["std"] } +itertools = "0.12.0" +iox_time = { path = "../iox_time" } metric = { path = "../metric" } object_store = { workspace = true } observability_deps = { path = "../observability_deps" } @@ -35,11 +39,13 @@ parking_lot = "0.12" parquet_file = { path = "../parquet_file" } query_functions = { path = "../query_functions"} schema = { path = "../schema" } -snafu = "0.7" -tokio = { version = "1.32", features = ["macros", "parking_lot"] } +snafu = "0.8" +tokio = { version = "1.35", features = ["macros", "parking_lot"] } tokio-stream = "0.1" trace = { path = "../trace" } +tracker = { path = "../tracker" } predicate = { path = "../predicate" } +uuid = { version = "1", features = ["v4"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order diff --git a/iox_query/src/chunk_statistics.rs b/iox_query/src/chunk_statistics.rs index ca2ba742068..043034737bc 100644 --- a/iox_query/src/chunk_statistics.rs +++ b/iox_query/src/chunk_statistics.rs @@ -3,10 +3,12 @@ use std::{collections::HashMap, sync::Arc}; use data_types::TimestampMinMax; +use datafusion::common::stats::Precision; use datafusion::{ physical_plan::{ColumnStatistics, Statistics}, scalar::ScalarValue, }; +use datafusion_util::{option_to_precision, timestamptz_nano}; use schema::{InfluxColumnType, Schema}; /// Represent known min/max values for a specific column. @@ -23,12 +25,25 @@ pub struct ColumnRange { /// These ranges apply to ALL rows (esp. in ALL files and ingester chunks) within in given partition. pub type ColumnRanges = Arc, ColumnRange>>; +/// Returns the min/max values for the range, if present +fn range_to_min_max_stats( + range: Option<&ColumnRange>, +) -> (Precision, Precision) { + let Some(range) = range else { + return (Precision::Absent, Precision::Absent); + }; + ( + Precision::Exact(range.min_value.as_ref().clone()), + Precision::Exact(range.max_value.as_ref().clone()), + ) +} + /// Create chunk [statistics](Statistics). pub fn create_chunk_statistics( - row_count: u64, + row_count: Option, schema: &Schema, ts_min_max: Option, - ranges: &ColumnRanges, + ranges: Option<&ColumnRanges>, ) -> Statistics { let mut columns = Vec::with_capacity(schema.len()); @@ -38,43 +53,46 @@ pub fn create_chunk_statistics( // prefer explicitely given time range but fall back to column ranges let (min_value, max_value) = match ts_min_max { Some(ts_min_max) => ( - Some(ScalarValue::TimestampNanosecond(Some(ts_min_max.min), None)), - Some(ScalarValue::TimestampNanosecond(Some(ts_min_max.max), None)), + Precision::Exact(timestamptz_nano(ts_min_max.min)), + Precision::Exact(timestamptz_nano(ts_min_max.max)), ), None => { - let range = ranges.get::(field.name().as_ref()); - ( - range.map(|r| r.min_value.as_ref().clone()), - range.map(|r| r.max_value.as_ref().clone()), - ) + let range = + ranges.and_then(|ranges| ranges.get::(field.name().as_ref())); + + range_to_min_max_stats(range) } }; ColumnStatistics { - null_count: Some(0), + null_count: Precision::Exact(0), + min_value, max_value, + distinct_count: Precision::Absent, + } + } + _ => { + let range = ranges.and_then(|ranges| ranges.get::(field.name().as_ref())); + + let (min_value, max_value) = range_to_min_max_stats(range); + + ColumnStatistics { + null_count: Precision::Absent, min_value, - distinct_count: None, + max_value, + distinct_count: Precision::Absent, } } - _ => ranges - .get::(field.name().as_ref()) - .map(|range| ColumnStatistics { - null_count: None, - max_value: Some(range.max_value.as_ref().clone()), - min_value: Some(range.min_value.as_ref().clone()), - distinct_count: None, - }) - .unwrap_or_default(), }; columns.push(stats) } + let num_rows = option_to_precision(row_count); + Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(columns), - is_exact: true, + num_rows, + total_byte_size: Precision::Absent, + column_statistics: columns, } } @@ -89,12 +107,24 @@ mod tests { let schema = SchemaBuilder::new().build().unwrap(); let row_count = 0; - let actual = create_chunk_statistics(row_count, &schema, None, &Default::default()); + let actual = create_chunk_statistics(Some(row_count), &schema, None, None); + let expected = Statistics { + num_rows: Precision::Exact(row_count), + total_byte_size: Precision::Absent, + column_statistics: vec![], + }; + assert_eq!(actual, expected); + } + + #[test] + fn test_create_chunk_statistics_no_columns_null_rows() { + let schema = SchemaBuilder::new().build().unwrap(); + + let actual = create_chunk_statistics(None, &schema, None, None); let expected = Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(vec![]), - is_exact: true, + num_rows: Precision::Absent, + total_byte_size: Precision::Absent, + column_statistics: vec![], }; assert_eq!(actual, expected); } @@ -127,37 +157,45 @@ mod tests { ), ])); - for row_count in [0u64, 1337u64] { - let actual = create_chunk_statistics(row_count, &schema, Some(ts_min_max), &ranges); + for row_count in [0usize, 1337usize] { + let actual = + create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges)); let expected = Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(vec![ + num_rows: Precision::Exact(row_count), + total_byte_size: Precision::Absent, + column_statistics: vec![ + // tag1 ColumnStatistics { - null_count: None, - min_value: Some(ScalarValue::from("aaa")), - max_value: Some(ScalarValue::from("bbb")), - distinct_count: None, + null_count: Precision::Absent, + min_value: Precision::Exact(ScalarValue::from("aaa")), + max_value: Precision::Exact(ScalarValue::from("bbb")), + distinct_count: Precision::Absent, }, + // tag2 ColumnStatistics::default(), + // field_bool ColumnStatistics::default(), + // field_float ColumnStatistics::default(), + // field_integer ColumnStatistics { - null_count: None, - min_value: Some(ScalarValue::from(10i64)), - max_value: Some(ScalarValue::from(20i64)), - distinct_count: None, + null_count: Precision::Absent, + min_value: Precision::Exact(ScalarValue::from(10i64)), + max_value: Precision::Exact(ScalarValue::from(20i64)), + distinct_count: Precision::Absent, }, + // field_string ColumnStatistics::default(), + // field_uinteger ColumnStatistics::default(), + // time ColumnStatistics { - null_count: Some(0), - min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)), - max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)), - distinct_count: None, + null_count: Precision::Exact(0), + min_value: Precision::Exact(timestamptz_nano(10)), + max_value: Precision::Exact(timestamptz_nano(20)), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -166,21 +204,22 @@ mod tests { #[test] fn test_create_chunk_statistics_ts_min_max_overrides_column_range() { let schema = full_schema(); - let row_count = 42u64; + let row_count = 42usize; let ts_min_max = TimestampMinMax { min: 10, max: 20 }; let ranges = Arc::new(HashMap::from([( Arc::from(TIME_COLUMN_NAME), ColumnRange { - min_value: Arc::new(ScalarValue::TimestampNanosecond(Some(12), None)), - max_value: Arc::new(ScalarValue::TimestampNanosecond(Some(22), None)), + min_value: Arc::new(timestamptz_nano(12)), + max_value: Arc::new(timestamptz_nano(22)), }, )])); - let actual = create_chunk_statistics(row_count, &schema, Some(ts_min_max), &ranges); + let actual = + create_chunk_statistics(Some(row_count), &schema, Some(ts_min_max), Some(&ranges)); let expected = Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(vec![ + num_rows: Precision::Exact(row_count), + total_byte_size: Precision::Absent, + column_statistics: vec![ ColumnStatistics::default(), ColumnStatistics::default(), ColumnStatistics::default(), @@ -189,13 +228,12 @@ mod tests { ColumnStatistics::default(), ColumnStatistics::default(), ColumnStatistics { - null_count: Some(0), - min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)), - max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)), - distinct_count: None, + null_count: Precision::Exact(0), + min_value: Precision::Exact(timestamptz_nano(10)), + max_value: Precision::Exact(timestamptz_nano(20)), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -203,20 +241,20 @@ mod tests { #[test] fn test_create_chunk_statistics_ts_min_max_none_so_fallback_to_column_range() { let schema = full_schema(); - let row_count = 42u64; + let row_count = 42usize; let ranges = Arc::new(HashMap::from([( Arc::from(TIME_COLUMN_NAME), ColumnRange { - min_value: Arc::new(ScalarValue::TimestampNanosecond(Some(12), None)), - max_value: Arc::new(ScalarValue::TimestampNanosecond(Some(22), None)), + min_value: Arc::new(timestamptz_nano(12)), + max_value: Arc::new(timestamptz_nano(22)), }, )])); - let actual = create_chunk_statistics(row_count, &schema, None, &ranges); + let actual = create_chunk_statistics(Some(row_count), &schema, None, Some(&ranges)); let expected = Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(vec![ + num_rows: Precision::Exact(row_count), + total_byte_size: Precision::Absent, + column_statistics: vec![ ColumnStatistics::default(), ColumnStatistics::default(), ColumnStatistics::default(), @@ -225,13 +263,12 @@ mod tests { ColumnStatistics::default(), ColumnStatistics::default(), ColumnStatistics { - null_count: Some(0), - min_value: Some(ScalarValue::TimestampNanosecond(Some(12), None)), - max_value: Some(ScalarValue::TimestampNanosecond(Some(22), None)), - distinct_count: None, + null_count: Precision::Exact(0), + min_value: Precision::Exact(timestamptz_nano(12)), + max_value: Precision::Exact(timestamptz_nano(22)), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } diff --git a/iox_query/src/exec.rs b/iox_query/src/exec.rs index dfaaf360450..abb8ba59d3a 100644 --- a/iox_query/src/exec.rs +++ b/iox_query/src/exec.rs @@ -10,6 +10,7 @@ mod non_null_checker; pub mod query_tracing; mod schema_pivot; pub mod seriesset; +pub mod sleep; pub(crate) mod split; pub mod stringset; use datafusion_util::config::register_iox_object_store; @@ -60,6 +61,18 @@ pub struct ExecutorConfig { pub mem_pool_size: usize, } +impl ExecutorConfig { + pub fn testing() -> Self { + Self { + num_threads: NonZeroUsize::new(1).unwrap(), + target_query_partitions: NonZeroUsize::new(1).unwrap(), + object_stores: HashMap::default(), + metric_registry: Arc::new(Registry::default()), + mem_pool_size: TESTING_MEM_POOL_SIZE, + } + } +} + impl Display for ExecutorConfig { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( @@ -172,13 +185,7 @@ impl Executor { /// Get testing executor that runs a on single thread and a low memory bound /// to preserve resources. pub fn new_testing() -> Self { - let config = ExecutorConfig { - num_threads: NonZeroUsize::new(1).unwrap(), - target_query_partitions: NonZeroUsize::new(1).unwrap(), - object_stores: HashMap::default(), - metric_registry: Arc::new(Registry::default()), - mem_pool_size: TESTING_MEM_POOL_SIZE, - }; + let config = ExecutorConfig::testing(); let executors = Arc::new(DedicatedExecutors::new_testing()); Self::new_with_config_and_executors(config, executors) } @@ -274,6 +281,11 @@ impl Executor { pub fn pool(&self) -> Arc { Arc::clone(&self.runtime.memory_pool) } + + /// Returns underlying config. + pub fn config(&self) -> &ExecutorConfig { + &self.config + } } // No need to implement `Drop` because this is done by DedicatedExecutor already @@ -742,8 +754,10 @@ mod tests { Ok(Box::pin(stream)) } - fn statistics(&self) -> datafusion::physical_plan::Statistics { - Default::default() + fn statistics(&self) -> Result { + Ok(datafusion::physical_plan::Statistics::new_unknown( + &self.schema(), + )) } } diff --git a/iox_query/src/exec/context.rs b/iox_query/src/exec/context.rs index 4060eb48c83..ad60c7a3cc9 100644 --- a/iox_query/src/exec/context.rs +++ b/iox_query/src/exec/context.rs @@ -6,6 +6,7 @@ use super::{ gapfill::{plan_gap_fill, GapFill}, non_null_checker::NonNullCheckerNode, seriesset::{series::Either, SeriesSet}, + sleep::SleepNode, split::StreamSplitNode, }; use crate::{ @@ -34,6 +35,7 @@ use arrow::record_batch::RecordBatch; use async_trait::async_trait; use datafusion::{ catalog::CatalogProvider, + common::ParamValues, execution::{ context::{QueryPlanner, SessionState, TaskContext}, memory_pool::MemoryPool, @@ -55,7 +57,7 @@ use query_functions::{register_scalar_functions, selectors::register_selector_ag use std::{fmt, num::NonZeroUsize, sync::Arc}; use trace::{ ctx::SpanContext, - span::{MetaValue, Span, SpanExt, SpanRecorder}, + span::{MetaValue, Span, SpanEvent, SpanExt, SpanRecorder}, }; // Reuse DataFusion error and Result types for this module @@ -150,6 +152,9 @@ impl ExtensionPlanner for IOxExtensionPlanner { physical_inputs, )?; Some(Arc::new(gap_fill_exec) as Arc) + } else if let Some(sleep) = any.downcast_ref::() { + let sleep = sleep.plan(planner, logical_inputs, physical_inputs, session_state)?; + Some(Arc::new(sleep) as _) } else { None }; @@ -252,12 +257,12 @@ impl IOxSessionConfig { .session_config .with_extension(Arc::new(recorder.span().cloned())); - let state = SessionState::with_config_rt(session_config, self.runtime) + let state = SessionState::new_with_config_rt(session_config, self.runtime) .with_query_planner(Arc::new(IOxQueryPlanner {})); let state = register_iox_physical_optimizers(state); let state = register_iox_logical_optimizers(state); - let inner = SessionContext::with_state(state); + let inner = SessionContext::new_with_state(state); register_selector_aggregates(&inner); register_scalar_functions(&inner); if let Some(default_catalog) = self.default_catalog { @@ -340,9 +345,27 @@ impl IOxSessionContext { /// in the SQL have been registered with this context. Use /// `create_physical_plan` to actually execute the query. pub async fn sql_to_logical_plan(&self, sql: &str) -> Result { + Self::sql_to_logical_plan_with_params(self, sql, ParamValues::List(vec![])).await + } + + /// Plan a SQL statement, providing a list of parameter values + /// to supply to `$placeholder` variables. This assumes that + /// any tables referenced in the SQL have been registered with + /// this context. Use `create_physical_plan` to actually execute + /// the query. + pub async fn sql_to_logical_plan_with_params( + &self, + sql: &str, + params: impl Into + Send, + ) -> Result { let ctx = self.child_ctx("sql_to_logical_plan"); debug!(text=%sql, "planning SQL query"); - let plan = ctx.inner.state().create_logical_plan(sql).await?; + let plan = ctx + .inner + .state() + .create_logical_plan(sql) + .await? + .with_param_values(params.into())?; // ensure the plan does not contain unwanted statements let verifier = SQLOptions::new() .with_allow_ddl(false) // no CREATE ... @@ -363,9 +386,20 @@ impl IOxSessionContext { /// Plan a SQL statement and convert it to an execution plan. This assumes that any /// tables referenced in the SQL have been registered with this context pub async fn sql_to_physical_plan(&self, sql: &str) -> Result> { - let logical_plan = self.sql_to_logical_plan(sql).await?; + Self::sql_to_physical_plan_with_params(self, sql, ParamValues::List(vec![])).await + } + /// Plan a SQL statement and convert it to an execution plan, providing a list of + /// parameter values to supply to `$placeholder` variables. This assumes that any + /// tables referenced in the SQL have been registered with this context + pub async fn sql_to_physical_plan_with_params( + &self, + sql: &str, + params: impl Into + Send, + ) -> Result> { let ctx = self.child_ctx("sql_to_physical_plan"); + + let logical_plan = ctx.sql_to_logical_plan_with_params(sql, params).await?; ctx.create_physical_plan(&logical_plan).await } @@ -378,7 +412,7 @@ impl IOxSessionContext { debug!(text=%logical_plan.display_indent_schema(), "create_physical_plan: initial plan"); let physical_plan = ctx.inner.state().create_physical_plan(logical_plan).await?; - ctx.recorder.event("physical plan"); + ctx.recorder.event(SpanEvent::new("physical plan")); debug!(text=%displayable(physical_plan.as_ref()).indent(false), "create_physical_plan: plan to run"); Ok(physical_plan) } @@ -671,7 +705,7 @@ impl IOxSessionContext { /// Record an event on the span recorder pub fn record_event(&mut self, name: &'static str) { - self.recorder.event(name); + self.recorder.event(SpanEvent::new(name)); } /// Record an event on the span recorder diff --git a/iox_query/src/exec/field.rs b/iox_query/src/exec/field.rs index a0614b01691..58388900c86 100644 --- a/iox_query/src/exec/field.rs +++ b/iox_query/src/exec/field.rs @@ -55,7 +55,7 @@ impl From<&[&str]> for FieldColumns { } /// Column indexes for a field: a value and corresponding timestamp -#[derive(Debug, PartialEq, Eq, Clone)] +#[derive(Debug, PartialEq, Eq, Clone, Copy)] pub struct FieldIndex { pub value_index: usize, pub timestamp_index: usize, diff --git a/iox_query/src/exec/fieldlist.rs b/iox_query/src/exec/fieldlist.rs index c2bbf4d2dc7..e74954362a3 100644 --- a/iox_query/src/exec/fieldlist.rs +++ b/iox_query/src/exec/fieldlist.rs @@ -190,7 +190,7 @@ mod tests { array::{Int64Array, StringArray}, datatypes::{DataType as ArrowDataType, Field as ArrowField, Schema}, }; - use schema::TIME_DATA_TYPE; + use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE}; #[test] fn test_convert_single_batch() { @@ -200,9 +200,10 @@ mod tests { ])); let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"])); - let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![ - 1000, 2000, 3000, 4000, - ])); + let timestamp_array: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let actual = do_conversion( Arc::clone(&schema), @@ -226,9 +227,10 @@ mod tests { // expect same even if the timestamp order is different let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"])); - let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![ - 1000, 4000, 2000, 3000, - ])); + let timestamp_array: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 4000, 2000, 3000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]]) .expect("convert correctly"); @@ -247,12 +249,16 @@ mod tests { ])); let string_array1: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar"])); - let timestamp_array1: ArrayRef = - Arc::new(TimestampNanosecondArray::from_iter_values(vec![1000, 3000])); + let timestamp_array1: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 3000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let string_array2: ArrayRef = Arc::new(StringArray::from(vec!["foo", "foo"])); - let timestamp_array2: ArrayRef = - Arc::new(TimestampNanosecondArray::from_iter_values(vec![1000, 4000])); + let timestamp_array2: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 4000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let actual = do_conversion( schema, @@ -287,9 +293,10 @@ mod tests { // string array has no actual values, so should not be returned as a field let string_array: ArrayRef = Arc::new(StringArray::from(vec![None::<&str>, None, None, None])); - let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![ - 1000, 2000, 3000, 4000, - ])); + let timestamp_array: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let actual = do_conversion(schema, vec![vec![string_array, timestamp_array]]) .expect("convert correctly"); @@ -314,9 +321,10 @@ mod tests { let string_array: ArrayRef = Arc::new(StringArray::from(vec!["foo", "bar", "baz", "foo"])); let int_array: ArrayRef = Arc::new(Int64Array::from(vec![Some(10), Some(20), Some(30), None])); - let timestamp_array: ArrayRef = Arc::new(TimestampNanosecondArray::from_iter_values(vec![ - 1000, 2000, 3000, 4000, - ])); + let timestamp_array: ArrayRef = Arc::new( + TimestampNanosecondArray::from_iter_values(vec![1000, 2000, 3000, 4000]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let expected = FieldList { fields: vec![ diff --git a/iox_query/src/exec/gapfill/algo.rs b/iox_query/src/exec/gapfill/algo.rs index 05fcaa8f2c3..0733038f8f6 100644 --- a/iox_query/src/exec/gapfill/algo.rs +++ b/iox_query/src/exec/gapfill/algo.rs @@ -274,7 +274,7 @@ impl GapFiller { output_arrays.sort_by(|(a, _), (b, _)| a.cmp(b)); let output_arrays: Vec<_> = output_arrays.into_iter().map(|(_, arr)| arr).collect(); let batch = RecordBatch::try_new(Arc::clone(&schema), output_arrays) - .map_err(DataFusionError::ArrowError)?; + .map_err(|err| DataFusionError::ArrowError(err, None))?; self.cursor = final_cursor; Ok(batch) @@ -596,7 +596,8 @@ impl Cursor { self.build_vec(params, input_time_array, series_ends, &mut aggr_builder)?; let take_arr = UInt64Array::from(aggr_builder.take_idxs); - take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError) + take::take(input_aggr_array, &take_arr, None) + .map_err(|err| DataFusionError::ArrowError(err, None)) } /// Builds an array using the [`take`](take::take) kernel @@ -668,7 +669,8 @@ impl Cursor { }); let take_arr = UInt64Array::from(take_idxs); - take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError) + take::take(input_aggr_array, &take_arr, None) + .map_err(|err| DataFusionError::ArrowError(err, None)) } /// Builds an array using the [`interleave`](arrow::compute::interleave) kernel @@ -969,15 +971,15 @@ impl StashedAggrBuilder<'_> { /// kernel. fn create_stash(input_aggr_array: &ArrayRef, offset: u64) -> Result { let take_arr: UInt64Array = vec![None, Some(offset)].into(); - let stash = - take::take(input_aggr_array, &take_arr, None).map_err(DataFusionError::ArrowError)?; + let stash = take::take(input_aggr_array, &take_arr, None) + .map_err(|err| DataFusionError::ArrowError(err, None))?; Ok(stash) } /// Build the output column. fn build(&self) -> Result { arrow::compute::interleave(&[&self.stash, self.input_aggr_array], &self.interleave_idxs) - .map_err(DataFusionError::ArrowError) + .map_err(|err| DataFusionError::ArrowError(err, None)) } fn buffered_input(offset: usize) -> (usize, usize) { @@ -1043,7 +1045,7 @@ mod tests { use arrow_util::test_util::batches_to_lines; use datafusion::error::Result; use hashbrown::HashMap; - use schema::InfluxColumnType; + use schema::{InfluxColumnType, TIME_DATA_TIMEZONE}; use crate::exec::gapfill::{ algo::{AggrColState, Cursor}, @@ -1188,12 +1190,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &[series], &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &[series], &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_null(¶ms, &[series], &input_times, &input_aggr_array) .unwrap(); @@ -1234,12 +1238,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &[series], &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &[series], &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor.build_aggr_fill_null(¶ms, &[series], &input_times, &input_aggr_array)?; insta::assert_yaml_snapshot!(array_to_lines(&time_arr, &arr), @r###" @@ -1287,12 +1293,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &[series], &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &[series], &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_prev(¶ms, &[series], &input_times, &input_aggr_array) .unwrap(); @@ -1343,12 +1351,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &[series], &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &[series], &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_prev(¶ms, &[series], &input_times, &input_aggr_array) .unwrap(); @@ -1384,7 +1394,8 @@ mod tests { // 1000 Some(1050), // 1100 - ]); + ]) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![10.0, 11.0])); let series_ends = vec![1, 2]; @@ -1399,12 +1410,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_null(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); @@ -1439,7 +1452,8 @@ mod tests { Some(1000), Some(1050), Some(1100), - ]); + ]) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![ // 950 // 1000 @@ -1463,12 +1477,14 @@ mod tests { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_prev(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); @@ -1511,7 +1527,8 @@ mod tests { Some(1050), Some(1100), Some(1100), - ]); + ]) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let input_aggr_array: ArrayRef = Arc::new(Float64Array::from(vec![ // Some(9.0) // 950 // ^^^^^^^^^ this element has been sliced off @@ -1552,12 +1569,14 @@ mod tests { .collect(), }; - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_prev_stashed(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); diff --git a/iox_query/src/exec/gapfill/algo/interpolate.rs b/iox_query/src/exec/gapfill/algo/interpolate.rs index 0e3c68e7949..277e01b6fec 100644 --- a/iox_query/src/exec/gapfill/algo/interpolate.rs +++ b/iox_query/src/exec/gapfill/algo/interpolate.rs @@ -353,6 +353,7 @@ mod test { use arrow::array::{ArrayRef, Float64Array, Int64Array, TimestampNanosecondArray, UInt64Array}; use hashbrown::HashMap; + use schema::TIME_DATA_TIMEZONE; use crate::exec::gapfill::{ algo::tests::{array_to_lines, assert_cursor_end_state, new_cursor_with_batch_size}, @@ -404,12 +405,14 @@ mod test { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_interpolate(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); @@ -476,12 +479,14 @@ mod test { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_interpolate(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); @@ -548,12 +553,14 @@ mod test { let output_batch_size = 10000; let mut cursor = new_cursor_with_batch_size(¶ms, output_batch_size); - let time_arr: TimestampNanosecondArray = cursor - .clone_for_aggr_col(None) - .unwrap() - .build_time_vec(¶ms, &series_ends, &input_times) - .unwrap() - .into(); + let time_arr = TimestampNanosecondArray::from( + cursor + .clone_for_aggr_col(None) + .unwrap() + .build_time_vec(¶ms, &series_ends, &input_times) + .unwrap(), + ) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let arr = cursor .build_aggr_fill_interpolate(¶ms, &series_ends, &input_times, &input_aggr_array) .unwrap(); diff --git a/iox_query/src/exec/gapfill/buffered_input.rs b/iox_query/src/exec/gapfill/buffered_input.rs index a7ea743c00e..59ae3111d40 100644 --- a/iox_query/src/exec/gapfill/buffered_input.rs +++ b/iox_query/src/exec/gapfill/buffered_input.rs @@ -189,8 +189,8 @@ impl BufferedInput { .iter() .map(|c| SortField::new(batch.column(*c).data_type().clone())) .collect(); - let row_converter = - RowConverter::new(sort_fields).map_err(DataFusionError::ArrowError)?; + let row_converter = RowConverter::new(sort_fields) + .map_err(|err| DataFusionError::ArrowError(err, None))?; self.row_converter = Some(row_converter); } Ok(self.row_converter.as_mut().expect("cannot be none")) @@ -206,7 +206,7 @@ impl BufferedInput { .collect(); self.get_row_converter()? .convert_columns(&columns) - .map_err(DataFusionError::ArrowError) + .map_err(|err| DataFusionError::ArrowError(err, None)) } /// Returns the row-oriented representation of the last buffered row that may appear in the next diff --git a/iox_query/src/exec/gapfill/exec_tests.rs b/iox_query/src/exec/gapfill/exec_tests.rs index 78eee423644..cc0a19086e2 100644 --- a/iox_query/src/exec/gapfill/exec_tests.rs +++ b/iox_query/src/exec/gapfill/exec_tests.rs @@ -1456,8 +1456,8 @@ impl TryFrom for Vec { ))); } - let one_batch = - RecordBatch::try_new(value.schema(), arrs).map_err(DataFusionError::ArrowError)?; + let one_batch = RecordBatch::try_new(value.schema(), arrs) + .map_err(|err| DataFusionError::ArrowError(err, None))?; let mut batches = vec![]; let mut offset = 0; while offset < one_batch.num_rows() { @@ -1479,7 +1479,7 @@ struct TestCase { impl TestCase { fn run(self) -> Result> { block_on(async { - let session_ctx = SessionContext::with_config( + let session_ctx = SessionContext::new_with_config( SessionConfig::default().with_batch_size(self.output_batch_size), ) .into(); @@ -1489,7 +1489,7 @@ impl TestCase { fn run_with_memory_limit(self, limit: usize) -> Result> { block_on(async { - let session_ctx = SessionContext::with_config_rt( + let session_ctx = SessionContext::new_with_config_rt( SessionConfig::default().with_batch_size(self.output_batch_size), RuntimeEnv::new(RuntimeConfig::default().with_memory_limit(limit, 1.0))?.into(), ) @@ -1560,10 +1560,7 @@ fn phys_fill_strategies( let end = start + records.agg_cols.len() + records.struct_cols.len(); let mut v = Vec::with_capacity(records.agg_cols.len()); for f in &records.schema().fields()[start..end] { - v.push(( - phys_col(f.name(), &records.schema())?, - fill_strategy.clone(), - )); + v.push((phys_col(f.name(), &records.schema())?, fill_strategy)); } Ok(v) } diff --git a/iox_query/src/exec/gapfill/mod.rs b/iox_query/src/exec/gapfill/mod.rs index 90b20254be8..30ef8a52275 100644 --- a/iox_query/src/exec/gapfill/mod.rs +++ b/iox_query/src/exec/gapfill/mod.rs @@ -70,7 +70,7 @@ pub struct GapFillParams { } /// Describes how to fill gaps in an aggregate column. -#[derive(Clone, Debug, Hash, PartialEq, Eq)] +#[derive(Clone, Debug, Hash, PartialEq, Eq, Copy)] pub enum FillStrategy { /// Fill with null values. /// This is the InfluxQL behavior for `FILL(NULL)` or `FILL(NONE)`. @@ -318,7 +318,7 @@ pub(crate) fn plan_gap_fill( .map(|(e, fs)| { Ok(( create_physical_expr(e, input_dfschema, input_schema, execution_props)?, - fs.clone(), + *fs, )) }) .collect::, FillStrategy)>>>()?; @@ -534,8 +534,8 @@ impl ExecutionPlan for GapFillExec { )?)) } - fn statistics(&self) -> Statistics { - Statistics::default() + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema())) } } @@ -589,9 +589,10 @@ mod test { datasource::empty::EmptyTable, error::Result, logical_expr::{logical_plan, Extension, UserDefinedLogicalNode}, - prelude::{col, lit, lit_timestamp_nano}, + prelude::{col, lit}, scalar::ScalarValue, }; + use datafusion_util::lit_timestamptz_nano; use test_helpers::assert_error; @@ -628,7 +629,7 @@ mod test { time_column: col("time"), origin: None, time_range: Range { - start: Bound::Included(lit_timestamp_nano(1000)), + start: Bound::Included(lit_timestamptz_nano(1000)), end: Bound::Unbounded, }, fill_strategy: fill_strategy_null(vec![col("temp")]), @@ -669,7 +670,7 @@ mod test { origin: None, time_range: Range { start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamp_nano(2000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, fill_strategy: fill_strategy_null(vec![col("temp")]), }, @@ -679,8 +680,8 @@ mod test { time_column: col("time"), origin: None, time_range: Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Excluded(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, fill_strategy: fill_strategy_null(vec![col("temp")]), }, @@ -688,10 +689,10 @@ mod test { GapFillParams { stride: lit(ScalarValue::IntervalDayTime(Some(60_000))), time_column: col("time"), - origin: Some(lit_timestamp_nano(1_000_000_000)), + origin: Some(lit_timestamptz_nano(1_000_000_000)), time_range: Range { start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamp_nano(2000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, fill_strategy: fill_strategy_null(vec![col("temp")]), }, @@ -699,10 +700,10 @@ mod test { GapFillParams { stride: lit(ScalarValue::IntervalDayTime(Some(60_000))), time_column: col("time"), - origin: Some(lit_timestamp_nano(1_000_000_000)), + origin: Some(lit_timestamptz_nano(1_000_000_000)), time_range: Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Excluded(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, fill_strategy: fill_strategy_null(vec![col("temp")]), }, @@ -734,8 +735,8 @@ mod test { time_column: col("time"), origin: None, time_range: Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Excluded(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, fill_strategy: fill_strategy_null(vec![col("temp")]), }, @@ -784,7 +785,7 @@ mod test { - " SortExec: expr=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 ASC]" - " AggregateExec: mode=Final, gby=[date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@0 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]" - " AggregateExec: mode=Partial, gby=[date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))], aggr=[AVG(temps.temp)]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); Ok(()) @@ -814,7 +815,7 @@ mod test { - " SortExec: expr=[loc@0 ASC,concat(Utf8(\"zz\"),temps.loc)@2 ASC,date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 ASC]" - " AggregateExec: mode=Final, gby=[loc@0 as loc, date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\"))@1 as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(Utf8(\"zz\"),temps.loc)@2 as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]" - " AggregateExec: mode=Partial, gby=[loc@1 as loc, date_bin(60000000000, time@0, 0) as date_bin_gapfill(IntervalMonthDayNano(\"60000000000\"),temps.time,Utf8(\"1970-01-01T00:00:00Z\")), concat(zz, loc@1) as concat(Utf8(\"zz\"),temps.loc)], aggr=[AVG(temps.temp)]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); Ok(()) diff --git a/iox_query/src/exec/gapfill/params.rs b/iox_query/src/exec/gapfill/params.rs index b3ead749be1..5e9d0c42bc7 100644 --- a/iox_query/src/exec/gapfill/params.rs +++ b/iox_query/src/exec/gapfill/params.rs @@ -98,7 +98,7 @@ impl GapFillParams { "fill strategy aggr expr was not a column: {e:?}", )))? .index(); - Ok((idx, fs.clone())) + Ok((idx, *fs)) }) .collect::>>()?; diff --git a/iox_query/src/exec/gapfill/stream.rs b/iox_query/src/exec/gapfill/stream.rs index 823c3c173a0..499de06b077 100644 --- a/iox_query/src/exec/gapfill/stream.rs +++ b/iox_query/src/exec/gapfill/stream.rs @@ -182,14 +182,15 @@ impl GapFillStream { let old_size = batches.iter().map(|rb| rb.get_array_memory_size()).sum(); let mut batch = arrow::compute::concat_batches(&self.schema, &batches) - .map_err(DataFusionError::ArrowError)?; + .map_err(|err| DataFusionError::ArrowError(err, None))?; self.reservation.try_grow(batch.get_array_memory_size())?; if batches.len() > 1 { // Optimize the dictionaries. The output of this operator uses the take kernel to produce // its output. Since the input batches will usually be smaller than the output, it should // be less work to optimize here vs optimizing the output. - batch = optimize_dictionaries(&batch).map_err(DataFusionError::ArrowError)?; + batch = optimize_dictionaries(&batch) + .map_err(|err| DataFusionError::ArrowError(err, None))?; } self.reservation.shrink(old_size); @@ -205,7 +206,7 @@ impl GapFillStream { let input_time_array = self .time_expr .evaluate(&input_batch)? - .into_array(input_batch.num_rows()); + .into_array(input_batch.num_rows())?; let input_time_array: &TimestampNanosecondArray = input_time_array .as_any() .downcast_ref() @@ -247,7 +248,8 @@ impl GapFillStream { .map(|e| { Ok(( expr_to_index(e), - e.evaluate(input_batch)?.into_array(input_batch.num_rows()), + e.evaluate(input_batch)? + .into_array(input_batch.num_rows())?, )) }) .collect::>>() @@ -261,7 +263,8 @@ impl GapFillStream { .map(|e| { Ok(( expr_to_index(e), - e.evaluate(input_batch)?.into_array(input_batch.num_rows()), + e.evaluate(input_batch)? + .into_array(input_batch.num_rows())?, )) }) .collect::>>() diff --git a/iox_query/src/exec/non_null_checker.rs b/iox_query/src/exec/non_null_checker.rs index 1de84cc656e..8a60bd73f57 100644 --- a/iox_query/src/exec/non_null_checker.rs +++ b/iox_query/src/exec/non_null_checker.rs @@ -46,6 +46,7 @@ use arrow::{ datatypes::{DataType, Field, Schema, SchemaRef}, record_batch::RecordBatch, }; +use datafusion::logical_expr::expr_vec_fmt; use datafusion::{ common::{DFSchemaRef, ToDFSchema}, error::{DataFusionError, Result}, @@ -79,6 +80,10 @@ pub struct NonNullCheckerNode { } impl NonNullCheckerNode { + /// Creates a new NonNullChecker node + /// + /// # Panics + /// If the input schema is empty pub fn new(value: &str, input: LogicalPlan) -> Self { let schema = make_non_null_checker_output_schema(); @@ -91,6 +96,8 @@ impl NonNullCheckerNode { .map(|field| Expr::Column(field.qualified_column())) .collect::>(); + assert!(!exprs.is_empty(), "NonNullChecker: input schema was empty"); + Self { input, schema, @@ -130,17 +137,23 @@ impl UserDefinedLogicalNodeCore for NonNullCheckerNode { self.exprs.clone() } - /// For example: `NonNullChecker('the_value')` + /// For example: `NonNullChecker('the_value'), exprs=[foo]` fn fmt_for_explain(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}('{}')", self.name(), self.value) + write!( + f, + "{}('{}') exprs={}", + self.name(), + self.value, + expr_vec_fmt!(self.exprs) + ) } fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { - assert_eq!(inputs.len(), 1, "NonNullChecker: input sizes inconistent"); + assert_eq!(inputs.len(), 1, "NonNullChecker: input sizes inconsistent"); assert_eq!( exprs.len(), self.exprs.len(), - "NonNullChecker: expression sizes inconistent" + "NonNullChecker: expression sizes inconsistent" ); Self::new(self.value.as_ref(), inputs[0].clone()) } @@ -276,9 +289,8 @@ impl ExecutionPlan for NonNullCheckerExec { Some(self.metrics.clone_inner()) } - fn statistics(&self) -> Statistics { - // don't know anything about the statistics - Statistics::default() + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema())) } } diff --git a/iox_query/src/exec/query_tracing.rs b/iox_query/src/exec/query_tracing.rs index a4b81bd2c39..de639c33b7a 100644 --- a/iox_query/src/exec/query_tracing.rs +++ b/iox_query/src/exec/query_tracing.rs @@ -672,8 +672,10 @@ mod tests { unimplemented!() } - fn statistics(&self) -> datafusion::physical_plan::Statistics { - unimplemented!() + fn statistics(&self) -> Result { + Ok(datafusion::physical_plan::Statistics::new_unknown( + &self.schema(), + )) } fn metrics(&self) -> Option { diff --git a/iox_query/src/exec/schema_pivot.rs b/iox_query/src/exec/schema_pivot.rs index b6192f61f1a..a3e3d3adb2f 100644 --- a/iox_query/src/exec/schema_pivot.rs +++ b/iox_query/src/exec/schema_pivot.rs @@ -251,9 +251,8 @@ impl ExecutionPlan for SchemaPivotExec { Some(self.metrics.clone_inner()) } - fn statistics(&self) -> Statistics { - // don't know anything about the statistics - Statistics::default() + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema())) } } diff --git a/iox_query/src/exec/seriesset/converter.rs b/iox_query/src/exec/seriesset/converter.rs index 2ad6a63fd6c..81e83844927 100644 --- a/iox_query/src/exec/seriesset/converter.rs +++ b/iox_query/src/exec/seriesset/converter.rs @@ -48,7 +48,7 @@ pub enum Error { pub type Result = std::result::Result; // Handles converting record batches into SeriesSets -#[derive(Debug, Default)] +#[derive(Debug, Default, Copy, Clone)] pub struct SeriesSetConverter {} impl SeriesSetConverter { @@ -386,7 +386,7 @@ impl Stream for SeriesSetConverterStream { Err(e) => { // internal state is broken, end this stream this.we_finished = true; - return Poll::Ready(Some(Err(DataFusionError::ArrowError(e)))); + return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None)))); } }; @@ -435,7 +435,7 @@ impl Stream for SeriesSetConverterStream { Err(e) => { // internal state is broken, end this stream this.we_finished = true; - return Poll::Ready(Some(Err(DataFusionError::ArrowError(e)))); + return Poll::Ready(Some(Err(DataFusionError::ArrowError(e, None)))); } }; @@ -625,9 +625,8 @@ impl PartialEq for SortableSeries { impl Eq for SortableSeries {} impl PartialOrd for SortableSeries { - #[allow(clippy::non_canonical_partial_ord_impl)] fn partial_cmp(&self, other: &Self) -> Option { - self.tag_vals.partial_cmp(&other.tag_vals) + Some(self.cmp(other)) } } diff --git a/iox_query/src/exec/sleep.rs b/iox_query/src/exec/sleep.rs new file mode 100644 index 00000000000..b7fa5050fd3 --- /dev/null +++ b/iox_query/src/exec/sleep.rs @@ -0,0 +1,265 @@ +/// Implementation of a "sleep" operation in DataFusion. +/// +/// The sleep operation passes through its input data and sleeps asynchronously for a duration determined by an +/// expression. The async sleep is implemented as a special [execution plan](SleepExpr) so we can perform this as part +/// of the async data stream. In contrast to a UDF, this will NOT block any threads. +use std::{sync::Arc, time::Duration}; + +use arrow::{ + array::{Array, Float32Array, Float64Array, Int64Array}, + datatypes::{DataType, SchemaRef, TimeUnit}, +}; +use datafusion::{ + common::DFSchemaRef, + error::DataFusionError, + execution::{context::SessionState, TaskContext}, + logical_expr::{LogicalPlan, UserDefinedLogicalNodeCore}, + physical_plan::{ + stream::RecordBatchStreamAdapter, DisplayAs, DisplayFormatType, ExecutionPlan, + PhysicalExpr, SendableRecordBatchStream, Statistics, + }, + physical_planner::PhysicalPlanner, + prelude::Expr, +}; +use futures::TryStreamExt; + +/// Logical plan note that represents a "sleep" operation. +/// +/// This will be lowered to [`SleepExpr`]. +/// +/// See [module](super) docs for more details. +#[derive(Clone, Debug, Hash, PartialEq, Eq)] +pub struct SleepNode { + input: LogicalPlan, + duration: Vec, +} + +impl SleepNode { + pub fn new(input: LogicalPlan, duration: Vec) -> Self { + Self { input, duration } + } + + pub fn plan( + &self, + planner: &dyn PhysicalPlanner, + logical_inputs: &[&LogicalPlan], + physical_inputs: &[Arc], + session_state: &SessionState, + ) -> Result { + let duration = self + .duration + .iter() + .map(|e| { + planner.create_physical_expr( + e, + logical_inputs[0].schema(), + &physical_inputs[0].schema(), + session_state, + ) + }) + .collect::, _>>()?; + Ok(SleepExpr::new(Arc::clone(&physical_inputs[0]), duration)) + } +} + +impl UserDefinedLogicalNodeCore for SleepNode { + fn name(&self) -> &str { + "Sleep" + } + + fn inputs(&self) -> Vec<&LogicalPlan> { + vec![&self.input] + } + + fn schema(&self) -> &DFSchemaRef { + self.input.schema() + } + + fn expressions(&self) -> Vec { + self.duration.clone() + } + + fn fmt_for_explain(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let duration = self + .duration + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", "); + + write!(f, "{}: duration=[{}]", self.name(), duration) + } + + fn from_template(&self, exprs: &[Expr], inputs: &[LogicalPlan]) -> Self { + Self::new(inputs[0].clone(), exprs.to_vec()) + } +} + +/// Physical node that implements a "sleep" operation. +/// +/// This was lowered from [`SleepNode`]. +/// +/// See [module](super) docs for more details. +#[derive(Debug)] +pub struct SleepExpr { + /// Input data. + input: Arc, + + /// Expression that determines the sum of the sleep duration. + duration: Vec>, +} + +impl SleepExpr { + pub fn new(input: Arc, duration: Vec>) -> Self { + Self { input, duration } + } +} + +impl DisplayAs for SleepExpr { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + let duration = self + .duration + .iter() + .map(|e| e.to_string()) + .collect::>() + .join(", "); + + write!(f, "Sleep: duration=[{}]", duration) + } + } + } +} + +impl ExecutionPlan for SleepExpr { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { + self.input.output_partitioning() + } + + fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> { + self.input.output_ordering() + } + + fn children(&self) -> Vec> { + vec![Arc::clone(&self.input)] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion::error::Result> { + assert_eq!(children.len(), 1); + + Ok(Arc::new(Self::new( + Arc::clone(&children[0]), + self.duration.clone(), + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> datafusion::error::Result { + let stream = self.input.execute(partition, context)?; + + let duration = self.duration.clone(); + let stream = RecordBatchStreamAdapter::new( + stream.schema(), + stream.and_then(move |batch| { + let duration = duration.clone(); + + async move { + let mut sum = Duration::ZERO; + for expr in duration { + let array = expr.evaluate(&batch)?.into_array(batch.num_rows())?; + let d = array_to_duration(&array)?; + if let Some(d) = d { + sum += d; + } + } + if !sum.is_zero() { + tokio::time::sleep(sum).await; + } + Ok(batch) + } + }), + ); + Ok(Box::pin(stream)) + } + + fn statistics(&self) -> Result { + Ok(Statistics::new_unknown(&self.schema())) + } +} + +fn array_to_duration(array: &dyn Array) -> Result, DataFusionError> { + match array.data_type() { + DataType::Null => Ok(None), + DataType::Duration(tunit) => { + let array = arrow::compute::cast(array, &DataType::Int64)?; + let array = array + .as_any() + .downcast_ref::() + .expect("just casted"); + let Some(sum) = arrow::compute::sum(array) else { + return Ok(None); + }; + if sum < 0 { + return Err(DataFusionError::Execution(format!( + "duration must be non-negative but is {sum}{tunit:?}" + ))); + } + let sum = sum as u64; + let duration = match tunit { + TimeUnit::Second => Duration::from_secs(sum), + TimeUnit::Millisecond => Duration::from_millis(sum), + TimeUnit::Microsecond => Duration::from_micros(sum), + TimeUnit::Nanosecond => Duration::from_nanos(sum), + }; + Ok(Some(duration)) + } + DataType::Float32 => { + let array = array + .as_any() + .downcast_ref::() + .expect("just checked"); + let Some(sum) = arrow::compute::sum(array) else { + return Ok(None); + }; + if sum < 0.0 || !sum.is_finite() { + return Err(DataFusionError::Execution(format!( + "duration must be non-negative but is {sum}s" + ))); + } + Ok(Some(Duration::from_secs_f32(sum))) + } + DataType::Float64 => { + let array = array + .as_any() + .downcast_ref::() + .expect("just checked"); + let Some(sum) = arrow::compute::sum(array) else { + return Ok(None); + }; + if sum < 0.0 || !sum.is_finite() { + return Err(DataFusionError::Execution(format!( + "duration must be non-negative but is {sum}s" + ))); + } + Ok(Some(Duration::from_secs_f64(sum))) + } + other => Err(DataFusionError::Internal(format!( + "Expected duration pattern to sleep(...), got: {other:?}" + ))), + } +} diff --git a/iox_query/src/exec/split.rs b/iox_query/src/exec/split.rs index 736ab02131a..30108844d74 100644 --- a/iox_query/src/exec/split.rs +++ b/iox_query/src/exec/split.rs @@ -271,10 +271,10 @@ impl ExecutionPlan for StreamSplitExec { Some(self.metrics.clone_inner()) } - fn statistics(&self) -> Statistics { + fn statistics(&self) -> Result { // For now, don't return any statistics (in the future we // could potentially estimate the output cardinalities) - Statistics::default() + Ok(Statistics::new_unknown(&self.schema())) } } @@ -567,7 +567,7 @@ mod tests { let input = make_input(vec![vec![batch0, batch1]]); // int_col < 3 - let split_expr = df_physical_expr(input.as_ref(), col("int_col").lt(lit(3))).unwrap(); + let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr])); @@ -625,12 +625,12 @@ mod tests { let input = make_input(vec![vec![batch0, batch1]]); // int_col < 2 let split_expr1 = - df_physical_expr(input.as_ref(), col("int_col").lt(lit::(2))).unwrap(); + df_physical_expr(input.schema(), col("int_col").lt(lit::(2))).unwrap(); // 2 <= int_col < 3 let expr = col("int_col") .gt_eq(lit::(2)) .and(col("int_col").lt(lit::(3))); - let split_expr2 = df_physical_expr(input.as_ref(), expr).unwrap(); + let split_expr2 = df_physical_expr(input.schema(), expr).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2])); @@ -679,7 +679,7 @@ mod tests { let input = make_input(vec![vec![batch0]]); // use `false` to send all outputs to second stream - let split_expr = df_physical_expr(input.as_ref(), lit(false)).unwrap(); + let split_expr = df_physical_expr(input.schema(), lit(false)).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr])); @@ -713,8 +713,8 @@ mod tests { // Test 1: 3 streams but all data is sent to the second one let input = make_input(vec![vec![batch0.clone()]]); // use `false` & `true` to send all outputs to second stream - let split_expr1 = df_physical_expr(input.as_ref(), lit(false)).unwrap(); - let split_expr2 = df_physical_expr(input.as_ref(), lit(true)).unwrap(); + let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap(); + let split_expr2 = df_physical_expr(input.schema(), lit(true)).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2])); @@ -743,8 +743,8 @@ mod tests { let input = make_input(vec![vec![batch0.clone()]]); // use `false` & `false` to send all outputs to third stream - let split_expr1 = df_physical_expr(input.as_ref(), lit(false)).unwrap(); - let split_expr2 = df_physical_expr(input.as_ref(), lit(false)).unwrap(); + let split_expr1 = df_physical_expr(input.schema(), lit(false)).unwrap(); + let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2])); @@ -773,8 +773,8 @@ mod tests { let input = make_input(vec![vec![batch0]]); // use `true` & `false` to send all outputs to first stream - let split_expr1 = df_physical_expr(input.as_ref(), lit(true)).unwrap(); - let split_expr2 = df_physical_expr(input.as_ref(), lit(false)).unwrap(); + let split_expr1 = df_physical_expr(input.schema(), lit(true)).unwrap(); + let split_expr2 = df_physical_expr(input.schema(), lit(false)).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2])); @@ -812,7 +812,7 @@ mod tests { let input = make_input(vec![vec![batch0]]); // int_col < 3 - let split_expr = df_physical_expr(input.as_ref(), col("int_col").lt(lit(3))).unwrap(); + let split_expr = df_physical_expr(input.schema(), col("int_col").lt(lit(3))).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr])); @@ -853,12 +853,12 @@ mod tests { let input = make_input(vec![vec![batch0]]); // int_col < 2 let split_expr1 = - df_physical_expr(input.as_ref(), col("int_col").lt(lit::(2))).unwrap(); + df_physical_expr(input.schema(), col("int_col").lt(lit::(2))).unwrap(); // 2 <= int_col < 3 let expr = col("int_col") .gt_eq(lit::(2)) .and(col("int_col").lt(lit::(3))); - let split_expr2 = df_physical_expr(input.as_ref(), expr).unwrap(); + let split_expr2 = df_physical_expr(input.schema(), expr).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr1, split_expr2])); @@ -908,7 +908,7 @@ mod tests { let input = make_input(vec![vec![batch0]]); // int_col (not a boolean) - let split_expr = df_physical_expr(input.as_ref(), col("int_col")).unwrap(); + let split_expr = df_physical_expr(input.schema(), col("int_col")).unwrap(); let split_exec: Arc = Arc::new(StreamSplitExec::new(input, vec![split_expr])); diff --git a/iox_query/src/frontend/reorg.rs b/iox_query/src/frontend/reorg.rs index 1149d6cb443..9bf8259a76a 100644 --- a/iox_query/src/frontend/reorg.rs +++ b/iox_query/src/frontend/reorg.rs @@ -2,10 +2,8 @@ use std::sync::Arc; -use datafusion::{ - logical_expr::LogicalPlan, - prelude::{col, lit_timestamp_nano}, -}; +use datafusion::{logical_expr::LogicalPlan, prelude::col}; +use datafusion_util::lit_timestamptz_nano; use observability_deps::tracing::debug; use schema::{sort::SortKey, Schema, TIME_COLUMN_NAME}; @@ -44,7 +42,7 @@ impl From for Error { /// Planner for physically rearranging chunk data. This planner /// creates COMPACT and SPLIT plans for use in the database lifecycle manager -#[derive(Debug, Default)] +#[derive(Debug, Default, Copy, Clone)] pub struct ReorgPlanner {} impl ReorgPlanner { @@ -203,7 +201,7 @@ impl ReorgPlanner { let mut split_exprs = Vec::with_capacity(split_times.len()); // time <= split_times[0] - split_exprs.push(col(TIME_COLUMN_NAME).lt_eq(lit_timestamp_nano(split_times[0]))); + split_exprs.push(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[0]))); // split_times[i-1] , time <= split_time[i] for i in 1..split_times.len() { if split_times[i - 1] >= split_times[i] { @@ -217,8 +215,8 @@ impl ReorgPlanner { } split_exprs.push( col(TIME_COLUMN_NAME) - .gt(lit_timestamp_nano(split_times[i - 1])) - .and(col(TIME_COLUMN_NAME).lt_eq(lit_timestamp_nano(split_times[i]))), + .gt(lit_timestamptz_nano(split_times[i - 1])) + .and(col(TIME_COLUMN_NAME).lt_eq(lit_timestamptz_nano(split_times[i]))), ); } let plan = make_stream_split(plan, split_exprs); @@ -389,12 +387,79 @@ mod test { } #[tokio::test] - async fn test_compact_plan() { + async fn test_compact_plan_default_sort() { + test_helpers::maybe_start_logging(); + + let (schema, chunks) = get_test_chunks().await; + + let sort_key = SortKeyBuilder::with_capacity(2) + .with_col("tag1") + .with_col(TIME_COLUMN_NAME) + .build(); + + let compact_plan = ReorgPlanner::new() + .compact_plan(Arc::from("t"), &schema, chunks, sort_key) + .expect("created compact plan"); + + let executor = Executor::new_testing(); + let physical_plan = executor + .new_context(ExecutorType::Reorg) + .create_physical_plan(&compact_plan) + .await + .unwrap(); + + // It is critical that the plan only sorts the inputs and is not resorted after the UnionExec. + insta::assert_yaml_snapshot!( + format_execution_plan(&physical_plan), + @r###" + --- + - " SortPreservingMergeExec: [tag1@2 ASC,time@3 ASC]" + - " UnionExec" + - " SortExec: expr=[tag1@2 ASC,time@3 ASC]" + - " RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]" + - " ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]" + - " DeduplicateExec: [tag1@3 ASC,time@4 ASC]" + - " SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" + - " RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]" + "### + ); + + assert_eq!( + physical_plan.output_partitioning().partition_count(), + 1, + "{:?}", + physical_plan.output_partitioning() + ); + + let batches = test_collect(physical_plan).await; + + // sorted on state ASC and time ASC (defaults) + let expected = vec![ + "+-----------+------------+------+--------------------------------+", + "| field_int | field_int2 | tag1 | time |", + "+-----------+------------+------+--------------------------------+", + "| 100 | | AL | 1970-01-01T00:00:00.000000050Z |", + "| 70 | | CT | 1970-01-01T00:00:00.000000100Z |", + "| 1000 | | MT | 1970-01-01T00:00:00.000001Z |", + "| 5 | | MT | 1970-01-01T00:00:00.000005Z |", + "| 10 | | MT | 1970-01-01T00:00:00.000007Z |", + "| 70 | 70 | UT | 1970-01-01T00:00:00.000220Z |", + "| 50 | 50 | VT | 1970-01-01T00:00:00.000210Z |", + "| 1000 | 1000 | WA | 1970-01-01T00:00:00.000028Z |", + "+-----------+------------+------+--------------------------------+", + ]; + + assert_batches_eq!(&expected, &batches); + } + + #[tokio::test] + async fn test_compact_plan_alternate_sort() { test_helpers::maybe_start_logging(); let (schema, chunks) = get_test_chunks().await; let sort_key = SortKeyBuilder::with_capacity(2) + // use something other than the default sort .with_col_opts("tag1", true, true) .with_col_opts(TIME_COLUMN_NAME, false, false) .build(); @@ -417,12 +482,12 @@ mod test { - " SortPreservingMergeExec: [tag1@2 DESC,time@3 ASC NULLS LAST]" - " UnionExec" - " SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]" - " SortExec: expr=[tag1@2 DESC,time@3 ASC NULLS LAST]" - " ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]" - " DeduplicateExec: [tag1@3 ASC,time@4 ASC]" - " SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]" "### ); @@ -435,7 +500,7 @@ mod test { let batches = test_collect(physical_plan).await; - // sorted on state ASC and time + // sorted on state DESC and time ASC let expected = vec![ "+-----------+------------+------+--------------------------------+", "| field_int | field_int2 | tag1 | time |", @@ -486,12 +551,12 @@ mod test { - " SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]" - " UnionExec" - " SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]" - " SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]" - " ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]" - " DeduplicateExec: [tag1@3 ASC,time@4 ASC]" - " SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]" "### ); @@ -567,12 +632,12 @@ mod test { - " SortPreservingMergeExec: [time@3 ASC NULLS LAST,tag1@2 ASC]" - " UnionExec" - " SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field_int, field_int2, tag1, time]" - " SortExec: expr=[time@3 ASC NULLS LAST,tag1@2 ASC]" - " ProjectionExec: expr=[field_int@1 as field_int, field_int2@2 as field_int2, tag1@3 as tag1, time@4 as time]" - " DeduplicateExec: [tag1@3 ASC,time@4 ASC]" - " SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[__chunk_order, field_int, field_int2, tag1, time]" "### ); diff --git a/iox_query/src/frontend/sql.rs b/iox_query/src/frontend/sql.rs index 28536e41cf2..4008e3c8f27 100644 --- a/iox_query/src/frontend/sql.rs +++ b/iox_query/src/frontend/sql.rs @@ -1,10 +1,10 @@ use std::sync::Arc; use crate::exec::context::IOxSessionContext; -use datafusion::{error::Result, physical_plan::ExecutionPlan}; +use datafusion::{common::ParamValues, error::Result, physical_plan::ExecutionPlan}; /// This struct can create plans for running SQL queries against databases -#[derive(Debug, Default)] +#[derive(Debug, Default, Copy, Clone)] pub struct SqlQueryPlanner {} impl SqlQueryPlanner { @@ -17,8 +17,10 @@ impl SqlQueryPlanner { pub async fn query( &self, query: &str, + params: impl Into + Send, ctx: &IOxSessionContext, ) -> Result> { - ctx.sql_to_physical_plan(query).await + let ctx = ctx.child_ctx("SqlQueryPlanner::query"); + ctx.sql_to_physical_plan_with_params(query, params).await } } diff --git a/iox_query/src/lib.rs b/iox_query/src/lib.rs index a4c7d3f95c7..e5afb924238 100644 --- a/iox_query/src/lib.rs +++ b/iox_query/src/lib.rs @@ -11,11 +11,14 @@ clippy::dbg_macro, unused_crate_dependencies )] +#![allow(unreachable_pub)] use datafusion_util::MemoryStream; use futures::TryStreamExt; -use trace::ctx::SpanContext; +use query_log::{QueryCompletedToken, QueryText, StateReceived}; +use trace::{ctx::SpanContext, span::Span}; +use tracker::InstrumentedAsyncOwnedSemaphorePermit; // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -45,6 +48,7 @@ pub mod physical_optimizer; pub mod plan; pub mod provider; pub mod pruning; +pub mod query_log; pub mod statistics; pub mod util; @@ -98,54 +102,6 @@ pub trait QueryChunk: Debug + Send + Sync + 'static { fn as_any(&self) -> &dyn Any; } -/// A `QueryCompletedToken` is returned by `record_query` implementations of -/// a `QueryNamespace`. It is used to trigger side-effects (such as query timing) -/// on query completion. -/// -pub struct QueryCompletedToken { - /// If this query completed successfully - success: bool, - - /// Function invoked when the token is dropped. It is passed the - /// vaue of `self.success` - f: Option>, -} - -impl Debug for QueryCompletedToken { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("QueryCompletedToken") - .field("success", &self.success) - .finish() - } -} - -impl QueryCompletedToken { - pub fn new(f: impl FnOnce(bool) + Send + 'static) -> Self { - Self { - success: false, - f: Some(Box::new(f)), - } - } - - /// Record that this query completed successfully - pub fn set_success(&mut self) { - self.success = true; - } -} - -impl Drop for QueryCompletedToken { - fn drop(&mut self) { - if let Some(f) = self.f.take() { - (f)(self.success) - } - } -} - -/// Boxed description of a query that knows how to render to a string -/// -/// This avoids storing potentially large strings -pub type QueryText = Box; - /// `QueryNamespace` is the main trait implemented by the IOx subsystems that store actual data. /// /// Namespaces store data organized by partitions and each partition stores data in Chunks. @@ -186,12 +142,33 @@ pub trait QueryNamespace: Debug + Send + Sync { span_ctx: Option<&SpanContext>, query_type: &'static str, query_text: QueryText, - ) -> QueryCompletedToken; + ) -> QueryCompletedToken; /// Returns a new execution context suitable for running queries fn new_query_context(&self, span_ctx: Option) -> IOxSessionContext; } +/// Trait that allows the query engine (which includes flight and storage/InfluxRPC) to access a +/// virtual set of namespaces. +/// +/// This is the only entry point for the query engine. This trait and the traits reachable by it (e.g. +/// [`QueryNamespace`]) are the only wait to access the catalog and payload data. +#[async_trait] +pub trait QueryNamespaceProvider: std::fmt::Debug + Send + Sync + 'static { + /// Get namespace if it exists. + /// + /// System tables may contain debug information depending on `include_debug_info_tables`. + async fn db( + &self, + name: &str, + span: Option, + include_debug_info_tables: bool, + ) -> Option>; + + /// Acquire concurrency-limiting sempahore + async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit; +} + /// Raw data of a [`QueryChunk`]. pub enum QueryChunkData { /// Record batches. @@ -236,110 +213,6 @@ impl std::fmt::Debug for QueryChunkData { } } -impl

QueryChunk for Arc

-where - P: QueryChunk, -{ - fn stats(&self) -> Arc { - self.as_ref().stats() - } - - fn schema(&self) -> &Schema { - self.as_ref().schema() - } - - fn partition_id(&self) -> &TransitionPartitionId { - self.as_ref().partition_id() - } - - fn sort_key(&self) -> Option<&SortKey> { - self.as_ref().sort_key() - } - - fn id(&self) -> ChunkId { - self.as_ref().id() - } - - fn may_contain_pk_duplicates(&self) -> bool { - self.as_ref().may_contain_pk_duplicates() - } - - fn data(&self) -> QueryChunkData { - self.as_ref().data() - } - - fn chunk_type(&self) -> &str { - self.as_ref().chunk_type() - } - - fn order(&self) -> ChunkOrder { - self.as_ref().order() - } - - fn as_any(&self) -> &dyn Any { - // present the underlying implementation, not the wrapper - self.as_ref().as_any() - } -} - -impl QueryChunk for Arc { - fn stats(&self) -> Arc { - self.as_ref().stats() - } - - fn schema(&self) -> &Schema { - self.as_ref().schema() - } - - fn partition_id(&self) -> &TransitionPartitionId { - self.as_ref().partition_id() - } - - fn sort_key(&self) -> Option<&SortKey> { - self.as_ref().sort_key() - } - - fn id(&self) -> ChunkId { - self.as_ref().id() - } - - fn may_contain_pk_duplicates(&self) -> bool { - self.as_ref().may_contain_pk_duplicates() - } - - fn data(&self) -> QueryChunkData { - self.as_ref().data() - } - - fn chunk_type(&self) -> &str { - self.as_ref().chunk_type() - } - - fn order(&self) -> ChunkOrder { - self.as_ref().order() - } - - fn as_any(&self) -> &dyn Any { - // present the underlying implementation, not the wrapper - self.as_ref().as_any() - } -} - -/// return true if all the chunks include distinct counts for all columns. -pub fn chunks_have_distinct_counts<'a>( - chunks: impl IntoIterator>, -) -> bool { - // If at least one of the provided chunk cannot provide stats, - // do not need to compute potential duplicates. We will treat - // as all of them have duplicates - chunks.into_iter().all(|chunk| { - let Some(col_stats) = &chunk.stats().column_statistics else { - return false; - }; - col_stats.iter().all(|col| col.distinct_count.is_some()) - }) -} - // Note: I would like to compile this module only in the 'test' cfg, // but when I do so then other modules can not find them. For example: // diff --git a/iox_query/src/logical_optimizer/extract_sleep.rs b/iox_query/src/logical_optimizer/extract_sleep.rs new file mode 100644 index 00000000000..2f11446ec29 --- /dev/null +++ b/iox_query/src/logical_optimizer/extract_sleep.rs @@ -0,0 +1,100 @@ +use std::sync::Arc; + +use datafusion::logical_expr::expr::ScalarFunction; +use datafusion::{ + common::{tree_node::TreeNodeRewriter, DFSchema}, + error::DataFusionError, + logical_expr::{expr_rewriter::rewrite_preserving_name, Extension, LogicalPlan}, + optimizer::{OptimizerConfig, OptimizerRule}, + prelude::{lit, Expr}, + scalar::ScalarValue, +}; +use query_functions::SLEEP_UDF_NAME; + +use crate::exec::sleep::SleepNode; + +/// Rewrites the ["sleep" UDF](SLEEP_UDF_NAME) to a NULL expression and a [`SleepNode`]. +/// +/// See [`crate::exec::sleep`] for more details. +#[derive(Debug, Clone)] +pub struct ExtractSleep {} + +impl ExtractSleep { + /// Create new optimizer rule. + pub fn new() -> Self { + Self {} + } +} + +impl OptimizerRule for ExtractSleep { + fn name(&self) -> &str { + "extract_sleep" + } + + fn try_optimize( + &self, + plan: &LogicalPlan, + _config: &dyn OptimizerConfig, + ) -> datafusion::error::Result> { + optimize(plan).map(Some) + } +} + +fn optimize(plan: &LogicalPlan) -> Result { + let new_inputs = plan + .inputs() + .iter() + .map(|input| optimize(input)) + .collect::, DataFusionError>>()?; + + let mut schema = + new_inputs + .iter() + .map(|input| input.schema()) + .fold(DFSchema::empty(), |mut lhs, rhs| { + lhs.merge(rhs); + lhs + }); + + schema.merge(plan.schema()); + + let mut expr_rewriter = Rewriter::default(); + + let new_exprs = plan + .expressions() + .into_iter() + .map(|expr| rewrite_preserving_name(expr, &mut expr_rewriter)) + .collect::, DataFusionError>>()?; + let mut plan = plan.with_new_exprs(new_exprs, &new_inputs)?; + + if !expr_rewriter.found_exprs.is_empty() { + plan = LogicalPlan::Extension(Extension { + node: Arc::new(SleepNode::new(plan, expr_rewriter.found_exprs)), + }); + } + + Ok(plan) +} + +#[derive(Default)] +struct Rewriter { + found_exprs: Vec, +} + +impl TreeNodeRewriter for Rewriter { + type N = Expr; + + fn mutate(&mut self, expr: Expr) -> Result { + match expr { + Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => { + if func_def.name() == SLEEP_UDF_NAME { + self.found_exprs.append(&mut args); + return Ok(lit(ScalarValue::Null)); + } + + Ok(Expr::ScalarFunction(ScalarFunction { func_def, args })) + } + _ => Ok(expr), + } + } +} diff --git a/iox_query/src/logical_optimizer/handle_gapfill.rs b/iox_query/src/logical_optimizer/handle_gapfill.rs index 291b88e986b..bd046b14df1 100644 --- a/iox_query/src/logical_optimizer/handle_gapfill.rs +++ b/iox_query/src/logical_optimizer/handle_gapfill.rs @@ -4,16 +4,17 @@ pub mod range_predicate; use crate::exec::gapfill::{FillStrategy, GapFill, GapFillParams}; +use datafusion::logical_expr::ScalarFunctionDefinition; use datafusion::{ common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter, VisitRecursion}, error::{DataFusionError, Result}, logical_expr::{ - expr::{Alias, ScalarFunction, ScalarUDF}, + expr::{Alias, ScalarFunction}, utils::expr_to_columns, Aggregate, BuiltinScalarFunction, Extension, LogicalPlan, Projection, }, optimizer::{optimizer::ApplyOrder, OptimizerConfig, OptimizerRule}, - prelude::{col, Expr}, + prelude::{col, Column, Expr}, }; use hashbrown::{hash_map, HashMap}; use query_functions::gapfill::{DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME}; @@ -100,8 +101,12 @@ impl OptimizerRule for HandleGapFill { fn handle_gap_fill(plan: &LogicalPlan) -> Result> { let res = match plan { - LogicalPlan::Aggregate(aggr) => handle_aggregate(aggr)?, - LogicalPlan::Projection(proj) => handle_projection(proj)?, + LogicalPlan::Aggregate(aggr) => { + handle_aggregate(aggr).map_err(|e| e.context("handle_aggregate"))? + } + LogicalPlan::Projection(proj) => { + handle_projection(proj).map_err(|e| e.context("handle_projection"))? + } _ => None, }; @@ -129,7 +134,9 @@ fn handle_aggregate(aggr: &Aggregate) -> Result> { new_group_expr, date_bin_gapfill_index, date_bin_gapfill_args, - } = if let Some(v) = replace_date_bin_gapfill(group_expr)? { + } = if let Some(v) = + replace_date_bin_gapfill(group_expr).map_err(|e| e.context("replace_date_bin_gapfill"))? + { v } else { return Ok(None); @@ -145,14 +152,16 @@ fn handle_aggregate(aggr: &Aggregate) -> Result> { new_group_expr, aggr_expr.clone(), Arc::clone(schema), - )?; + ) + .map_err(|e| e.context("Aggregate::try_new_with_schema"))?; let new_aggr_plan = LogicalPlan::Aggregate(new_aggr_plan); - check_node(&new_aggr_plan)?; + check_node(&new_aggr_plan).map_err(|e| e.context("check_node"))?; new_aggr_plan }; let new_gap_fill_plan = - build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args)?; + build_gapfill_node(new_aggr_plan, date_bin_gapfill_index, date_bin_gapfill_args) + .map_err(|e| e.context("build_gapfill_node"))?; Ok(Some(new_gap_fill_plan)) } @@ -174,23 +183,33 @@ fn build_gapfill_node( // Ensure that stride argument is a scalar let stride = args_iter.next().unwrap(); - validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride)?; + validate_scalar_expr("stride argument to DATE_BIN_GAPFILL", &stride) + .map_err(|e| e.context("validate_scalar_expr"))?; + + fn get_column(expr: Expr) -> Result { + match expr { + Expr::Column(c) => Ok(c), + Expr::Cast(c) => get_column(*c.expr), + _ => Err(DataFusionError::Plan( + "DATE_BIN_GAPFILL requires a column as the source argument".to_string(), + )), + } + } // Ensure that the source argument is a column - let time_col = args_iter.next().unwrap().try_into_col().map_err(|_| { - DataFusionError::Plan( - "DATE_BIN_GAPFILL requires a column as the source argument".to_string(), - ) - })?; + let time_col = + get_column(args_iter.next().unwrap()).map_err(|e| e.context("get time column"))?; // Ensure that a time range was specified and is valid for gap filling - let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col)?; - validate_time_range(&time_range)?; + let time_range = range_predicate::find_time_range(new_aggr_plan.inputs()[0], &time_col) + .map_err(|e| e.context("find time range"))?; + validate_time_range(&time_range).map_err(|e| e.context("validate time range"))?; // Ensure that origin argument is a scalar let origin = args_iter.next(); if let Some(ref origin) = origin { - validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", origin)?; + validate_scalar_expr("origin argument to DATE_BIN_GAPFILL", origin) + .map_err(|e| e.context("validate origin"))?; } // Make sure the time output to the gapfill node matches what the @@ -219,18 +238,21 @@ fn build_gapfill_node( .collect(); Ok(LogicalPlan::Extension(Extension { - node: Arc::new(GapFill::try_new( - Arc::new(new_aggr_plan), - new_group_expr, - aggr_expr, - GapFillParams { - stride, - time_column, - origin, - time_range, - fill_strategy: fill_behavior, - }, - )?), + node: Arc::new( + GapFill::try_new( + Arc::new(new_aggr_plan), + new_group_expr, + aggr_expr, + GapFillParams { + stride, + time_column, + origin, + time_range, + fill_strategy: fill_behavior, + }, + ) + .map_err(|e| e.context("GapFill::try_new"))?, + ), })) } @@ -358,7 +380,7 @@ impl TreeNodeRewriter for DateBinGapfillRewriter { type N = Expr; fn pre_visit(&mut self, expr: &Expr) -> Result { match expr { - Expr::ScalarUDF(ScalarUDF { fun, .. }) if fun.name == DATE_BIN_GAPFILL_UDF_NAME => { + Expr::ScalarFunction(fun) if fun.func_def.name() == DATE_BIN_GAPFILL_UDF_NAME => { Ok(RewriteRecursion::Mutate) } _ => Ok(RewriteRecursion::Continue), @@ -370,10 +392,12 @@ impl TreeNodeRewriter for DateBinGapfillRewriter { // so that everything stays wired up. let orig_name = expr.display_name()?; match expr { - Expr::ScalarUDF(ScalarUDF { fun, args }) if fun.name == DATE_BIN_GAPFILL_UDF_NAME => { + Expr::ScalarFunction(ScalarFunction { func_def, args }) + if func_def.name() == DATE_BIN_GAPFILL_UDF_NAME => + { self.args = Some(args.clone()); Ok(Expr::ScalarFunction(ScalarFunction { - fun: BuiltinScalarFunction::DateBin, + func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin), args, }) .alias(orig_name)) @@ -422,7 +446,11 @@ fn handle_projection(proj: &Projection) -> Result> { }; let new_proj_exprs = proj_exprs .iter() - .map(|e| e.clone().rewrite(&mut fill_fn_rewriter)) + .map(|expr| { + expr.clone() + .rewrite(&mut fill_fn_rewriter) + .map_err(|e| e.context(format!("rewrite: {expr}"))) + }) .collect::>>()?; let FillFnRewriter { aggr_col_fill_map } = fill_fn_rewriter; @@ -434,7 +462,7 @@ fn handle_projection(proj: &Projection) -> Result> { // to reflect the new fill strategy. let mut new_gapfill = child_gapfill.clone(); for (e, fs) in aggr_col_fill_map { - let udf = fill_strategy_to_udf(&fs)?; + let udf = fill_strategy_to_udf(&fs).map_err(|e| e.context("fill_strategy_to_udf"))?; if new_gapfill.replace_fill_strategy(&e, fs).is_none() { // There was a gap filling function called on a non-aggregate column. return Err(DataFusionError::Plan(format!( @@ -470,7 +498,7 @@ impl TreeNodeRewriter for FillFnRewriter { type N = Expr; fn pre_visit(&mut self, expr: &Expr) -> Result { match expr { - Expr::ScalarUDF(ScalarUDF { fun, .. }) if udf_to_fill_strategy(&fun.name).is_some() => { + Expr::ScalarFunction(fun) if udf_to_fill_strategy(fun.func_def.name()).is_some() => { Ok(RewriteRecursion::Mutate) } _ => Ok(RewriteRecursion::Continue), @@ -480,14 +508,14 @@ impl TreeNodeRewriter for FillFnRewriter { fn mutate(&mut self, expr: Expr) -> Result { let orig_name = expr.display_name()?; match expr { - Expr::ScalarUDF(ScalarUDF { ref fun, .. }) - if udf_to_fill_strategy(&fun.name).is_none() => + Expr::ScalarFunction(ref fun) + if udf_to_fill_strategy(fun.func_def.name()).is_none() => { Ok(expr) } - Expr::ScalarUDF(ScalarUDF { fun, mut args }) => { - let fs = udf_to_fill_strategy(&fun.name).expect("must be a fill fn"); - let arg = args.remove(0); + Expr::ScalarFunction(mut fun) => { + let fs = udf_to_fill_strategy(fun.func_def.name()).expect("must be a fill fn"); + let arg = fun.args.remove(0); self.add_fill_strategy(arg.clone(), fs)?; Ok(arg.alias(orig_name)) } @@ -524,7 +552,7 @@ fn count_udf(e: &Expr, name: &str) -> Result { fn matches_udf(e: &Expr, name: &str) -> bool { matches!( e, - Expr::ScalarUDF(ScalarUDF { fun, .. }) if fun.name == name + Expr::ScalarFunction(fun) if fun.func_def.name() == name ) } @@ -556,18 +584,19 @@ mod test { use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion::error::Result; - use datafusion::logical_expr::expr::ScalarUDF; + use datafusion::logical_expr::builder::table_scan_with_filters; use datafusion::logical_expr::{logical_plan, LogicalPlan, LogicalPlanBuilder}; use datafusion::optimizer::optimizer::Optimizer; use datafusion::optimizer::OptimizerContext; - use datafusion::prelude::{avg, case, col, lit, lit_timestamp_nano, min, Expr}; + use datafusion::prelude::{avg, case, col, lit, min, Expr}; use datafusion::scalar::ScalarValue; + use datafusion_util::lit_timestamptz_nano; use query_functions::gapfill::{ DATE_BIN_GAPFILL_UDF_NAME, INTERPOLATE_UDF_NAME, LOCF_UDF_NAME, }; - fn table_scan() -> Result { - let schema = Schema::new(vec![ + fn schema() -> Schema { + Schema::new(vec![ Field::new( "time", DataType::Timestamp(TimeUnit::Nanosecond, None), @@ -580,8 +609,11 @@ mod test { ), Field::new("loc", DataType::Utf8, false), Field::new("temp", DataType::Float64, false), - ]); - logical_plan::table_scan(Some("temps"), &schema, None)?.build() + ]) + } + + fn table_scan() -> Result { + logical_plan::table_scan(Some("temps"), &schema(), None)?.build() } fn date_bin_gapfill(interval: Expr, time: Expr) -> Result { @@ -597,33 +629,27 @@ mod test { if let Some(origin) = origin { args.push(origin) } - Ok(Expr::ScalarUDF(ScalarUDF { - fun: query_functions::registry().udf(DATE_BIN_GAPFILL_UDF_NAME)?, - args, - })) + + Ok(query_functions::registry() + .udf(DATE_BIN_GAPFILL_UDF_NAME)? + .call(args)) } fn locf(arg: Expr) -> Result { - Ok(Expr::ScalarUDF(ScalarUDF { - fun: query_functions::registry().udf(LOCF_UDF_NAME)?, - args: vec![arg], - })) + Ok(query_functions::registry() + .udf(LOCF_UDF_NAME)? + .call(vec![arg])) } fn interpolate(arg: Expr) -> Result { - Ok(Expr::ScalarUDF(ScalarUDF { - fun: query_functions::registry().udf(INTERPOLATE_UDF_NAME)?, - args: vec![arg], - })) + Ok(query_functions::registry() + .udf(INTERPOLATE_UDF_NAME)? + .call(vec![arg])) } fn optimize(plan: &LogicalPlan) -> Result> { let optimizer = Optimizer::with_rules(vec![Arc::new(HandleGapFill)]); - optimizer.optimize_recursively( - optimizer.rules.first().unwrap(), - plan, - &OptimizerContext::new(), - ) + optimizer.optimize_recursively(&optimizer.rules[0], plan, &OptimizerContext::new()) } fn assert_optimizer_err(plan: &LogicalPlan, expected: &str) { @@ -713,8 +739,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![ @@ -742,8 +768,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![ @@ -771,8 +797,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill_with_origin( @@ -803,8 +829,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill(stride, col("time"))?], @@ -826,20 +852,20 @@ mod test { "Error during planning: gap-filling query is missing both upper and lower time bounds", ), ( - col("time").gt_eq(lit_timestamp_nano(1000)), + col("time").gt_eq(lit_timestamptz_nano(1000)), "Error during planning: gap-filling query is missing upper time bound", ), ( - col("time").lt(lit_timestamp_nano(2000)), + col("time").lt(lit_timestamptz_nano(2000)), "Error during planning: gap-filling query is missing lower time bound", ), ( col("time").gt_eq(col("time2")).and( - col("time").lt(lit_timestamp_nano(2000))), + col("time").lt(lit_timestamptz_nano(2000))), "Error during planning: lower time bound for gap fill query must evaluate to a scalar", ), ( - col("time").gt_eq(lit_timestamp_nano(2000)).and( + col("time").gt_eq(lit_timestamptz_nano(2000)).and( col("time").lt(col("time2"))), "Error during planning: upper time bound for gap fill query must evaluate to a scalar", ) @@ -874,8 +900,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill( @@ -903,14 +929,14 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill_with_origin( lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time"), - Some(lit_timestamp_nano(7)), + Some(lit_timestamptz_nano(7)), )?], vec![avg(col("temp"))], )? @@ -933,8 +959,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![ @@ -980,8 +1006,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill( @@ -1014,8 +1040,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill( @@ -1049,8 +1075,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill( @@ -1083,8 +1109,8 @@ mod test { let plan = LogicalPlanBuilder::from(table_scan()?) .filter( col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), )? .aggregate( vec![date_bin_gapfill( @@ -1112,4 +1138,39 @@ mod test { "###); Ok(()) } + + #[test] + fn scan_filter_not_part_of_projection() { + let schema = schema(); + let plan = table_scan_with_filters( + Some("temps"), + &schema, + Some(vec![schema.index_of("time").unwrap()]), + vec![ + col("temps.time").gt_eq(lit_timestamptz_nano(1000)), + col("temps.time").lt(lit_timestamptz_nano(2000)), + col("temps.loc").eq(lit("foo")), + ], + ) + .unwrap() + .aggregate( + vec![ + date_bin_gapfill(lit(ScalarValue::IntervalDayTime(Some(60_000))), col("time")) + .unwrap(), + ], + std::iter::empty::(), + ) + .unwrap() + .build() + .unwrap(); + + insta::assert_yaml_snapshot!( + format_optimized_plan(&plan).unwrap(), + @r###" + --- + - "GapFill: groupBy=[date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)], aggr=[[]], time_column=date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time), stride=IntervalDayTime(\"60000\"), range=Included(Literal(TimestampNanosecond(1000, None)))..Excluded(Literal(TimestampNanosecond(2000, None)))" + - " Aggregate: groupBy=[[date_bin(IntervalDayTime(\"60000\"), temps.time) AS date_bin_gapfill(IntervalDayTime(\"60000\"),temps.time)]], aggr=[[]]" + - " TableScan: temps projection=[time], full_filters=[temps.time >= TimestampNanosecond(1000, None), temps.time < TimestampNanosecond(2000, None), temps.loc = Utf8(\"foo\")]" + "###); + } } diff --git a/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs index 97a31e232bd..26b9682b454 100644 --- a/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs +++ b/iox_query/src/logical_optimizer/handle_gapfill/range_predicate.rs @@ -1,5 +1,8 @@ //! Find the time range from the filters in a logical plan. -use std::ops::{Bound, Range}; +use std::{ + ops::{Bound, Range}, + sync::Arc, +}; use datafusion::{ common::{ @@ -7,8 +10,9 @@ use datafusion::{ DFSchema, }, error::Result, - logical_expr::{Between, BinaryExpr, LogicalPlan, Operator}, - optimizer::utils::split_conjunction, + logical_expr::{ + utils::split_conjunction, Between, BinaryExpr, LogicalPlan, LogicalPlanBuilder, Operator, + }, prelude::{Column, Expr}, }; @@ -57,12 +61,23 @@ impl TreeNodeVisitor for TimeRangeVisitor { } LogicalPlan::TableScan(t) => { let range = self.range.clone(); + + // filters may use columns that are NOT part of a projection, so we need the underlying schema. Because + // that's a bit of a mess in DF, we reconstruct the schema using the plan builder. + let unprojected_scan = LogicalPlanBuilder::scan_with_filters( + t.table_name.to_owned(), + Arc::clone(&t.source), + None, + t.filters.clone(), + ) + .map_err(|e| e.context("reconstruct unprojected scheam"))?; + let unprojected_schema = unprojected_scan.schema(); let range = t .filters .iter() .flat_map(split_conjunction) .try_fold(range, |range, expr| { - range.with_expr(&t.projected_schema, &self.col, expr) + range.with_expr(unprojected_schema, &self.col, expr) })?; self.range = range; Ok(VisitRecursion::Continue) @@ -166,9 +181,10 @@ mod tests { logical_plan::{self, builder::LogicalTableSource}, Between, LogicalPlan, LogicalPlanBuilder, }, - prelude::{col, lit, lit_timestamp_nano, Column, Expr, Partitioning}, + prelude::{col, lit, Column, Expr, Partitioning}, sql::TableReference, }; + use datafusion_util::lit_timestamptz_nano; use super::find_time_range; @@ -225,88 +241,88 @@ mod tests { ), ( "time_gt_val", - col("time").gt(lit_timestamp_nano(1000)), + col("time").gt(lit_timestamptz_nano(1000)), Range { - start: Bound::Excluded(lit_timestamp_nano(1000)), + start: Bound::Excluded(lit_timestamptz_nano(1000)), end: Bound::Unbounded, }, ), ( "time_gt_eq_val", - col("time").gt_eq(lit_timestamp_nano(1000)), + col("time").gt_eq(lit_timestamptz_nano(1000)), Range { - start: Bound::Included(lit_timestamp_nano(1000)), + start: Bound::Included(lit_timestamptz_nano(1000)), end: Bound::Unbounded, }, ), ( "time_lt_val", - col("time").lt(lit_timestamp_nano(1000)), + col("time").lt(lit_timestamptz_nano(1000)), Range { start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamp_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(1000)), }, ), ( "time_lt_eq_val", - col("time").lt_eq(lit_timestamp_nano(1000)), + col("time").lt_eq(lit_timestamptz_nano(1000)), Range { start: Bound::Unbounded, - end: Bound::Included(lit_timestamp_nano(1000)), + end: Bound::Included(lit_timestamptz_nano(1000)), }, ), ( "val_gt_time", - lit_timestamp_nano(1000).gt(col("time")), + lit_timestamptz_nano(1000).gt(col("time")), Range { start: Bound::Unbounded, - end: Bound::Excluded(lit_timestamp_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(1000)), }, ), ( "val_gt_eq_time", - lit_timestamp_nano(1000).gt_eq(col("time")), + lit_timestamptz_nano(1000).gt_eq(col("time")), Range { start: Bound::Unbounded, - end: Bound::Included(lit_timestamp_nano(1000)), + end: Bound::Included(lit_timestamptz_nano(1000)), }, ), ( "val_lt_time", - lit_timestamp_nano(1000).lt(col("time")), + lit_timestamptz_nano(1000).lt(col("time")), Range { - start: Bound::Excluded(lit_timestamp_nano(1000)), + start: Bound::Excluded(lit_timestamptz_nano(1000)), end: Bound::Unbounded, }, ), ( "val_lt_eq_time", - lit_timestamp_nano(1000).lt_eq(col("time")), + lit_timestamptz_nano(1000).lt_eq(col("time")), Range { - start: Bound::Included(lit_timestamp_nano(1000)), + start: Bound::Included(lit_timestamptz_nano(1000)), end: Bound::Unbounded, }, ), ( "and", col("time") - .gt_eq(lit_timestamp_nano(1000)) - .and(col("time").lt(lit_timestamp_nano(2000))), + .gt_eq(lit_timestamptz_nano(1000)) + .and(col("time").lt(lit_timestamptz_nano(2000))), Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Excluded(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }, ), ( "between", between( col("time"), - lit_timestamp_nano(1000), - lit_timestamp_nano(2000), + lit_timestamptz_nano(1000), + lit_timestamptz_nano(2000), ), Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Included(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Included(lit_timestamptz_nano(2000)), }, ), ]; @@ -330,11 +346,11 @@ mod tests { // - even when predicates are in different filter nodes // - through projections that alias columns let plan = LogicalPlanBuilder::from(table_scan()?) - .filter(col("time").gt_eq(lit_timestamp_nano(1000)))? + .filter(col("time").gt_eq(lit_timestamptz_nano(1000)))? .sort(vec![col("time")])? .limit(0, Some(10))? .project(vec![col("time").alias("other_time")])? - .filter(col("other_time").lt(lit_timestamp_nano(2000)))? + .filter(col("other_time").lt(lit_timestamptz_nano(2000)))? .distinct()? .repartition(Partitioning::RoundRobinBatch(1))? .project(vec![col("other_time").alias("my_time")])? @@ -342,8 +358,8 @@ mod tests { let time_col = Column::from_name("my_time"); let actual = find_time_range(&plan, &time_col)?; let expected = Range { - start: Bound::Included(lit_timestamp_nano(1000)), - end: Bound::Excluded(lit_timestamp_nano(2000)), + start: Bound::Included(lit_timestamptz_nano(1000)), + end: Bound::Excluded(lit_timestamptz_nano(2000)), }; assert_eq!(expected, actual); Ok(()) diff --git a/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs index 216b663011e..3660cdbbd2a 100644 --- a/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs +++ b/iox_query/src/logical_optimizer/influx_regex_to_datafusion_regex.rs @@ -1,9 +1,8 @@ +use datafusion::logical_expr::expr::ScalarFunction; use datafusion::{ common::{tree_node::TreeNodeRewriter, DFSchema}, error::DataFusionError, - logical_expr::{ - expr::ScalarUDF, expr_rewriter::rewrite_preserving_name, LogicalPlan, Operator, - }, + logical_expr::{expr_rewriter::rewrite_preserving_name, LogicalPlan, Operator}, optimizer::{OptimizerConfig, OptimizerRule}, prelude::{binary_expr, lit, Expr}, scalar::ScalarValue, @@ -73,14 +72,14 @@ impl TreeNodeRewriter for InfluxRegexToDataFusionRegex { fn mutate(&mut self, expr: Expr) -> Result { match expr { - Expr::ScalarUDF(ScalarUDF { fun, mut args }) => { + Expr::ScalarFunction(ScalarFunction { func_def, mut args }) => { + let name = func_def.name(); if (args.len() == 2) - && ((fun.name == REGEX_MATCH_UDF_NAME) - || (fun.name == REGEX_NOT_MATCH_UDF_NAME)) + && ((name == REGEX_MATCH_UDF_NAME) || (name == REGEX_NOT_MATCH_UDF_NAME)) { if let Expr::Literal(ScalarValue::Utf8(Some(s))) = &args[1] { let s = clean_non_meta_escapes(s); - let op = match fun.name.as_str() { + let op = match name { REGEX_MATCH_UDF_NAME => Operator::RegexMatch, REGEX_NOT_MATCH_UDF_NAME => Operator::RegexNotMatch, _ => unreachable!(), @@ -89,7 +88,7 @@ impl TreeNodeRewriter for InfluxRegexToDataFusionRegex { } } - Ok(Expr::ScalarUDF(ScalarUDF { fun, args })) + Ok(Expr::ScalarFunction(ScalarFunction { func_def, args })) } _ => Ok(expr), } diff --git a/iox_query/src/logical_optimizer/mod.rs b/iox_query/src/logical_optimizer/mod.rs index 6e88a65bd23..42b72e18be3 100644 --- a/iox_query/src/logical_optimizer/mod.rs +++ b/iox_query/src/logical_optimizer/mod.rs @@ -3,9 +3,11 @@ use std::sync::Arc; use datafusion::execution::context::SessionState; use self::{ - handle_gapfill::HandleGapFill, influx_regex_to_datafusion_regex::InfluxRegexToDataFusionRegex, + extract_sleep::ExtractSleep, handle_gapfill::HandleGapFill, + influx_regex_to_datafusion_regex::InfluxRegexToDataFusionRegex, }; +mod extract_sleep; mod handle_gapfill; mod influx_regex_to_datafusion_regex; pub use handle_gapfill::range_predicate; @@ -16,5 +18,6 @@ pub use handle_gapfill::range_predicate; pub fn register_iox_logical_optimizers(state: SessionState) -> SessionState { state .add_optimizer_rule(Arc::new(InfluxRegexToDataFusionRegex::new())) + .add_optimizer_rule(Arc::new(ExtractSleep::new())) .add_optimizer_rule(Arc::new(HandleGapFill::new())) } diff --git a/iox_query/src/physical_optimizer/chunk_extraction.rs b/iox_query/src/physical_optimizer/chunk_extraction.rs index a462b2973ea..488b5df7854 100644 --- a/iox_query/src/physical_optimizer/chunk_extraction.rs +++ b/iox_query/src/physical_optimizer/chunk_extraction.rs @@ -5,8 +5,8 @@ use datafusion::{ datasource::physical_plan::ParquetExec, error::DataFusionError, physical_plan::{ - empty::EmptyExec, union::UnionExec, visit_execution_plan, ExecutionPlan, - ExecutionPlanVisitor, + empty::EmptyExec, placeholder_row::PlaceholderRowExec, union::UnionExec, + visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor, }, }; use observability_deps::tracing::debug; @@ -141,14 +141,12 @@ impl ExecutionPlanVisitor for ExtractChunksVisitor { self.add_chunk(Arc::clone(&ext.chunk)); } } - } else if let Some(empty_exec) = plan_any.downcast_ref::() { + } else if plan_any.downcast_ref::().is_some() { // should not produce dummy data - if empty_exec.produce_one_row() { - return Err(DataFusionError::External( - String::from("EmptyExec produces row").into(), - )); - } - + return Err(DataFusionError::External( + String::from("EmptyExec produces row").into(), + )); + } else if let Some(empty_exec) = plan_any.downcast_ref::() { self.add_schema_from_exec(empty_exec).map_err(|e| { DataFusionError::Context("add schema from EmptyExec".to_owned(), Box::new(e)) })?; @@ -228,8 +226,8 @@ mod tests { let schema1 = iox_schema.as_arrow(); let schema2 = iox_schema.select_by_indices(&[]).as_arrow(); let plan = UnionExec::new(vec![ - Arc::new(EmptyExec::new(false, schema1)), - Arc::new(EmptyExec::new(false, schema2)), + Arc::new(EmptyExec::new(schema1)), + Arc::new(EmptyExec::new(schema2)), ]); assert!(extract_chunks(&plan).is_none()); } @@ -237,7 +235,7 @@ mod tests { #[test] fn test_empty_exec_with_rows() { let schema = chunk(1).schema().as_arrow(); - let plan = EmptyExec::new(true, schema); + let plan = PlaceholderRowExec::new(schema); assert!(extract_chunks(&plan).is_none()); } @@ -248,7 +246,7 @@ mod tests { DataType::Float64, true, )])); - let plan = EmptyExec::new(false, Arc::clone(&schema)); + let plan = EmptyExec::new(Arc::clone(&schema)); let (schema2, chunks, sort_key) = extract_chunks(&plan).unwrap(); assert_eq!(schema, schema2); assert!(chunks.is_empty()); @@ -274,7 +272,7 @@ mod tests { let schema = chunk1.schema().as_arrow(); let plan = chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2); let plan = FilterExec::try_new( - df_physical_expr(plan.as_ref(), col("tag1").eq(lit("foo"))).unwrap(), + df_physical_expr(plan.schema(), col("tag1").eq(lit("foo"))).unwrap(), plan, ) .unwrap(); diff --git a/iox_query/src/physical_optimizer/combine_chunks.rs b/iox_query/src/physical_optimizer/combine_chunks.rs index a0138cd4761..d09681ea47d 100644 --- a/iox_query/src/physical_optimizer/combine_chunks.rs +++ b/iox_query/src/physical_optimizer/combine_chunks.rs @@ -1,20 +1,50 @@ use std::sync::Arc; +use arrow::compute::SortOptions; use datafusion::{ - common::tree_node::{Transformed, TreeNode}, + common::{ + plan_err, + tree_node::{Transformed, TreeNode}, + }, config::ConfigOptions, error::{DataFusionError, Result}, physical_optimizer::PhysicalOptimizerRule, physical_plan::{union::UnionExec, ExecutionPlan}, }; +use observability_deps::tracing::trace; +use schema::TIME_COLUMN_NAME; use crate::{ - physical_optimizer::chunk_extraction::extract_chunks, provider::chunks_to_physical_nodes, + physical_optimizer::{ + chunk_extraction::extract_chunks, + sort::util::{collect_statistics_min_max, sort_by_value_ranges}, + }, + provider::chunks_to_physical_nodes, }; /// Collects [`QueryChunk`]s and re-creates a appropriate physical nodes. /// -/// This only works if there no filters, projections, sorts, or de-duplicate operations in the affected subtree. +/// Invariants of inputs of the union: +/// 1. They do not overlap on time ranges (done in previous step: TimeSplit) +/// 2. Each input of the union is either with_chunks or other_plans. +/// - An input with_chunks is a plan that contains only (union of) ParquetExecs or RecordBatchesExec +/// - An input of other_plans is a plan that contains at least one node that is not a ParquetExec or +/// RecordBatchesExec or Union of them. Examples of those other nodes are FilterExec, DeduplicateExec, +/// ProjectionExec, etc. +// +/// Goals of this optimzation step: +/// i. Combine **possible** plans with_chunks into a single union +/// ii. - Keep the the combined plan non-overlapped on time ranges. This will likely help later optimization steps. +/// - If time ranges cannot be computed, combine all plans with_chunks into a single union. +/// +/// Example: w = with_chunks, o = other_plans +/// Input: |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --| +/// Output when time ranges can be computed: Only two sets of plans that are combined: [P1, P2], [P4, P5] +/// |------ P1 & P2 w ----| |-- P3 o --| |------ P4 & P5 w ------| |-- P6 o --| |--P7 w --| +/// Output when time ranges cannot be computed: all plans with_chunks are combined into a single union +/// |-------------------------- P1, P2, P4, P5, P7 w -------------------------------------| +/// |-- P3 o --| |-- P6 o --| +/// /// /// This is mostly useful after multiple re-arrangements (e.g. [`PartitionSplit`]-[`TimeSplit`]-[`RemoveDedup`]) created /// a bunch of freestanding chunks that can be re-arranged into more packed, more efficient physical nodes. @@ -35,32 +65,31 @@ impl PhysicalOptimizerRule for CombineChunks { ) -> Result> { plan.transform_up(&|plan| { if let Some(union_exec) = plan.as_any().downcast_ref::() { - let (inputs_with_chunks, inputs_other): (Vec<_>, Vec<_>) = union_exec - .inputs() - .iter() - .cloned() - .partition(|plan| { - extract_chunks(plan.as_ref()).is_some() - }); - - if inputs_with_chunks.is_empty() { - return Ok(Transformed::No(plan)); - } - let union_of_chunks = UnionExec::new(inputs_with_chunks); - - if let Some((schema, chunks, output_sort_key)) = extract_chunks(&union_of_chunks) { - let union_of_chunks = chunks_to_physical_nodes( - &schema, - output_sort_key.as_ref(), - chunks, - config.execution.target_partitions, - ); - let Some(union_of_chunks) = union_of_chunks.as_any().downcast_ref::() else { - return Err(DataFusionError::External(format!("Expected chunks_to_physical_nodes to produce UnionExec but got {union_of_chunks:?}").into())); - }; - let final_union = UnionExec::new(union_of_chunks.inputs().iter().cloned().chain(inputs_other).collect()); - return Ok(Transformed::Yes(Arc::new(final_union))); - } + // sort and group the inputs by time range + let inputs = union_exec.inputs(); + // We only need to ensure the input are sorted by time range, + // any order is fine and hence we choose to go with ASC here + let groups = sort_and_group_plans( + inputs.clone(), + TIME_COLUMN_NAME, + SortOptions { + descending: false, + nulls_first: false, + }, + )?; + + // combine plans from each group + let plans = groups + .into_iter() + .map(|group| combine_plans(group, config)) + .collect::>>()? + .into_iter() + .flatten() + .collect::>(); + + let final_union = UnionExec::new(plans); + trace!(?final_union, "-------- final union"); + return Ok(Transformed::Yes(Arc::new(final_union))); } Ok(Transformed::No(plan)) @@ -76,6 +105,117 @@ impl PhysicalOptimizerRule for CombineChunks { } } +/// Sort the given plans on the given column name and a given sort order. +/// +/// Then group them into non-overlapped groups based on the ranges of the given column, and return the groups. +/// +/// # Input Invariants +/// - Plans do not overlap on the given column +/// +/// # Output Invariants +/// - Plans in the same group do not overlap on the given column +/// -The groups do not overlap on the given column +/// +/// # Example +/// Input: +/// +/// ```text +/// 7 plans with value ranges : |--P1 w --| |--P2 w --| |-- P3 o --| |-- P4 w --| |-- P5 w --| |-- P6 o --| |--P7 w --| +/// ``` +/// +/// Output: +/// +/// ```text +/// 5 groups: [P1, P2], [P3], [P4, P5], [P6], [P7] +/// ``` +fn sort_and_group_plans( + plans: Vec>, + col_name: &str, + sort_options: SortOptions, +) -> Result>>> { + if plans.len() <= 1 { + return Ok(vec![plans]); + } + + let Some(value_ranges) = collect_statistics_min_max(&plans, col_name)? else { + // No statistics to sort and group the plans. + // Return all plans in the same group + trace!("-------- combine chunks - cannot collect statistics min max for column {col_name}"); + return Ok(vec![plans]); + }; + + // Sort the plans by their value ranges + trace!("-------- value_ranges: {:?}", value_ranges); + let Some(plans_value_ranges) = sort_by_value_ranges(plans.clone(), value_ranges, sort_options)? + else { + // The inputs are not being sorted by value ranges, cannot group them + // Return all plans in the same group + trace!("-------- inputs are not sorted by value ranges. No optimization"); + return Ok(vec![plans]); + }; + + // Group plans that can be combined + let plans = plans_value_ranges.plans; + let mut final_groups = Vec::with_capacity(plans.len()); + let mut combinable_plans = Vec::new(); + for plan in plans { + if extract_chunks(plan.as_ref()).is_some() { + combinable_plans.push(plan); + } else { + if !combinable_plans.is_empty() { + final_groups.push(combinable_plans); + combinable_plans = Vec::new(); + } + final_groups.push(vec![plan]); + } + } + + if !combinable_plans.is_empty() { + final_groups.push(combinable_plans); + } + + Ok(final_groups) +} + +/// Combine the given plans with chunks into a single union. The other plans stay as is. +fn combine_plans( + plans: Vec>, + config: &ConfigOptions, +) -> Result>> { + let (inputs_with_chunks, inputs_other): (Vec<_>, Vec<_>) = plans + .iter() + .cloned() + .partition(|plan| extract_chunks(plan.as_ref()).is_some()); + + if inputs_with_chunks.is_empty() { + return Ok(plans); + } + let union_of_chunks = UnionExec::new(inputs_with_chunks); + + if let Some((schema, chunks, output_sort_key)) = extract_chunks(&union_of_chunks) { + let union_of_chunks = chunks_to_physical_nodes( + &schema, + output_sort_key.as_ref(), + chunks, + config.execution.target_partitions, + ); + let Some(union_of_chunks) = union_of_chunks.as_any().downcast_ref::() else { + return plan_err!("Expected chunks_to_physical_nodes to produce UnionExec but got {union_of_chunks:?}"); + }; + + // return other_plans and the union_of_chunks + let plans = union_of_chunks + .inputs() + .iter() + .cloned() + .chain(inputs_other) + .collect(); + return Ok(plans); + } + + Ok(plans) +} + #[cfg(test)] mod tests { use datafusion::{ @@ -89,11 +229,24 @@ mod tests { #[test] fn test_combine_single_union_tree() { - let chunk1 = TestChunk::new("table").with_id(1); - let chunk2 = TestChunk::new("table").with_id(2).with_dummy_parquet_file(); - let chunk3 = TestChunk::new("table").with_id(3); - let chunk4 = TestChunk::new("table").with_id(4).with_dummy_parquet_file(); - let chunk5 = TestChunk::new("table").with_id(5).with_dummy_parquet_file(); + let chunk1 = TestChunk::new("table") + .with_id(1) + .with_time_column_with_stats(Some(1), Some(2)); + let chunk2 = TestChunk::new("table") + .with_id(2) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(3), Some(4)); + let chunk3 = TestChunk::new("table") + .with_id(3) + .with_time_column_with_stats(Some(5), Some(6)); + let chunk4 = TestChunk::new("table") + .with_id(4) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(7), Some(8)); + let chunk5 = TestChunk::new("table") + .with_id(5) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(9), Some(10)); let schema = chunk1.schema().as_arrow(); let plan = Arc::new(UnionExec::new(vec![ chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1), Arc::new(chunk2)], 2), @@ -114,16 +267,75 @@ mod tests { input: - " UnionExec" - " UnionExec" - - " RecordBatchesExec: chunks=1" - - " ParquetExec: file_groups={1 group: [[2.parquet]]}" + - " RecordBatchesExec: chunks=1, projection=[time]" + - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]" - " UnionExec" - - " RecordBatchesExec: chunks=1" - - " ParquetExec: file_groups={2 groups: [[4.parquet], [5.parquet]]}" + - " RecordBatchesExec: chunks=1, projection=[time]" + - " ParquetExec: file_groups={2 groups: [[4.parquet], [5.parquet]]}, projection=[time]" output: Ok: - " UnionExec" - - " RecordBatchesExec: chunks=2" - - " ParquetExec: file_groups={2 groups: [[2.parquet, 5.parquet], [4.parquet]]}" + - " RecordBatchesExec: chunks=2, projection=[time]" + - " ParquetExec: file_groups={2 groups: [[2.parquet, 5.parquet], [4.parquet]]}, projection=[time]" + "### + ); + } + + #[test] + fn test_only_combine_contiguous_arms() { + let chunk1 = TestChunk::new("table") + .with_id(1) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(1), Some(2)); + let chunk2 = TestChunk::new("table") + .with_id(2) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(3), Some(4)); + let chunk3 = TestChunk::new("table") + .with_id(3) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(5), Some(6)); + let chunk4 = TestChunk::new("table") + .with_id(4) + .with_dummy_parquet_file() + .with_time_column_with_stats(Some(7), Some(8)); + let schema = chunk1.schema().as_arrow(); + let plan = Arc::new(UnionExec::new(vec![ + chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk1)], 2), + chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk2)], 2), + Arc::new( + FilterExec::try_new( + Arc::new(Literal::new(ScalarValue::from(false))), + chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk3)], 2), + ) + .unwrap(), + ), + chunks_to_physical_nodes(&schema, None, vec![Arc::new(chunk4)], 2), + ])); + let opt = CombineChunks; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan, opt), + @r###" + --- + input: + - " UnionExec" + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[1.parquet]]}, projection=[time]" + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[time]" + - " FilterExec: false" + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]" + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]" + output: + Ok: + - " UnionExec" + - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[time]" + - " FilterExec: false" + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[time]" + - " ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[time]" "### ); } @@ -183,10 +395,10 @@ mod tests { @r###" --- input: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } diff --git a/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs index 9f6539ea4c6..341ae4774cc 100644 --- a/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs +++ b/iox_query/src/physical_optimizer/dedup/dedup_null_columns.rs @@ -119,11 +119,11 @@ mod tests { --- input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " DeduplicateExec: []" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } diff --git a/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs index 08e94e87dc6..c4b39248d2d 100644 --- a/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs +++ b/iox_query/src/physical_optimizer/dedup/dedup_sort_order.rs @@ -191,11 +191,11 @@ mod tests { --- input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } diff --git a/iox_query/src/physical_optimizer/dedup/partition_split.rs b/iox_query/src/physical_optimizer/dedup/partition_split.rs index 07154149854..386cd9cd94e 100644 --- a/iox_query/src/physical_optimizer/dedup/partition_split.rs +++ b/iox_query/src/physical_optimizer/dedup/partition_split.rs @@ -126,11 +126,11 @@ mod tests { --- input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -150,13 +150,13 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]" "### ); @@ -183,18 +183,18 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]" output: Ok: - " UnionExec" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]" "### ); @@ -238,18 +238,18 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]" output: Ok: - " UnionExec" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[3.parquet, 6.parquet], [5.parquet]]}, projection=[field, tag1, tag2, time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]" "### ); @@ -275,12 +275,12 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=3" + - " RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=3" + - " RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]" "### ); } diff --git a/iox_query/src/physical_optimizer/dedup/remove_dedup.rs b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs index 4bfab071505..9558c5a205f 100644 --- a/iox_query/src/physical_optimizer/dedup/remove_dedup.rs +++ b/iox_query/src/physical_optimizer/dedup/remove_dedup.rs @@ -80,10 +80,10 @@ mod tests { --- input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -101,11 +101,11 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" output: Ok: - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" "### ); } @@ -123,12 +123,12 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" "### ); } @@ -147,12 +147,12 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" "### ); } diff --git a/iox_query/src/physical_optimizer/dedup/time_split.rs b/iox_query/src/physical_optimizer/dedup/time_split.rs index 57f18baf5fa..29acccb9f7a 100644 --- a/iox_query/src/physical_optimizer/dedup/time_split.rs +++ b/iox_query/src/physical_optimizer/dedup/time_split.rs @@ -119,11 +119,11 @@ mod tests { --- input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -145,13 +145,13 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[3.parquet]]}, projection=[field, tag1, tag2, time]" "### ); @@ -186,18 +186,18 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[3.parquet, 5.parquet], [4.parquet, 6.parquet]]}, projection=[field, tag1, tag2, time]" output: Ok: - " UnionExec" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={2 groups: [[6.parquet, 5.parquet], [3.parquet]]}, projection=[field, tag1, tag2, time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time]" - " ParquetExec: file_groups={1 group: [[4.parquet]]}, projection=[field, tag1, tag2, time]" "### ); @@ -223,12 +223,12 @@ mod tests { input: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=3" + - " RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]" output: Ok: - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=3" + - " RecordBatchesExec: chunks=3, projection=[field, tag1, tag2, time]" "### ); } diff --git a/iox_query/src/physical_optimizer/mod.rs b/iox_query/src/physical_optimizer/mod.rs index c12331e427a..a0bf7a4cb0c 100644 --- a/iox_query/src/physical_optimizer/mod.rs +++ b/iox_query/src/physical_optimizer/mod.rs @@ -10,7 +10,7 @@ use self::{ }, predicate_pushdown::PredicatePushdown, projection_pushdown::ProjectionPushdown, - sort::parquet_sortness::ParquetSortness, + sort::{order_union_sorted_inputs::OrderUnionSortedInputs, parquet_sortness::ParquetSortness}, union::{nested_union::NestedUnion, one_union::OneUnion}, }; @@ -25,6 +25,9 @@ mod union; #[cfg(test)] mod test_util; +#[cfg(test)] +mod tests; + /// Register IOx-specific [`PhysicalOptimizerRule`]s with the SessionContext pub fn register_iox_physical_optimizers(state: SessionState) -> SessionState { // prepend IOx-specific rules to DataFusion builtins @@ -42,7 +45,12 @@ pub fn register_iox_physical_optimizers(state: SessionState) -> SessionState { Arc::new(NestedUnion), Arc::new(OneUnion), ]; + + // Append DataFUsion physical rules to the IOx-specific rules optimizers.append(&mut state.physical_optimizers().to_vec()); + // Add a rule to optimize plan with limit + optimizers.push(Arc::new(OrderUnionSortedInputs)); + state.with_physical_optimizer_rules(optimizers) } diff --git a/iox_query/src/physical_optimizer/predicate_pushdown.rs b/iox_query/src/physical_optimizer/predicate_pushdown.rs index 3e3b8b92f50..ab8ccd4bcfc 100644 --- a/iox_query/src/physical_optimizer/predicate_pushdown.rs +++ b/iox_query/src/physical_optimizer/predicate_pushdown.rs @@ -38,10 +38,8 @@ impl PhysicalOptimizerRule for PredicatePushdown { let child = children.remove(0); let child_any = child.as_any(); - if let Some(child_empty) = child_any.downcast_ref::() { - if !child_empty.produce_one_row() { - return Ok(Transformed::Yes(child)); - } + if child_any.downcast_ref::().is_some() { + return Ok(Transformed::Yes(child)); } else if let Some(child_union) = child_any.downcast_ref::() { let new_inputs = child_union .inputs() @@ -170,6 +168,7 @@ mod tests { physical_expr::PhysicalSortExpr, physical_plan::{ expressions::{BinaryExpr, Column, Literal}, + placeholder_row::PlaceholderRowExec, PhysicalExpr, Statistics, }, scalar::ScalarValue, @@ -184,11 +183,7 @@ mod tests { fn test_empty_no_rows() { let schema = schema(); let plan = Arc::new( - FilterExec::try_new( - predicate_tag(&schema), - Arc::new(EmptyExec::new(false, schema)), - ) - .unwrap(), + FilterExec::try_new(predicate_tag(&schema), Arc::new(EmptyExec::new(schema))).unwrap(), ); let opt = PredicatePushdown; insta::assert_yaml_snapshot!( @@ -197,10 +192,10 @@ mod tests { --- input: - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -211,7 +206,7 @@ mod tests { let plan = Arc::new( FilterExec::try_new( predicate_tag(&schema), - Arc::new(EmptyExec::new(true, schema)), + Arc::new(PlaceholderRowExec::new(schema)), ) .unwrap(), ); @@ -222,11 +217,11 @@ mod tests { --- input: - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" output: Ok: - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } @@ -239,7 +234,7 @@ mod tests { predicate_tag(&schema), Arc::new(UnionExec::new( (0..2) - .map(|_| Arc::new(EmptyExec::new(true, Arc::clone(&schema))) as _) + .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _) .collect(), )), ) @@ -253,15 +248,15 @@ mod tests { input: - " FilterExec: tag1@0 = foo" - " UnionExec" - - " EmptyExec: produce_one_row=true" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" + - " PlaceholderRowExec" output: Ok: - " UnionExec" - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } @@ -274,7 +269,7 @@ mod tests { predicate_tag(&schema), Arc::new(UnionExec::new(vec![Arc::new(UnionExec::new( (0..2) - .map(|_| Arc::new(EmptyExec::new(true, Arc::clone(&schema))) as _) + .map(|_| Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))) as _) .collect(), ))])), ) @@ -289,16 +284,16 @@ mod tests { - " FilterExec: tag1@0 = foo" - " UnionExec" - " UnionExec" - - " EmptyExec: produce_one_row=true" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" + - " PlaceholderRowExec" output: Ok: - " UnionExec" - " UnionExec" - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } @@ -310,12 +305,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![], - infinite_source: false, }; let plan = Arc::new( FilterExec::try_new( @@ -351,7 +345,7 @@ mod tests { FilterExec::try_new( predicate_field(&schema), Arc::new(DeduplicateExec::new( - Arc::new(EmptyExec::new(true, Arc::clone(&schema))), + Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))), sort_expr(&schema), false, )), @@ -366,12 +360,12 @@ mod tests { input: - " FilterExec: field@2 = val" - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" output: Ok: - " FilterExec: field@2 = val" - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } @@ -383,7 +377,7 @@ mod tests { FilterExec::try_new( predicate_tag(&schema), Arc::new(DeduplicateExec::new( - Arc::new(EmptyExec::new(true, Arc::clone(&schema))), + Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))), sort_expr(&schema), false, )), @@ -398,12 +392,12 @@ mod tests { input: - " FilterExec: tag1@0 = foo" - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" output: Ok: - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - " FilterExec: tag1@0 = foo" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } @@ -422,7 +416,7 @@ mod tests { ]) .expect("not empty"), Arc::new(DeduplicateExec::new( - Arc::new(EmptyExec::new(true, Arc::clone(&schema))), + Arc::new(PlaceholderRowExec::new(Arc::clone(&schema))), sort_expr(&schema), false, )), @@ -437,13 +431,13 @@ mod tests { input: - " FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND field@2 = val AND tag1@0 = field@2 AND true" - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" output: Ok: - " FilterExec: field@2 = val AND tag1@0 = field@2" - " DeduplicateExec: [tag1@0 ASC,tag2@1 ASC]" - " FilterExec: tag1@0 = foo AND tag1@0 = tag2@1 AND true" - - " EmptyExec: produce_one_row=true" + - " PlaceholderRowExec" "### ); } diff --git a/iox_query/src/physical_optimizer/projection_pushdown.rs b/iox_query/src/physical_optimizer/projection_pushdown.rs index df26c84ecf8..0efe5977b95 100644 --- a/iox_query/src/physical_optimizer/projection_pushdown.rs +++ b/iox_query/src/physical_optimizer/projection_pushdown.rs @@ -18,6 +18,7 @@ use datafusion::{ empty::EmptyExec, expressions::Column, filter::FilterExec, + placeholder_row::PlaceholderRowExec, projection::ProjectionExec, sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec}, union::UnionExec, @@ -63,10 +64,15 @@ impl PhysicalOptimizerRule for ProjectionPushdown { let child_any = child.as_any(); if let Some(child_empty) = child_any.downcast_ref::() { - let new_child = EmptyExec::new( - child_empty.produce_one_row(), - Arc::new(child_empty.schema().project(&column_indices)?), - ); + let new_child = + EmptyExec::new(Arc::new(child_empty.schema().project(&column_indices)?)); + return Ok(Transformed::Yes(Arc::new(new_child))); + } else if let Some(child_placeholder) = + child_any.downcast_ref::() + { + let new_child = PlaceholderRowExec::new(Arc::new( + child_placeholder.schema().project(&column_indices)?, + )); return Ok(Transformed::Yes(Arc::new(new_child))); } else if let Some(child_union) = child_any.downcast_ref::() { let new_inputs = child_union @@ -453,7 +459,7 @@ mod tests { let plan = Arc::new( ProjectionExec::try_new( vec![(expr_col("tag1", &schema), String::from("tag1"))], - Arc::new(EmptyExec::new(false, schema)), + Arc::new(EmptyExec::new(schema)), ) .unwrap(), ); @@ -465,10 +471,10 @@ mod tests { --- input: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); @@ -492,7 +498,7 @@ mod tests { (expr_col("tag1", &schema), String::from("tag1")), (expr_col("field", &schema), String::from("field")), ], - Arc::new(EmptyExec::new(false, schema)), + Arc::new(EmptyExec::new(schema)), ) .unwrap(), ); @@ -504,10 +510,10 @@ mod tests { --- input: - " ProjectionExec: expr=[tag2@1 as tag2, tag1@0 as tag1, field@2 as field]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); @@ -531,7 +537,7 @@ mod tests { let plan = Arc::new( ProjectionExec::try_new( vec![(expr_col("tag2", &schema), String::from("tag1"))], - Arc::new(EmptyExec::new(false, schema)), + Arc::new(EmptyExec::new(schema)), ) .unwrap(), ); @@ -542,11 +548,11 @@ mod tests { --- input: - " ProjectionExec: expr=[tag2@1 as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " ProjectionExec: expr=[tag2@1 as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -560,7 +566,7 @@ mod tests { (expr_col("tag1", &schema), String::from("tag1")), (expr_col("tag2", &schema), String::from("tag3")), ], - Arc::new(EmptyExec::new(false, schema)), + Arc::new(EmptyExec::new(schema)), ) .unwrap(), ); @@ -571,11 +577,11 @@ mod tests { --- input: - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag3]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -589,7 +595,7 @@ mod tests { Arc::new(Literal::new(ScalarValue::from("foo"))), String::from("tag1"), )], - Arc::new(EmptyExec::new(false, schema)), + Arc::new(EmptyExec::new(schema)), ) .unwrap(), ); @@ -600,11 +606,11 @@ mod tests { --- input: - " ProjectionExec: expr=[foo as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " ProjectionExec: expr=[foo as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -725,7 +731,7 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: Some(projection), limit: None, table_partition_cols: vec![], @@ -743,7 +749,6 @@ mod tests { options: Default::default(), }, ]], - infinite_source: false, }; let inner = ParquetExec::new(base_config, Some(expr_string_cmp("tag1", &schema)), None); let plan = Arc::new( @@ -987,12 +992,12 @@ mod tests { --- input: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " SortExec: fetch=42, expr=[tag2@1 DESC]" + - " SortExec: TopK(fetch=42), expr=[tag2@1 DESC]" - " Test" output: Ok: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " SortExec: fetch=42, expr=[tag2@1 DESC]" + - " SortExec: TopK(fetch=42), expr=[tag2@1 DESC]" - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]" - " Test" "### @@ -1033,12 +1038,12 @@ mod tests { --- input: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " SortExec: fetch=42, expr=[tag2@1 DESC]" + - " SortExec: TopK(fetch=42), expr=[tag2@1 DESC]" - " Test" output: Ok: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " SortExec: fetch=42, expr=[tag2@1 DESC]" + - " SortExec: TopK(fetch=42), expr=[tag2@1 DESC]" - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]" - " Test" "### @@ -1089,7 +1094,7 @@ mod tests { #[test] fn test_nested_proj_inner_is_impure() { let schema = schema(); - let plan = Arc::new(EmptyExec::new(false, schema)); + let plan = Arc::new(EmptyExec::new(schema)); let plan = Arc::new( ProjectionExec::try_new( vec![ @@ -1121,11 +1126,11 @@ mod tests { input: - " ProjectionExec: expr=[tag1@0 as tag1]" - " ProjectionExec: expr=[foo as tag1, bar as tag2]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " ProjectionExec: expr=[foo as tag1]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -1133,7 +1138,7 @@ mod tests { #[test] fn test_nested_proj_inner_is_pure() { let schema = schema(); - let plan = Arc::new(EmptyExec::new(false, schema)); + let plan = Arc::new(EmptyExec::new(schema)); let plan = Arc::new( ProjectionExec::try_new( vec![ @@ -1160,10 +1165,10 @@ mod tests { input: - " ProjectionExec: expr=[tag1@0 as tag1]" - " ProjectionExec: expr=[tag1@0 as tag1, tag2@1 as tag2]" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); let empty_exec = test @@ -1297,10 +1302,10 @@ mod tests { --- input: - " ProjectionExec: expr=[tag1@0 as tag1]" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[tag1, tag2, field]" output: Ok: - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[tag1]" "### ); @@ -1326,12 +1331,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![], - infinite_source: false, }; let plan = Arc::new(ParquetExec::new(base_config, None, None)); let plan = Arc::new(UnionExec::new(vec![plan])); @@ -1695,8 +1699,10 @@ mod tests { unimplemented!() } - fn statistics(&self) -> datafusion::physical_plan::Statistics { - Statistics::default() + fn statistics(&self) -> Result { + Ok(datafusion::physical_plan::Statistics::new_unknown( + &self.schema(), + )) } } diff --git a/iox_query/src/physical_optimizer/sort/mod.rs b/iox_query/src/physical_optimizer/sort/mod.rs index d0cdabb621a..9a9be8b7a80 100644 --- a/iox_query/src/physical_optimizer/sort/mod.rs +++ b/iox_query/src/physical_optimizer/sort/mod.rs @@ -2,5 +2,7 @@ //! //! [`SortExec`]: datafusion::physical_plan::sorts::sort::SortExec +pub mod order_union_sorted_inputs; pub mod parquet_sortness; pub mod push_sort_through_union; +pub mod util; diff --git a/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs new file mode 100644 index 00000000000..026610870b6 --- /dev/null +++ b/iox_query/src/physical_optimizer/sort/order_union_sorted_inputs.rs @@ -0,0 +1,1487 @@ +use std::sync::Arc; + +use datafusion::{ + common::tree_node::{Transformed, TreeNode}, + config::ConfigOptions, + error::Result, + physical_optimizer::PhysicalOptimizerRule, + physical_plan::{ + displayable, expressions::Column, sorts::sort_preserving_merge::SortPreservingMergeExec, + union::UnionExec, ExecutionPlan, + }, +}; +use observability_deps::tracing::{trace, warn}; + +use crate::{ + physical_optimizer::sort::util::{collect_statistics_min_max, sort_by_value_ranges}, + provider::progressive_eval::ProgressiveEvalExec, +}; + +/// IOx specific optimization that eliminates a `SortPreservingMerge` +/// by reordering inputs in terms of their value ranges. If all inputs are non overlapping and ordered +/// by value range, they can be concatenated by `ProgressiveEval` while +/// maintaining the desired output order without actually merging. +/// +/// Find this structure: +/// SortPreservingMergeExec - on one column (DESC or ASC) +/// UnionExec +/// and if +/// - all inputs of UnionExec are already sorted (or has SortExec) with sortExpr also on time DESC or ASC accarsdingly and +/// - the streams do not overlap in values of the sorted column +/// do: +/// - order them by the sorted column DESC or ASC accordingly and +/// - replace SortPreservingMergeExec with ProgressiveEvalExec +/// +/// Notes: The difference between SortPreservingMergeExec & ProgressiveEvalExec +/// - SortPreservingMergeExec do the merge of sorted input streams. It needs each stream sorted but the streams themselves +/// can be in any random order and they can also overlap in values of sorted columns. +/// - ProgressiveEvalExec only outputs data in their input order of the streams and not do any merges. Thus in order to +/// output data in the right sort order, these three conditions must be true: +/// 1. Each input stream must sorted on the same column DESC or ASC accordingly +/// 2. The streams must be sorted on the column DESC or ASC accordingly +/// 3. The streams must not overlap in the values of that column. +/// +/// Example: for col_name ranges: +/// |--- r1---|-- r2 ---|-- r3 ---|-- r4 --| +/// +/// Here is what the input look like: +/// +/// SortPreservingMergeExec: time@2 DESC, fetch=1 +/// UnionExec +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r3 +/// ... +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r1 +/// ... +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r4 +/// ... +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r2 -- assuming this SortExec has 2 output sorted streams +/// ... +/// +/// The streams do not overlap in time, and they are already sorted by time DESC. +/// +/// The output will be the same except that all the input streams will be sorted by time DESC too and looks like +/// +/// SortPreservingMergeExec: time@2 DESC, fetch=1 +/// UnionExec +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r1 +/// ... +/// SortPreservingMergeExec: -- need this extra to merge the 2 streams into one +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r2 +/// ... +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r3 +/// ... +/// SortExec: expr=col_name@2 DESC <--- input stream with col_name range r4 +/// ... +/// + +pub(crate) struct OrderUnionSortedInputs; + +impl PhysicalOptimizerRule for OrderUnionSortedInputs { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + plan.transform_up(&|plan| { + // Find SortPreservingMergeExec + let Some(sort_preserving_merge_exec) = + plan.as_any().downcast_ref::() + else { + return Ok(Transformed::No(plan)); + }; + + // Check if the sortExpr is only on one column + let sort_expr = sort_preserving_merge_exec.expr(); + if sort_expr.len() != 1 { + trace!( + ?sort_expr, + "-------- sortExpr is not on one column. No optimization" + ); + return Ok(Transformed::No(plan)); + }; + let Some(sorted_col) = sort_expr[0].expr.as_any().downcast_ref::() else { + trace!( + ?sort_expr, + "-------- sortExpr is not on pure column but expression. No optimization" + ); + return Ok(Transformed::No(plan)); + }; + let sort_options = sort_expr[0].options; + + // Find UnionExec + let Some(union_exec) = sort_preserving_merge_exec + .input() + .as_any() + .downcast_ref::() + else { + trace!("-------- SortPreservingMergeExec input is not UnionExec. No optimization"); + return Ok(Transformed::No(plan)); + }; + + // Check all inputs of UnionExec must be already sorted and on the same sort_expr of SortPreservingMergeExec + let Some(union_output_ordering) = union_exec.output_ordering() else { + warn!(plan=%displayable(plan.as_ref()).indent(false), "Union input to SortPreservingMerge is not sorted"); + return Ok(Transformed::No(plan)); + }; + + // Check if the first PhysicalSortExpr is the same as the sortExpr[0] in SortPreservingMergeExec + if sort_expr[0] != union_output_ordering[0] { + warn!(?sort_expr, ?union_output_ordering, plan=%displayable(plan.as_ref()).indent(false), "-------- Sort order of SortPreservingMerge and its children are different"); + return Ok(Transformed::No(plan)); + } + + let Some(value_ranges) = collect_statistics_min_max(union_exec.inputs(), sorted_col.name())? + else { + return Ok(Transformed::No(plan)); + }; + + // Sort the inputs by their value ranges + trace!("-------- value_ranges: {:?}", value_ranges); + let Some(plans_value_ranges) = + sort_by_value_ranges(union_exec.inputs().to_vec(), value_ranges, sort_options)? + else { + trace!("-------- inputs are not sorted by value ranges. No optimization"); + return Ok(Transformed::No(plan)); + }; + + // If each input of UnionExec outputs many sorted streams, data of different streams may overlap and + // even if they do not overlapped, their streams can be in any order. We need to (sort) merge them first + // to have a single output stream out to guarantee the output is sorted. + let new_inputs = plans_value_ranges.plans + .iter() + .map(|input| { + if input.output_partitioning().partition_count() > 1 { + // Add SortPreservingMergeExec on top of this input + let sort_preserving_merge_exec = Arc::new( + SortPreservingMergeExec::new(sort_expr.to_vec(), Arc::clone(input)) + .with_fetch(sort_preserving_merge_exec.fetch()), + ); + Ok(sort_preserving_merge_exec as _) + } else { + Ok(Arc::clone(input)) + } + }) + .collect::>>()?; + + let new_union_exec = Arc::new(UnionExec::new(new_inputs)); + + // Replace SortPreservingMergeExec with ProgressiveEvalExec + let progresive_eval_exec = Arc::new(ProgressiveEvalExec::new( + new_union_exec, + Some(plans_value_ranges.value_ranges), + sort_preserving_merge_exec.fetch(), + )); + + Ok(Transformed::Yes(progresive_eval_exec)) + }) + } + + fn name(&self) -> &str { + "order_union_sorted_inputs" + } + + fn schema_check(&self) -> bool { + true + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use arrow::{compute::SortOptions, datatypes::SchemaRef}; + use datafusion::{ + logical_expr::Operator, + physical_expr::PhysicalSortExpr, + physical_plan::{ + expressions::{BinaryExpr, Column}, + limit::GlobalLimitExec, + projection::ProjectionExec, + repartition::RepartitionExec, + sorts::{sort::SortExec, sort_preserving_merge::SortPreservingMergeExec}, + union::UnionExec, + ExecutionPlan, Partitioning, PhysicalExpr, + }, + scalar::ScalarValue, + }; + use schema::{InfluxFieldType, SchemaBuilder as IOxSchemaBuilder}; + + use crate::{ + physical_optimizer::{ + sort::order_union_sorted_inputs::OrderUnionSortedInputs, test_util::OptimizationTest, + }, + provider::{chunks_to_physical_nodes, DeduplicateExec, RecordBatchesExec}, + statistics::{column_statistics_min_max, compute_stats_column_min_max}, + test::{format_execution_plan, TestChunk}, + QueryChunk, CHUNK_ORDER_COLUMN_NAME, + }; + + // ------------------------------------------------------------------ + // Positive tests: the right structure found -> plan optimized + // ------------------------------------------------------------------ + + #[test] + fn test_limit_mix_record_batch_parquet_1_desc() { + test_helpers::maybe_start_logging(); + + // Input plan: + // + // GlobalLimitExec: skip=0, fetch=2 + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ParquetExec -- [1000, 2000] + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of two record batches + // UnionExec + // RecordBatchesExec -- 3 chunks [2001, 3000] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // + // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_batches1 = record_batches_exec_with_value_range(3, 2001, 3000); + let plan_batches2 = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches1, plan_batches2])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + // min max of plan_sorted1 is [1000, 2000] + // structure of plan_sorted1 + let p_sort1 = Arc::clone(&plan_sort1) as Arc; + insta::assert_yaml_snapshot!( + format_execution_plan(&p_sort1), + @r###" + --- + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + let min_max_sort1 = compute_stats_column_min_max(&*plan_sort1, "time").unwrap(); + let min_max = column_statistics_min_max(&min_max_sort1).unwrap(); + assert_eq!( + min_max, + ( + ScalarValue::TimestampNanosecond(Some(1000), None), + ScalarValue::TimestampNanosecond(Some(2000), None) + ) + ); + // + // min max of plan_sorted2 is [2001, 3500] + let p_sort2 = Arc::clone(&plan_sort2) as Arc; + insta::assert_yaml_snapshot!( + format_execution_plan(&p_sort2), + @r###" + --- + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + "### + ); + let min_max_sort2 = compute_stats_column_min_max(&*plan_sort2, "time").unwrap(); + let min_max = column_statistics_min_max(&min_max_sort2).unwrap(); + assert_eq!( + min_max, + ( + ScalarValue::TimestampNanosecond(Some(2001), None), + ScalarValue::TimestampNanosecond(Some(3500), None) + ) + ); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + // min max of plan_spm is [1000, 3500] + let p_spm = Arc::clone(&plan_spm) as Arc; + insta::assert_yaml_snapshot!( + format_execution_plan(&p_spm), + @r###" + --- + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + "### + ); + let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap(); + let min_max = column_statistics_min_max(&min_max_spm).unwrap(); + assert_eq!( + min_max, + ( + ScalarValue::TimestampNanosecond(Some(1000), None), + ScalarValue::TimestampNanosecond(Some(3500), None) + ) + ); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1))); + + // Output plan: the 2 SortExecs will be swapped the order + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + #[test] + fn test_limit_mix_record_batch_parquet_2_desc() { + test_helpers::maybe_start_logging(); + + // Input plan: + // + // GlobalLimitExec: skip=0, fetch=2 + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ParquetExec -- [1000, 2000] + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of two record batches + // UnionExec + // SortExec: expr=[time@2 DESC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // ParquetExec -- [2001, 3000] + // + // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first + + let schema = schema(); + let order = ordering_with_options( + [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1))); + + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // test on non-time column & order desc + #[test] + fn test_limit_mix_record_batch_parquet_non_time_sort_desc() { + test_helpers::maybe_start_logging(); + + // Input plan: + // + // GlobalLimitExec: skip=0, fetch=2 + // SortPreservingMerge: [field1@2 DESC] + // UnionExec + // SortExec: expr=[field1@2 DESC] -- time range [1000, 2000] + // ParquetExec -- [1000, 2000] + // SortExec: expr=[field1@2 DESC] -- time range [2001, 3500] from combine time range of two record batches + // UnionExec + // SortExec: expr=[field1@2 DESC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // ParquetExec -- [2001, 3000] + // + // Output plan: the 2 SortExecs will be swapped the order to have time range [2001, 3500] first + + let schema = schema(); + let order = ordering_with_options( + [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("field1", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2])); + + let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1))); + + // Output plan: the 2 SortExecs will be swapped the order + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]" + - " UnionExec" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // test on non-time column & order asc + #[test] + fn test_limit_mix_record_batch_parquet_non_time_sort_asc() { + test_helpers::maybe_start_logging(); + + // Input plan: + // + // GlobalLimitExec: skip=0, fetch=2 + // SortPreservingMerge: [field1@2 ASC] + // UnionExec + // SortExec: expr=[field1@2 ASC] -- time range [1000, 2000] + // ParquetExec -- [1000, 2000] + // SortExec: expr=[field1@2 ASC] -- time range [2001, 3500] from combine time range of two record batches + // UnionExec + // SortExec: expr=[field1@2 ASC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // ParquetExec -- [2001, 3000] + // + // Output plan: same as input plan + + let schema = schema(); + let order = ordering_with_options( + [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("field1", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2])); + + let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1))); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [field1@2 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]" + - " UnionExec" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,field1@2 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // No limit & but the input is in the right sort preserving merge struct --> optimize + #[test] + fn test_spm_time_desc() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] + // ParquetExec + // SortExec: expr=[time@2 DESC] + // UnionExec + // RecordBatchesExec + // ParquetExec + // + // Output: 2 SortExec are swapped + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + // Output plan: the 2 SortExecs will be swapped the order + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // No limit & but the input is in the right sort preserving merge struct --> optimize + #[test] + fn test_spm_non_time_desc() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [field1@2 DESC] + // UnionExec + // SortExec: expr=[field1@2 DESC] + // ParquetExec + // SortExec: expr=[field1@2 DESC] + // UnionExec + // RecordBatchesExec + // ParquetExec + // + // Output: 2 SortExec are swapped + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2])); + + let sort_order = ordering_with_options([("field1", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + // Output plan: the 2 SortExecs will be swapped the order + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(Int64(2001), Int64(3500)), (Int64(1000), Int64(2000))]" + - " UnionExec" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // No limit & but the input is in the right sort preserving merge struct --> optimize + #[test] + fn test_spm_non_time_asc() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [field1@2 ASC] + // UnionExec + // SortExec: expr=[field1@2 ASC] + // ParquetExec + // SortExec: expr=[field1@2 ASC] + // UnionExec + // RecordBatchesExec + // ParquetExec + // + // Output: 2 SortExec ordered as above + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2])); + + let sort_order = ordering_with_options([("field1", SortOp::Asc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + // output stays the same as input + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [field1@2 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(Int64(1000), Int64(2000)), (Int64(2001), Int64(3500))]" + - " UnionExec" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[field1@2 ASC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // Plan starts with SortPreservingMerge and includes deduplication & projections. + // All conditions meet --> optimize + #[test] + fn test_spm_time_desc_with_dedupe_and_proj() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ProjectionExec: expr=[time] + // ParquetExec -- [1000, 2000] + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of record batches & parquet + // ProjectionExec: expr=[time] + // DeduplicateExec: [col1, col2, time] + // SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC] + // UnionExec + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // ParquetExec -- [2001, 3000] + // + // Output: 2 SortExec are swapped + + let schema = schema(); + + let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + + // Sort plan of the first parquet: + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ProjectionExec: expr=[time] + // ParquetExec + let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_projection_1 = Arc::new( + ProjectionExec::try_new( + vec![(expr_col("time", &schema), String::from("time"))], + plan_parquet_1, + ) + .unwrap(), + ); + let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1)); + + // Sort plan of the second parquet and the record batch + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of record batches & parquet + // ProjectionExec: expr=[time] + // DeduplicateExec: [col1, col2, time] + // SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC] + // UnionExec + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // ParquetExec -- [2001, 3000] + let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + let dedupe_sort_order = ordering_with_options( + [ + ("col1", SortOp::Asc), + ("col2", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches)); + let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq])); + let plan_spm_1 = Arc::new(SortPreservingMergeExec::new( + dedupe_sort_order.clone(), + plan_union_1, + )); + let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false)); + let plan_projection_2 = Arc::new( + ProjectionExec::try_new( + vec![(expr_col("time", &schema), String::from("time"))], + plan_dedupe, + ) + .unwrap(), + ); + let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2)); + + // Union them together + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + // SortPreservingMerge them + let plan_spm = Arc::new(SortPreservingMergeExec::new( + final_sort_order.clone(), + plan_union_2, + )); + + // compute statistics + let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap(); + let min_max = column_statistics_min_max(&min_max_spm).unwrap(); + assert_eq!( + min_max, + ( + ScalarValue::TimestampNanosecond(Some(1000), None), + ScalarValue::TimestampNanosecond(Some(3500), None) + ) + ); + + // Output plan: the 2 SortExecs will be swapped the order + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[time@3 as time]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[time@3 as time]" + - " DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " ProgressiveEvalExec: input_ranges=[(TimestampNanosecond(2001, None), TimestampNanosecond(3500, None)), (TimestampNanosecond(1000, None), TimestampNanosecond(2000, None))]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[time@3 as time]" + - " DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[time@3 as time]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // ------------------------------------------------------------------ + // Negative tests: the right structure not found -> nothing optimized + // ------------------------------------------------------------------ + + // Right stucture but sort on 2 columns --> plan stays the same + #[test] + fn test_negative_spm_2_column_sort_desc() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [time@3 DESC, field1@2 DESC] + // UnionExec + // SortExec: expr=[time@3 DESC, field1@2 DESC] + // ParquetExec + // SortExec: expr=[time@3 DESC, field1@2 DESC] + // UnionExec + // RecordBatchesExec + // ParquetExec + // + // Output: same as input + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2])); + + let sort_order = + ordering_with_options([("time", SortOp::Desc), ("field1", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST,field1@2 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // No limit & random plan --> plan stay the same + #[test] + fn test_negative_no_limit() { + test_helpers::maybe_start_logging(); + + let schema = schema(); + let order = ordering_with_options( + [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_batches = record_batches_exec_with_value_range(2, 1500, 2500); + + let plan = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet])); + let plan = + Arc::new(RepartitionExec::try_new(plan, Partitioning::RoundRobinBatch(8)).unwrap()); + let hash_exprs = order.iter().cloned().map(|e| e.expr).collect(); + let plan = + Arc::new(RepartitionExec::try_new(plan, Partitioning::Hash(hash_exprs, 8)).unwrap()); + let plan = Arc::new(SortExec::new(order.clone(), plan)); + let plan = Arc::new(DeduplicateExec::new(plan, order, true)); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan, opt), + @r###" + --- + input: + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " DeduplicateExec: [col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3], 8), input_partitions=8" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=3" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // has limit but no sort preserving merge --> plan stay the same + #[test] + fn test_negative_limit_no_preserving_merge() { + test_helpers::maybe_start_logging(); + + let schema = schema(); + + let plan_batches1 = record_batches_exec_with_value_range(1, 1000, 2000); + let plan_batches2 = record_batches_exec_with_value_range(3, 2001, 3000); + let plan_batches3 = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches2, plan_batches3])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_batches1)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_union_2, 0, Some(1))); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " RecordBatchesExec: chunks=1, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=3, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + "### + ); + } + + // right structure and same sort order but inputs of uion overlap --> plan stay the same + #[test] + fn test_negative_overlap() { + test_helpers::maybe_start_logging(); + + // Input plan: + // + // GlobalLimitExec: skip=0, fetch=2 + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] that overlaps with the other SorExec + // ParquetExec -- [1000, 2000] + // SortExec: expr=[time@2 DESC] -- time range [2000, 3500] from combine time range of two record batches + // UnionExec + // SortExec: expr=[time@2 DESC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // ParquetExec -- [2000, 3000] + + let schema = schema(); + let order = ordering_with_options( + [ + ("col2", SortOp::Asc), + ("col1", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2000, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_sort1 = Arc::new(SortExec::new(order.clone(), plan_batches)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort1, plan_parquet2])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort3 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort2, plan_sort3])); + + let plan_spm = Arc::new(SortPreservingMergeExec::new( + sort_order.clone(), + plan_union_2, + )); + + let plan_limit = Arc::new(GlobalLimitExec::new(plan_spm, 0, Some(1))); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_limit, opt), + @r###" + --- + input: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " GlobalLimitExec: skip=0, fetch=1" + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col2@1 ASC NULLS LAST,col1@0 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // No limit & but the input is in the right union struct --> plan stay the same + #[test] + fn test_negative_no_sortpreservingmerge_input_union() { + test_helpers::maybe_start_logging(); + + // plan: + // UnionExec + // SortExec: expr=[time@2 DESC] + // ParquetExec + // SortExec: expr=[time@2 DESC] + // UnionExec + // RecordBatchesExec + // ParquetExec + + let schema = schema(); + + let plan_parquet = parquet_exec_with_value_range(&schema, 1000, 2000); + let plan_parquet2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_batches, plan_parquet2])); + + let sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + let plan_sort1 = Arc::new(SortExec::new(sort_order.clone(), plan_parquet)); + let plan_sort2 = Arc::new(SortExec::new(sort_order.clone(), plan_union_1)); + + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + // input and output are the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_union_2, opt), + @r###" + --- + input: + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " UnionExec" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // Projection expression (field + field) ==> not optimze. Plan stays the same + #[test] + fn test_negative_spm_time_desc_with_dedupe_and_proj_on_expr() { + test_helpers::maybe_start_logging(); + + // plan: + // SortPreservingMerge: [time@2 DESC] + // UnionExec + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ProjectionExec: expr=[field1 + field1, time] <-- NOTE: has expresssion col1+col2 + // ParquetExec -- [1000, 2000] + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of record batches & parquet + // ProjectionExec: expr=[field1 + field1, time] <-- NOTE: has expresssion col1+col2 + // DeduplicateExec: [col1, col2, time] + // SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC] + // UnionExec + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // ParquetExec -- [2001, 3000] + + let schema = schema(); + + let final_sort_order = ordering_with_options([("time", SortOp::Desc)], &schema); + + // Sort plan of the first parquet: + // SortExec: expr=[time@2 DESC] -- time range [1000, 2000] + // ProjectionExec: expr=[field1 + field1, time] + // ParquetExec + let plan_parquet_1 = parquet_exec_with_value_range(&schema, 1000, 2000); + + let field_expr = Arc::new(BinaryExpr::new( + Arc::new(Column::new_with_schema("field1", &schema).unwrap()), + Operator::Plus, + Arc::new(Column::new_with_schema("field1", &schema).unwrap()), + )); + let plan_projection_1 = Arc::new( + ProjectionExec::try_new( + vec![ + (Arc::::clone(&field_expr), String::from("field")), + (expr_col("time", &schema), String::from("time")), + ], + plan_parquet_1, + ) + .unwrap(), + ); + let plan_sort1 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_1)); + + // Sort plan of the second parquet and the record batch + // SortExec: expr=[time@2 DESC] -- time range [2001, 3500] from combine time range of record batches & parquet + // ProjectionExec: expr=[field1 + field1, time] + // DeduplicateExec: [col1, col2, time] + // SortPreservingMergeExec: [col1 ASC, col2 ASC, time ASC] + // UnionExec + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // RecordBatchesExec -- 2 chunks [2500, 3500] + // SortExec: expr=[col1 ASC, col2 ASC, time ASC] + // ParquetExec -- [2001, 3000] + let plan_parquet_2 = parquet_exec_with_value_range(&schema, 2001, 3000); + let plan_batches = record_batches_exec_with_value_range(2, 2500, 3500); + let dedupe_sort_order = ordering_with_options( + [ + ("col1", SortOp::Asc), + ("col2", SortOp::Asc), + ("time", SortOp::Asc), + ], + &schema, + ); + let plan_sort_rb = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_batches)); + let plan_sort_pq = Arc::new(SortExec::new(dedupe_sort_order.clone(), plan_parquet_2)); + let plan_union_1 = Arc::new(UnionExec::new(vec![plan_sort_rb, plan_sort_pq])); + let plan_spm_1 = Arc::new(SortPreservingMergeExec::new( + dedupe_sort_order.clone(), + plan_union_1, + )); + let plan_dedupe = Arc::new(DeduplicateExec::new(plan_spm_1, dedupe_sort_order, false)); + let plan_projection_2 = Arc::new( + ProjectionExec::try_new( + vec![ + (field_expr, String::from("field")), + (expr_col("time", &schema), String::from("time")), + ], + plan_dedupe, + ) + .unwrap(), + ); + let plan_sort2 = Arc::new(SortExec::new(final_sort_order.clone(), plan_projection_2)); + + // Union them together + let plan_union_2 = Arc::new(UnionExec::new(vec![plan_sort1, plan_sort2])); + + // SortPreservingMerge them + let plan_spm = Arc::new(SortPreservingMergeExec::new( + final_sort_order.clone(), + plan_union_2, + )); + + // compute statistics: no stats becasue the ProjectionExec includes expression + let min_max_spm = compute_stats_column_min_max(&*plan_spm, "time").unwrap(); + let min_max = column_statistics_min_max(&min_max_spm); + assert!(min_max.is_none()); + + // output plan stays the same + let opt = OrderUnionSortedInputs; + insta::assert_yaml_snapshot!( + OptimizationTest::new(plan_spm, opt), + @r###" + --- + input: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]" + - " DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + output: + Ok: + - " SortPreservingMergeExec: [time@3 DESC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + - " SortExec: expr=[time@3 DESC NULLS LAST]" + - " ProjectionExec: expr=[field1@2 + field1@2 as field, time@3 as time]" + - " DeduplicateExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " SortPreservingMergeExec: [col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " UnionExec" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " SortExec: expr=[col1@0 ASC NULLS LAST,col2@1 ASC NULLS LAST,time@3 ASC NULLS LAST]" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" + "### + ); + } + + // ------------------------------------------------------------------ + // Helper functions + // ------------------------------------------------------------------ + + fn schema() -> SchemaRef { + IOxSchemaBuilder::new() + .tag("col1") + .tag("col2") + .influx_field("field1", InfluxFieldType::Float) + .timestamp() + .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer) + .build() + .unwrap() + .into() + } + + fn expr_col(name: &str, schema: &SchemaRef) -> Arc { + Arc::new(Column::new_with_schema(name, schema).unwrap()) + } + + // test chunk with time range and field1's value range + fn test_chunk(min: i64, max: i64, parquet_data: bool) -> Arc { + let chunk = TestChunk::new("t") + .with_time_column_with_stats(Some(min), Some(max)) + .with_tag_column_with_stats("col1", Some("AL"), Some("MT")) + .with_tag_column_with_stats("col2", Some("MA"), Some("VY")) + .with_i64_field_column_with_stats("field1", Some(min), Some(max)); + + let chunk = if parquet_data { + chunk.with_dummy_parquet_file() + } else { + chunk + }; + + Arc::new(chunk) as Arc + } + + fn record_batches_exec_with_value_range( + n_chunks: usize, + min: i64, + max: i64, + ) -> Arc { + let chunks = std::iter::repeat(test_chunk(min, max, false)) + .take(n_chunks) + .collect::>(); + + Arc::new(RecordBatchesExec::new(chunks, schema(), None)) + } + + fn parquet_exec_with_value_range( + schema: &SchemaRef, + min: i64, + max: i64, + ) -> Arc { + let chunk = test_chunk(min, max, true); + let plan = chunks_to_physical_nodes(schema, None, vec![chunk], 1); + + if let Some(union_exec) = plan.as_any().downcast_ref::() { + if union_exec.inputs().len() == 1 { + Arc::clone(&union_exec.inputs()[0]) + } else { + plan + } + } else { + plan + } + } + + fn ordering_with_options( + cols: [(&str, SortOp); N], + schema: &SchemaRef, + ) -> Vec { + cols.into_iter() + .map(|col| PhysicalSortExpr { + expr: Arc::new(Column::new_with_schema(col.0, schema.as_ref()).unwrap()), + options: SortOptions { + descending: col.1 == SortOp::Desc, + nulls_first: false, + }, + }) + .collect() + } + + #[derive(Debug, PartialEq)] + enum SortOp { + Asc, + Desc, + } +} diff --git a/iox_query/src/physical_optimizer/sort/parquet_sortness.rs b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs index bf2c9440733..c0f4a132dab 100644 --- a/iox_query/src/physical_optimizer/sort/parquet_sortness.rs +++ b/iox_query/src/physical_optimizer/sort/parquet_sortness.rs @@ -180,8 +180,8 @@ mod tests { datasource::{listing::PartitionedFile, object_store::ObjectStoreUrl}, physical_expr::PhysicalSortExpr, physical_plan::{ - empty::EmptyExec, expressions::Column, sorts::sort::SortExec, union::UnionExec, - Statistics, + expressions::Column, placeholder_row::PlaceholderRowExec, sorts::sort::SortExec, + union::UnionExec, Statistics, }, }; use object_store::{path::Path, ObjectMeta}; @@ -202,12 +202,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1"], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -220,11 +219,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" "### ); @@ -237,12 +236,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new(DeduplicateExec::new( @@ -273,12 +271,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)], vec![file(3)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1"], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -296,11 +293,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet, 2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={3 groups: [[1.parquet], [2.parquet], [3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" "### ); @@ -315,12 +312,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1)], vec![file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1"], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -333,11 +329,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" "### ); @@ -350,12 +346,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col1", "col2"], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -368,11 +363,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" "### ); @@ -385,12 +380,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -403,11 +397,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3]" "### ); @@ -420,12 +414,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2), file(3)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1"], &schema)], - infinite_source: false, }; let inner = ParquetExec::new(base_config, None, None); let plan = Arc::new( @@ -443,11 +436,11 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet, 3.parquet]]}, projection=[col1, col2, col3], output_ordering=[col2@1 ASC, col1@0 ASC]" "### ); @@ -456,7 +449,7 @@ mod tests { #[test] fn test_other_node() { let schema = schema(); - let inner = EmptyExec::new(true, Arc::clone(&schema)); + let inner = PlaceholderRowExec::new(Arc::clone(&schema)); let plan = Arc::new( SortExec::new(ordering(["col2", "col1"], &schema), Arc::new(inner)) .with_fetch(Some(42)), @@ -467,12 +460,12 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" - - " EmptyExec: produce_one_row=true" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" + - " PlaceholderRowExec" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" - - " EmptyExec: produce_one_row=true" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" + - " PlaceholderRowExec" "### ); } @@ -484,12 +477,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1"], &schema)], - infinite_source: false, }; let plan = Arc::new(ParquetExec::new(base_config, None, None)); let opt = ParquetSortness; @@ -513,12 +505,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col1", "col2"], &schema)], - infinite_source: false, }; let plan = Arc::new(ParquetExec::new(base_config, None, None)); let plan = @@ -531,13 +522,13 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]" - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]" - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" "### ); @@ -550,12 +541,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col1", "col2"], &schema)], - infinite_source: false, }; let plan = Arc::new(ParquetExec::new(base_config, None, None)); let plan = @@ -568,13 +558,13 @@ mod tests { @r###" --- input: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" - - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" output: Ok: - - " SortExec: fetch=42, expr=[col2@1 ASC,col1@0 ASC]" - - " SortExec: fetch=42, expr=[col1@0 ASC,col2@1 ASC]" + - " SortExec: TopK(fetch=42), expr=[col2@1 ASC,col1@0 ASC]" + - " SortExec: TopK(fetch=42), expr=[col1@0 ASC,col2@1 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3], output_ordering=[col1@0 ASC, col2@1 ASC]" "### ); @@ -588,12 +578,11 @@ mod tests { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(&schema), file_groups: vec![vec![file(1), file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(&schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![ordering(["col2", "col1", CHUNK_ORDER_COLUMN_NAME], &schema)], - infinite_source: false, }; let plan_parquet = Arc::new(ParquetExec::new(base_config, None, None)); let plan_batches = Arc::new(RecordBatchesExec::new(vec![], Arc::clone(&schema), None)); @@ -612,13 +601,13 @@ mod tests { input: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=0" + - " RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]" - " ParquetExec: file_groups={1 group: [[1.parquet, 2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]" output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=0" + - " RecordBatchesExec: chunks=0, projection=[col1, col2, col3, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, col3, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, __chunk_order@3 ASC]" "### ); @@ -650,6 +639,7 @@ mod tests { last_modified: Default::default(), size: 0, e_tag: None, + version: None, }, partition_values: vec![], range: None, diff --git a/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs index 6563a86512e..f76772abe3f 100644 --- a/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs +++ b/iox_query/src/physical_optimizer/sort/push_sort_through_union.rs @@ -1,13 +1,13 @@ use std::sync::Arc; use datafusion::{ - common::tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter}, - config::ConfigOptions, - error::Result, - physical_expr::{ - utils::ordering_satisfy_requirement, - {PhysicalSortExpr, PhysicalSortRequirement}, + common::{ + internal_err, + tree_node::{RewriteRecursion, Transformed, TreeNode, TreeNodeRewriter}, }, + config::ConfigOptions, + error::{DataFusionError, Result}, + physical_expr::{PhysicalSortExpr, PhysicalSortRequirement}, physical_optimizer::PhysicalOptimizerRule, physical_plan::{ repartition::RepartitionExec, sorts::sort::SortExec, union::UnionExec, ExecutionPlan, @@ -67,7 +67,7 @@ impl PhysicalOptimizerRule for PushSortThroughUnion { return Ok(Transformed::No(plan)); }; - if !sort_should_be_pushed_down(sort_exec) { + if !sort_should_be_pushed_down(sort_exec)? { return Ok(Transformed::No(plan)); } @@ -80,16 +80,16 @@ impl PhysicalOptimizerRule for PushSortThroughUnion { // As a sanity check, make sure plan has the same ordering as before. // If this fails, there is a bug in this optimization. - let required_order = sort_exec.output_ordering().map(sort_exprs_to_requirement); - if !ordering_satisfy_requirement( - plan.output_ordering(), - required_order.as_deref(), - || plan.equivalence_properties(), - || plan.ordering_equivalence_properties(), - ) { - return Err(datafusion::error::DataFusionError::Internal( - "PushSortThroughUnion corrupted plan sort order".into(), - )); + let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement) + else { + return internal_err!("No sort order after a sort"); + }; + + if !plan + .equivalence_properties() + .ordering_satisfy_requirement(&required_order) + { + return internal_err!("PushSortThroughUnion corrupted plan sort order"); } Ok(Transformed::Yes(plan)) @@ -106,7 +106,7 @@ impl PhysicalOptimizerRule for PushSortThroughUnion { } /// Returns true if the [`SortExec`] can be pushed down beneath a [`UnionExec`]. -fn sort_should_be_pushed_down(sort_exec: &SortExec) -> bool { +fn sort_should_be_pushed_down(sort_exec: &SortExec) -> Result { // Skip over any RepartitionExecs let mut input = sort_exec.input(); while input.as_any().is::() { @@ -118,22 +118,21 @@ fn sort_should_be_pushed_down(sort_exec: &SortExec) -> bool { } let Some(union_exec) = input.as_any().downcast_ref::() else { - return false; + return Ok(false); }; - let required_ordering = sort_exec.output_ordering().map(sort_exprs_to_requirement); + let Some(required_order) = sort_exec.output_ordering().map(sort_exprs_to_requirement) else { + return internal_err!("No sort order after a sort"); + }; // Push down the sort if any of the children are already sorted. // This means we will need to sort fewer rows than if we didn't // push down the sort. - union_exec.children().iter().any(|child| { - ordering_satisfy_requirement( - child.output_ordering(), - required_ordering.as_deref(), - || child.equivalence_properties(), - || child.ordering_equivalence_properties(), - ) - }) + Ok(union_exec.children().iter().any(|child| { + child + .equivalence_properties() + .ordering_satisfy_requirement(&required_order) + })) } /// Rewrites a plan: @@ -166,23 +165,21 @@ impl TreeNodeRewriter for SortRewriter { Arc::clone(repartition_exec.input()), repartition_exec.output_partitioning(), )? - .with_preserve_order(true), + .with_preserve_order(), )) } else if let Some(union_exec) = plan.as_any().downcast_ref::() { // Any children of the UnionExec that are not already sorted, // need to be sorted. - let required_ordering = Some(sort_exprs_to_requirement(self.ordering.as_ref())); + let required_ordering = sort_exprs_to_requirement(self.ordering.as_ref()); let new_children = union_exec .children() .into_iter() .map(|child| { - if !ordering_satisfy_requirement( - child.output_ordering(), - required_ordering.as_deref(), - || child.equivalence_properties(), - || child.ordering_equivalence_properties(), - ) { + if !child + .equivalence_properties() + .ordering_satisfy_requirement(&required_ordering) + { let sort_exec = SortExec::new(self.ordering.clone(), child) .with_preserve_partitioning(true); Arc::new(sort_exec) @@ -266,16 +263,16 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - - " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" - " UnionExec" - " SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -317,17 +314,17 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: - " SortExec: expr=[time@3 ASC]" - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - - " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" - " UnionExec" - " SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -358,14 +355,14 @@ mod test { - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - " SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - " UnionExec" - " SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -402,8 +399,8 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" @@ -411,8 +408,8 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" "### ); } @@ -454,8 +451,8 @@ mod test { output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - - " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" - " UnionExec" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" @@ -537,16 +534,16 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: - " DeduplicateExec: [col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " SortPreservingRepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - - " SortPreservingRepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" + - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" + - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4, preserve_order=true, sort_exprs=col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC" - " UnionExec" - " SortExec: expr=[col2@1 ASC,col1@0 ASC,time@3 ASC,__chunk_order@4 ASC]" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -586,7 +583,7 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: @@ -596,7 +593,7 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col2@1 ASC, col1@0 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -635,7 +632,7 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]" output: Ok: @@ -644,7 +641,7 @@ mod test { - " RepartitionExec: partitioning=Hash([col2@1, col1@0, time@3, __chunk_order@4], 8), input_partitions=8" - " RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=4" - " UnionExec" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[col1, col2, field1, time, __chunk_order]" - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[col1, col2, field1, time, __chunk_order], output_ordering=[col1@0 ASC, col2@1 ASC, time@3 ASC, __chunk_order@4 ASC]" "### ); @@ -662,12 +659,11 @@ mod test { object_store_url: ObjectStoreUrl::parse("test://").unwrap(), file_schema: Arc::clone(schema), file_groups: vec![vec![file(1)], vec![file(2)]], - statistics: Statistics::default(), + statistics: Statistics::new_unknown(schema), projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![order.to_vec()], - infinite_source: false, }; Arc::new(ParquetExec::new(base_config, None, None)) } @@ -691,6 +687,7 @@ mod test { last_modified: Default::default(), size: 0, e_tag: None, + version: None, }, partition_values: vec![], range: None, diff --git a/iox_query/src/physical_optimizer/sort/util.rs b/iox_query/src/physical_optimizer/sort/util.rs new file mode 100644 index 00000000000..274b016c225 --- /dev/null +++ b/iox_query/src/physical_optimizer/sort/util.rs @@ -0,0 +1,102 @@ +use std::sync::Arc; + +use crate::statistics::{column_statistics_min_max, compute_stats_column_min_max, overlap}; +use arrow::compute::{rank, SortOptions}; +use datafusion::{error::Result, physical_plan::ExecutionPlan, scalar::ScalarValue}; +use observability_deps::tracing::trace; + +/// Compute statistics for the given plans on a given column name +/// Return none if the statistics are not available +pub(crate) fn collect_statistics_min_max( + plans: &[Arc], + col_name: &str, +) -> Result>> { + // temp solution while waiting for DF's statistics to get mature + // Compute min max stats for all inputs of UnionExec on the sorted column + // https://github.com/apache/arrow-datafusion/issues/8078 + let col_stats = plans + .iter() + .map(|plan| compute_stats_column_min_max(&**plan, col_name)) + .collect::>>()?; + + // If min and max not available, return none + let mut value_ranges = Vec::with_capacity(col_stats.len()); + for stats in &col_stats { + let Some((min, max)) = column_statistics_min_max(stats) else { + trace!("-------- min_max not available"); + return Ok(None); + }; + + value_ranges.push((min, max)); + } + + // todo: use this when DF satistics is ready + // // Get statistics for the inputs of UnionExec on the sorted column + // let Some(value_ranges) = statistics_min_max(plans, col_name) + // else { + // return Ok(None); + // }; + + Ok(Some(value_ranges)) +} + +/// Plans and their corresponding value ranges +pub(crate) struct PlansValueRanges { + pub plans: Vec>, + // Min and max values of the plan on a specific column + pub value_ranges: Vec<(ScalarValue, ScalarValue)>, +} + +/// Sort the given plans by value ranges +/// Return none if +/// . the number of plans is not the same as the number of value ranges +/// . the value ranges overlap +pub(crate) fn sort_by_value_ranges( + plans: Vec>, + value_ranges: Vec<(ScalarValue, ScalarValue)>, + sort_options: SortOptions, +) -> Result> { + if plans.len() != value_ranges.len() { + trace!( + plans.len = plans.len(), + value_ranges.len = value_ranges.len(), + "--------- number of plans is not the same as the number of value ranges" + ); + return Ok(None); + } + + if overlap(&value_ranges)? { + trace!("--------- value ranges overlap"); + return Ok(None); + } + + // get the min value of each value range + let min_iter = value_ranges.iter().map(|(min, _)| min.clone()); + let mins = ScalarValue::iter_to_array(min_iter)?; + + // rank the min values + let ranks = rank(&*mins, Some(sort_options))?; + + // sort the plans by the ranks of their min values + let mut plan_rank_zip: Vec<(Arc, u32)> = + plans.into_iter().zip(ranks.clone()).collect::>(); + plan_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2)); + let plans = plan_rank_zip + .into_iter() + .map(|(plan, _)| plan) + .collect::>(); + + // Sort the value ranges by the ranks of their min values + let mut value_range_rank_zip: Vec<((ScalarValue, ScalarValue), u32)> = + value_ranges.into_iter().zip(ranks).collect::>(); + value_range_rank_zip.sort_by(|(_, min1), (_, min2)| min1.cmp(min2)); + let value_ranges = value_range_rank_zip + .into_iter() + .map(|(value_range, _)| value_range) + .collect::>(); + + Ok(Some(PlansValueRanges { + plans, + value_ranges, + })) +} diff --git a/iox_query/src/physical_optimizer/tests.rs b/iox_query/src/physical_optimizer/tests.rs new file mode 100644 index 00000000000..4e582278627 --- /dev/null +++ b/iox_query/src/physical_optimizer/tests.rs @@ -0,0 +1,210 @@ +//! Optimizer edge cases. +//! +//! These are NOT part of the usual end2end query tests because they depend on very specific chunk arrangements that are +//! hard to reproduce in an end2end setting. + +use std::sync::Arc; + +use arrow::datatypes::DataType; +use datafusion::{ + common::DFSchema, + datasource::provider_as_source, + logical_expr::{col, count, lit, Expr, ExprSchemable, LogicalPlanBuilder}, + scalar::ScalarValue, +}; +use schema::sort::SortKey; +use test_helpers::maybe_start_logging; + +use crate::{ + exec::{DedicatedExecutors, Executor, ExecutorConfig, ExecutorType}, + provider::ProviderBuilder, + test::{format_execution_plan, TestChunk}, + QueryChunk, +}; + +/// Test that reconstructs specific case where parquet files may unnecessarily be sorted. +/// +/// See: +/// - +/// - +#[tokio::test] +async fn test_parquet_should_not_be_resorted() { + // DF session setup + let config = ExecutorConfig { + target_query_partitions: 16.try_into().unwrap(), + ..ExecutorConfig::testing() + }; + let exec = Executor::new_with_config_and_executors( + config, + Arc::new(DedicatedExecutors::new_testing()), + ); + let ctx = exec.new_context(ExecutorType::Query); + let state = ctx.inner().state(); + + // chunks + let c = TestChunk::new("t") + .with_tag_column("tag") + .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None); + let c_mem = c.clone().with_may_contain_pk_duplicates(true); + let c_file = c + .clone() + .with_dummy_parquet_file() + .with_may_contain_pk_duplicates(false) + .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")])); + let schema = c.schema().clone(); + let provider = ProviderBuilder::new("t".into(), schema) + .add_chunk(Arc::new(c_mem.clone().with_id(1).with_order(i64::MAX))) + .add_chunk(Arc::new(c_file.clone().with_id(2).with_order(2))) + .add_chunk(Arc::new(c_file.clone().with_id(3).with_order(3))) + .build() + .unwrap(); + + // initial plan + // NOTE: we NEED two time predicates for the bug to trigger! + let expr = col("time") + .gt(lit(ScalarValue::TimestampNanosecond(Some(0), None))) + .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None)))); + + let plan = + LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None) + .unwrap() + .filter(expr) + .unwrap() + .aggregate( + std::iter::empty::(), + [count(lit(true)).alias("count")], + ) + .unwrap() + .project([col("count")]) + .unwrap() + .build() + .unwrap(); + + let plan = state.create_physical_plan(&plan).await.unwrap(); + + // The output of the parquet files should not be resorted + insta::assert_yaml_snapshot!( + format_execution_plan(&plan), + @r###" + --- + - " AggregateExec: mode=Final, gby=[], aggr=[count]" + - " CoalescePartitionsExec" + - " AggregateExec: mode=Partial, gby=[], aggr=[count]" + - " RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1" + - " ProjectionExec: expr=[]" + - " DeduplicateExec: [tag@1 ASC,time@2 ASC]" + - " SortPreservingMergeExec: [tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]" + - " UnionExec" + - " SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]" + - " CoalesceBatchesExec: target_batch_size=8192" + - " FilterExec: time@2 > 0 AND time@2 > 2" + - " RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=1" + - " RecordBatchesExec: chunks=1, projection=[__chunk_order, tag, time]" + - " SortExec: expr=[tag@1 ASC,time@2 ASC,__chunk_order@0 ASC]" + - " CoalesceBatchesExec: target_batch_size=8192" + - " FilterExec: time@2 > 0 AND time@2 > 2" + - " RepartitionExec: partitioning=RoundRobinBatch(16), input_partitions=2" + - " ParquetExec: file_groups={2 groups: [[2.parquet], [3.parquet]]}, projection=[__chunk_order, tag, time], output_ordering=[tag@1 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=time@1 > 0 AND time@1 > 2, pruning_predicate=time_max@0 > 0 AND time_max@0 > 2" + "### + ); +} + +/// Bug reproducer for: +/// - +/// - +#[tokio::test] +async fn test_parquet_must_resorted() { + maybe_start_logging(); + + // DF session setup + let config = ExecutorConfig { + target_query_partitions: 6.try_into().unwrap(), + ..ExecutorConfig::testing() + }; + let exec = Executor::new_with_config_and_executors( + config, + Arc::new(DedicatedExecutors::new_testing()), + ); + let ctx = exec.new_context(ExecutorType::Query); + let state = ctx.inner().state(); + + // chunks + let c = TestChunk::new("t") + .with_tag_column("tag") + .with_f64_field_column("field") + .with_time_column_with_full_stats(Some(0), Some(10), 10_000, None) + .with_may_contain_pk_duplicates(false) + .with_sort_key(SortKey::from_columns([Arc::from("tag"), Arc::from("time")])); + let schema = c.schema().clone(); + let df_schema = DFSchema::try_from(schema.as_arrow().as_ref().clone()).unwrap(); + let provider = ProviderBuilder::new("t".into(), schema) + // need a small file followed by a big one + .add_chunk(Arc::new( + c.clone() + .with_id(1) + .with_order(1) + .with_dummy_parquet_file_and_size(1), + )) + .add_chunk(Arc::new( + c.clone() + .with_id(2) + .with_order(2) + .with_dummy_parquet_file_and_size(100_000_000), + )) + .build() + .unwrap(); + + // initial plan + let expr = col("tag") + .gt(lit("foo")) + .and(col("time").gt(lit(ScalarValue::TimestampNanosecond(Some(2), None)))) + .and( + col("field") + .cast_to(&DataType::Utf8, &df_schema) + .unwrap() + .not_eq(lit("")), + ); + + let plan = + LogicalPlanBuilder::scan("t".to_owned(), provider_as_source(Arc::new(provider)), None) + .unwrap() + .filter(expr) + .unwrap() + .project([col("tag")]) + .unwrap() + .build() + .unwrap(); + + let plan = state.create_physical_plan(&plan).await.unwrap(); + + // The output of the parquet files must be sorted prior to merging + // if the first file_group has more than one file + // + // Prior to https://github.com/influxdata/influxdb_iox/issues/9450, the plan + // called for the ParquetExec to read the files in parallel (using subranges) like: + // ``` + // {6 groups: [[1.parquet:0..1, 2.parquet:0..16666666], [2.parquet:16666666..33333333],... + // ``` + // + // Groups with more than one file produce an output partition that is the + // result of concatenating them together, so even if the output of each + // individual file is sorted, the output of the partition is not, due to the + // concatenation. + insta::assert_yaml_snapshot!( + format_execution_plan(&plan), + @r###" + --- + - " ProjectionExec: expr=[tag@1 as tag]" + - " CoalesceBatchesExec: target_batch_size=8192" + - " FilterExec: CAST(field@0 AS Utf8) != " + - " RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=1" + - " ProjectionExec: expr=[field@1 as field, tag@3 as tag]" + - " DeduplicateExec: [tag@3 ASC,time@2 ASC]" + - " SortPreservingMergeExec: [tag@3 ASC,time@2 ASC,__chunk_order@0 ASC]" + - " CoalesceBatchesExec: target_batch_size=8192" + - " FilterExec: tag@3 > foo AND time@2 > 2" + - " RepartitionExec: partitioning=RoundRobinBatch(6), input_partitions=2, preserve_order=true, sort_exprs=tag@3 ASC,time@2 ASC,__chunk_order@0 ASC" + - " ParquetExec: file_groups={2 groups: [[1.parquet], [2.parquet]]}, projection=[__chunk_order, field, time, tag], output_ordering=[tag@3 ASC, time@2 ASC, __chunk_order@0 ASC], predicate=tag@1 > foo AND time@2 > 2, pruning_predicate=tag_max@0 > foo AND time_max@1 > 2" + "### + ); +} diff --git a/iox_query/src/physical_optimizer/union/nested_union.rs b/iox_query/src/physical_optimizer/union/nested_union.rs index 6e94423dec9..7a051396d1a 100644 --- a/iox_query/src/physical_optimizer/union/nested_union.rs +++ b/iox_query/src/physical_optimizer/union/nested_union.rs @@ -95,11 +95,11 @@ mod tests { --- input: - " UnionExec" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - " UnionExec" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -118,15 +118,15 @@ mod tests { input: - " UnionExec" - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" + - " EmptyExec" output: Ok: - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" + - " EmptyExec" "### ); } @@ -148,16 +148,16 @@ mod tests { input: - " UnionExec" - " UnionExec" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" output: Ok: - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" + - " EmptyExec" "### ); } @@ -171,16 +171,16 @@ mod tests { @r###" --- input: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } fn other_node() -> Arc { - Arc::new(EmptyExec::new(false, schema())) + Arc::new(EmptyExec::new(schema())) } fn schema() -> SchemaRef { diff --git a/iox_query/src/physical_optimizer/union/one_union.rs b/iox_query/src/physical_optimizer/union/one_union.rs index c43bedcadea..15f277a40af 100644 --- a/iox_query/src/physical_optimizer/union/one_union.rs +++ b/iox_query/src/physical_optimizer/union/one_union.rs @@ -77,10 +77,10 @@ mod tests { --- input: - " UnionExec" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -95,13 +95,13 @@ mod tests { --- input: - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" output: Ok: - " UnionExec" - - " EmptyExec: produce_one_row=false" - - " EmptyExec: produce_one_row=false" + - " EmptyExec" + - " EmptyExec" "### ); } @@ -115,16 +115,16 @@ mod tests { @r###" --- input: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" output: Ok: - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } fn other_node() -> Arc { - Arc::new(EmptyExec::new(false, schema())) + Arc::new(EmptyExec::new(schema())) } fn schema() -> SchemaRef { diff --git a/iox_query/src/provider.rs b/iox_query/src/provider.rs index 30f50c20b69..3fab97e578c 100644 --- a/iox_query/src/provider.rs +++ b/iox_query/src/provider.rs @@ -11,8 +11,10 @@ use datafusion::{ datasource::{provider_as_source, TableProvider}, error::{DataFusionError, Result as DataFusionResult}, execution::context::SessionState, - logical_expr::{LogicalPlanBuilder, TableProviderFilterPushDown, TableType}, - optimizer::utils::{conjunction, split_conjunction}, + logical_expr::{ + utils::{conjunction, split_conjunction}, + LogicalPlanBuilder, TableProviderFilterPushDown, TableType, + }, physical_plan::{ expressions::col as physical_col, filter::FilterExec, projection::ProjectionExec, ExecutionPlan, @@ -35,6 +37,7 @@ mod adapter; mod deduplicate; pub mod overlap; mod physical; +pub(crate) mod progressive_eval; mod record_batch_exec; pub use self::overlap::group_potential_duplicates; pub use deduplicate::{DeduplicateExec, RecordBatchDeduplicator}; @@ -82,7 +85,7 @@ impl From for ArrowError { impl From for DataFusionError { // Wrap an error into a datafusion error fn from(e: Error) -> Self { - Self::ArrowError(e.into()) + Self::ArrowError(e.into(), None) } } @@ -195,6 +198,14 @@ impl TableProvider for ChunkTableProvider { self.arrow_schema() } + /// Creates a plan like the following: + /// + /// ```text + /// Project (keep only columns needed in the rest of the plan) + /// Filter (optional, apply any push down predicates) + /// Deduplicate (optional, if chunks overlap) + /// ... Scan of Chunks (RecordBatchExec / ParquetExec / UnionExec, etc) ... + /// ``` async fn scan( &self, ctx: &SessionState, @@ -256,7 +267,7 @@ impl TableProvider for ChunkTableProvider { if let Some(expr) = maybe_expr { Arc::new(FilterExec::try_new( - df_physical_expr(plan.as_ref(), expr)?, + df_physical_expr(plan.schema(), expr)?, plan, )?) } else { @@ -358,7 +369,7 @@ mod test { - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -375,7 +386,7 @@ mod test { - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -396,7 +407,7 @@ mod test { - " FilterExec: false" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -410,7 +421,7 @@ mod test { - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -456,7 +467,7 @@ mod test { --- - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -472,7 +483,7 @@ mod test { --- - " ProjectionExec: expr=[tag1@1 as tag1, time@3 as time]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -501,7 +512,7 @@ mod test { - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]" - " FilterExec: false AND tag1@1 = CAST(foo AS Dictionary(Int32, Utf8))" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -514,7 +525,7 @@ mod test { --- - " ProjectionExec: expr=[field@0 as field, tag1@1 as tag1, tag2@2 as tag2, time@3 as time]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -565,7 +576,7 @@ mod test { - " FilterExec: time@3 > 100" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -583,7 +594,7 @@ mod test { - " FilterExec: time@3 > 100" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -607,7 +618,7 @@ mod test { - " FilterExec: false AND time@3 > 100" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); @@ -622,7 +633,7 @@ mod test { - " FilterExec: time@3 > 100" - " DeduplicateExec: [tag1@1 ASC,tag2@2 ASC,time@3 ASC]" - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[field, tag1, tag2, time, __chunk_order]" - " ParquetExec: file_groups={1 group: [[2.parquet]]}, projection=[field, tag1, tag2, time, __chunk_order], output_ordering=[__chunk_order@4 ASC]" "### ); diff --git a/iox_query/src/provider/adapter.rs b/iox_query/src/provider/adapter.rs index 5d928852d41..a0f1ad9b8b9 100644 --- a/iox_query/src/provider/adapter.rs +++ b/iox_query/src/provider/adapter.rs @@ -204,11 +204,13 @@ impl SchemaAdapterStream { .mappings .iter() .map(|mapping| match mapping { - ColumnMapping::FromInput(input_index) => Arc::clone(batch.column(*input_index)), - ColumnMapping::MakeNull(data_type) => new_null_array(data_type, batch.num_rows()), + ColumnMapping::FromInput(input_index) => Ok(Arc::clone(batch.column(*input_index))), + ColumnMapping::MakeNull(data_type) => { + Ok(new_null_array(data_type, batch.num_rows())) + } ColumnMapping::Virtual(value) => value.to_array_of_size(batch.num_rows()), }) - .collect::>(); + .collect::, DataFusionError>>()?; Ok(RecordBatch::try_new( Arc::clone(&self.output_schema), diff --git a/iox_query/src/provider/deduplicate.rs b/iox_query/src/provider/deduplicate.rs index 5744f4ec5df..45c02503d60 100644 --- a/iox_query/src/provider/deduplicate.rs +++ b/iox_query/src/provider/deduplicate.rs @@ -1,6 +1,5 @@ //! Implemention of DeduplicateExec operator (resolves primary key conflicts) plumbing and tests mod algo; -mod key_ranges; use std::{collections::HashSet, fmt, sync::Arc}; @@ -11,6 +10,7 @@ use crate::CHUNK_ORDER_COLUMN_NAME; use self::algo::get_col_name; pub use self::algo::RecordBatchDeduplicator; +use datafusion::physical_expr::EquivalenceProperties; use datafusion::{ error::{DataFusionError, Result}, execution::context::TaskContext, @@ -188,6 +188,7 @@ impl ExecutionPlan for DeduplicateExec { } fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + trace!("Deduplicate output ordering: {:?}", self.sort_keys); Some(&self.sort_keys) } @@ -209,6 +210,11 @@ impl ExecutionPlan for DeduplicateExec { vec![Arc::clone(&self.input)] } + fn equivalence_properties(&self) -> EquivalenceProperties { + // deduplicate does not change the equivalence properties + self.input.equivalence_properties() + } + fn with_new_children( self: Arc, children: Vec>, @@ -271,12 +277,9 @@ impl ExecutionPlan for DeduplicateExec { Some(self.metrics.clone_inner()) } - fn statistics(&self) -> Statistics { + fn statistics(&self) -> Result { // use a guess from our input but they are NOT exact - Statistics { - is_exact: false, - ..self.input.statistics() - } + Ok(self.input.statistics()?.into_inexact()) } } @@ -369,6 +372,7 @@ mod test { use super::*; use arrow::array::{DictionaryArray, Int64Array}; + use schema::TIME_DATA_TIMEZONE; use std::iter::FromIterator; #[tokio::test] @@ -465,7 +469,8 @@ mod test { let f1 = Float64Array::from(vec![Some(1.0), None]); let f2 = Float64Array::from(vec![None, Some(3.0)]); - let time = TimestampNanosecondArray::from(vec![Some(100), Some(100)]); + let time = TimestampNanosecondArray::from(vec![Some(100), Some(100)]) + .with_timezone_opt(TIME_DATA_TIMEZONE()); let batch = RecordBatch::try_from_iter(vec![ ("f1", Arc::new(f1) as ArrayRef), @@ -1219,9 +1224,9 @@ mod test { Ok(AdapterStream::adapt_unbounded(self.schema(), rx, handle)) } - fn statistics(&self) -> Statistics { + fn statistics(&self) -> Result { // don't know anything about the statistics - Statistics::default() + Ok(Statistics::new_unknown(&self.schema())) } } diff --git a/iox_query/src/provider/deduplicate/algo.rs b/iox_query/src/provider/deduplicate/algo.rs index d0a13ce2bc9..a4c24e6e344 100644 --- a/iox_query/src/provider/deduplicate/algo.rs +++ b/iox_query/src/provider/deduplicate/algo.rs @@ -16,8 +16,6 @@ use datafusion::physical_plan::{ }; use observability_deps::tracing::{debug, trace}; -use crate::provider::deduplicate::key_ranges::key_ranges; - // Handles the deduplication across potentially multiple // [`RecordBatch`]es which are already sorted on a primary key, // including primary keys which straddle RecordBatch boundaries @@ -240,12 +238,7 @@ impl RecordBatchDeduplicator { is_sort_key[index] = true; - let array = batch.column(index); - - arrow::compute::SortColumn { - values: Arc::clone(array), - options: Some(skey.options), - } + Arc::clone(batch.column(index)) }) .collect(); // @@ -256,19 +249,18 @@ impl RecordBatchDeduplicator { // the column with the highest cardinality let len = columns.len(); if len > 1 { - if let DataType::Timestamp(TimeUnit::Nanosecond, _) = - columns[len - 1].values.data_type() - { + if let DataType::Timestamp(TimeUnit::Nanosecond, _) = columns[len - 1].data_type() { columns.swap(len - 2, len - 1); } } // Reverse the list - let columns: Vec<_> = columns.into_iter().rev().collect(); + columns.reverse(); // Compute partitions (aka breakpoints between the ranges) // Each range (or partition) includes a unique sort key value which is // a unique combination of PK columns. PK columns consist of all tags and the time col. - let ranges = key_ranges(&columns)?.collect(); + let partitions = arrow::compute::partition(&columns)?; + let ranges = partitions.ranges(); Ok(DuplicateRanges { is_sort_key, @@ -411,8 +403,6 @@ mod test { use datafusion::physical_plan::expressions::col; use datafusion::physical_plan::metrics::{ExecutionPlanMetricsSet, MetricBuilder}; - use crate::provider::deduplicate::key_ranges::range; - use super::*; #[tokio::test] @@ -844,4 +834,8 @@ mod test { let metrics = ExecutionPlanMetricsSet::new(); MetricBuilder::new(&metrics).counter("num_dupes", 0) } + + fn range(start: usize, end: usize) -> Range { + Range { start, end } + } } diff --git a/iox_query/src/provider/overlap.rs b/iox_query/src/provider/overlap.rs index 4ba1d84e6fd..4b90162bb98 100644 --- a/iox_query/src/provider/overlap.rs +++ b/iox_query/src/provider/overlap.rs @@ -23,7 +23,7 @@ pub fn group_potential_duplicates( // If at least one of the chunks has no time range, // all chunks are considered to overlap with each other. if ts.iter().any(|ts| ts.is_none()) { - debug!("At least one chunk has not timestamp mim max"); + debug!("At least one chunk has not timestamp min max"); return vec![chunks]; } @@ -92,21 +92,16 @@ pub fn group_potential_duplicates( } fn timestamp_min_max(chunk: &dyn QueryChunk) -> Option { + let stats = chunk.stats(); chunk - .stats() - .column_statistics - .as_ref() - .and_then(|stats| { - chunk - .schema() - .find_index_of(TIME_COLUMN_NAME) - .map(|idx| &stats[idx]) - }) + .schema() + .find_index_of(TIME_COLUMN_NAME) + .map(|idx| &stats.column_statistics[idx]) .and_then(|stats| { if let ( - Some(ScalarValue::TimestampNanosecond(Some(min), None)), - Some(ScalarValue::TimestampNanosecond(Some(max), None)), - ) = (&stats.min_value, &stats.max_value) + Some(ScalarValue::TimestampNanosecond(Some(min), _)), + Some(ScalarValue::TimestampNanosecond(Some(max), _)), + ) = (stats.min_value.get_value(), stats.max_value.get_value()) { Some(TimestampMinMax::new(*min, *max)) } else { diff --git a/iox_query/src/provider/physical.rs b/iox_query/src/provider/physical.rs index 9616653c9df..3114cf8b397 100644 --- a/iox_query/src/provider/physical.rs +++ b/iox_query/src/provider/physical.rs @@ -1,10 +1,11 @@ //! Implementation of a DataFusion PhysicalPlan node across partition chunks +use crate::statistics::build_statistics_for_chunks; use crate::{ provider::record_batch_exec::RecordBatchesExec, util::arrow_sort_key_exprs, QueryChunk, QueryChunkData, CHUNK_ORDER_COLUMN_NAME, }; -use arrow::datatypes::{DataType, Fields, Schema as ArrowSchema, SchemaRef}; +use arrow::datatypes::{Fields, Schema as ArrowSchema, SchemaRef}; use datafusion::{ datasource::{ listing::PartitionedFile, @@ -12,10 +13,7 @@ use datafusion::{ physical_plan::{FileScanConfig, ParquetExec}, }, physical_expr::PhysicalSortExpr, - physical_plan::{ - empty::EmptyExec, expressions::Column, union::UnionExec, ColumnStatistics, ExecutionPlan, - Statistics, - }, + physical_plan::{empty::EmptyExec, expressions::Column, union::UnionExec, ExecutionPlan}, scalar::ScalarValue, }; use object_store::ObjectMeta; @@ -145,7 +143,7 @@ pub fn chunks_to_physical_nodes( target_partitions: usize, ) -> Arc { if chunks.is_empty() { - return Arc::new(EmptyExec::new(false, Arc::clone(schema))); + return Arc::new(EmptyExec::new(Arc::clone(schema))); } let mut record_batch_chunks: Vec> = vec![]; @@ -199,24 +197,12 @@ pub fn chunks_to_physical_nodes( // ensure that chunks are actually ordered by chunk order chunks.sort_by_key(|(_meta, c)| c.order()); - #[allow(clippy::manual_try_fold)] - let num_rows = chunks.iter().map(|(_meta, c)| c.stats().num_rows).fold( - Some(0usize), - |accu, x| match (accu, x) { - (Some(accu), Some(x)) => Some(accu + x), - _ => None, - }, - ); - let chunk_order_min = chunks + // Compute statistics for the chunks + let query_chunks = chunks .iter() - .map(|(_meta, c)| c.order().get()) - .min() - .expect("at least one chunk"); - let chunk_order_max = chunks - .iter() - .map(|(_meta, c)| c.order().get()) - .max() - .expect("at least one chunk"); + .map(|(_meta, chunk)| Arc::clone(chunk)) + .collect::>(); + let statistics = build_statistics_for_chunks(&query_chunks, Arc::clone(schema)); let file_groups = distribute( chunks.into_iter().map(|(object_meta, chunk)| { @@ -242,7 +228,10 @@ pub fn chunks_to_physical_nodes( let output_ordering = sort_key.map(|sort_key| arrow_sort_key_exprs(&sort_key, schema)); let (table_partition_cols, file_schema, output_ordering) = if has_chunk_order_col { - let table_partition_cols = vec![(CHUNK_ORDER_COLUMN_NAME.to_owned(), DataType::Int64)]; + let table_partition_cols = vec![schema + .field_with_name(CHUNK_ORDER_COLUMN_NAME) + .unwrap() + .clone()]; let file_schema = Arc::new(ArrowSchema::new( schema .fields @@ -269,40 +258,6 @@ pub fn chunks_to_physical_nodes( (vec![], Arc::clone(schema), output_ordering) }; - let statistics = Statistics { - num_rows, - total_byte_size: None, - column_statistics: Some( - schema - .fields - .iter() - .map(|f| { - let null_count = if f.is_nullable() { None } else { Some(0) }; - - let (min_value, max_value) = if f.name() == CHUNK_ORDER_COLUMN_NAME { - ( - Some(ScalarValue::from(chunk_order_min)), - Some(ScalarValue::from(chunk_order_max)), - ) - } else { - (None, None) - }; - - ColumnStatistics { - null_count, - min_value, - max_value, - distinct_count: None, - } - }) - .collect(), - ), - - // this does NOT account for predicate pushdown - // Also see https://github.com/apache/arrow-datafusion/issues/5614 - is_exact: false, - }; - // No sort order is represented by an empty Vec let output_ordering = vec![output_ordering.unwrap_or_default()]; @@ -315,7 +270,6 @@ pub fn chunks_to_physical_nodes( limit: None, table_partition_cols, output_ordering, - infinite_source: false, }; let meta_size_hint = None; @@ -350,10 +304,15 @@ where #[cfg(test)] mod tests { - use schema::{sort::SortKeyBuilder, SchemaBuilder, TIME_COLUMN_NAME}; + use datafusion::{ + common::stats::Precision, + physical_plan::{ColumnStatistics, Statistics}, + }; + use schema::{sort::SortKeyBuilder, InfluxFieldType, SchemaBuilder, TIME_COLUMN_NAME}; use crate::{ chunk_order_field, + statistics::build_statistics_for_chunks, test::{format_execution_plan, TestChunk}, }; @@ -455,7 +414,7 @@ mod tests { format_execution_plan(&plan), @r###" --- - - " EmptyExec: produce_one_row=false" + - " EmptyExec" "### ); } @@ -575,9 +534,192 @@ mod tests { @r###" --- - " UnionExec" - - " RecordBatchesExec: chunks=1" + - " RecordBatchesExec: chunks=1, projection=[tag, __chunk_order]" - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, __chunk_order], output_ordering=[__chunk_order@1 ASC]" "### ); } + + // reproducer of https://github.com/influxdata/idpe/issues/18287 + #[test] + fn reproduce_schema_bug_in_parquet_exec() { + // schema with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME + let schema: SchemaRef = SchemaBuilder::new() + .tag("tag") + .influx_field("field", InfluxFieldType::Float) + .timestamp() + .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer) + .build() + .unwrap() + .into(); + + // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME + let record_batch_chunk = Arc::new( + TestChunk::new("t") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)), + ); + + // create them same test chunk but with a parquet file + let parquet_chunk = Arc::new( + TestChunk::new("t") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)) + .with_dummy_parquet_file(), + ); + + // Build a RecordBatchsExec for record_batch_chunk + // + // Use chunks_to_physical_nodes to build a plan with UnionExec on top of RecordBatchesExec + // Note: I purposely use chunks_to_physical_node to create plan for both record_batch_chunk and parquet_chunk to + // consistently create their plan. Also chunks_to_physical_node is used to do create plan in optimization + // passes that I will need + let plan = chunks_to_physical_nodes( + &schema, + None, + vec![Arc::clone(&record_batch_chunk) as Arc], + 1, + ); + // remove union + let Some(union_exec) = plan.as_any().downcast_ref::() else { + panic!("plan is not a UnionExec"); + }; + let plan_record_batches_exec = Arc::clone(&union_exec.inputs()[0]); + // verify this is a RecordBatchesExec + assert!(plan_record_batches_exec + .as_any() + .downcast_ref::() + .is_some()); + + // Build a ParquetExec for parquet_chunk + // + // Use chunks_to_physical_nodes to build a plan with UnionExec on top of ParquetExec + let plan = chunks_to_physical_nodes( + &schema, + None, + vec![Arc::clone(&parquet_chunk) as Arc], + 1, + ); + // remove union + let Some(union_exec) = plan.as_any().downcast_ref::() else { + panic!("plan is not a UnionExec"); + }; + let plan_parquet_exec = Arc::clone(&union_exec.inputs()[0]); + // verify this is a ParquetExec + assert!(plan_parquet_exec + .as_any() + .downcast_ref::() + .is_some()); + + // Schema of 2 chunks are the same + assert_eq!(record_batch_chunk.schema(), parquet_chunk.schema()); + + // Schema of the corresponding plans are also the same + assert_eq!( + plan_record_batches_exec.schema(), + plan_parquet_exec.schema() + ); + + // Statistics of 2 chunks are the same + let record_batch_stats = + build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema)); + let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema); + assert_eq!(record_batch_stats, parquet_stats); + + // Statistics of the corresponding plans should also be the same except the CHUNK_ORDER_COLUMN_NAME + // Notes: + // 1. We do compute stats for CHUNK_ORDER_COLUMN_NAME and store it as in FileScanConfig.statistics + // See: https://github.com/influxdata/influxdb_iox/blob/0e5b97d9e913111641f65b9af31e3b3f45f3b14b/iox_query/src/provider/physical.rs#L311C24-L311C24 + // So, if we get statistics there, we have everything + // 2. However, if we get statistics through the DF plan's statistics() method, we will not get stats for CHUNK_ORDER_COLUMN_NAME + // The reason is we store CHUNK_ORDER_COLUMN_NAME as table_partition_cols in DF and DF has not computed stats for it yet. + // See: https://github.com/apache/arrow-datafusion/blob/a9d66e2b492843c2fb335a7dfe27fed073629b09/datafusion/core/src/datasource/physical_plan/file_scan_config.rs#L139 + // When we get the plan's statistics, we won't care about CHUNK_ORDER_COLUMN_NAME becasue it is not a real column. + // Thus, we are good for now. In the future, if we want a 100% consistent for CHUNK_ORDER_COLUMN_NAME, we need + // to modify DF to compute stats for table_partition_cols + // + // Here both parquet's plan stats and FileScanConfig stats + // + // Cast to ParquetExec to get statistics + let plan_parquet_exec = plan_parquet_exec + .as_any() + .downcast_ref::() + .unwrap(); + // stats of the parquet plan generally computed from propagating stats from input plans/chunks/columns + let parquet_plan_stats = plan_parquet_exec.statistics().unwrap(); + // stats stored in FileScanConfig + let parqet_file_stats = &plan_parquet_exec.base_config().statistics; + + // stats of IOx specific recod batch plan + let record_batch_plan_stats = plan_record_batches_exec.statistics().unwrap(); + + // Record batch plan stats is the same as parquet file stats and includes everything + assert_eq!(record_batch_plan_stats, *parqet_file_stats); + + // Verify content + // + // Actual columns have stats + let col_stats = vec![ + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(100))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)), + min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)), + distinct_count: Precision::Absent, + }, + ]; + // + // Add CHUNK_ORDER_COLUMN_NAME with stats + let mut parquet_file_col_stats = col_stats.clone(); + parquet_file_col_stats.push(ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(6))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }); + // + // Add CHUNK_ORDER_COLUMN_NAME without stats + let mut parquet_plan_stats_col_stats = col_stats; + parquet_plan_stats_col_stats.push(ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, + }); + // + let expected_parquet_plan_stats = Statistics { + num_rows: Precision::Exact(0), + total_byte_size: Precision::Absent, + column_statistics: parquet_plan_stats_col_stats, + }; + // + let expected_parquet_file_stats = Statistics { + num_rows: Precision::Exact(0), + total_byte_size: Precision::Absent, + column_statistics: parquet_file_col_stats, + }; + + // Content of Record batch plan stats that include stats of CHUNK_ORDER_COLUMN_NAME + assert_eq!(record_batch_plan_stats, expected_parquet_file_stats); + // Content of parquet file stats that also include stats of CHUNK_ORDER_COLUMN_NAME + assert_eq!(*parqet_file_stats, expected_parquet_file_stats); + // + // Content of parquet plan stats that does not include stats of CHUNK_ORDER_COLUMN_NAME + assert_eq!(parquet_plan_stats, expected_parquet_plan_stats); + } } diff --git a/iox_query/src/provider/progressive_eval.rs b/iox_query/src/provider/progressive_eval.rs new file mode 100644 index 00000000000..80109e4baca --- /dev/null +++ b/iox_query/src/provider/progressive_eval.rs @@ -0,0 +1,1206 @@ +// ProgressiveEvalExec (step 1 in https://docs.google.com/document/d/1x1yf9ggyxD4JPT8Gf9YlIKxUawqoKTJ1HFyTbGin9xY/edit) +// This will be moved to DF once it is ready + +//! Defines the progressive eval plan + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use arrow::record_batch::RecordBatch; +use datafusion::common::{internal_err, DataFusionError, Result}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::{EquivalenceProperties, PhysicalSortExpr, PhysicalSortRequirement}; +use datafusion::physical_plan::metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}; +use datafusion::physical_plan::stream::RecordBatchReceiverStream; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning, RecordBatchStream, + SendableRecordBatchStream, Statistics, +}; +use datafusion::scalar::ScalarValue; +use futures::{ready, Stream, StreamExt}; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use observability_deps::tracing::{debug, trace}; + +/// ProgressiveEval return a stream of record batches in the order of its inputs. +/// It will stop when the number of output rows reach the given limit. +/// +/// This takes an input execution plan and a number n, and provided each partition of +/// the input plan is in an expected order, this operator will return top record batches that covers the top n rows +/// in the order of the input plan. +/// +/// ```text +/// ┌─────────────────────────┐ +/// │ ┌───┬───┬───┬───┐ │ +/// │ │ A │ B │ C │ D │ │──┐ +/// │ └───┴───┴───┴───┘ │ │ +/// └─────────────────────────┘ │ ┌───────────────────┐ ┌───────────────────────────────┐ +/// Stream 1 │ │ │ │ ┌───┬───╦═══╦───┬───╦═══╗ │ +/// ├─▶│ ProgressiveEval │───▶│ │ A │ B ║ C ║ D │ M ║ N ║ ... │ +/// │ │ │ │ └───┴─▲─╩═══╩───┴───╩═══╝ │ +/// ┌─────────────────────────┐ │ └───────────────────┘ └─┬─────┴───────────────────────┘ +/// │ ╔═══╦═══╗ │ │ +/// │ ║ M ║ N ║ │──┘ │ +/// │ ╚═══╩═══╝ │ Output only include top record batches that cover top N rows +/// └─────────────────────────┘ +/// Stream 2 +/// +/// +/// Input Streams Output stream +/// (in some order) (in same order) +/// ``` +#[derive(Debug)] +pub(crate) struct ProgressiveEvalExec { + /// Input plan + input: Arc, + + /// Corresponding value ranges of the input plan + /// None if the value ranges are not available + value_ranges: Option>, + + /// Execution metrics + metrics: ExecutionPlanMetricsSet, + + /// Optional number of rows to fetch. Stops producing rows after this fetch + fetch: Option, +} + +impl ProgressiveEvalExec { + /// Create a new progressive execution plan + pub fn new( + input: Arc, + value_ranges: Option>, + fetch: Option, + ) -> Self { + Self { + input, + value_ranges, + metrics: ExecutionPlanMetricsSet::new(), + fetch, + } + } + + /// Input schema + pub fn input(&self) -> &Arc { + &self.input + } +} + +impl DisplayAs for ProgressiveEvalExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "ProgressiveEvalExec: ")?; + if let Some(fetch) = self.fetch { + write!(f, "fetch={fetch}, ")?; + }; + if let Some(value_ranges) = &self.value_ranges { + write!(f, "input_ranges={value_ranges:?}")?; + }; + + Ok(()) + } + } + } +} + +impl ExecutionPlan for ProgressiveEvalExec { + /// Return a reference to Any that can be used for downcasting + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + /// Get the output partitioning of this plan + fn output_partitioning(&self) -> Partitioning { + // This node serializes all the data to a single partition + Partitioning::UnknownPartitioning(1) + } + + /// Specifies whether this plan generates an infinite stream of records. + /// If the plan does not support pipelining, but its input(s) are + /// infinite, returns an error to indicate this. + fn unbounded_output(&self, children: &[bool]) -> Result { + Ok(children[0]) + } + + fn required_input_distribution(&self) -> Vec { + vec![Distribution::UnspecifiedDistribution] + } + + fn benefits_from_input_partitioning(&self) -> Vec { + vec![false] + } + + fn required_input_ordering(&self) -> Vec>> { + self.input() + .output_ordering() + .map(|_| None) + .into_iter() + .collect() + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + self.input.output_ordering() + } + + /// ProgressiveEvalExec will only accept sorted input + /// and will maintain the input order + fn maintains_input_order(&self) -> Vec { + vec![true] + } + + fn children(&self) -> Vec> { + vec![Arc::::clone(&self.input)] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + Ok(Arc::new(Self::new( + Arc::::clone(&children[0]), + self.value_ranges.clone(), + self.fetch, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + trace!( + "Start ProgressiveEvalExec::execute for partition: {}", + partition + ); + if 0 != partition { + return internal_err!("ProgressiveEvalExec invalid partition {partition}"); + } + + let input_partitions = self.input.output_partitioning().partition_count(); + trace!( + "Number of input partitions of ProgressiveEvalExec::execute: {}", + input_partitions + ); + let schema = self.schema(); + + // Have the input streams run in parallel + // todo: maybe in the future we do not need this parallelism if number of fecthed rows is in the fitst stream + let receivers = (0..input_partitions) + .map(|partition| { + let stream = self + .input + .execute(partition, Arc::::clone(&context))?; + + Ok(spawn_buffered(stream, 1)) + }) + .collect::>()?; + + debug!("Done setting up sender-receiver for ProgressiveEvalExec::execute"); + + let result = ProgressiveEvalStream::new( + receivers, + schema, + BaselineMetrics::new(&self.metrics, partition), + self.fetch, + )?; + + debug!("Got stream result from ProgressiveEvalStream::new_from_receivers"); + + Ok(Box::pin(result)) + } + + fn metrics(&self) -> Option { + Some(self.metrics.clone_inner()) + } + + fn statistics(&self) -> Result { + self.input.statistics() + } + + fn equivalence_properties(&self) -> EquivalenceProperties { + // progressive eval does not change the equivalence properties of its input + self.input.equivalence_properties() + } +} + +/// Concat input streams until reaching the fetch limit +struct ProgressiveEvalStream { + /// input streams + input_streams: Vec, + + /// The schema of the input and output. + schema: SchemaRef, + + /// used to record execution metrics + metrics: BaselineMetrics, + + /// Index of current stream + current_stream_idx: usize, + + /// If the stream has encountered an error + aborted: bool, + + /// Optional number of rows to fetch + fetch: Option, + + /// number of rows produced + produced: usize, +} + +impl ProgressiveEvalStream { + fn new( + input_streams: Vec, + schema: SchemaRef, + metrics: BaselineMetrics, + fetch: Option, + ) -> Result { + Ok(Self { + input_streams, + schema, + metrics, + current_stream_idx: 0, + aborted: false, + fetch, + produced: 0, + }) + } +} + +impl Stream for ProgressiveEvalStream { + type Item = Result; + + // Return the next record batch until reaching the fetch limit or the end of all input streams + // Return pending if the next record batch is not ready + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + // Error in previous poll + if self.aborted { + return Poll::Ready(None); + } + + // Have reached the fetch limit + if self.produced >= self.fetch.unwrap_or(std::usize::MAX) { + return Poll::Ready(None); + } + + // Have reached the end of all input streams + if self.current_stream_idx >= self.input_streams.len() { + return Poll::Ready(None); + } + + // Get next record batch + let mut poll; + loop { + let idx = self.current_stream_idx; + poll = self.input_streams[idx].poll_next_unpin(cx); + match poll { + // This input stream no longer has data, move to next stream + Poll::Ready(None) => { + self.current_stream_idx += 1; + if self.current_stream_idx >= self.input_streams.len() { + break; + } + } + _ => break, + } + } + + let poll = match ready!(poll) { + // This input stream has data, return its next record batch + Some(Ok(batch)) => { + self.produced += batch.num_rows(); + Poll::Ready(Some(Ok(batch))) + } + // This input stream has an error, return the error and set aborted to true to stop polling next round + Some(Err(e)) => { + self.aborted = true; + Poll::Ready(Some(Err(e))) + } + // This input stream has no more data, return None (aka finished) + None => { + // Reaching here means data of all streams have read + assert!( + self.current_stream_idx >= self.input_streams.len(), + "ProgressiveEvalStream::poll_next should not return None before all input streams are read",); + + Poll::Ready(None) + } + }; + + self.metrics.record_poll(poll) + } +} + +impl RecordBatchStream for ProgressiveEvalStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } +} + +// todo: this is a copy from DF code. When this ProgressiveEval operator is moved to DF, this can be removed +/// If running in a tokio context spawns the execution of `stream` to a separate task +/// allowing it to execute in parallel with an intermediate buffer of size `buffer` +pub(crate) fn spawn_buffered( + mut input: SendableRecordBatchStream, + buffer: usize, +) -> SendableRecordBatchStream { + // Use tokio only if running from a multi-thread tokio context + match tokio::runtime::Handle::try_current() { + Ok(handle) if handle.runtime_flavor() == tokio::runtime::RuntimeFlavor::MultiThread => { + let mut builder = RecordBatchReceiverStream::builder(input.schema(), buffer); + + let sender = builder.tx(); + + builder.spawn(async move { + while let Some(item) = input.next().await { + if sender.send(item).await.is_err() { + // receiver dropped when query is shutdown early (e.g., limit) or error, + // no need to return propagate the send error. + return Ok(()); + } + } + + Ok(()) + }); + + builder.build() + } + _ => input, + } +} + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + use std::sync::Weak; + + use arrow::array::ArrayRef; + use arrow::array::{Int32Array, StringArray, TimestampNanosecondArray}; + use arrow::datatypes::Schema; + use arrow::datatypes::{DataType, Field}; + use arrow::record_batch::RecordBatch; + use datafusion::assert_batches_eq; + use datafusion::physical_plan::collect; + use datafusion::physical_plan::memory::MemoryExec; + use datafusion::physical_plan::metrics::{MetricValue, Timestamp}; + use futures::{Future, FutureExt}; + + use super::*; + + #[tokio::test] + async fn test_no_input_stream() { + let task_ctx = Arc::new(TaskContext::default()); + _test_progressive_eval( + &[], + None, + None, // no fetch limit --> return all rows + &["++", "++"], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_one_input_stream() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("c"), + Some("e"), + Some("g"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // return all + _test_progressive_eval( + &[vec![b1.clone()]], + None, + None, // no fetch limit --> return all rows + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | c | 1970-01-01T00:00:00.000000007 |", + "| 7 | e | 1970-01-01T00:00:00.000000006 |", + "| 9 | g | 1970-01-01T00:00:00.000000005 |", + "| 3 | j | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // fetch no rows + _test_progressive_eval( + &[vec![b1.clone()]], + None, + Some(0), + &["++", "++"], + Arc::clone(&task_ctx), + ) + .await; + + // still return all even select 3 rows becasue first record batch is returned + _test_progressive_eval( + &[vec![b1.clone()]], + None, + Some(3), + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | c | 1970-01-01T00:00:00.000000007 |", + "| 7 | e | 1970-01-01T00:00:00.000000006 |", + "| 9 | g | 1970-01-01T00:00:00.000000005 |", + "| 3 | j | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // return all because fetch limit is larger + _test_progressive_eval( + &[vec![b1.clone()]], + None, + Some(7), + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | c | 1970-01-01T00:00:00.000000007 |", + "| 7 | e | 1970-01-01T00:00:00.000000006 |", + "| 9 | g | 1970-01-01T00:00:00.000000005 |", + "| 3 | j | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + } + + #[tokio::test] + async fn test_return_all() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("c"), + Some("e"), + Some("g"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("b"), + Some("d"), + Some("f"), + Some("h"), + Some("j"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2, 6])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b1, b2] + _test_progressive_eval( + &[vec![b1.clone()], vec![b2.clone()]], + None, + None, // no fetch limit --> return all rows + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | c | 1970-01-01T00:00:00.000000007 |", + "| 7 | e | 1970-01-01T00:00:00.000000006 |", + "| 9 | g | 1970-01-01T00:00:00.000000005 |", + "| 3 | j | 1970-01-01T00:00:00.000000008 |", + "| 10 | b | 1970-01-01T00:00:00.000000004 |", + "| 20 | d | 1970-01-01T00:00:00.000000006 |", + "| 70 | f | 1970-01-01T00:00:00.000000002 |", + "| 90 | h | 1970-01-01T00:00:00.000000002 |", + "| 30 | j | 1970-01-01T00:00:00.000000006 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b2, b1] + _test_progressive_eval( + &[vec![b2], vec![b1]], + None, + None, + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 10 | b | 1970-01-01T00:00:00.000000004 |", + "| 20 | d | 1970-01-01T00:00:00.000000006 |", + "| 70 | f | 1970-01-01T00:00:00.000000002 |", + "| 90 | h | 1970-01-01T00:00:00.000000002 |", + "| 30 | j | 1970-01-01T00:00:00.000000006 |", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | c | 1970-01-01T00:00:00.000000007 |", + "| 7 | e | 1970-01-01T00:00:00.000000006 |", + "| 9 | g | 1970-01-01T00:00:00.000000005 |", + "| 3 | j | 1970-01-01T00:00:00.000000008 |", + "+----+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_return_all_on_different_length_batches() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b1, b2] + _test_progressive_eval( + &[vec![b1.clone()], vec![b2.clone()]], + None, + None, + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b2, b1] + _test_progressive_eval( + &[vec![b2], vec![b1]], + None, + None, + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "+----+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_limit_1() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b2, b1] + // b2 has 3 rows. b1 has 5 rows + // Fetch limit is 1 --> return all 3 rows of the first batch (b2) that covers that limit + _test_progressive_eval( + &[vec![b2.clone()], vec![b1.clone()]], + None, + Some(1), + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b1, b2] + // b1 has 5 rows. b2 has 3 rows + // Fetch limit is 1 --> return all 5 rows of the first batch (b1) that covers that limit + _test_progressive_eval( + &[vec![b1], vec![b2]], + None, + Some(1), + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_limit_equal_first_batch_size() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b2, b1] + // b2 has 3 rows. b1 has 5 rows + // Fetch limit is 3 --> return all 3 rows of the first batch (b2) that covers that limit + _test_progressive_eval( + &[vec![b2.clone()], vec![b1.clone()]], + None, + Some(3), + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b1, b2] + // b1 has 5 rows. b2 has 3 rows + // Fetch limit is 5 --> return all 5 rows of first batch (b1) that covers that limit + _test_progressive_eval( + &[vec![b1], vec![b2]], + None, + Some(5), + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_fetch_limit_over_first_batch_size() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![70, 90, 30])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("c"), + Some("d"), + Some("e"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b2, b1] + // b2 has 3 rows. b1 has 5 rows + // Fetch limit is 4 --> return all rows of both batches in the order of b2, b1 + _test_progressive_eval( + &[vec![b2.clone()], vec![b1.clone()]], + None, + Some(4), + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b1, b2] + // b1 has 5 rows. b2 has 3 rows + // Fetch limit is 6 --> return all rows of both batches in the order of b1, b2 + _test_progressive_eval( + &[vec![b1], vec![b2]], + None, + Some(6), + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | d | 1970-01-01T00:00:00.000000005 |", + "| 3 | e | 1970-01-01T00:00:00.000000008 |", + "| 70 | c | 1970-01-01T00:00:00.000000004 |", + "| 90 | d | 1970-01-01T00:00:00.000000006 |", + "| 30 | e | 1970-01-01T00:00:00.000000002 |", + "+----+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + #[tokio::test] + async fn test_three_partitions_with_nulls() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2, 7, 9, 3])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("a"), + Some("b"), + Some("c"), + None, + Some("f"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![8, 7, 6, 5, 8])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20, 70])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + Some("e"), + Some("g"), + Some("h"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![40, 60, 20])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![100, 200, 700, 900])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![ + None, + Some("g"), + Some("h"), + Some("i"), + ])); + let c: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![4, 6, 2, 2])); + let b3 = RecordBatch::try_from_iter(vec![("a", a), ("b", b), ("c", c)]).unwrap(); + + // [b1, b2, b3] + // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows + // Fetch limit is 1 --> return all rows of the b1 + _test_progressive_eval( + &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]], + None, + Some(1), + &[ + "+---+---+-------------------------------+", + "| a | b | c |", + "+---+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | | 1970-01-01T00:00:00.000000005 |", + "| 3 | f | 1970-01-01T00:00:00.000000008 |", + "+---+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b1, b2, b3] + // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows + // Fetch limit is 7 --> return all rows of the b1 & b2 in the order of b1, b2 + _test_progressive_eval( + &[vec![b1.clone()], vec![b2.clone()], vec![b3.clone()]], + None, + Some(7), + &[ + "+----+---+-------------------------------+", + "| a | b | c |", + "+----+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | | 1970-01-01T00:00:00.000000005 |", + "| 3 | f | 1970-01-01T00:00:00.000000008 |", + "| 10 | e | 1970-01-01T00:00:00.000000040 |", + "| 20 | g | 1970-01-01T00:00:00.000000060 |", + "| 70 | h | 1970-01-01T00:00:00.000000020 |", + "+----+---+-------------------------------+", + ], + Arc::clone(&task_ctx), + ) + .await; + + // [b1, b2, b3] + // b1 has 5 rows. b2 has 3 rows. b3 has 4 rows + // Fetch limit is 50 --> return all rows of all batches in the order of b1, b2, b3 + _test_progressive_eval( + &[vec![b1], vec![b2], vec![b3]], + None, + Some(50), + &[ + "+-----+---+-------------------------------+", + "| a | b | c |", + "+-----+---+-------------------------------+", + "| 1 | a | 1970-01-01T00:00:00.000000008 |", + "| 2 | b | 1970-01-01T00:00:00.000000007 |", + "| 7 | c | 1970-01-01T00:00:00.000000006 |", + "| 9 | | 1970-01-01T00:00:00.000000005 |", + "| 3 | f | 1970-01-01T00:00:00.000000008 |", + "| 10 | e | 1970-01-01T00:00:00.000000040 |", + "| 20 | g | 1970-01-01T00:00:00.000000060 |", + "| 70 | h | 1970-01-01T00:00:00.000000020 |", + "| 100 | | 1970-01-01T00:00:00.000000004 |", + "| 200 | g | 1970-01-01T00:00:00.000000006 |", + "| 700 | h | 1970-01-01T00:00:00.000000002 |", + "| 900 | i | 1970-01-01T00:00:00.000000002 |", + "+-----+---+-------------------------------+", + ], + task_ctx, + ) + .await; + } + + async fn _test_progressive_eval( + partitions: &[Vec], + value_ranges: Option>, + fetch: Option, + exp: &[&str], + context: Arc, + ) { + let schema = if partitions.is_empty() { + // just whatwever schema + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); + let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap(); + batch.schema() + } else { + partitions[0][0].schema() + }; + + let exec = MemoryExec::try_new(partitions, schema, None).unwrap(); + let progressive = Arc::new(ProgressiveEvalExec::new( + Arc::new(exec), + value_ranges, + fetch, + )); + + let collected = collect(progressive, context).await.unwrap(); + assert_batches_eq!(exp, collected.as_slice()); + } + + #[tokio::test] + async fn test_merge_metrics() { + let task_ctx = Arc::new(TaskContext::default()); + let a: ArrayRef = Arc::new(Int32Array::from(vec![1, 2])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("a"), Some("c")])); + let b1 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap(); + + let a: ArrayRef = Arc::new(Int32Array::from(vec![10, 20])); + let b: ArrayRef = Arc::new(StringArray::from_iter(vec![Some("b"), Some("d")])); + let b2 = RecordBatch::try_from_iter(vec![("a", a), ("b", b)]).unwrap(); + + let schema = b1.schema(); + let exec = MemoryExec::try_new(&[vec![b1], vec![b2]], schema, None).unwrap(); + let progressive = Arc::new(ProgressiveEvalExec::new(Arc::new(exec), None, None)); + + let collected = collect(Arc::::clone(&progressive), task_ctx) + .await + .unwrap(); + let expected = [ + "+----+---+", + "| a | b |", + "+----+---+", + "| 1 | a |", + "| 2 | c |", + "| 10 | b |", + "| 20 | d |", + "+----+---+", + ]; + assert_batches_eq!(expected, collected.as_slice()); + + // Now, validate metrics + let metrics = progressive.metrics().unwrap(); + + assert_eq!(metrics.output_rows().unwrap(), 4); + assert!(metrics.elapsed_compute().unwrap() > 0); + + let mut saw_start = false; + let mut saw_end = false; + metrics.iter().for_each(|m| match m.value() { + MetricValue::StartTimestamp(ts) => { + saw_start = true; + assert!(nanos_from_timestamp(ts) > 0); + } + MetricValue::EndTimestamp(ts) => { + saw_end = true; + assert!(nanos_from_timestamp(ts) > 0); + } + _ => {} + }); + + assert!(saw_start); + assert!(saw_end); + } + + fn nanos_from_timestamp(ts: &Timestamp) -> i64 { + ts.value().unwrap().timestamp_nanos_opt().unwrap() + } + + #[tokio::test] + async fn test_drop_cancel() -> Result<()> { + let task_ctx = Arc::new(TaskContext::default()); + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, true)])); + + let blocking_exec = Arc::new(BlockingExec::new(Arc::clone(&schema), 2)); + let refs = blocking_exec.refs(); + let progressive_exec = Arc::new(ProgressiveEvalExec::new(blocking_exec, None, None)); + + let fut = collect(progressive_exec, task_ctx); + let mut fut = fut.boxed(); + + assert_is_pending(&mut fut); + drop(fut); + assert_strong_count_converges_to_zero(refs).await; + + Ok(()) + } + + // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed + /// Asserts that the strong count of the given [`Weak`] pointer converges to zero. + /// + /// This might take a while but has a timeout. + pub async fn assert_strong_count_converges_to_zero(refs: Weak) { + #![allow(clippy::future_not_send)] + tokio::time::timeout(std::time::Duration::from_secs(10), async { + loop { + if Weak::strong_count(&refs) == 0 { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + }) + .await + .unwrap(); + } + + // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed + /// Asserts that given future is pending. + pub fn assert_is_pending<'a, T>(fut: &mut Pin + Send + 'a>>) { + let waker = futures::task::noop_waker(); + let mut cx = futures::task::Context::from_waker(&waker); + let poll = fut.poll_unpin(&mut cx); + + assert!(poll.is_pending()); + } + + // todo: this is copied from DF. When we move ProgressiveEval to DF, this will be removed + /// Execution plan that emits streams that block forever. + /// + /// This is useful to test shutdown / cancelation behavior of certain execution plans. + #[derive(Debug)] + pub struct BlockingExec { + /// Schema that is mocked by this plan. + schema: SchemaRef, + + /// Number of output partitions. + n_partitions: usize, + + /// Ref-counting helper to check if the plan and the produced stream are still in memory. + refs: Arc<()>, + } + + impl BlockingExec { + /// Create new [`BlockingExec`] with a give schema and number of partitions. + pub fn new(schema: SchemaRef, n_partitions: usize) -> Self { + Self { + schema, + n_partitions, + refs: Default::default(), + } + } + + /// Weak pointer that can be used for ref-counting this execution plan and its streams. + /// + /// Use [`Weak::strong_count`] to determine if the plan itself and its streams are dropped (should be 0 in that + /// case). Note that tokio might take some time to cancel spawned tasks, so you need to wrap this check into a retry + /// loop. Use [`assert_strong_count_converges_to_zero`] to archive this. + pub fn refs(&self) -> Weak<()> { + Arc::downgrade(&self.refs) + } + } + + impl DisplayAs for BlockingExec { + fn fmt_as( + &self, + t: DisplayFormatType, + f: &mut std::fmt::Formatter<'_>, + ) -> std::fmt::Result { + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!(f, "BlockingExec",) + } + } + } + } + + impl ExecutionPlan for BlockingExec { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + + fn children(&self) -> Vec> { + // this is a leaf node and has no children + vec![] + } + + fn output_partitioning(&self) -> Partitioning { + Partitioning::UnknownPartitioning(self.n_partitions) + } + + fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> { + None + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + internal_err!("Children cannot be replaced in {self:?}") + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + Ok(Box::pin(BlockingStream { + schema: Arc::clone(&self.schema), + _refs: Arc::clone(&self.refs), + })) + } + + fn statistics(&self) -> Result { + unimplemented!() + } + } + + /// A [`RecordBatchStream`] that is pending forever. + #[derive(Debug)] + pub struct BlockingStream { + /// Schema mocked by this stream. + schema: SchemaRef, + + /// Ref-counting helper to check if the stream are still in memory. + _refs: Arc<()>, + } + + impl Stream for BlockingStream { + type Item = Result; + + fn poll_next(self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll> { + Poll::Pending + } + } + + impl RecordBatchStream for BlockingStream { + fn schema(&self) -> SchemaRef { + Arc::clone(&self.schema) + } + } +} diff --git a/iox_query/src/provider/record_batch_exec.rs b/iox_query/src/provider/record_batch_exec.rs index 9b3c591edae..612228681bc 100644 --- a/iox_query/src/provider/record_batch_exec.rs +++ b/iox_query/src/provider/record_batch_exec.rs @@ -1,17 +1,19 @@ //! Implementation of a DataFusion PhysicalPlan node across partition chunks -use crate::{statistics::DFStatsAggregator, QueryChunk, CHUNK_ORDER_COLUMN_NAME}; +use crate::statistics::build_statistics_for_chunks; +use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME}; use super::adapter::SchemaAdapterStream; -use arrow::datatypes::{Schema, SchemaRef}; +use arrow::datatypes::SchemaRef; +use datafusion::physical_plan::display::ProjectSchemaDisplay; use datafusion::{ error::DataFusionError, execution::context::TaskContext, physical_plan::{ expressions::{Column, PhysicalSortExpr}, metrics::{BaselineMetrics, ExecutionPlanMetricsSet, MetricsSet}, - ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, - SendableRecordBatchStream, Statistics, + DisplayAs, DisplayFormatType, ExecutionPlan, Partitioning, SendableRecordBatchStream, + Statistics, }, scalar::ScalarValue, }; @@ -55,40 +57,10 @@ impl RecordBatchesExec { schema: SchemaRef, output_sort_key_memo: Option, ) -> Self { - let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok(); - let chunk_order_only_schema = - chunk_order_field.map(|field| Schema::new(vec![field.clone()])); - let chunks: Vec<_> = chunks.into_iter().collect(); + let statistics = build_statistics_for_chunks(&chunks, Arc::clone(&schema)); - let statistics = chunks - .iter() - .fold(DFStatsAggregator::new(&schema), |mut agg, chunk| { - agg.update(&chunk.stats(), chunk.schema().as_arrow().as_ref()); - - if let Some(schema) = chunk_order_only_schema.as_ref() { - let order = chunk.order().get(); - let order = ScalarValue::from(order); - agg.update( - &Statistics { - num_rows: Some(0), - total_byte_size: Some(0), - column_statistics: Some(vec![ColumnStatistics { - null_count: Some(0), - max_value: Some(order.clone()), - min_value: Some(order), - distinct_count: Some(1), - }]), - is_exact: true, - }, - schema, - ); - } - - agg - }) - .build(); - + let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok(); let output_ordering = if chunk_order_field.is_some() { Some(vec![ // every chunk gets its own partition, so we can claim that the output is ordered @@ -199,8 +171,8 @@ impl ExecutionPlan for RecordBatchesExec { Some(self.metrics.clone_inner()) } - fn statistics(&self) -> Statistics { - self.statistics.clone() + fn statistics(&self) -> Result { + Ok(self.statistics.clone()) } } @@ -208,7 +180,11 @@ impl DisplayAs for RecordBatchesExec { fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter<'_>) -> fmt::Result { match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { - write!(f, "RecordBatchesExec: chunks={}", self.chunks.len(),) + write!(f, "RecordBatchesExec: chunks={}", self.chunks.len(),)?; + if !self.schema.fields().is_empty() { + write!(f, ", projection={}", ProjectSchemaDisplay(&self.schema))?; + } + Ok(()) } } } diff --git a/iox_query/src/pruning.rs b/iox_query/src/pruning.rs index 4885b2867e3..50f44f1baf3 100644 --- a/iox_query/src/pruning.rs +++ b/iox_query/src/pruning.rs @@ -2,20 +2,21 @@ use crate::QueryChunk; use arrow::{ - array::{ArrayRef, UInt64Array}, + array::{ArrayRef, BooleanArray, UInt64Array}, datatypes::{DataType, SchemaRef}, }; use datafusion::{ physical_expr::execution_props::ExecutionProps, physical_optimizer::pruning::PruningStatistics, physical_plan::{ColumnStatistics, Statistics}, - prelude::{col, lit_timestamp_nano, Column, Expr}, + prelude::{col, Column, Expr}, scalar::ScalarValue, }; -use datafusion_util::create_pruning_predicate; +use datafusion_util::{create_pruning_predicate, lit_timestamptz_nano}; use observability_deps::tracing::{debug, trace, warn}; use query_functions::group_by::Aggregate; use schema::{Schema, TIME_COLUMN_NAME}; +use std::collections::HashSet; use std::sync::Arc; /// Reason why a chunk could not be pruned. @@ -82,16 +83,7 @@ pub fn prune_chunks( .iter() .map(|c| (c.stats(), c.schema().as_arrow())) .collect(); - prune_summaries(table_schema, &summaries, filters) -} -/// Given a `Vec` of pruning summaries, return a `Vec` where `false` indicates that the -/// predicate can be proven to evaluate to `false` for every single row. -pub fn prune_summaries( - table_schema: &Schema, - summaries: &[(Arc, SchemaRef)], - filters: &[Expr], -) -> Result, NotPrunedReason> { let filter_expr = match filters.iter().cloned().reduce(|a, b| a.and(b)) { Some(expr) => expr, None => { @@ -99,12 +91,23 @@ pub fn prune_summaries( return Err(NotPrunedReason::NoExpressionOnPredicate); } }; + + prune_summaries(table_schema, &summaries, &filter_expr) +} + +/// Given a `Vec` of pruning summaries, return a `Vec` where `false` indicates that the +/// predicate can be proven to evaluate to `false` for every single row. +pub fn prune_summaries( + table_schema: &Schema, + summaries: &[(Arc, SchemaRef)], + filter_expr: &Expr, +) -> Result, NotPrunedReason> { trace!(%filter_expr, "Filter_expr of pruning chunks"); // no information about the queries here let props = ExecutionProps::new(); let pruning_predicate = - match create_pruning_predicate(&props, &filter_expr, &table_schema.as_arrow()) { + match create_pruning_predicate(&props, filter_expr, &table_schema.as_arrow()) { Ok(p) => p, Err(e) => { warn!(%e, ?filter_expr, "Can not create pruning predicate"); @@ -148,9 +151,8 @@ impl<'a> ChunkPruningStatistics<'a> { column: &'b Column, ) -> impl Iterator> + 'a { self.summaries.iter().map(|(stats, schema)| { - let stats = stats.column_statistics.as_ref()?; let idx = schema.index_of(&column.name).ok()?; - Some(&stats[idx]) + Some(&stats.column_statistics[idx]) }) } } @@ -175,10 +177,19 @@ impl<'a> PruningStatistics for ChunkPruningStatistics<'a> { fn null_counts(&self, column: &Column) -> Option { let null_counts = self .column_summaries(column) - .map(|x| x.and_then(|s| s.null_count.map(|x| x as u64))); + .map(|stats| stats.and_then(|stats| stats.null_count.get_value())) + .map(|x| x.map(|x| *x as u64)); Some(Arc::new(UInt64Array::from_iter(null_counts))) } + + fn contained( + &self, + _column: &datafusion::common::Column, + _values: &HashSet, + ) -> Option { + None + } } /// Collects an [`ArrayRef`] containing the aggregate statistic corresponding to @@ -201,15 +212,15 @@ fn collect_pruning_stats<'a>( /// Returns the aggregate statistic corresponding to `aggregate` from `stats` fn get_aggregate(stats: &ColumnStatistics, aggregate: Aggregate) -> Option<&ScalarValue> { match aggregate { - Aggregate::Min => stats.min_value.as_ref(), - Aggregate::Max => stats.max_value.as_ref(), + Aggregate::Min => stats.min_value.get_value(), + Aggregate::Max => stats.max_value.get_value(), _ => None, } } /// Retention time expression, "time > retention_time". pub fn retention_expr(retention_time: i64) -> Expr { - col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(retention_time)) + col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time)) } #[cfg(test)] diff --git a/iox_query/src/query_log.rs b/iox_query/src/query_log.rs new file mode 100644 index 00000000000..e6ae929cb76 --- /dev/null +++ b/iox_query/src/query_log.rs @@ -0,0 +1,704 @@ +//! Ring buffer of queries that have been run with some brief information + +use data_types::NamespaceId; +use datafusion::physical_plan::ExecutionPlan; +use iox_time::{Time, TimeProvider}; +use observability_deps::tracing::{info, warn}; +use parking_lot::Mutex; +use std::{ + collections::VecDeque, + fmt::Debug, + sync::{ + atomic::{self, AtomicBool, AtomicI64, AtomicUsize, Ordering}, + Arc, + }, + time::Duration, +}; +use trace::ctx::TraceId; +use uuid::Uuid; + +/// The query duration used for queries still running. +const UNCOMPLETED_DURATION: i64 = -1; + +/// Information about a single query that was executed +pub struct QueryLogEntry { + /// Unique ID. + pub id: Uuid, + + /// Namespace ID. + pub namespace_id: NamespaceId, + + /// Namespace name. + pub namespace_name: Arc, + + /// The type of query + pub query_type: &'static str, + + /// The text of the query (SQL for sql queries, pbjson for storage rpc queries) + pub query_text: QueryText, + + /// The trace ID if any + pub trace_id: Option, + + /// Time at which the query was run + pub issue_time: Time, + + /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time). + permit_duration: AtomicDuration, + + /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration). + plan_duration: AtomicDuration, + + /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) + + /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration). + execute_duration: AtomicDuration, + + /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow. + end2end_duration: AtomicDuration, + + /// CPU duration spend for computation. + compute_duration: AtomicDuration, + + /// If the query completed successfully + success: AtomicBool, + + /// If the query is currently running (in any state). + running: AtomicBool, +} + +impl Debug for QueryLogEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QueryLogEntry") + .field("id", &self.id) + .field("namespace_id", &self.namespace_id) + .field("namespace_name", &self.namespace_name) + .field("query_type", &self.query_type) + .field("query_text", &self.query_text.to_string()) + .field("trace_id", &self.trace_id) + .field("issue_time", &self.issue_time) + .field("permit_duration", &self.permit_duration()) + .field("plan_duration", &self.plan_duration()) + .field("execute_duration", &self.execute_duration()) + .field("end2end_duration", &self.end2end_duration()) + .field("compute_duration", &self.compute_duration()) + .field("success", &self.success()) + .field("running", &self.running()) + .finish() + } +} + +impl QueryLogEntry { + /// Duration it took to acquire a semaphore permit, relative to [`issue_time`](Self::issue_time). + pub fn permit_duration(&self) -> Option { + self.permit_duration.get() + } + + /// Duration it took to plan the query, relative to [`issue_time`](Self::issue_time) + [`permit_duration`](Self::permit_duration). + pub fn plan_duration(&self) -> Option { + self.plan_duration.get() + } + + /// Duration it took to execute the query, relative to [`issue_time`](Self::issue_time) + + /// [`permit_duration`](Self::permit_duration) + [`plan_duration`](Self::plan_duration). + pub fn execute_duration(&self) -> Option { + self.execute_duration.get() + } + + /// Duration from [`issue_time`](Self::issue_time) til the query ended somehow. + pub fn end2end_duration(&self) -> Option { + self.end2end_duration.get() + } + + /// CPU duration spend for computation. + pub fn compute_duration(&self) -> Option { + self.compute_duration.get() + } + + /// Returns true if `set_completed` was called with `success=true` + pub fn success(&self) -> bool { + self.success.load(Ordering::SeqCst) + } + + /// If the query is currently running (in any state). + pub fn running(&self) -> bool { + self.running.load(Ordering::SeqCst) + } + + /// Log entry. + pub fn log(&self, when: &'static str) { + info!( + when, + id=%self.id, + namespace_id=self.namespace_id.get(), + namespace_name=self.namespace_name.as_ref(), + query_type=self.query_type, + query_text=%self.query_text, + trace_id=self.trace_id.map(|id| format!("{:x}", id.get())), + issue_time=%self.issue_time, + plan_duration_secs=self.plan_duration().map(|d| d.as_secs_f64()), + permit_duration_secs=self.permit_duration().map(|d| d.as_secs_f64()), + execute_duration_secs=self.execute_duration().map(|d| d.as_secs_f64()), + end2end_duration_secs=self.end2end_duration().map(|d| d.as_secs_f64()), + compute_duration_secs=self.compute_duration().map(|d| d.as_secs_f64()), + success=self.success(), + running=self.running(), + "query", + ) + } +} + +/// Snapshot of the entries the [`QueryLog`]. +#[derive(Debug)] +pub struct QueryLogEntries { + /// Entries. + pub entries: VecDeque>, + + /// Maximum number of entries + pub max_size: usize, + + /// Number of evicted entries due to the "max size" constraint. + pub evicted: usize, +} + +/// Stores a fixed number `QueryExecutions` -- handles locking +/// internally so can be shared across multiple +pub struct QueryLog { + log: Mutex>>, + max_size: usize, + evicted: AtomicUsize, + time_provider: Arc, + id_gen: IDGen, +} + +impl QueryLog { + /// Create a new QueryLog that can hold at most `size` items. + /// When the `size+1` item is added, item `0` is evicted. + pub fn new(max_size: usize, time_provider: Arc) -> Self { + Self::new_with_id_gen(max_size, time_provider, Box::new(Uuid::new_v4)) + } + + pub fn new_with_id_gen( + max_size: usize, + time_provider: Arc, + id_gen: IDGen, + ) -> Self { + Self { + log: Mutex::new(VecDeque::with_capacity(max_size)), + max_size, + evicted: AtomicUsize::new(0), + time_provider, + id_gen, + } + } + + pub fn push( + &self, + namespace_id: NamespaceId, + namespace_name: Arc, + query_type: &'static str, + query_text: QueryText, + trace_id: Option, + ) -> QueryCompletedToken { + let entry = Arc::new(QueryLogEntry { + id: (self.id_gen)(), + namespace_id, + namespace_name, + query_type, + query_text, + trace_id, + issue_time: self.time_provider.now(), + permit_duration: Default::default(), + plan_duration: Default::default(), + execute_duration: Default::default(), + end2end_duration: Default::default(), + compute_duration: Default::default(), + success: atomic::AtomicBool::new(false), + running: atomic::AtomicBool::new(true), + }); + entry.log("start"); + let token = QueryCompletedToken { + entry: Some(Arc::clone(&entry)), + time_provider: Arc::clone(&self.time_provider), + state: Default::default(), + }; + + if self.max_size == 0 { + return token; + } + + let mut log = self.log.lock(); + + // enforce limit + while log.len() > self.max_size { + log.pop_front(); + self.evicted.fetch_add(1, Ordering::SeqCst); + } + + log.push_back(Arc::clone(&entry)); + token + } + + pub fn entries(&self) -> QueryLogEntries { + let log = self.log.lock(); + QueryLogEntries { + entries: log.clone(), + max_size: self.max_size, + evicted: self.evicted.load(Ordering::SeqCst), + } + } +} + +impl Debug for QueryLog { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("QueryLog") + .field("log", &self.log) + .field("max_size", &self.max_size) + .field("evicted", &self.evicted) + .field("time_provider", &self.time_provider) + .field("id_gen", &"") + .finish() + } +} + +/// State of [`QueryCompletedToken`]. +/// +/// # Done +/// - The query has been received (and potentially authenticated) by the server. +/// +/// # To Do +/// - The concurrency-limiting semaphore has NOT yet issued a permit. +/// - The query is not planned. +/// - The query has not been executed. +#[derive(Debug, Clone, Copy, Default)] +pub struct StateReceived; + +/// State of [`QueryCompletedToken`]. +/// +/// # Done +/// - The query has been received (and potentially authenticated) by the server. +/// - The concurrency-limiting semaphore has issued a permit. +/// - The query was planned. +/// +/// # To Do +/// - The concurrency-limiting semaphore has NOT yet issued a permit. +/// - The query has not been executed. +#[derive(Debug)] +pub struct StatePlanned { + /// Physical execution plan. + plan: Arc, +} + +/// State of [`QueryCompletedToken`]. +/// +/// # Done +/// - The query has been received (and potentially authenticated) by the server. +/// - The concurrency-limiting semaphore has issued a permit. +/// +/// # To Do +/// - The query has not been executed. +#[derive(Debug)] +pub struct StatePermit { + /// Physical execution plan. + plan: Arc, +} + +/// A `QueryCompletedToken` is returned by `record_query` implementations of +/// a `QueryNamespace`. It is used to trigger side-effects (such as query timing) +/// on query completion. +#[derive(Debug)] +pub struct QueryCompletedToken { + /// Entry. + /// + /// This is optional so we can implement type state and [`Drop`] at the same time. + entry: Option>, + + /// Time provider + time_provider: Arc, + + /// Current state. + state: S, +} + +impl QueryCompletedToken { + /// Underlying entry. + pub fn entry(&self) -> &Arc { + self.entry.as_ref().expect("valid state") + } +} + +impl QueryCompletedToken { + /// Record that this query got planned. + pub fn planned(mut self, plan: Arc) -> QueryCompletedToken { + let entry = self.entry.take().expect("valid state"); + + let now = self.time_provider.now(); + let origin = entry.issue_time; + entry.plan_duration.set_relative(origin, now); + + QueryCompletedToken { + entry: Some(entry), + time_provider: Arc::clone(&self.time_provider), + state: StatePlanned { plan }, + } + } +} + +impl QueryCompletedToken { + /// Record that this query got a semaphore permit. + pub fn permit(mut self) -> QueryCompletedToken { + let entry = self.entry.take().expect("valid state"); + + let now = self.time_provider.now(); + let origin = entry.issue_time + entry.plan_duration().expect("valid state"); + entry.permit_duration.set_relative(origin, now); + + QueryCompletedToken { + entry: Some(entry), + time_provider: Arc::clone(&self.time_provider), + state: StatePermit { + plan: Arc::clone(&self.state.plan), + }, + } + } +} + +impl QueryCompletedToken { + /// Record that this query completed successfully + pub fn success(self) { + let entry = self.entry.as_ref().expect("valid state"); + entry.success.store(true, Ordering::SeqCst); + + self.finish() + } + + /// Record that the query finished execution with an error. + pub fn fail(self) { + self.finish() + } + + fn finish(&self) { + let entry = self.entry.as_ref().expect("valid state"); + + let now = self.time_provider.now(); + let origin = entry.issue_time + + entry.permit_duration().expect("valid state") + + entry.plan_duration().expect("valid state"); + entry.execute_duration.set_relative(origin, now); + + entry + .compute_duration + .set_absolute(collect_compute_duration(self.state.plan.as_ref())); + } +} + +impl Drop for QueryCompletedToken { + fn drop(&mut self) { + if let Some(entry) = self.entry.take() { + let now = self.time_provider.now(); + entry.end2end_duration.set_relative(entry.issue_time, now); + entry.running.store(false, Ordering::SeqCst); + + entry.log("end"); + } + } +} + +/// Boxed description of a query that knows how to render to a string +/// +/// This avoids storing potentially large strings +pub type QueryText = Box; + +/// Method that generated [`Uuid`]s. +pub type IDGen = Box Uuid + Send + Sync>; + +struct AtomicDuration(AtomicI64); + +impl AtomicDuration { + fn get(&self) -> Option { + match self.0.load(Ordering::Relaxed) { + UNCOMPLETED_DURATION => None, + d => Some(Duration::from_nanos(d as u64)), + } + } + + fn set_relative(&self, origin: Time, now: Time) { + match now.checked_duration_since(origin) { + Some(dur) => { + self.0.store(dur.as_nanos() as i64, Ordering::Relaxed); + } + None => { + warn!("Clock went backwards, not query duration") + } + } + } + + fn set_absolute(&self, d: Duration) { + self.0.store(d.as_nanos() as i64, Ordering::Relaxed); + } +} + +impl Default for AtomicDuration { + fn default() -> Self { + Self(AtomicI64::new(UNCOMPLETED_DURATION)) + } +} + +/// Collect compute duration from [`ExecutionPlan`]. +fn collect_compute_duration(plan: &dyn ExecutionPlan) -> Duration { + let mut total = Duration::ZERO; + + if let Some(metrics) = plan.metrics() { + if let Some(nanos) = metrics.elapsed_compute() { + total += Duration::from_nanos(nanos as u64); + } + } + + for child in plan.children() { + total += collect_compute_duration(child.as_ref()); + } + + total +} + +#[cfg(test)] +mod test_super { + use datafusion::error::DataFusionError; + use std::sync::atomic::AtomicU64; + + use datafusion::physical_plan::{ + metrics::{MetricValue, MetricsSet}, + DisplayAs, Metric, + }; + use iox_time::MockProvider; + use test_helpers::tracing::TracingCapture; + + use super::*; + + #[test] + fn test_token_end2end_success() { + let capture = TracingCapture::new(); + + let Test { + time_provider, + token, + entry, + } = Test::default(); + + assert!(!entry.success()); + assert!(entry.running()); + assert_eq!(entry.permit_duration(), None,); + assert_eq!(entry.plan_duration(), None,); + assert_eq!(entry.execute_duration(), None,); + assert_eq!(entry.end2end_duration(), None,); + assert_eq!(entry.compute_duration(), None,); + + time_provider.inc(Duration::from_millis(1)); + let token = token.planned(plan()); + + assert!(!entry.success()); + assert!(entry.running()); + assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),); + assert_eq!(entry.permit_duration(), None,); + assert_eq!(entry.execute_duration(), None,); + assert_eq!(entry.end2end_duration(), None,); + assert_eq!(entry.compute_duration(), None,); + + time_provider.inc(Duration::from_millis(10)); + let token = token.permit(); + + assert!(!entry.success()); + assert!(entry.running()); + assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),); + assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),); + assert_eq!(entry.execute_duration(), None,); + assert_eq!(entry.end2end_duration(), None,); + assert_eq!(entry.compute_duration(), None,); + + time_provider.inc(Duration::from_millis(100)); + token.success(); + + assert!(entry.success()); + assert!(!entry.running()); + assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),); + assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),); + assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),); + assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),); + assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),); + + assert_eq!( + capture.to_string().trim(), + [ + r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#, + r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = true; running = false;"#, + ].join(" \n") + ); + } + + #[test] + fn test_token_execution_fail() { + let capture = TracingCapture::new(); + + let Test { + time_provider, + token, + entry, + } = Test::default(); + + time_provider.inc(Duration::from_millis(1)); + let token = token.planned(plan()); + time_provider.inc(Duration::from_millis(10)); + let token = token.permit(); + time_provider.inc(Duration::from_millis(100)); + token.fail(); + + assert!(!entry.success()); + assert!(!entry.running()); + assert_eq!(entry.plan_duration(), Some(Duration::from_millis(1)),); + assert_eq!(entry.permit_duration(), Some(Duration::from_millis(10)),); + assert_eq!(entry.execute_duration(), Some(Duration::from_millis(100)),); + assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(111)),); + assert_eq!(entry.compute_duration(), Some(Duration::from_millis(1_337)),); + + assert_eq!( + capture.to_string().trim(), + [ + r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#, + r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; plan_duration_secs = 0.001; permit_duration_secs = 0.01; execute_duration_secs = 0.1; end2end_duration_secs = 0.111; compute_duration_secs = 1.337; success = false; running = false;"#, + ].join(" \n") + ); + } + + #[test] + fn test_token_drop_before_acquire() { + let capture = TracingCapture::new(); + + let Test { + time_provider, + token, + entry, + } = Test::default(); + + time_provider.inc(Duration::from_millis(100)); + drop(token); + + assert!(!entry.success()); + assert!(!entry.running()); + assert_eq!(entry.permit_duration(), None,); + assert_eq!(entry.plan_duration(), None,); + assert_eq!(entry.execute_duration(), None,); + assert_eq!(entry.end2end_duration(), Some(Duration::from_millis(100)),); + assert_eq!(entry.compute_duration(), None,); + + assert_eq!( + capture.to_string().trim(), + [ + r#"level = INFO; message = query; when = "start"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; success = false; running = true;"#, + r#"level = INFO; message = query; when = "end"; id = 00000000-0000-0000-0000-000000000001; namespace_id = 1; namespace_name = "ns"; query_type = "sql"; query_text = SELECT 1; issue_time = 1970-01-01T00:00:00.100+00:00; end2end_duration_secs = 0.1; success = false; running = false;"#, + ].join(" \n") + ); + } + + struct Test { + time_provider: Arc, + token: QueryCompletedToken, + entry: Arc, + } + + impl Default for Test { + fn default() -> Self { + let time_provider = + Arc::new(MockProvider::new(Time::from_timestamp_millis(100).unwrap())); + let id_counter = AtomicU64::new(1); + let log = QueryLog::new_with_id_gen( + 1_000, + Arc::clone(&time_provider) as _, + Box::new(move || Uuid::from_u128(id_counter.fetch_add(1, Ordering::SeqCst) as _)), + ); + + let token = log.push( + NamespaceId::new(1), + Arc::from("ns"), + "sql", + Box::new("SELECT 1"), + None, + ); + + let entry = Arc::clone(token.entry()); + + Self { + time_provider, + token, + entry, + } + } + } + + fn plan() -> Arc { + Arc::new(TestExec) + } + + #[derive(Debug)] + struct TestExec; + + impl DisplayAs for TestExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + _f: &mut std::fmt::Formatter<'_>, + ) -> std::fmt::Result { + unimplemented!() + } + } + + impl ExecutionPlan for TestExec { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn schema(&self) -> arrow::datatypes::SchemaRef { + unimplemented!() + } + + fn output_partitioning(&self) -> datafusion::physical_plan::Partitioning { + unimplemented!() + } + + fn output_ordering(&self) -> Option<&[datafusion::physical_expr::PhysicalSortExpr]> { + unimplemented!() + } + + fn children(&self) -> Vec> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> datafusion::error::Result> { + unimplemented!() + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> datafusion::error::Result + { + unimplemented!() + } + + fn statistics(&self) -> Result { + unimplemented!() + } + + fn metrics(&self) -> Option { + let mut metrics = MetricsSet::default(); + + let t = datafusion::physical_plan::metrics::Time::default(); + t.add_duration(Duration::from_millis(1_337)); + metrics.push(Arc::new(Metric::new(MetricValue::ElapsedCompute(t), None))); + + Some(metrics) + } + } +} diff --git a/iox_query/src/statistics.rs b/iox_query/src/statistics.rs index fd5f98cfaa9..3fc4d540543 100644 --- a/iox_query/src/statistics.rs +++ b/iox_query/src/statistics.rs @@ -1,20 +1,41 @@ //! Code to translate IOx statistics to DataFusion statistics -use std::{cmp::Ordering, collections::HashMap}; +use std::collections::{HashMap, VecDeque}; +use std::sync::Arc; -use arrow::datatypes::Schema; +use arrow::compute::rank; +use arrow::datatypes::{Schema, SchemaRef}; +use datafusion::common::stats::Precision; +use datafusion::datasource::physical_plan::ParquetExec; +use datafusion::error::DataFusionError; +use datafusion::physical_plan::coalesce_batches::CoalesceBatchesExec; +use datafusion::physical_plan::empty::EmptyExec; +use datafusion::physical_plan::expressions::Column; +use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::placeholder_row::PlaceholderRowExec; +use datafusion::physical_plan::projection::ProjectionExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::sorts::sort::SortExec; +use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; +use datafusion::physical_plan::union::UnionExec; +use datafusion::physical_plan::{visit_execution_plan, ExecutionPlan, ExecutionPlanVisitor}; use datafusion::{ physical_plan::{ColumnStatistics, Statistics as DFStatistics}, scalar::ScalarValue, }; +use observability_deps::tracing::trace; + +use crate::provider::{DeduplicateExec, RecordBatchesExec}; +use crate::{QueryChunk, CHUNK_ORDER_COLUMN_NAME}; /// Aggregates DataFusion [statistics](DFStatistics). #[derive(Debug)] pub struct DFStatsAggregator<'a> { - num_rows: Option, - total_byte_size: Option, - column_statistics: Option>, - is_exact: bool, + num_rows: Precision, + total_byte_size: Precision, + column_statistics: Vec, + // Maps column name to index in column_statistics for all columns we are + // aggregating col_idx_map: HashMap<&'a str, usize>, } @@ -39,18 +60,16 @@ impl<'a> DFStatsAggregator<'a> { .collect::>(); Self { - num_rows: Some(0), - total_byte_size: Some(0), - column_statistics: Some( - (0..col_idx_map.len()) - .map(|_| DFStatsAggregatorCol { - null_count: Some(0), - max_value: TriStateScalar::Uninit, - min_value: TriStateScalar::Uninit, - }) - .collect(), - ), - is_exact: true, + num_rows: Precision::Exact(0), + total_byte_size: Precision::Exact(0), + column_statistics: (0..col_idx_map.len()) + .map(|_| DFStatsAggregatorCol { + null_count: Precision::Exact(0), + max_value: None, + min_value: None, + }) + .collect(), + col_idx_map, } } @@ -70,81 +89,70 @@ impl<'a> DFStatsAggregator<'a> { num_rows: update_num_rows, total_byte_size: update_total_byte_size, column_statistics: update_column_statistics, - is_exact: update_is_exact, } = update_stats; - self.num_rows = self - .num_rows - .zip(*update_num_rows) - .map(|(base, update)| base + update); - self.total_byte_size = self - .total_byte_size - .zip(*update_total_byte_size) - .map(|(base, update)| base + update); - self.column_statistics = self - .column_statistics - .take() - .zip(update_column_statistics.as_ref()) - .map(|(mut base_cols, update_cols)| { - assert_eq!(base_cols.len(), self.col_idx_map.len()); - assert!( - update_cols.len() == update_schema.fields().len(), - "stats ({}) and schema ({}) have different column count", - update_cols.len(), - update_schema.fields().len(), - ); + self.num_rows = self.num_rows.add(update_num_rows); + self.total_byte_size = self.total_byte_size.add(update_total_byte_size); - let mut used_cols = vec![false; self.col_idx_map.len()]; - - for (update_field, update_col) in update_schema.fields().iter().zip(update_cols) { - let Some(idx) = self.col_idx_map.get(update_field.name().as_str()) else { - continue; - }; - let base_col = &mut base_cols[*idx]; - used_cols[*idx] = true; - - // decompose structs so we don't forget new fields - let DFStatsAggregatorCol { - null_count: base_null_count, - max_value: base_max_value, - min_value: base_min_value, - } = base_col; - let ColumnStatistics { - null_count: update_null_count, - max_value: update_max_value, - min_value: update_min_value, - distinct_count: _update_distinct_count, - } = update_col; - - *base_null_count = base_null_count - .zip(*update_null_count) - .map(|(base, update)| base + update); - base_max_value.update(update_max_value, |base, update| { - match base.partial_cmp(update) { - None => None, - Some(Ordering::Less) => Some(update.clone()), - Some(Ordering::Equal | Ordering::Greater) => Some(base), - } - }); - base_min_value.update(update_min_value, |base, update| { - match base.partial_cmp(update) { - None => None, - Some(Ordering::Less | Ordering::Equal) => Some(base), - Some(Ordering::Greater) => Some(update.clone()), - } - }); - } + assert_eq!(self.column_statistics.len(), self.col_idx_map.len()); + assert_eq!( + update_column_statistics.len(), + update_schema.fields().len(), + "stats ({}) and schema ({}) have different column count", + update_column_statistics.len(), + update_schema.fields().len(), + ); - // for unused cols, we need to assume all-NULL and hence invalidate the null counters - for (used, base_col) in used_cols.into_iter().zip(&mut base_cols) { - if !used { - base_col.null_count = None; - } - } + let mut used_cols = vec![false; self.col_idx_map.len()]; - base_cols - }); - self.is_exact &= update_is_exact; + for (update_field, update_col) in update_schema + .fields() + .iter() + .zip(update_column_statistics.iter()) + { + // Skip if not aggregating statitics for this field + let Some(idx) = self.col_idx_map.get(update_field.name().as_str()) else { + continue; + }; + let base_col = &mut self.column_statistics[*idx]; + used_cols[*idx] = true; + + // decompose structs so we don't forget new fields + let DFStatsAggregatorCol { + null_count: base_null_count, + max_value: base_max_value, + min_value: base_min_value, + } = base_col; + let ColumnStatistics { + null_count: update_null_count, + max_value: update_max_value, + min_value: update_min_value, + distinct_count: _update_distinct_count, + } = update_col; + + *base_null_count = base_null_count.add(update_null_count); + + *base_max_value = Some( + base_max_value + .take() + .map(|base_max_value| base_max_value.max(update_max_value)) + .unwrap_or(update_max_value.clone()), + ); + + *base_min_value = Some( + base_min_value + .take() + .map(|base_min_value| base_min_value.min(update_min_value)) + .unwrap_or(update_min_value.clone()), + ); + } + + // for unused cols, we need to assume all-NULL and hence invalidate the null counters + for (used, base_col) in used_cols.into_iter().zip(&mut self.column_statistics) { + if !used { + base_col.null_count = Precision::Absent; + } + } } /// Build aggregated statistics. @@ -152,87 +160,336 @@ impl<'a> DFStatsAggregator<'a> { DFStatistics { num_rows: self.num_rows, total_byte_size: self.total_byte_size, - column_statistics: self.column_statistics.map(|cols| { - cols.into_iter() - .map(|col| ColumnStatistics { - null_count: col.null_count, - max_value: col.max_value.collapse(), - min_value: col.min_value.collapse(), - distinct_count: None, - }) - .collect() - }), - is_exact: self.is_exact, + column_statistics: self + .column_statistics + .into_iter() + .map(|col| ColumnStatistics { + null_count: col.null_count, + max_value: col.max_value.unwrap_or(Precision::Absent), + min_value: col.min_value.unwrap_or(Precision::Absent), + distinct_count: Precision::Absent, + }) + .collect(), } } } -/// Similar to [`ColumnStatistics`] but has a tri-state for the min/max values so we can differentiate between -/// ["uninitialized"](TriStateScalar::Uninit) and ["invalid"](TriStateScalar::Invalid). +/// Similar to [`ColumnStatistics`] but uses `Option` to track min/max values so +/// we can differentiate between +/// +/// 1. "uninitialized" (`None`) +/// 1. "initialized" (`Some(Precision::Exact(...))`) +/// 2. "initialized but invalid" (`Some(Precision::Absent)`). /// /// It also does NOT contain a distinct count because we cannot aggregate these. #[derive(Debug)] struct DFStatsAggregatorCol { - null_count: Option, - max_value: TriStateScalar, - min_value: TriStateScalar, + null_count: Precision, + max_value: Option>, + min_value: Option>, } -#[derive(Debug)] -enum TriStateScalar { - /// Scalar has valid state. - Valid(ScalarValue), +/// build DF statitics for given chunks and a schema +pub fn build_statistics_for_chunks( + chunks: &[Arc], + schema: SchemaRef, +) -> DFStatistics { + let chunk_order_field = schema.field_with_name(CHUNK_ORDER_COLUMN_NAME).ok(); + let chunk_order_only_schema = chunk_order_field.map(|field| Schema::new(vec![field.clone()])); - /// Scalar was not yet initialized. - Uninit, + let chunks: Vec<_> = chunks.iter().collect(); - /// Scalar was poisoned and is invalid. - Invalid, + let statistics = chunks + .iter() + .fold(DFStatsAggregator::new(&schema), |mut agg, chunk| { + agg.update(&chunk.stats(), chunk.schema().as_arrow().as_ref()); + + if let Some(schema) = chunk_order_only_schema.as_ref() { + let order = chunk.order().get(); + let order = ScalarValue::from(order); + + agg.update( + &DFStatistics { + num_rows: Precision::Exact(0), + total_byte_size: Precision::Exact(0), + column_statistics: vec![ColumnStatistics { + null_count: Precision::Exact(0), + max_value: Precision::Exact(order.clone()), + min_value: Precision::Exact(order), + distinct_count: Precision::Exact(1), + }], + }, + schema, + ); + } + + agg + }) + .build(); + + statistics +} + +/// Traverse the execution plan and build statistics min max for the given column +pub fn compute_stats_column_min_max( + plan: &dyn ExecutionPlan, + column_name: &str, +) -> Result { + let mut visitor = StatisticsVisitor::new(column_name); + visit_execution_plan(plan, &mut visitor)?; + + // there must be only one statistics left in the stack + if visitor.statistics.len() != 1 { + return Err(DataFusionError::Internal(format!( + "There must be only one statistics left in the stack, but find {}", + visitor.statistics.len() + ))); + } + + Ok(visitor.statistics.pop_back().unwrap()) +} + +/// Traverse the physical plan and build statistics min max for the given column each node +/// Note: This is a temproray solution until DF's statistics is more mature +/// +struct StatisticsVisitor<'a> { + column_name: &'a str, //String, // todo: not sure enough + statistics: VecDeque, +} + +impl<'a> StatisticsVisitor<'a> { + fn new(column_name: &'a str) -> Self { + Self { + column_name, + statistics: VecDeque::new(), + } + } } -impl TriStateScalar { - fn update<'a, F>(&mut self, update: &'a Option, f: F) - where - F: FnOnce(ScalarValue, &'a ScalarValue) -> Option, - { - match (self, update.as_ref()) { - // invalid acts as a poison value - (Self::Invalid, _) => {} - // update w/o invalid invalidates aggregate - (this, None) => { - *this = Self::Invalid; +impl ExecutionPlanVisitor for StatisticsVisitor<'_> { + type Error = DataFusionError; + + fn pre_visit(&mut self, _plan: &dyn ExecutionPlan) -> Result { + Ok(false) + } + + fn post_visit(&mut self, plan: &dyn ExecutionPlan) -> Result { + // If this is an EmptyExec / PlaceholderRowExec, we don't know about it + if plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + { + self.statistics.push_back(ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, + }); + } + // If this is leaf node (ParquetExec or RecordBatchExec), compute its statistics and push it to the stack + else if plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + { + // get index of the given column in the schema + let statistics = match plan.schema().index_of(self.column_name) { + Ok(col_index) => plan.statistics()?.column_statistics[col_index].clone(), + // This is the case of alias, do not optimize by returning no statistics + Err(_) => { + trace!( + " ------------------- No statistics for column {} in PQ/RB", + self.column_name + ); + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, + } + } + }; + self.statistics.push_back(statistics); + } + // Non leaf node + else { + // These are cases the stats will be unioned of their children's + // Sort, Dediplicate, Filter, Repartition, Union, SortPreservingMerge, CoalesceBatches + let union_stats = if plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan.as_any().downcast_ref::().is_some() + || plan + .as_any() + .downcast_ref::() + .is_some() + || plan + .as_any() + .downcast_ref::() + .is_some() + { + true + } else if plan.as_any().downcast_ref::().is_some() { + // ProjectionExec is a special case. Only union stats if it includes pure columns + projection_includes_pure_columns( + plan.as_any().downcast_ref::().unwrap(), + ) + } else { + false + }; + + // pop statistics of all inputs from the stack + let num_inputs = plan.children().len(); + // num_input must > 0. Pop the first one + let mut statistics = self + .statistics + .pop_back() + .expect("No statistics for input plan"); + // pop the rest and update the min and max + for _ in 1..num_inputs { + let input_statistics = self + .statistics + .pop_back() + .expect("No statistics for input plan"); + + if union_stats { + // Convervatively union min max + statistics.null_count = statistics.null_count.add(&input_statistics.null_count); + statistics.max_value = statistics.max_value.max(&input_statistics.max_value); + statistics.min_value = statistics.min_value.min(&input_statistics.min_value); + statistics.distinct_count = Precision::Absent; + }; } - // uninit w/ first value just clones the value - (this @ Self::Uninit, Some(update)) => { - *this = Self::Valid(update.clone()); + + if union_stats { + self.statistics.push_back(statistics); + } else { + trace!( + " ------ No statistics for column {} in non-leaf node", + self.column_name + ); + // Make them absent for other cases + self.statistics.push_back(ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, + }); } - // updating a valid value with something requires a folding function - (this @ Self::Valid(_), Some(update)) => { - let mut base = Self::Invalid; - std::mem::swap(this, &mut base); - let Self::Valid(base) = base else { - unreachable!() - }; - *this = match f(base, update) { - Some(val) => Self::Valid(val), - None => Self::Invalid, + } + + Ok(true) + } +} + +fn projection_includes_pure_columns(projection: &ProjectionExec) -> bool { + projection + .expr() + .iter() + .all(|(expr, _col_name)| expr.as_any().downcast_ref::().is_some()) +} + +/// Return min max of a ColumnStatistics with precise values +pub fn column_statistics_min_max( + column_statistics: &ColumnStatistics, +) -> Option<(ScalarValue, ScalarValue)> { + match (&column_statistics.min_value, &column_statistics.max_value) { + (Precision::Exact(min), Precision::Exact(max)) => Some((min.clone(), max.clone())), + // the statistics values are absent or imprecise + _ => None, + } +} + +/// Get statsistics min max of given column name on given plans +/// Return None if one of the inputs does not have statistics or does not include the column +pub fn statistics_min_max( + plans: &[Arc], + column_name: &str, +) -> Option> { + // Get statistics for each plan + let plans_schema_and_stats = plans + .iter() + .map(|plan| Ok((Arc::clone(plan), plan.schema(), plan.statistics()?))) + .collect::, DataFusionError>>(); + + // If any without statistics, return none + let Ok(plans_schema_and_stats) = plans_schema_and_stats else { + return None; + }; + + // get value range of the sorted column for each input + let mut min_max_ranges = Vec::with_capacity(plans_schema_and_stats.len()); + for (input, input_schema, input_stats) in plans_schema_and_stats { + // get index of the sorted column in the schema + let Ok(sorted_col_index) = input_schema.index_of(column_name) else { + // panic that the sorted column is not in the schema + panic!("sorted column {} is not in the schema", column_name); + }; + + let column_stats = input_stats.column_statistics; + let sorted_col_stats = column_stats[sorted_col_index].clone(); + match (sorted_col_stats.min_value, sorted_col_stats.max_value) { + (Precision::Exact(min), Precision::Exact(max)) => { + min_max_ranges.push((min, max)); + } + // WARNING: this may produce incorrect results until we use more precision + // as `Inexact` is not guaranteed to cover the actual min and max values + // https://github.com/apache/arrow-datafusion/issues/8078 + (Precision::Inexact(min), Precision::Inexact(max)) => { + if let Some(_deduplicate_exec) = input.as_any().downcast_ref::() { + min_max_ranges.push((min, max)); + } else { + return None; }; } + // the statistics values are absent + _ => return None, } } - fn collapse(self) -> Option { - match self { - Self::Invalid | Self::Uninit => None, - Self::Valid(val) => Some(val), + Some(min_max_ranges) +} + +/// Return true if at least 2 min_max ranges in the given array overlap +pub fn overlap(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result { + // interleave min and max into one iterator + let value_ranges_iter = value_ranges.iter().flat_map(|(min, max)| { + // panics if min > max + if min > max { + panic!("min ({:?}) > max ({:?})", min, max); + } + vec![min.clone(), max.clone()] + }); + + let value_ranges = ScalarValue::iter_to_array(value_ranges_iter)?; + + // rank it + let ranks = rank(&*value_ranges, None)?; + + // check overlap by checking if the max is rank right behind its corresponding min + // . non-overlap example: values of min-max pairs [3, 5, 9, 12, 1, 1, 6, 8] + // ranks: [3, 4, 7, 8, 2, 2, 5, 6] : max (even index) = its correspnding min (odd index) for same min max OR min + 1 + // . overlap example: [3, 5, 9, 12, 1, 1, 4, 6] : pair [3, 5] interleaves with pair [4, 6] + // ranks: [3, 5, 7, 8, 2, 2, 4, 6] + for i in (0..ranks.len()).step_by(2) { + if !((ranks[i] == ranks[i + 1]) || (ranks[i + 1] == ranks[i] + 1)) { + return Ok(true); } } + + Ok(false) } #[cfg(test)] mod test { + use crate::{ + provider::chunks_to_physical_nodes, + test::{format_execution_plan, TestChunk}, + }; + use super::*; use arrow::datatypes::{DataType, Field}; + use datafusion::{common::Statistics, error::DataFusionError}; + use itertools::Itertools; + use schema::{InfluxFieldType, SchemaBuilder}; #[test] fn test_df_stats_agg_no_cols_no_updates() { @@ -241,10 +498,9 @@ mod test { let actual = agg.build(); let expected = DFStatistics { - num_rows: Some(0), - total_byte_size: Some(0), - column_statistics: Some(vec![]), - is_exact: true, + num_rows: Precision::Exact(0), + total_byte_size: Precision::Exact(0), + column_statistics: Statistics::unknown_column(&schema), }; assert_eq!(actual, expected); } @@ -259,23 +515,22 @@ mod test { let actual = agg.build(); let expected = DFStatistics { - num_rows: Some(0), - total_byte_size: Some(0), - column_statistics: Some(vec![ + num_rows: Precision::Exact(0), + total_byte_size: Precision::Exact(0), + column_statistics: vec![ ColumnStatistics { - null_count: Some(0), - max_value: None, - min_value: None, - distinct_count: None, + null_count: Precision::Exact(0), + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, }, ColumnStatistics { - null_count: Some(0), - max_value: None, - min_value: None, - distinct_count: None, + null_count: Precision::Exact(0), + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -293,59 +548,56 @@ mod test { Field::new("col2", DataType::Utf8, false), ]); let update_stats = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![ + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![ ColumnStatistics { - null_count: Some(100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: Some(42), + null_count: Precision::Exact(100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Exact(42), }, ColumnStatistics { - null_count: Some(1_000), - max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: Some(42), + null_count: Precision::Exact(1_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Exact(42), }, - ]), - is_exact: true, + ], }; agg.update(&update_stats, &update_schema); let update_schema = Schema::new(vec![Field::new("col2", DataType::Utf8, false)]); let update_stats = DFStatistics { - num_rows: Some(10_000), - total_byte_size: Some(100_000), - column_statistics: Some(vec![ColumnStatistics { - null_count: Some(1_000_000), - max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("c".to_owned()))), - distinct_count: Some(42), - }]), - is_exact: true, + num_rows: Precision::Exact(10_000), + total_byte_size: Precision::Exact(100_000), + column_statistics: vec![ColumnStatistics { + null_count: Precision::Exact(1_000_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))), + distinct_count: Precision::Exact(42), + }], }; agg.update(&update_stats, &update_schema); let actual = agg.build(); let expected = DFStatistics { - num_rows: Some(10_001), - total_byte_size: Some(100_010), - column_statistics: Some(vec![ + num_rows: Precision::Exact(10_001), + total_byte_size: Precision::Exact(100_010), + column_statistics: vec![ ColumnStatistics { - null_count: None, - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: None, + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Absent, }, ColumnStatistics { - null_count: Some(1_001_000), - max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: None, + null_count: Precision::Exact(1_001_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -363,23 +615,22 @@ mod test { Field::new("col2", DataType::Utf8, false), ]); let update_stats = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![ + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![ ColumnStatistics { - null_count: Some(100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: Some(42), + null_count: Precision::Exact(100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Exact(42), }, ColumnStatistics { - null_count: Some(1_000), - max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: Some(42), + null_count: Precision::Exact(1_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Exact(42), }, - ]), - is_exact: true, + ], }; agg.update(&update_stats, &update_schema); @@ -388,45 +639,43 @@ mod test { Field::new("col1", DataType::UInt64, true), ]); let update_stats = DFStatistics { - num_rows: Some(10_000), - total_byte_size: Some(100_000), - column_statistics: Some(vec![ + num_rows: Precision::Exact(10_000), + total_byte_size: Precision::Exact(100_000), + column_statistics: vec![ ColumnStatistics { - null_count: Some(1_000_000), - max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("c".to_owned()))), - distinct_count: Some(42), + null_count: Precision::Exact(1_000_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("c".to_owned()))), + distinct_count: Precision::Exact(42), }, ColumnStatistics { - null_count: Some(10_000_000), - max_value: Some(ScalarValue::UInt64(Some(99))), - min_value: Some(ScalarValue::UInt64(Some(40))), - distinct_count: Some(42), + null_count: Precision::Exact(10_000_000), + max_value: Precision::Exact(ScalarValue::UInt64(Some(99))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(40))), + distinct_count: Precision::Exact(42), }, - ]), - is_exact: true, + ], }; agg.update(&update_stats, &update_schema); let actual = agg.build(); let expected = DFStatistics { - num_rows: Some(10_001), - total_byte_size: Some(100_010), - column_statistics: Some(vec![ + num_rows: Precision::Exact(10_001), + total_byte_size: Precision::Exact(100_010), + column_statistics: vec![ ColumnStatistics { - null_count: Some(10_000_100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(40))), - distinct_count: None, + null_count: Precision::Exact(10_000_100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(40))), + distinct_count: Precision::Absent, }, ColumnStatistics { - null_count: Some(1_001_000), - max_value: Some(ScalarValue::Utf8(Some("g".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: None, + null_count: Precision::Exact(1_001_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("g".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -444,45 +693,43 @@ mod test { Field::new("col3", DataType::Utf8, false), ]); let update_stats = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![ + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![ ColumnStatistics { - null_count: Some(100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: Some(42), + null_count: Precision::Exact(100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Exact(42), }, ColumnStatistics { - null_count: Some(1_000), - max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: Some(42), + null_count: Precision::Exact(1_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Exact(42), }, - ]), - is_exact: true, + ], }; agg.update(&update_stats, &update_schema); let actual = agg.build(); let expected = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![ + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![ ColumnStatistics { - null_count: Some(100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: None, + null_count: Precision::Exact(100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Absent, }, ColumnStatistics { - null_count: None, - max_value: None, - min_value: None, - distinct_count: None, + null_count: Precision::Absent, + max_value: Precision::Absent, + min_value: Precision::Absent, + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; assert_eq!(actual, expected); } @@ -495,42 +742,40 @@ mod test { ]); let update_stats = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![ + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![ ColumnStatistics { - null_count: Some(100), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: Some(42), + null_count: Precision::Exact(100), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Exact(42), }, ColumnStatistics { - null_count: Some(1_000), - max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: Some(42), + null_count: Precision::Exact(1_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Exact(42), }, - ]), - is_exact: true, + ], }; let agg_stats = DFStatistics { - num_rows: Some(2), - total_byte_size: Some(20), - column_statistics: Some(vec![ + num_rows: Precision::Exact(2), + total_byte_size: Precision::Exact(20), + column_statistics: vec![ ColumnStatistics { - null_count: Some(200), - max_value: Some(ScalarValue::UInt64(Some(100))), - min_value: Some(ScalarValue::UInt64(Some(50))), - distinct_count: None, + null_count: Precision::Exact(200), + max_value: Precision::Exact(ScalarValue::UInt64(Some(100))), + min_value: Precision::Exact(ScalarValue::UInt64(Some(50))), + distinct_count: Precision::Absent, }, ColumnStatistics { - null_count: Some(2_000), - max_value: Some(ScalarValue::Utf8(Some("e".to_owned()))), - min_value: Some(ScalarValue::Utf8(Some("b".to_owned()))), - distinct_count: None, + null_count: Precision::Exact(2_000), + max_value: Precision::Exact(ScalarValue::Utf8(Some("e".to_owned()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("b".to_owned()))), + distinct_count: Precision::Absent, }, - ]), - is_exact: true, + ], }; #[derive(Debug, Clone, Copy)] @@ -546,41 +791,36 @@ mod test { TotalByteSize, ColumnStatistics, Col(usize, ColMode), - IsExact, } impl Mode { fn mask(&self, mut stats: DFStatistics) -> DFStatistics { match self { Self::NumRows => { - stats.num_rows = None; + stats.num_rows = Precision::Absent; } Self::TotalByteSize => { - stats.total_byte_size = None; + stats.total_byte_size = Precision::Absent; } Self::ColumnStatistics => { - stats.column_statistics = None; + let num_cols = stats.column_statistics.len(); + stats.column_statistics = vec![ColumnStatistics::new_unknown(); num_cols] } Self::Col(idx, mode) => { - if let Some(stats) = stats.column_statistics.as_mut() { - let stats = &mut stats[*idx]; - - match mode { - ColMode::NullCount => { - stats.null_count = None; - } - ColMode::MaxValue => { - stats.max_value = None; - } - ColMode::MinValue => { - stats.min_value = None; - } + let stats = &mut stats.column_statistics[*idx]; + + match mode { + ColMode::NullCount => { + stats.null_count = Precision::Absent; + } + ColMode::MaxValue => { + stats.max_value = Precision::Absent; + } + ColMode::MinValue => { + stats.min_value = Precision::Absent; } } } - Self::IsExact => { - stats.is_exact = false; - } } stats } @@ -594,7 +834,6 @@ mod test { Mode::Col(0, ColMode::MaxValue), Mode::Col(0, ColMode::MinValue), Mode::Col(1, ColMode::NullCount), - Mode::IsExact, ] { println!("mode: {mode:?}"); @@ -626,11 +865,583 @@ mod test { let update_schema = Schema::new(vec![Field::new("col1", DataType::UInt64, true)]); let update_stats = DFStatistics { - num_rows: Some(1), - total_byte_size: Some(10), - column_statistics: Some(vec![]), - is_exact: true, + num_rows: Precision::Exact(1), + total_byte_size: Precision::Exact(10), + column_statistics: vec![], }; agg.update(&update_stats, &update_schema); } + + #[test] + fn test_stats_for_one_chunk() { + // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME + let schema: SchemaRef = SchemaBuilder::new() + .tag("tag") + .influx_field("field", InfluxFieldType::Float) + .timestamp() + .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer) + .build() + .unwrap() + .into(); + + // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME + let record_batch_chunk = Arc::new( + TestChunk::new("t") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)), + ); + + // create them same test chunk but with a parquet file + let parquet_chunk = Arc::new( + TestChunk::new("t") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)) + .with_dummy_parquet_file(), + ); + + let expected_stats = [ + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Utf8(Some("MT".to_string()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(100))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(20), None)), + min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(6))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }, + ]; + + let record_batch_stats = + build_statistics_for_chunks(&[record_batch_chunk], Arc::clone(&schema)); + assert_eq!(record_batch_stats.column_statistics, expected_stats); + + let parquet_stats = build_statistics_for_chunks(&[parquet_chunk], schema); + assert_eq!(parquet_stats.column_statistics, expected_stats); + } + + #[test] + fn test_stats_for_two_chunks() { + // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME + let schema: SchemaRef = SchemaBuilder::new() + .tag("tag") + .influx_field("field", InfluxFieldType::Float) + .timestamp() + .influx_field(CHUNK_ORDER_COLUMN_NAME, InfluxFieldType::Integer) + .build() + .unwrap() + .into(); + + // create a test chunk with one tag, one filed, time and CHUNK_ORDER_COLUMN_NAME + let record_batch_chunk_1 = Arc::new( + TestChunk::new("t1") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)), + ); + + let record_batch_chunk_2 = Arc::new( + TestChunk::new("t2") + .with_tag_column_with_stats("tag", Some("MI"), Some("WA")) + .with_time_column_with_stats(Some(50), Some(80)) + .with_i64_field_column_with_stats("field", Some(0), Some(70)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15)), + ); + + // create them same test chunk but with a parquet file + let parquet_chunk_1 = Arc::new( + TestChunk::new("t1") + .with_tag_column_with_stats("tag", Some("AL"), Some("MT")) + .with_i64_field_column_with_stats("field", Some(0), Some(100)) + .with_time_column_with_stats(Some(10), Some(20)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(5), Some(6)) + .with_dummy_parquet_file(), + ); + + let parquet_chunk_2 = Arc::new( + TestChunk::new("t2") + .with_tag_column_with_stats("tag", Some("MI"), Some("WA")) + .with_i64_field_column_with_stats("field", Some(0), Some(70)) + .with_time_column_with_stats(Some(50), Some(80)) + .with_i64_field_column_with_stats(CHUNK_ORDER_COLUMN_NAME, Some(7), Some(15)) + .with_dummy_parquet_file(), + ); + + let expected_stats = [ + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Utf8(Some("WA".to_string()))), + min_value: Precision::Exact(ScalarValue::Utf8(Some("AL".to_string()))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(100))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(80), None)), + min_value: Precision::Exact(ScalarValue::TimestampNanosecond(Some(10), None)), + distinct_count: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(15))), + min_value: Precision::Exact(ScalarValue::Int64(Some(0))), + distinct_count: Precision::Absent, + }, + ]; + + let record_batch_stats = build_statistics_for_chunks( + &[record_batch_chunk_1, record_batch_chunk_2], + Arc::clone(&schema), + ); + assert_eq!(record_batch_stats.column_statistics, expected_stats); + + let parquet_stats = + build_statistics_for_chunks(&[parquet_chunk_1, parquet_chunk_2], schema); + assert_eq!(parquet_stats.column_statistics, expected_stats); + } + + #[test] + fn test_compute_statistics_min_max() { + // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME + let schema: SchemaRef = SchemaBuilder::new() + .tag("tag") + .influx_field("float_field", InfluxFieldType::Float) + .influx_field("int_field", InfluxFieldType::Integer) + .influx_field("string_field", InfluxFieldType::String) + .tag("tag_no_val") // no chunks have values for this + .influx_field("field_no_val", InfluxFieldType::Integer) + .timestamp() + .build() + .unwrap() + .into(); + + let parquet_chunk = Arc::new( + TestChunk::new("t") + .with_time_column_with_stats(Some(10), Some(100)) + .with_tag_column_with_stats("tag", Some("MA"), Some("VT")) + .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4)) + .with_i64_field_column_with_stats("int_field", Some(30), Some(50)) + .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum")) + // only this chunk has value for this field + .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50)) + .with_dummy_parquet_file(), + ) as Arc; + + let record_batch_chunk = Arc::new( + TestChunk::new("t") + .with_time_column_with_stats(Some(20), Some(200)) + .with_tag_column_with_stats("tag", Some("Boston"), Some("DC")) + .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0)) + .with_i64_field_column_with_stats("int_field", Some(1), Some(50)) + .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")), + ) as Arc; + + let plan_pq = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1); + insta::assert_yaml_snapshot!( + format_execution_plan(&plan_pq), + @r###" + --- + - " UnionExec" + - " ParquetExec: file_groups={1 group: [[0.parquet]]}, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]" + "### + ); + + let plan_rb = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1); + insta::assert_yaml_snapshot!( + format_execution_plan(&plan_rb), + @r###" + --- + - " UnionExec" + - " RecordBatchesExec: chunks=1, projection=[tag, float_field, int_field, string_field, tag_no_val, field_no_val, time]" + "### + ); + + // Stats for time + // parquet + let time_stats = compute_stats_column_min_max(&*plan_pq, "time").unwrap(); + let min_max = column_statistics_min_max(&time_stats).unwrap(); + let expected_time_stats = ( + ScalarValue::TimestampNanosecond(Some(10), None), + ScalarValue::TimestampNanosecond(Some(100), None), + ); + assert_eq!(min_max, expected_time_stats); + // record batch + let time_stats = compute_stats_column_min_max(&*plan_rb, "time").unwrap(); + let min_max = column_statistics_min_max(&time_stats).unwrap(); + let expected_time_stats = ( + ScalarValue::TimestampNanosecond(Some(20), None), + ScalarValue::TimestampNanosecond(Some(200), None), + ); + assert_eq!(min_max, expected_time_stats); + + // Stats for tag + // parquet + let tag_stats = compute_stats_column_min_max(&*plan_pq, "tag").unwrap(); + let min_max = column_statistics_min_max(&tag_stats).unwrap(); + let expected_tag_stats = ( + ScalarValue::Utf8(Some("MA".to_string())), + ScalarValue::Utf8(Some("VT".to_string())), + ); + assert_eq!(min_max, expected_tag_stats); + // record batch + let tag_stats = compute_stats_column_min_max(&*plan_rb, "tag").unwrap(); + let min_max = column_statistics_min_max(&tag_stats).unwrap(); + let expected_tag_stats = ( + ScalarValue::Utf8(Some("Boston".to_string())), + ScalarValue::Utf8(Some("DC".to_string())), + ); + assert_eq!(min_max, expected_tag_stats); + + // Stats for field + // parquet + let float_stats = compute_stats_column_min_max(&*plan_pq, "float_field").unwrap(); + let min_max = column_statistics_min_max(&float_stats).unwrap(); + let expected_float_stats = ( + ScalarValue::Float64(Some(10.1)), + ScalarValue::Float64(Some(100.4)), + ); + assert_eq!(min_max, expected_float_stats); + // record batch + let float_stats = compute_stats_column_min_max(&*plan_rb, "float_field").unwrap(); + let min_max = column_statistics_min_max(&float_stats).unwrap(); + let expected_float_stats = ( + ScalarValue::Float64(Some(15.6)), + ScalarValue::Float64(Some(30.0)), + ); + assert_eq!(min_max, expected_float_stats); + + // Stats for int + // parquet + let int_stats = compute_stats_column_min_max(&*plan_pq, "int_field").unwrap(); + let min_max = column_statistics_min_max(&int_stats).unwrap(); + let expected_int_stats = (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50))); + assert_eq!(min_max, expected_int_stats); + // record batch + let int_stats = compute_stats_column_min_max(&*plan_rb, "int_field").unwrap(); + let min_max = column_statistics_min_max(&int_stats).unwrap(); + let expected_int_stats = (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50))); + assert_eq!(min_max, expected_int_stats); + + // Stats for string + // parquet + let string_stats = compute_stats_column_min_max(&*plan_pq, "string_field").unwrap(); + let min_max = column_statistics_min_max(&string_stats).unwrap(); + let expected_string_stats = ( + ScalarValue::Utf8(Some("orange".to_string())), + ScalarValue::Utf8(Some("plum".to_string())), + ); + assert_eq!(min_max, expected_string_stats); + // record batch + let string_stats = compute_stats_column_min_max(&*plan_rb, "string_field").unwrap(); + let min_max = column_statistics_min_max(&string_stats).unwrap(); + let expected_string_stats = ( + ScalarValue::Utf8(Some("banana".to_string())), + ScalarValue::Utf8(Some("plum".to_string())), + ); + assert_eq!(min_max, expected_string_stats); + + // no tats on parquet + let tag_no_stats = compute_stats_column_min_max(&*plan_pq, "tag_no_val").unwrap(); + let min_max = column_statistics_min_max(&tag_no_stats); + assert!(min_max.is_none()); + + // no stats on record batch + let field_no_stats = compute_stats_column_min_max(&*plan_rb, "field_no_val").unwrap(); + let min_max = column_statistics_min_max(&field_no_stats); + assert!(min_max.is_none()); + } + + #[test] + fn test_statistics_min_max() { + // schema with one tag, one field, time and CHUNK_ORDER_COLUMN_NAME + let schema: SchemaRef = SchemaBuilder::new() + .tag("tag") + .influx_field("float_field", InfluxFieldType::Float) + .influx_field("int_field", InfluxFieldType::Integer) + .influx_field("string_field", InfluxFieldType::String) + .tag("tag_no_val") // no chunks have values for this + .influx_field("field_no_val", InfluxFieldType::Integer) + .timestamp() + .build() + .unwrap() + .into(); + + let parquet_chunk = Arc::new( + TestChunk::new("t") + .with_time_column_with_stats(Some(10), Some(100)) + .with_tag_column_with_stats("tag", Some("MA"), Some("VT")) + .with_f64_field_column_with_stats("float_field", Some(10.1), Some(100.4)) + .with_i64_field_column_with_stats("int_field", Some(30), Some(50)) + .with_string_field_column_with_stats("string_field", Some("orange"), Some("plum")) + // only this chunk has value for this field + .with_i64_field_column_with_stats("field_no_val", Some(30), Some(50)) + .with_dummy_parquet_file(), + ) as Arc; + + let record_batch_chunk = Arc::new( + TestChunk::new("t") + .with_time_column_with_stats(Some(20), Some(200)) + .with_tag_column_with_stats("tag", Some("Boston"), Some("DC")) + .with_f64_field_column_with_stats("float_field", Some(15.6), Some(30.0)) + .with_i64_field_column_with_stats("int_field", Some(1), Some(50)) + .with_string_field_column_with_stats("string_field", Some("banana"), Some("plum")), + ) as Arc; + + let plan1 = chunks_to_physical_nodes(&schema, None, vec![parquet_chunk], 1); + let plan2 = chunks_to_physical_nodes(&schema, None, vec![record_batch_chunk], 1); + + let time_stats = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "time").unwrap(); + let expected_time_stats = [ + ( + ScalarValue::TimestampNanosecond(Some(10), None), + ScalarValue::TimestampNanosecond(Some(100), None), + ), + ( + ScalarValue::TimestampNanosecond(Some(20), None), + ScalarValue::TimestampNanosecond(Some(200), None), + ), + ]; + assert_eq!(time_stats, expected_time_stats); + + let tag_stats = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag").unwrap(); + let expected_tag_stats = [ + ( + ScalarValue::Utf8(Some("MA".to_string())), + ScalarValue::Utf8(Some("VT".to_string())), + ), + ( + ScalarValue::Utf8(Some("Boston".to_string())), + ScalarValue::Utf8(Some("DC".to_string())), + ), + ]; + assert_eq!(tag_stats, expected_tag_stats); + + let float_stats = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "float_field").unwrap(); + let expected_float_stats = [ + ( + ScalarValue::Float64(Some(10.1)), + ScalarValue::Float64(Some(100.4)), + ), + ( + ScalarValue::Float64(Some(15.6)), + ScalarValue::Float64(Some(30.0)), + ), + ]; + assert_eq!(float_stats, expected_float_stats); + + let int_stats = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "int_field").unwrap(); + let expected_int_stats = [ + (ScalarValue::Int64(Some(30)), ScalarValue::Int64(Some(50))), + (ScalarValue::Int64(Some(1)), ScalarValue::Int64(Some(50))), + ]; + assert_eq!(int_stats, expected_int_stats); + + let string_stats = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "string_field").unwrap(); + let expected_string_stats = [ + ( + ScalarValue::Utf8(Some("orange".to_string())), + ScalarValue::Utf8(Some("plum".to_string())), + ), + ( + ScalarValue::Utf8(Some("banana".to_string())), + ScalarValue::Utf8(Some("plum".to_string())), + ), + ]; + assert_eq!(string_stats, expected_string_stats); + + let tag_no_stat = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "tag_no_val"); + assert!(tag_no_stat.is_none()); + + let field_no_stat = + statistics_min_max(&[Arc::clone(&plan1), Arc::clone(&plan2)], "field_no_val"); + assert!(field_no_stat.is_none()); + } + + #[test] + fn test_non_overlap_time() { + let pair_1 = ( + ScalarValue::TimestampNanosecond(Some(10), None), + ScalarValue::TimestampNanosecond(Some(20), None), + ); + let pair_2 = ( + ScalarValue::TimestampNanosecond(Some(100), None), + ScalarValue::TimestampNanosecond(Some(150), None), + ); + let pair_3 = ( + ScalarValue::TimestampNanosecond(Some(60), None), + ScalarValue::TimestampNanosecond(Some(65), None), + ); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap(); + assert!(!overlap); + } + + #[test] + fn test_overlap_time() { + let pair_1 = ( + ScalarValue::TimestampNanosecond(Some(10), None), + ScalarValue::TimestampNanosecond(Some(20), None), + ); + let pair_2 = ( + ScalarValue::TimestampNanosecond(Some(100), None), + ScalarValue::TimestampNanosecond(Some(150), None), + ); + let pair_3 = ( + ScalarValue::TimestampNanosecond(Some(8), None), + ScalarValue::TimestampNanosecond(Some(10), None), + ); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap(); + assert!(overlap); + } + + #[test] + fn test_non_overlap_integer() { + // [3, 5, 9, 12, 1, 1, 6, 8] + let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5))); + let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12))); + let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1))); + let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8))); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap(); + assert!(!overlap); + } + + #[test] + fn test_overlap_integer() { + // [3, 5, 9, 12, 1, 1, 4, 6] + let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5))); + let pair_2 = (ScalarValue::Int16(Some(9)), ScalarValue::Int16(Some(12))); + let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(1))); + let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6))); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap(); + assert!(overlap); + } + + #[test] + fn test_non_overlap_integer_ascending_null_first() { + // [3, 5, null, null, 1, 1, 6, 8] + let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5))); + let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None)); + let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2))); + let pair_4 = (ScalarValue::Int16(Some(6)), ScalarValue::Int16(Some(8))); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap(); + assert!(!overlap); + } + + #[test] + fn test_overlap_integer_ascending_null_first() { + // [3, 5, null, null, 1, 1, 4, 6] + let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(5))); + let pair_2 = (ScalarValue::Int16(None), ScalarValue::Int16(None)); + let pair_3 = (ScalarValue::Int16(Some(1)), ScalarValue::Int16(Some(2))); + let pair_4 = (ScalarValue::Int16(Some(4)), ScalarValue::Int16(Some(6))); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3, pair_4]).unwrap(); + assert!(overlap); + } + + #[test] + fn test_non_overlap_string_ascending_null_first() { + // ['e', 'h', null, null, 'a', 'a', 'k', 'q'] + let pair_1 = ( + ScalarValue::Utf8(Some('e'.to_string())), + ScalarValue::Utf8(Some('h'.to_string())), + ); + let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None)); + let pair_3 = ( + ScalarValue::Utf8(Some('a'.to_string())), + ScalarValue::Utf8(Some('a'.to_string())), + ); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap(); + assert!(!overlap); + } + + #[test] + fn test_overlap_string_ascending_null_first() { + // ['e', 'h', null, null, 'a', 'f', 'k', 'q'] + let pair_1 = ( + ScalarValue::Utf8(Some('e'.to_string())), + ScalarValue::Utf8(Some('h'.to_string())), + ); + let pair_2 = (ScalarValue::Utf8(None), ScalarValue::Utf8(None)); + let pair_3 = ( + ScalarValue::Utf8(Some('a'.to_string())), + ScalarValue::Utf8(Some('f'.to_string())), + ); + + let overlap = overlap_all(&vec![pair_1, pair_2, pair_3]).unwrap(); + assert!(overlap); + } + + #[test] + #[should_panic(expected = "Internal(\"Empty iterator passed to ScalarValue::iter_to_array\")")] + fn test_overlap_empty() { + let _overlap = overlap_all(&[]); + } + + #[should_panic(expected = "min (Int16(3)) > max (Int16(2))")] + #[test] + fn test_overlap_panic() { + // max < min + let pair_1 = (ScalarValue::Int16(Some(3)), ScalarValue::Int16(Some(2))); + let _overlap = overlap_all(&[pair_1]); + } + + /// Runs `overlap` on all permutations of the given `value_range`es and asserts that the result is + /// the same. Returns that result + fn overlap_all(value_ranges: &[(ScalarValue, ScalarValue)]) -> Result { + let n = value_ranges.len(); + + let mut overlaps_all_permutations = value_ranges + .iter() + .cloned() + .permutations(n) + .map(|v| overlap(&v)); + + let Some(first) = overlaps_all_permutations.next() else { + return overlap(value_ranges); + }; + + let first = first.unwrap(); + + for result in overlaps_all_permutations { + assert_eq!(&result.unwrap(), &first); + } + + Ok(first) + } } diff --git a/iox_query/src/test.rs b/iox_query/src/test.rs index 8b11291a601..e9697760f61 100644 --- a/iox_query/src/test.rs +++ b/iox_query/src/test.rs @@ -8,7 +8,9 @@ use crate::{ Executor, ExecutorType, IOxSessionContext, }, pruning::prune_chunks, - QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryText, + query_log::{QueryLog, StateReceived}, + QueryChunk, QueryChunkData, QueryCompletedToken, QueryNamespace, QueryNamespaceProvider, + QueryText, }; use arrow::array::{BooleanArray, Float64Array}; use arrow::datatypes::SchemaRef; @@ -20,7 +22,8 @@ use arrow::{ record_batch::RecordBatch, }; use async_trait::async_trait; -use data_types::{ChunkId, ChunkOrder, PartitionKey, TableId, TransitionPartitionId}; +use data_types::{ChunkId, ChunkOrder, NamespaceId, PartitionKey, TableId, TransitionPartitionId}; +use datafusion::common::stats::Precision; use datafusion::error::DataFusionError; use datafusion::execution::context::SessionState; use datafusion::logical_expr::Expr; @@ -32,7 +35,8 @@ use datafusion::{ physical_plan::{ColumnStatistics, Statistics as DataFusionStatistics}, scalar::ScalarValue, }; -use datafusion_util::config::DEFAULT_SCHEMA; +use datafusion_util::{config::DEFAULT_SCHEMA, option_to_precision, timestamptz_nano}; +use iox_time::SystemProvider; use itertools::Itertools; use object_store::{path::Path, ObjectMeta}; use parking_lot::Mutex; @@ -47,7 +51,76 @@ use std::{ num::NonZeroU64, sync::Arc, }; -use trace::ctx::SpanContext; +use trace::{ctx::SpanContext, span::Span}; +use tracker::{AsyncSemaphoreMetrics, InstrumentedAsyncOwnedSemaphorePermit}; + +#[derive(Debug)] +pub struct TestDatabaseStore { + databases: Mutex>>, + executor: Arc, + pub metric_registry: Arc, + pub query_semaphore: Arc, +} + +impl TestDatabaseStore { + pub fn new() -> Self { + Self::default() + } + + pub fn new_with_semaphore_size(semaphore_size: usize) -> Self { + let metric_registry = Arc::new(metric::Registry::default()); + let semaphore_metrics = Arc::new(AsyncSemaphoreMetrics::new( + &metric_registry, + &[("semaphore", "query_execution")], + )); + Self { + databases: Mutex::new(BTreeMap::new()), + executor: Arc::new(Executor::new_testing()), + metric_registry, + query_semaphore: Arc::new(semaphore_metrics.new_semaphore(semaphore_size)), + } + } + + pub async fn db_or_create(&self, name: &str) -> Arc { + let mut databases = self.databases.lock(); + + if let Some(db) = databases.get(name) { + Arc::clone(db) + } else { + let new_db = Arc::new(TestDatabase::new(Arc::clone(&self.executor))); + databases.insert(name.to_string(), Arc::clone(&new_db)); + new_db + } + } +} + +impl Default for TestDatabaseStore { + fn default() -> Self { + Self::new_with_semaphore_size(u16::MAX as usize) + } +} + +#[async_trait] +impl QueryNamespaceProvider for TestDatabaseStore { + /// Retrieve the database specified name + async fn db( + &self, + name: &str, + _span: Option, + _include_debug_info_tables: bool, + ) -> Option> { + let databases = self.databases.lock(); + + databases.get(name).cloned().map(|ns| ns as _) + } + + async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit { + Arc::clone(&self.query_semaphore) + .acquire_owned(span) + .await + .unwrap() + } +} #[derive(Debug)] pub struct TestDatabase { @@ -160,11 +233,17 @@ impl QueryNamespace for TestDatabase { fn record_query( &self, - _span_ctx: Option<&SpanContext>, - _query_type: &'static str, - _query_text: QueryText, - ) -> QueryCompletedToken { - QueryCompletedToken::new(|_| {}) + span_ctx: Option<&SpanContext>, + query_type: &'static str, + query_text: QueryText, + ) -> QueryCompletedToken { + QueryLog::new(0, Arc::new(SystemProvider::new())).push( + NamespaceId::new(1), + Arc::from("ns"), + query_type, + query_text, + span_ctx.map(|s| s.trace_id), + ) } fn new_query_context(&self, span_ctx: Option) -> IOxSessionContext { @@ -280,13 +359,13 @@ impl TableProvider for TestDatabaseTableProvider { } } -#[derive(Debug)] +#[derive(Debug, Clone)] enum TestChunkData { RecordBatches(Vec), Parquet(ParquetExecInput), } -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct TestChunk { /// Table name table_name: String, @@ -355,10 +434,10 @@ macro_rules! impl_with_column_with_stats { .unwrap(); let stats = ColumnStatistics { - null_count: None, - max_value: max.map(|s| ScalarValue::from(s)), - min_value: min.map(|s| ScalarValue::from(s)), - distinct_count: None, + null_count: Precision::Absent, + max_value: option_to_precision(max.map(|s| ScalarValue::from(s))), + min_value: option_to_precision(min.map(|s| ScalarValue::from(s))), + distinct_count: Precision::Absent, }; self.add_schema_to_table(new_column_schema, Some(stats)) @@ -405,7 +484,15 @@ impl TestChunk { self.with_dummy_parquet_file_and_store("iox://store") } + pub fn with_dummy_parquet_file_and_size(self, size: usize) -> Self { + self.with_dummy_parquet_file_and_store_and_size("iox://store", size) + } + pub fn with_dummy_parquet_file_and_store(self, store: &str) -> Self { + self.with_dummy_parquet_file_and_store_and_size(store, 1) + } + + pub fn with_dummy_parquet_file_and_store_and_size(self, store: &str, size: usize) -> Self { match self.table_data { TestChunkData::RecordBatches(batches) => { assert!(batches.is_empty(), "chunk already has record batches"); @@ -419,8 +506,9 @@ impl TestChunk { object_meta: ObjectMeta { location: Self::parquet_location(self.id), last_modified: Default::default(), - size: 1, + size, e_tag: None, + version: None, }, }), ..self @@ -546,10 +634,10 @@ impl TestChunk { // Construct stats let stats = ColumnStatistics { - null_count: Some(null_count as usize), - max_value: max.map(ScalarValue::from), - min_value: min.map(ScalarValue::from), - distinct_count: distinct_count.map(|c| c.get() as usize), + null_count: Precision::Exact(null_count as usize), + max_value: option_to_precision(max.map(ScalarValue::from)), + min_value: option_to_precision(min.map(ScalarValue::from)), + distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)), }; self.update_count(count as usize); @@ -585,10 +673,10 @@ impl TestChunk { // Construct stats let stats = ColumnStatistics { - null_count: Some(null_count as usize), - max_value: max.map(|v| ScalarValue::TimestampNanosecond(Some(v), None)), - min_value: min.map(|v| ScalarValue::TimestampNanosecond(Some(v), None)), - distinct_count: distinct_count.map(|c| c.get() as usize), + null_count: Precision::Exact(null_count as usize), + max_value: option_to_precision(max.map(timestamptz_nano)), + min_value: option_to_precision(min.map(timestamptz_nano)), + distinct_count: option_to_precision(distinct_count.map(|c| c.get() as usize)), }; self.update_count(count as usize); @@ -601,8 +689,8 @@ impl TestChunk { .get_mut(TIME_COLUMN_NAME) .expect("stats in sync w/ columns"); - stats.min_value = Some(ScalarValue::TimestampNanosecond(Some(min), None)); - stats.max_value = Some(ScalarValue::TimestampNanosecond(Some(max), None)); + stats.min_value = Precision::Exact(timestamptz_nano(min)); + stats.max_value = Precision::Exact(timestamptz_nano(max)); self } @@ -638,10 +726,10 @@ impl TestChunk { // Construct stats let stats = ColumnStatistics { - null_count: None, - max_value: max.map(ScalarValue::from), - min_value: min.map(ScalarValue::from), - distinct_count: None, + null_count: Precision::Absent, + max_value: option_to_precision(max.map(ScalarValue::from)), + min_value: option_to_precision(min.map(ScalarValue::from)), + distinct_count: Precision::Absent, }; self.add_schema_to_table(new_column_schema, Some(stats)) @@ -682,9 +770,9 @@ impl TestChunk { DataType::Int64 => Arc::new(Int64Array::from(vec![1000])) as ArrayRef, DataType::UInt64 => Arc::new(UInt64Array::from(vec![1000])) as ArrayRef, DataType::Utf8 => Arc::new(StringArray::from(vec!["MA"])) as ArrayRef, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![1000])) as ArrayRef - } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![1000]).with_timezone_opt(tz.clone()), + ) as ArrayRef, DataType::Dictionary(key, value) if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => { @@ -723,9 +811,9 @@ impl TestChunk { .iter() .map(|(_influxdb_column_type, field)| match field.data_type() { DataType::Int64 => Arc::new(Int64Array::from(vec![field_val])) as ArrayRef, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![ts_val])) as ArrayRef - } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![ts_val]).with_timezone_opt(tz.clone()), + ) as ArrayRef, DataType::Dictionary(key, value) if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => { @@ -773,9 +861,10 @@ impl TestChunk { "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI"])) as ArrayRef, _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR"])) as ArrayRef, }, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![8000, 10000, 20000])) as ArrayRef - } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![8000, 10000, 20000]) + .with_timezone_opt(tz.clone()), + ) as ArrayRef, DataType::Dictionary(key, value) if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => { @@ -834,11 +923,10 @@ impl TestChunk { "tag2" => Arc::new(StringArray::from(vec!["SC", "NC", "RI", "NC"])) as ArrayRef, _ => Arc::new(StringArray::from(vec!["TX", "PR", "OR", "AL"])) as ArrayRef, }, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![ - 28000, 210000, 220000, 210000, - ])) as ArrayRef - } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![28000, 210000, 220000, 210000]) + .with_timezone_opt(tz.clone()), + ) as ArrayRef, DataType::Dictionary(key, value) if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => { @@ -888,54 +976,54 @@ impl TestChunk { /// Stats(min, max) : tag1(AL, MT), tag2(AL, MA), time(5, 7000) pub fn with_five_rows_of_data(mut self) -> Self { // create arrays - let columns = - self.schema - .iter() - .map(|(_influxdb_column_type, field)| match field.data_type() { - DataType::Int64 => { - Arc::new(Int64Array::from(vec![1000, 10, 70, 100, 5])) as ArrayRef - } - DataType::Utf8 => match field.name().as_str() { + let columns = self + .schema + .iter() + .map(|(_influxdb_column_type, field)| match field.data_type() { + DataType::Int64 => { + Arc::new(Int64Array::from(vec![1000, 10, 70, 100, 5])) as ArrayRef + } + DataType::Utf8 => { + match field.name().as_str() { "tag1" => Arc::new(StringArray::from(vec!["MT", "MT", "CT", "AL", "MT"])) as ArrayRef, "tag2" => Arc::new(StringArray::from(vec!["CT", "AL", "CT", "MA", "AL"])) as ArrayRef, _ => Arc::new(StringArray::from(vec!["CT", "MT", "AL", "AL", "MT"])) as ArrayRef, - }, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![ - 1000, 7000, 100, 50, 5000, - ])) as ArrayRef } - DataType::Dictionary(key, value) - if key.as_ref() == &DataType::Int32 - && value.as_ref() == &DataType::Utf8 => - { - match field.name().as_str() { - "tag1" => Arc::new( - vec!["MT", "MT", "CT", "AL", "MT"] - .into_iter() - .collect::>(), - ) as ArrayRef, - "tag2" => Arc::new( - vec!["CT", "AL", "CT", "MA", "AL"] - .into_iter() - .collect::>(), - ) as ArrayRef, - _ => Arc::new( - vec!["CT", "MT", "AL", "AL", "MT"] - .into_iter() - .collect::>(), - ) as ArrayRef, - } + } + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![1000, 7000, 100, 50, 5000]) + .with_timezone_opt(tz.clone()), + ) as ArrayRef, + DataType::Dictionary(key, value) + if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => + { + match field.name().as_str() { + "tag1" => Arc::new( + vec!["MT", "MT", "CT", "AL", "MT"] + .into_iter() + .collect::>(), + ) as ArrayRef, + "tag2" => Arc::new( + vec!["CT", "AL", "CT", "MA", "AL"] + .into_iter() + .collect::>(), + ) as ArrayRef, + _ => Arc::new( + vec!["CT", "MT", "AL", "AL", "MT"] + .into_iter() + .collect::>(), + ) as ArrayRef, } - _ => unimplemented!( - "Unimplemented data type for test database: {:?}", - field.data_type() - ), - }) - .collect::>(); + } + _ => unimplemented!( + "Unimplemented data type for test database: {:?}", + field.data_type() + ), + }) + .collect::>(); let batch = RecordBatch::try_new(self.schema.as_arrow(), columns).expect("made record batch"); @@ -981,11 +1069,12 @@ impl TestChunk { "CT", "MT", "AL", "AL", "MT", "CT", "MT", "AL", "AL", "MT", ])) as ArrayRef, }, - DataType::Timestamp(TimeUnit::Nanosecond, _) => { - Arc::new(TimestampNanosecondArray::from(vec![ + DataType::Timestamp(TimeUnit::Nanosecond, tz) => Arc::new( + TimestampNanosecondArray::from(vec![ 1000, 7000, 100, 50, 5, 2000, 7000, 500, 50, 5, - ])) as ArrayRef - } + ]) + .with_timezone_opt(tz.clone()), + ) as ArrayRef, DataType::Dictionary(key, value) if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => { @@ -1045,17 +1134,15 @@ impl QueryChunk for TestChunk { self.check_error().unwrap(); Arc::new(DataFusionStatistics { - num_rows: self.num_rows, - total_byte_size: None, - column_statistics: Some( - self.schema - .inner() - .fields() - .iter() - .map(|f| self.column_stats.get(f.name()).cloned().unwrap_or_default()) - .collect(), - ), - is_exact: true, + num_rows: option_to_precision(self.num_rows), + total_byte_size: Precision::Absent, + column_statistics: self + .schema + .inner() + .fields() + .iter() + .map(|f| self.column_stats.get(f.name()).cloned().unwrap_or_default()) + .collect(), }) } diff --git a/iox_query/src/util.rs b/iox_query/src/util.rs index 28371db745c..7cd92a46c9d 100644 --- a/iox_query/src/util.rs +++ b/iox_query/src/util.rs @@ -6,31 +6,34 @@ use std::{ }; use arrow::{ - array::TimestampNanosecondArray, compute::SortOptions, datatypes::Schema as ArrowSchema, + array::TimestampNanosecondArray, + compute::SortOptions, + datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}, record_batch::RecordBatch, }; use data_types::TimestampMinMax; +use datafusion::common::stats::Precision; +use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries}; use datafusion::{ self, common::ToDFSchema, datasource::{provider_as_source, MemTable}, error::DataFusionError, execution::context::ExecutionProps, - logical_expr::{LogicalPlan, LogicalPlanBuilder}, + logical_expr::{interval_arithmetic::Interval, LogicalPlan, LogicalPlanBuilder}, optimizer::simplify_expressions::{ExprSimplifier, SimplifyContext}, physical_expr::create_physical_expr, physical_plan::{ expressions::{col as physical_col, PhysicalSortExpr}, - ColumnStatistics, ExecutionPlan, PhysicalExpr, Statistics, + PhysicalExpr, }, prelude::{Column, Expr}, - scalar::ScalarValue, }; use itertools::Itertools; use observability_deps::tracing::trace; -use schema::{sort::SortKey, InfluxColumnType, Schema, TIME_COLUMN_NAME}; +use schema::{sort::SortKey, TIME_COLUMN_NAME}; use snafu::{ensure, OptionExt, ResultExt, Snafu}; #[derive(Debug, Snafu)] @@ -68,26 +71,6 @@ pub fn make_scan_plan(batch: RecordBatch) -> std::result::Result, - input_schema: &ArrowSchema, -) -> Vec { - let mut sort_exprs = vec![]; - for key in key_columns { - let expr = physical_col(key, input_schema).expect("pk in schema"); - sort_exprs.push(PhysicalSortExpr { - expr, - options: SortOptions { - descending: false, - nulls_first: false, - }, - }); - } - - sort_exprs -} - pub fn logical_sort_key_exprs(sort_key: &SortKey) -> Vec { sort_key .iter() @@ -120,11 +103,9 @@ pub fn arrow_sort_key_exprs( /// Build a datafusion physical expression from a logical one pub fn df_physical_expr( - input: &dyn ExecutionPlan, + schema: ArrowSchemaRef, expr: Expr, ) -> std::result::Result, DataFusionError> { - let schema = input.schema(); - let df_schema = Arc::clone(&schema).to_dfschema_ref()?; let props = ExecutionProps::new(); @@ -139,7 +120,8 @@ pub fn df_physical_expr( create_physical_expr(&expr, df_schema.as_ref(), schema.as_ref(), &props) } -/// Return min and max for column `time` of the given set of record batches +/// Return min and max for column `time` of the given set of record batches by +/// performing an `O(n)` scan of all provided batches. pub fn compute_timenanosecond_min_max<'a, I>(batches: I) -> Result where I: IntoIterator, @@ -157,7 +139,8 @@ where }) } -/// Return min and max for column `time` in the given record batch +/// Return min and max for column `time` in the given record batch by performing +/// an `O(n)` scan of `batch`. pub fn compute_timenanosecond_min_max_for_one_record_batch( batch: &RecordBatch, ) -> Result<(i64, i64)> { @@ -188,136 +171,155 @@ pub fn compute_timenanosecond_min_max_for_one_record_batch( Ok((min, max)) } -/// Create basic table summary. -/// -/// This contains: -/// - correct column types -/// - [total count](Statistics::num_rows) -/// - [min](ColumnStatistics::min_value)/[max](ColumnStatistics::max_value) for the timestamp column -pub fn create_basic_summary( - row_count: u64, - schema: &Schema, - ts_min_max: Option, -) -> Statistics { - let mut columns = Vec::with_capacity(schema.len()); - - for (t, _field) in schema.iter() { - let stats = match t { - InfluxColumnType::Timestamp => ColumnStatistics { - null_count: Some(0), - max_value: Some(ScalarValue::TimestampNanosecond( - ts_min_max.map(|v| v.max), - None, - )), - min_value: Some(ScalarValue::TimestampNanosecond( - ts_min_max.map(|v| v.min), - None, - )), - distinct_count: None, - }, - _ => ColumnStatistics::default(), - }; - columns.push(stats) - } +/// Determine the possible maximum range for each of the fields in a +/// ['ArrowSchema'] once the ['Expr'] has been applied. The returned +/// Vec includes an Interval for every field in the schema in the same +/// order. Any fileds that are not constrained by the expression will +/// have an unbounded interval. +pub fn calculate_field_intervals( + schema: ArrowSchemaRef, + expr: Expr, +) -> Result, DataFusionError> { + // make unknown boundaries for each column + // TODO use upstream code when https://github.com/apache/arrow-datafusion/pull/8377 is merged + let fields = schema.fields(); + let boundaries = fields + .iter() + .enumerate() + .map(|(i, field)| { + let column = datafusion::physical_expr::expressions::Column::new(field.name(), i); + let interval = Interval::make_unbounded(field.data_type())?; + Ok(ExprBoundaries { + column, + interval, + distinct_count: Precision::Absent, + }) + }) + .collect::, DataFusionError>>()?; + + let context = AnalysisContext::new(boundaries); + let analysis_result = analyze( + &df_physical_expr(Arc::clone(&schema), expr)?, + context, + &schema, + )?; + + let intervals = analysis_result + .boundaries + .into_iter() + .map(|b| b.interval) + .collect::>(); + + Ok(intervals) +} - Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(columns), - is_exact: true, - } +/// Determine the possible maximum range for the named field in the +/// ['ArrowSchema'] once the ['Expr'] has been applied. +pub fn calculate_field_interval( + schema: ArrowSchemaRef, + expr: Expr, + name: &str, +) -> Result { + let idx = schema.index_of(name)?; + let mut intervals = calculate_field_intervals(Arc::clone(&schema), expr)?; + Ok(intervals.swap_remove(idx)) } #[cfg(test)] mod tests { - use datafusion::scalar::ScalarValue; - use schema::{builder::SchemaBuilder, InfluxFieldType}; + use datafusion::common::rounding::next_down; + use datafusion::common::ScalarValue; + use datafusion::logical_expr::{col, lit}; + use schema::{builder::SchemaBuilder, InfluxFieldType, TIME_DATA_TIMEZONE}; use super::*; - #[test] - fn test_create_basic_summary_no_columns_no_rows() { - let schema = SchemaBuilder::new().build().unwrap(); - let row_count = 0; - - let actual = create_basic_summary(row_count, &schema, None); - let expected = Statistics { - num_rows: Some(row_count as usize), - total_byte_size: None, - column_statistics: Some(vec![]), - is_exact: true, - }; - assert_eq!(actual, expected); + fn time_interval(lower: Option, upper: Option) -> Interval { + let lower = ScalarValue::TimestampNanosecond(lower, TIME_DATA_TIMEZONE()); + let upper = ScalarValue::TimestampNanosecond(upper, TIME_DATA_TIMEZONE()); + Interval::try_new(lower, upper).unwrap() + } + + fn f64_interval(lower: Option, upper: Option) -> Interval { + let lower = ScalarValue::Float64(lower); + let upper = ScalarValue::Float64(upper); + Interval::try_new(lower, upper).unwrap() } #[test] - fn test_create_basic_summary_no_rows() { - let schema = full_schema(); - let row_count = 0; - let ts_min_max = TimestampMinMax { min: 10, max: 20 }; - - let actual = create_basic_summary(row_count, &schema, Some(ts_min_max)); - let expected = Statistics { - num_rows: Some(0), - total_byte_size: None, - column_statistics: Some(vec![ - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics { - null_count: Some(0), - min_value: Some(ScalarValue::TimestampNanosecond(Some(10), None)), - max_value: Some(ScalarValue::TimestampNanosecond(Some(20), None)), - distinct_count: None, - }, - ]), - is_exact: true, - }; - assert_eq!(actual, expected); + fn test_calculate_field_intervals() { + let schema = SchemaBuilder::new() + .timestamp() + .influx_field("a", InfluxFieldType::Float) + .build() + .unwrap() + .as_arrow(); + let expr = col("time") + .gt_eq(lit("2020-01-01T00:00:00Z")) + .and(col("time").lt(lit("2020-01-02T00:00:00Z"))) + .and(col("a").gt_eq(lit(1000000.0))) + .and(col("a").lt(lit(2000000.0))); + let intervals = calculate_field_intervals(schema, expr).unwrap(); + // 2020-01-01T00:00:00Z == 1577836800000000000 + // 2020-01-02T00:00:00Z == 1577923200000000000 + assert_eq!( + vec![ + time_interval(Some(1577836800000000000), Some(1577923200000000000i64 - 1),), + f64_interval(Some(1000000.0), Some(next_down(2000000.0))) + ], + intervals + ); } #[test] - fn test_create_basic_summary() { - let schema = full_schema(); - let row_count = 3; - let ts_min_max = TimestampMinMax { min: 42, max: 42 }; - - let actual = create_basic_summary(row_count, &schema, Some(ts_min_max)); - let expected = Statistics { - num_rows: Some(3), - total_byte_size: None, - column_statistics: Some(vec![ - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics::default(), - ColumnStatistics { - null_count: Some(0), - min_value: Some(ScalarValue::TimestampNanosecond(Some(42), None)), - max_value: Some(ScalarValue::TimestampNanosecond(Some(42), None)), - distinct_count: None, - }, - ]), - is_exact: true, - }; - assert_eq!(actual, expected); + fn test_calculate_field_intervals_no_constraints() { + let schema = SchemaBuilder::new() + .timestamp() + .influx_field("a", InfluxFieldType::Float) + .build() + .unwrap() + .as_arrow(); + // must be a predicate (boolean expression) + let expr = lit("test").eq(lit("foo")); + let intervals = calculate_field_intervals(schema, expr).unwrap(); + assert_eq!( + vec![time_interval(None, None), f64_interval(None, None)], + intervals + ); } - fn full_schema() -> Schema { - SchemaBuilder::new() - .tag("tag") - .influx_field("field_bool", InfluxFieldType::Boolean) - .influx_field("field_float", InfluxFieldType::Float) - .influx_field("field_integer", InfluxFieldType::Integer) - .influx_field("field_string", InfluxFieldType::String) - .influx_field("field_uinteger", InfluxFieldType::UInteger) + #[test] + fn test_calculate_field_interval() { + let schema = SchemaBuilder::new() .timestamp() + .influx_field("a", InfluxFieldType::Float) .build() .unwrap() + .as_arrow(); + let expr = col("time") + .gt_eq(lit("2020-01-01T00:00:00Z")) + .and(col("time").lt(lit("2020-01-02T00:00:00Z"))) + .and(col("a").gt_eq(lit(1000000.0))) + .and(col("a").lt(lit(2000000.0))); + + // Note + // 2020-01-01T00:00:00Z == 1577836800000000000 + // 2020-01-02T00:00:00Z == 1577923200000000000 + let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "time").unwrap(); + assert_eq!( + time_interval(Some(1577836800000000000), Some(1577923200000000000 - 1),), + interval + ); + + let interval = calculate_field_interval(Arc::clone(&schema), expr.clone(), "a").unwrap(); + assert_eq!( + f64_interval(Some(1000000.0), Some(next_down(2000000.0))), + interval + ); + + assert_eq!( + "Arrow error: Schema error: Unable to get field named \"b\". Valid fields: [\"time\", \"a\"]", + calculate_field_interval(Arc::clone(&schema), expr.clone(), "b").unwrap_err().to_string(), + ); } } diff --git a/iox_query_influxql/Cargo.toml b/iox_query_influxql/Cargo.toml index 50a96373dd4..0c116125510 100644 --- a/iox_query_influxql/Cargo.toml +++ b/iox_query_influxql/Cargo.toml @@ -5,22 +5,25 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } chrono-tz = { version = "0.8" } datafusion = { workspace = true } datafusion_util = { path = "../datafusion_util" } generated_types = { path = "../generated_types" } influxdb_influxql_parser = { path = "../influxdb_influxql_parser" } iox_query = { path = "../iox_query" } -itertools = "0.11.0" +itertools = "0.12.0" observability_deps = { path = "../observability_deps" } once_cell = "1" predicate = { path = "../predicate" } query_functions = { path = "../query_functions" } regex = "1" schema = { path = "../schema" } -serde_json = "1.0.107" +serde_json = "1.0.111" thiserror = "1.0" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/iox_query_influxql/src/aggregate/percentile.rs b/iox_query_influxql/src/aggregate/percentile.rs index 4b88e9eb967..dda8659e45c 100644 --- a/iox_query_influxql/src/aggregate/percentile.rs +++ b/iox_query_influxql/src/aggregate/percentile.rs @@ -120,8 +120,9 @@ impl Accumulator for PercentileAccumulator { } fn state(&self) -> Result> { + let arr = ScalarValue::new_list(&self.data, &self.data_type); Ok(vec![ - ScalarValue::new_list(Some(self.data.clone()), self.data_type.clone()), + ScalarValue::List(arr), ScalarValue::Float64(self.percentile), ]) } diff --git a/iox_query_influxql/src/frontend/planner.rs b/iox_query_influxql/src/frontend/planner.rs index e8a311bdb6d..f8f6ff019fd 100644 --- a/iox_query_influxql/src/frontend/planner.rs +++ b/iox_query_influxql/src/frontend/planner.rs @@ -1,4 +1,5 @@ use arrow::datatypes::SchemaRef; +use datafusion::common::ParamValues; use datafusion::physical_expr::execution_props::ExecutionProps; use influxdb_influxql_parser::show_field_keys::ShowFieldKeysStatement; use influxdb_influxql_parser::show_measurements::ShowMeasurementsStatement; @@ -12,7 +13,6 @@ use std::ops::Deref; use std::sync::Arc; use crate::plan::{parse_regex, InfluxQLToLogicalPlan, SchemaProvider}; -use datafusion::common::Statistics; use datafusion::datasource::provider_as_source; use datafusion::execution::context::{SessionState, TaskContext}; use datafusion::logical_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource}; @@ -120,8 +120,10 @@ impl ExecutionPlan for SchemaExec { self.input.execute(partition, context) } - fn statistics(&self) -> Statistics { - self.input.statistics() + fn statistics(&self) -> Result { + Ok(datafusion::physical_plan::Statistics::new_unknown( + &self.schema(), + )) } } @@ -136,7 +138,7 @@ impl DisplayAs for SchemaExec { } /// Create plans for running InfluxQL queries against databases -#[derive(Debug, Default)] +#[derive(Debug, Default, Copy, Clone)] pub struct InfluxQLQueryPlanner {} impl InfluxQLQueryPlanner { @@ -149,13 +151,20 @@ impl InfluxQLQueryPlanner { pub async fn query( &self, query: &str, + params: impl Into + Send, ctx: &IOxSessionContext, ) -> Result> { + let ctx = ctx.child_ctx("InfluxQLQueryPlanner::query"); debug!(text=%query, "planning InfluxQL query"); let statement = self.query_to_statement(query)?; - let logical_plan = self.statement_to_plan(statement, ctx).await?; - + let logical_plan = self.statement_to_plan(statement, &ctx).await?; + // add params to plan only when they're non-empty + let logical_plan = match params.into() { + ParamValues::List(v) if !v.is_empty() => logical_plan.with_param_values(v)?, + ParamValues::Map(m) if !m.is_empty() => logical_plan.with_param_values(m)?, + _ => logical_plan, + }; let input = ctx.create_physical_plan(&logical_plan).await?; // Merge schema-level metadata from the logical plan with the @@ -179,6 +188,7 @@ impl InfluxQLQueryPlanner { ) -> Result { use std::collections::hash_map::Entry; + let ctx = ctx.child_ctx("statement_to_plan"); let session_cfg = ctx.inner().copied_config(); let cfg = session_cfg.options(); let schema = ctx @@ -207,6 +217,9 @@ impl InfluxQLQueryPlanner { for table_name in &query_tables { if let Entry::Vacant(v) = sp.tables.entry(table_name.to_string()) { + let mut ctx = ctx.child_ctx("get table schema"); + ctx.set_metadata("table", table_name.to_owned()); + if let Some(table) = schema.table(table_name).await { let schema = Schema::try_from(table.schema()) .map_err(|err| { @@ -217,7 +230,7 @@ impl InfluxQLQueryPlanner { } } - let planner = InfluxQLToLogicalPlan::new(&sp, ctx); + let planner = InfluxQLToLogicalPlan::new(&sp, &ctx); let logical_plan = planner.statement_to_plan(statement)?; debug!(plan=%logical_plan.display_graphviz(), "logical plan"); Ok(logical_plan) diff --git a/iox_query_influxql/src/plan/ir.rs b/iox_query_influxql/src/plan/ir.rs index 336bf4675fa..7ee811d154e 100644 --- a/iox_query_influxql/src/plan/ir.rs +++ b/iox_query_influxql/src/plan/ir.rs @@ -228,8 +228,8 @@ impl Display for Field { #[derive(Debug, Clone, Copy)] pub(super) struct Interval { /// The nanosecond duration of the interval - pub duration: i64, + pub(super) duration: i64, /// The nanosecond offset of the interval. - pub offset: Option, + pub(super) offset: Option, } diff --git a/iox_query_influxql/src/plan/planner.rs b/iox_query_influxql/src/plan/planner.rs index cf8d72f1802..14ff70a3d4e 100644 --- a/iox_query_influxql/src/plan/planner.rs +++ b/iox_query_influxql/src/plan/planner.rs @@ -31,22 +31,22 @@ use datafusion::catalog::TableReference; use datafusion::common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion::common::{DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, ToDFSchema}; use datafusion::datasource::{provider_as_source, MemTable}; -use datafusion::logical_expr::expr::{Alias, ScalarFunction}; +use datafusion::logical_expr::expr::{AggregateFunctionDefinition, Alias, ScalarFunction}; use datafusion::logical_expr::expr_rewriter::normalize_col; use datafusion::logical_expr::logical_plan::builder::project; use datafusion::logical_expr::logical_plan::Analyze; use datafusion::logical_expr::utils::{expr_as_column_expr, find_aggregate_exprs}; use datafusion::logical_expr::{ - binary_expr, col, date_bin, expr, expr::WindowFunction, lit, lit_timestamp_nano, now, union, - window_function, AggregateFunction, AggregateUDF, Between, BuiltInWindowFunction, - BuiltinScalarFunction, EmptyRelation, Explain, Expr, ExprSchemable, Extension, LogicalPlan, - LogicalPlanBuilder, Operator, PlanType, Projection, ScalarUDF, TableSource, ToStringifiedPlan, - WindowFrame, WindowFrameBound, WindowFrameUnits, + binary_expr, col, date_bin, expr, expr::WindowFunction, lit, now, union, utils::conjunction, + AggregateFunction, AggregateUDF, Between, BuiltInWindowFunction, BuiltinScalarFunction, + Distinct, EmptyRelation, Explain, Expr, ExprSchemable, Extension, LogicalPlan, + LogicalPlanBuilder, Operator, PlanType, Projection, ScalarFunctionDefinition, ScalarUDF, + TableSource, ToStringifiedPlan, WindowFrame, WindowFrameBound, WindowFrameUnits, + WindowFunctionDefinition, }; -use datafusion::optimizer::utils::conjunction; use datafusion::physical_expr::execution_props::ExecutionProps; use datafusion::prelude::{cast, sum, when, Column}; -use datafusion_util::{lit_dict, AsExpr}; +use datafusion_util::{lit_dict, lit_timestamptz_nano, AsExpr}; use generated_types::influxdata::iox::querier::v1::InfluxQlMetadata; use influxdb_influxql_parser::common::{LimitClause, OffsetClause, OrderByClause}; use influxdb_influxql_parser::explain::{ExplainOption, ExplainStatement}; @@ -433,12 +433,15 @@ impl<'a> Context<'a> { /// InfluxQL query planner pub struct InfluxQLToLogicalPlan<'a> { s: &'a dyn SchemaProvider, - iox_ctx: &'a IOxSessionContext, + iox_ctx: IOxSessionContext, } impl<'a> InfluxQLToLogicalPlan<'a> { pub fn new(s: &'a dyn SchemaProvider, iox_ctx: &'a IOxSessionContext) -> Self { - Self { s, iox_ctx } + Self { + s, + iox_ctx: iox_ctx.child_ctx("InfluxQLToLogicalPlan"), + } } pub fn statement_to_plan(&self, statement: Statement) -> Result { @@ -447,9 +450,11 @@ impl<'a> InfluxQLToLogicalPlan<'a> { Statement::Delete(_) => error::not_implemented("DELETE"), Statement::DropMeasurement(_) => error::not_implemented("DROP MEASUREMENT"), Statement::Explain(explain) => self.explain_statement_to_plan(*explain), - Statement::Select(select) => { - self.select_query_to_plan(&self.rewrite_select_statement(*select)?) - } + Statement::Select(select) => self.select_query_to_plan( + &self + .rewrite_select_statement(*select) + .map_err(|e| e.context("rewriting statement"))?, + ), Statement::ShowDatabases(_) => error::not_implemented("SHOW DATABASES"), Statement::ShowMeasurements(show_measurements) => { self.show_measurements_to_plan(*show_measurements) @@ -468,7 +473,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { } fn explain_statement_to_plan(&self, explain: ExplainStatement) -> Result { - let plan = self.select_query_to_plan(&self.rewrite_select_statement(*explain.select)?)?; + let plan = self.statement_to_plan(*explain.statement)?; let plan = Arc::new(plan); let schema = LogicalPlan::explain_schema(); let schema = schema.to_dfschema_ref()?; @@ -783,13 +788,25 @@ impl<'a> InfluxQLToLogicalPlan<'a> { ) -> Result { match ctx.projection_type { ProjectionType::Raw => self.project_select_raw(input, fields), - ProjectionType::RawDistinct => self.project_select_raw_distinct(input, fields), - ProjectionType::Aggregate => self.project_select_aggregate(ctx, input, fields, group_by_tag_set), - ProjectionType::Window => self.project_select_window(ctx, input, fields, group_by_tag_set), - ProjectionType::WindowAggregate => self.project_select_window_aggregate(ctx, input, fields, group_by_tag_set), - ProjectionType::WindowAggregateMixed => error::not_implemented("mixed window-aggregate and aggregate columns, such as DIFFERENCE(MEAN(col)), MEAN(col)"), - ProjectionType::Selector{..} => self.project_select_selector(ctx, input, fields, group_by_tag_set), - ProjectionType::TopBottomSelector => self.project_select_top_bottom_selector(ctx, input, fields, group_by_tag_set), + ProjectionType::RawDistinct => self.project_select_raw_distinct(ctx, input, fields), + ProjectionType::Aggregate => { + self.project_select_aggregate(ctx, input, fields, group_by_tag_set) + } + ProjectionType::Window => { + self.project_select_window(ctx, input, fields, group_by_tag_set) + } + ProjectionType::WindowAggregate => { + self.project_select_window_aggregate(ctx, input, fields, group_by_tag_set) + } + ProjectionType::WindowAggregateMixed => { + self.project_select_window_aggregate_mixed(ctx, input, fields, group_by_tag_set) + } + ProjectionType::Selector { .. } => { + self.project_select_selector(ctx, input, fields, group_by_tag_set) + } + ProjectionType::TopBottomSelector => { + self.project_select_top_bottom_selector(ctx, input, fields, group_by_tag_set) + } } } @@ -809,6 +826,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { /// and call only scalar functions, but output only distinct rows. fn project_select_raw_distinct( &self, + ctx: &Context<'_>, input: LogicalPlan, fields: &[Field], ) -> Result { @@ -834,10 +852,32 @@ impl<'a> InfluxQLToLogicalPlan<'a> { return error::internal("time column is not an alias"); }; - select_exprs[time_column_index] = lit_timestamp_nano(0).alias(alias); + select_exprs[time_column_index] = if let Some(i) = ctx.interval { + let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration)); + let offset = i.offset.unwrap_or_default(); + + date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset)).alias(alias) + } else { + lit_timestamptz_nano(0).alias(alias) + }; // Wrap the plan in a `LogicalPlan::Projection` from the select expressions - let plan = project(input, select_exprs)?; + let mut plan = project(input, select_exprs)?; + + // generate a predicate to filter out all rows where all field values are `NULL`, + // like: + // + // NOT (field1 IS NULL AND field2 IS NULL AND ...) + plan = match conjunction(fields.iter().filter_map(|f| { + if matches!(f.data_type, Some(InfluxColumnType::Field(_))) { + Some(f.name.as_expr().is_null()) + } else { + None + } + })) { + Some(expr) => LogicalPlanBuilder::from(plan).filter(expr.not())?.build()?, + None => plan, + }; LogicalPlanBuilder::from(plan).distinct()?.build() } @@ -852,7 +892,6 @@ impl<'a> InfluxQLToLogicalPlan<'a> { group_by_tag_set: &[&str], ) -> Result { let schema = IQLSchema::new_from_fields(input.schema(), fields)?; - // Transform InfluxQL AST field expressions to a list of DataFusion expressions. let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?; @@ -923,8 +962,6 @@ impl<'a> InfluxQLToLogicalPlan<'a> { // Wrap the plan in a `LogicalPlan::Projection` from the select expressions let plan = project(plan, select_exprs)?; - // InfluxQL OG physical operators for - // generate a predicate to filter rows where all field values of the row are `NULL`, // like: // @@ -941,6 +978,34 @@ impl<'a> InfluxQLToLogicalPlan<'a> { } } + /// Plan "WindowAggregateMixed" SELECT queries. These are queries that use + /// a combination of window and nested aggregate functions, along with + /// additional aggregate functions. + /// + /// N.B. The plans produced here can output incorrect results when using the + /// `FILL(0)` directive. See [#9706](https://github.com/influxdata/influxdb_iox/issues/9706) + /// for details. + fn project_select_window_aggregate_mixed( + &self, + ctx: &Context<'_>, + input: LogicalPlan, + fields: &[Field], + group_by_tag_set: &[&str], + ) -> Result { + let schema = IQLSchema::new_from_fields(input.schema(), fields)?; + + // Transform InfluxQL AST field expressions to a list of DataFusion expressions. + let select_exprs = self.field_list_to_exprs(&input, fields, &schema)?; + + let (plan, select_exprs) = + self.select_aggregate(ctx, input, fields, select_exprs, group_by_tag_set)?; + + let (plan, select_exprs) = self.select_window(ctx, plan, select_exprs, group_by_tag_set)?; + + // Wrap the plan in a `LogicalPlan::Projection` from the select expressions + project(plan, select_exprs) + } + /// Plan the execution of SELECT queries that have the Selector projection /// type. These a queries that include a single FIRST, LAST, MAX, MIN, /// PERCENTILE, or SAMPLE function call, possibly requesting additional @@ -980,8 +1045,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> { let perc_row_column_name = window_perc_row.display_name()?; let window_row = Expr::WindowFunction(WindowFunction::new( - window_function::WindowFunction::BuiltInWindowFunction( - window_function::BuiltInWindowFunction::RowNumber, + WindowFunctionDefinition::BuiltInWindowFunction( + BuiltInWindowFunction::RowNumber, ), vec![], window_partition_by(ctx, input.schema(), group_by_tag_set), @@ -1148,8 +1213,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> { if aggr_exprs.len() == 1 { let selector = aggr_exprs[0].clone(); - if let Expr::AggregateUDF(mut udf) = selector.clone() { - if udf.fun.name.starts_with("selector_") { + if let Expr::AggregateFunction(mut agg) = selector.clone() { + if agg.func_def.name().starts_with("selector_") { let selector_index = select_exprs .iter() .enumerate() @@ -1168,6 +1233,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { let (expr, out_name) = match expr.clone() { Expr::Alias(Alias { expr, + relation: None, name: out_name, }) => (*expr, out_name), _ => { @@ -1185,8 +1251,8 @@ impl<'a> InfluxQLToLogicalPlan<'a> { )); } - udf.args.append(&mut additional_args); - let selector_new = Expr::AggregateUDF(udf); + agg.args.append(&mut additional_args); + let selector_new = Expr::AggregateFunction(agg); select_exprs[selector_index] = select_exprs[selector_index] .clone() .transform_up(&|expr| { @@ -1229,11 +1295,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { let stride = lit(ScalarValue::new_interval_mdn(0, 0, i.duration)); let offset = i.offset.unwrap_or_default(); - date_bin( - stride, - "time".as_expr(), - lit(ScalarValue::TimestampNanosecond(Some(offset), None)), - ) + date_bin(stride, "time".as_expr(), lit_timestamptz_nano(offset)) } else if let ProjectionType::Selector { has_fields: _ } = ctx.projection_type { let selector = match aggr_exprs.len() { 1 => aggr_exprs[0].clone(), @@ -1247,7 +1309,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { selector.field("time") } else { - lit_timestamp_nano(0) + lit_timestamptz_nano(0) } .alias(alias); @@ -1308,7 +1370,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { FillClause::None => unreachable!(), }; - build_gap_fill_node(plan, time_column, fill_strategy)? + build_gap_fill_node(plan, time_column, fill_strategy, &ctx.projection_type)? } else { plan }; @@ -1418,9 +1480,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { }; let window_expr = Expr::WindowFunction(WindowFunction::new( - window_function::WindowFunction::BuiltInWindowFunction( - window_function::BuiltInWindowFunction::RowNumber, - ), + WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber), Vec::::new(), window_partition_by(ctx, input.schema(), group_by_tags), order_by_exprs, @@ -1451,9 +1511,12 @@ impl<'a> InfluxQLToLogicalPlan<'a> { // neither of which should be passed to udf_to_expr .map_err(|err| error::map::internal(format!("display_name: {err}")))?; - let Expr::ScalarUDF(expr::ScalarUDF { fun, args }) = e else { + let Expr::ScalarFunction(ScalarFunction { func_def, args }) = e else { return error::internal(format!("udf_to_expr: unexpected expression: {e}")); }; + let ScalarFunctionDefinition::UDF(udf) = func_def else { + return error::internal(format!("udf_to_expr: unexpected function: {func_def:?}")); + }; fn derivative_unit(ctx: &Context<'_>, args: &Vec) -> Result { if args.len() > 1 { @@ -1469,7 +1532,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { } } - match udf::WindowFunction::try_from_scalar_udf(Arc::clone(&fun)) { + match udf::WindowFunction::try_from_scalar_udf(Arc::clone(&udf)) { Some(udf::WindowFunction::MovingAverage) => Ok(Expr::WindowFunction(WindowFunction { fun: MOVING_AVERAGE.clone(), args, @@ -1556,7 +1619,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { .alias(alias)), None => error::internal(format!( "unexpected user-defined window function: {}", - fun.name + udf.name() )), } } @@ -1622,19 +1685,17 @@ impl<'a> InfluxQLToLogicalPlan<'a> { fields_to_exprs_no_nulls(input.schema(), group_by_tag_set).collect::>() }; - let window_func_exprs = vec![Expr::WindowFunction(WindowFunction { - fun: window_function::WindowFunction::BuiltInWindowFunction( - BuiltInWindowFunction::RowNumber, - ), - args: vec![], + let window_func_exprs = vec![Expr::WindowFunction(WindowFunction::new( + WindowFunctionDefinition::BuiltInWindowFunction(BuiltInWindowFunction::RowNumber), + vec![], partition_by, order_by, - window_frame: WindowFrame { + WindowFrame { units: WindowFrameUnits::Rows, start_bound: WindowFrameBound::Preceding(ScalarValue::Null), end_bound: WindowFrameBound::CurrentRow, }, - }) + )) .alias(IOX_ROW_ALIAS)]; // Prepare new projection. @@ -1846,7 +1907,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { Literal::Timestamp(v) => v .timestamp_nanos_opt() .ok_or_else(|| error::map::query("timestamp out of range")) - .map(|ts| lit(ScalarValue::TimestampNanosecond(Some(ts), None))), + .map(lit_timestamptz_nano), Literal::Duration(v) => { Ok(lit(ScalarValue::IntervalMonthDayNano(Some((**v).into())))) } @@ -1989,12 +2050,13 @@ impl<'a> InfluxQLToLogicalPlan<'a> { check_arg_count(name, args, 2)?; let nexpr = self.expr_to_df_expr(scope, &args[1], schema)?; - Ok(Expr::AggregateUDF(expr::AggregateUDF::new( - PERCENTILE.clone(), - vec![expr, nexpr], - None, - None, - ))) + Ok(Expr::AggregateFunction(expr::AggregateFunction { + func_def: AggregateFunctionDefinition::UDF(PERCENTILE.clone()), + args: vec![expr, nexpr], + distinct: false, + filter: None, + order_by: None, + })) } name @ ("first" | "last" | "min" | "max") => { let expr = self.expr_to_df_expr(scope, &args[0], schema)?; @@ -2118,7 +2180,7 @@ impl<'a> InfluxQLToLogicalPlan<'a> { call: &Call, schema: &IQLSchema<'a>, ) -> Result { - let args = call + let mut args = call .args .iter() .map(|e| self.expr_to_df_expr(scope, e, schema)) @@ -2129,13 +2191,16 @@ impl<'a> InfluxQLToLogicalPlan<'a> { if args.len() != 2 { error::query("invalid number of arguments for log, expected 2, got 1") } else { - Ok(Expr::ScalarFunction(ScalarFunction { - fun: BuiltinScalarFunction::Log, - args: args.into_iter().rev().collect(), - })) + let arg1 = args.pop().unwrap(); + let arg0 = args.pop().unwrap(); + // reverse args + Ok(datafusion::prelude::log(arg1, arg0)) } } - fun => Ok(Expr::ScalarFunction(ScalarFunction { fun, args })), + fun => Ok(Expr::ScalarFunction(ScalarFunction { + func_def: ScalarFunctionDefinition::BuiltIn(fun), + args, + })), } } @@ -2437,17 +2502,15 @@ impl<'a> InfluxQLToLogicalPlan<'a> { // - not null if it had any non-null values // // note that since we only have a single row, this is efficient - .project([Expr::ScalarFunction(ScalarFunction { - fun: BuiltinScalarFunction::MakeArray, - args: tags - .iter() + .project([datafusion::prelude::array( + tags.iter() .map(|tag| { let tag_col = Expr::Column(Column::from_name(*tag)); when(tag_col.gt(lit(0)), lit(*tag)).end() }) .collect::, _>>()?, - }) + ) .alias(tag_key_col)])? // roll our single array row into one row per tag key .unnest_column(tag_key_df_col)? @@ -2941,15 +3004,20 @@ fn build_gap_fill_node( input: LogicalPlan, time_column: &Expr, fill_strategy: FillStrategy, + projection_type: &ProjectionType, ) -> Result { let (expr, alias) = match time_column { - Expr::Alias(Alias { expr, name: alias }) => (expr.as_ref(), alias), + Expr::Alias(Alias { + expr, + relation: None, + name: alias, + }) => (expr.as_ref(), alias), _ => return error::internal("expected time column to have an alias function"), }; let date_bin_args = match expr { Expr::ScalarFunction(ScalarFunction { - fun: BuiltinScalarFunction::DateBin, + func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::DateBin), args, }) => args, _ => { @@ -2996,6 +3064,16 @@ fn build_gap_fill_node( } _ => Ok(VisitRecursion::Continue), }); + time_range = if projection_type == &ProjectionType::WindowAggregateMixed { + // For WindowAggregateMixed queries do not gap fill before the first + // iterator value. + time_range.map(|Range { start: _, end }| Range { + start: Bound::Unbounded, + end, + }) + } else { + time_range + }; time_range .ok_or_else(|| error::map::internal("expected to find a Filter or TableScan")) }?; @@ -3029,7 +3107,7 @@ fn build_gap_fill_node( let fill_strategy = aggr_expr .iter() .cloned() - .map(|e| (e, fill_strategy.clone())) + .map(|e| (e, fill_strategy)) .collect(); let time_column = col(input @@ -3148,8 +3226,13 @@ fn plan_with_metadata(plan: LogicalPlan, metadata: &InfluxQlMetadata) -> Result< LogicalPlan::Analyze(v) } LogicalPlan::Distinct(src) => { - let mut v = src.clone(); - v.input = Arc::new(set_schema(&src.input, metadata)?); + let v = match src.clone() { + Distinct::All(input) => Distinct::All(Arc::new(set_schema(&input, metadata)?)), + Distinct::On(mut on) => { + on.input = Arc::new(set_schema(&on.input, metadata)?); + Distinct::On(on) + } + }; LogicalPlan::Distinct(v) } LogicalPlan::Unnest(src) => { @@ -3309,7 +3392,7 @@ fn window_partition_by( parition_by.push(date_bin( stride, "time".as_expr(), - lit(ScalarValue::TimestampNanosecond(Some(offset), None)), + lit_timestamptz_nano(offset), )); } parition_by @@ -3482,7 +3565,7 @@ mod test { } #[test] - fn test_snow_measurements() { + fn test_show_measurements() { assert_snapshot!(plan("SHOW MEASUREMENTS"), @"TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)]"); assert_snapshot!(plan("SHOW MEASUREMENTS LIMIT 1 OFFSET 2"), @r###" Sort: measurements.iox::measurement ASC NULLS LAST, measurements.name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] @@ -3494,46 +3577,55 @@ mod test { assert_snapshot!(plan("SHOW MEASUREMENTS WHERE foo = 'some_foo'"), @r###" Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Filter: temp_03.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] @@ -3542,46 +3634,55 @@ mod test { assert_snapshot!(plan("SHOW MEASUREMENTS WHERE time > 1337"), @r###" Sort: iox::measurement ASC NULLS LAST, name ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] - Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Union [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("all_types")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("cpu")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("data")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("disk")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("diskio")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_00")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("merge_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("name_clash")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_01")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_02")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + Limit: skip=0, fetch=1 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Projection: Dictionary(Int32, Utf8("measurements")) AS iox::measurement, Dictionary(Int32, Utf8("temp_03")) AS name [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] Limit: skip=0, fetch=1 [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Filter: temp_03.time >= TimestampNanosecond(1338, None) [shared_field0:Utf8;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] @@ -3606,76 +3707,85 @@ mod test { assert_snapshot!(plan("SHOW TAG KEYS WHERE foo = 'some_foo'"), @r###" Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N] - Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N] - Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N] - Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N] - Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N] - Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N] - Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N] - Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N] - Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] - Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] - Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N] + Filter: all_types.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N] + Filter: cpu.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N] + Filter: data.time >= TimestampNanosecond(1672444800000000000, None) AND data.foo = Dictionary(Int32, Utf8("some_foo")) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N] + Filter: disk.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N] + Filter: diskio.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N] + Filter: merge_00.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N] + Filter: merge_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N] + Filter: name_clash.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] + Filter: temp_01.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] + Filter: temp_02.time >= TimestampNanosecond(1672444800000000000, None) AND Boolean(false) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] Filter: tagKey IS NOT NULL [tagKey:Utf8;N] Unnest: tagKey [tagKey:Utf8;N] @@ -3691,76 +3801,85 @@ mod test { assert_snapshot!(plan("SHOW TAG KEYS WHERE time > 1337"), @r###" Sort: iox::measurement ASC NULLS LAST, tagKey ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N] - Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] - Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N] - Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N] - Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N] - Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N] - Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] - Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N] - Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N] - Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N] - Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] - Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] - Filter: tagKey IS NOT NULL [tagKey:Utf8;N] - Unnest: tagKey [tagKey:Utf8;N] - Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] - Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] - Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] - TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Union [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Projection: Dictionary(Int32, Utf8("all_types")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN tag0 > Int32(0) THEN Utf8("tag0") END, CASE WHEN tag1 > Int32(0) THEN Utf8("tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(all_types.tag0 IS NOT NULL AS UInt64)) AS tag0, SUM(CAST(all_types.tag1 IS NOT NULL AS UInt64)) AS tag1]] [tag0:UInt64;N, tag1:UInt64;N] + Filter: all_types.time >= TimestampNanosecond(1338, None) [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN cpu > Int32(0) THEN Utf8("cpu") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(cpu.cpu IS NOT NULL AS UInt64)) AS cpu, SUM(CAST(cpu.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(cpu.region IS NOT NULL AS UInt64)) AS region]] [cpu:UInt64;N, host:UInt64;N, region:UInt64;N] + Filter: cpu.time >= TimestampNanosecond(1338, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN bar > Int32(0) THEN Utf8("bar") END, CASE WHEN foo > Int32(0) THEN Utf8("foo") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(data.bar IS NOT NULL AS UInt64)) AS bar, SUM(CAST(data.foo IS NOT NULL AS UInt64)) AS foo]] [bar:UInt64;N, foo:UInt64;N] + Filter: data.time >= TimestampNanosecond(1338, None) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + Projection: Dictionary(Int32, Utf8("disk")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN device > Int32(0) THEN Utf8("device") END, CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(disk.device IS NOT NULL AS UInt64)) AS device, SUM(CAST(disk.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(disk.region IS NOT NULL AS UInt64)) AS region]] [device:UInt64;N, host:UInt64;N, region:UInt64;N] + Filter: disk.time >= TimestampNanosecond(1338, None) [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: disk [bytes_free:Int64;N, bytes_used:Int64;N, device:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("diskio")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN host > Int32(0) THEN Utf8("host") END, CASE WHEN region > Int32(0) THEN Utf8("region") END, CASE WHEN status > Int32(0) THEN Utf8("status") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(diskio.host IS NOT NULL AS UInt64)) AS host, SUM(CAST(diskio.region IS NOT NULL AS UInt64)) AS region, SUM(CAST(diskio.status IS NOT NULL AS UInt64)) AS status]] [host:UInt64;N, region:UInt64;N, status:UInt64;N] + Filter: diskio.time >= TimestampNanosecond(1338, None) [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + TableScan: diskio [bytes_read:Int64;N, bytes_written:Int64;N, host:Dictionary(Int32, Utf8);N, is_local:Boolean;N, read_utilization:Float64;N, region:Dictionary(Int32, Utf8);N, status:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), write_utilization:Float64;N] + Projection: Dictionary(Int32, Utf8("merge_00")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN col0 > Int32(0) THEN Utf8("col0") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_00.col0 IS NOT NULL AS UInt64)) AS col0]] [col0:UInt64;N] + Filter: merge_00.time >= TimestampNanosecond(1338, None) [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_00 [col0:Dictionary(Int32, Utf8);N, col1:Float64;N, col2:Boolean;N, col3:Utf8;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("merge_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN col1 > Int32(0) THEN Utf8("col1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(merge_01.col1 IS NOT NULL AS UInt64)) AS col1]] [col1:UInt64;N] + Filter: merge_01.time >= TimestampNanosecond(1338, None) [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + TableScan: merge_01 [col0:Float64;N, col1:Dictionary(Int32, Utf8);N, col2:Utf8;N, col3:Boolean;N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("name_clash")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN first > Int32(0) THEN Utf8("first") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(name_clash.first IS NOT NULL AS UInt64)) AS first]] [first:UInt64;N] + Filter: name_clash.time >= TimestampNanosecond(1338, None) [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: name_clash [f:Float64;N, first:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("temp_01")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_01.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_01.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] + Filter: temp_01.time >= TimestampNanosecond(1338, None) [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_01 [field_f64:Float64;N, field_i64:Int64;N, field_str:Utf8;N, field_u64:UInt64;N, shared_field0:Float64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + Projection: Dictionary(Int32, Utf8("temp_02")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] + Filter: tagKey IS NOT NULL [tagKey:Utf8;N] + Unnest: tagKey [tagKey:Utf8;N] + Projection: make_array(CASE WHEN shared_tag0 > Int32(0) THEN Utf8("shared_tag0") END, CASE WHEN shared_tag1 > Int32(0) THEN Utf8("shared_tag1") END) AS tagKey [tagKey:List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} });N] + Aggregate: groupBy=[[]], aggr=[[SUM(CAST(temp_02.shared_tag0 IS NOT NULL AS UInt64)) AS shared_tag0, SUM(CAST(temp_02.shared_tag1 IS NOT NULL AS UInt64)) AS shared_tag1]] [shared_tag0:UInt64;N, shared_tag1:UInt64;N] + Filter: temp_02.time >= TimestampNanosecond(1338, None) [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] + TableScan: temp_02 [shared_field0:Int64;N, shared_tag0:Dictionary(Int32, Utf8);N, shared_tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None)] Projection: Dictionary(Int32, Utf8("temp_03")) AS iox::measurement, tagKey [iox::measurement:Dictionary(Int32, Utf8), tagKey:Utf8;N] Filter: tagKey IS NOT NULL [tagKey:Utf8;N] Unnest: tagKey [tagKey:Utf8;N] @@ -3897,8 +4016,9 @@ mod test { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N] Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); // Outer query projects subquery with binary expressions @@ -3907,8 +4027,9 @@ mod test { Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time AS time, value * Float64(0.99) AS value [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), value:Float64;N] Sort: time ASC NULLS LAST [time:Timestamp(Nanosecond, None), value:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), value:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), value:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), value:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); // Outer query groups by the `cpu` tag, which should be pushed all the way to inner-most subquery @@ -3920,8 +4041,9 @@ mod test { Aggregate: groupBy=[[cpu]], aggr=[[selector_max(value, time)]] [cpu:Dictionary(Int32, Utf8);N, selector_max(value,time):Struct([Field { name: "value", data_type: Float64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "time", data_type: Timestamp(Nanosecond, None), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]);N] Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT value IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS value [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, value:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); } } @@ -4083,7 +4205,13 @@ mod test { "###); // Invariant: second argument is always a constant - assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), usage_system) FROM cpu GROUP BY TIME(10s)"), @"Error during planning: expected integer argument in moving_average()"); + assert_snapshot!(plan("SELECT MOVING_AVERAGE(MEAN(usage_idle), usage_system) FROM cpu GROUP BY TIME(10s)"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: expected integer argument in moving_average() + "###); } #[test] @@ -4178,8 +4306,16 @@ mod test { } #[test] - fn test_not_implemented() { - assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)), MEAN(usage_idle) FROM cpu GROUP BY TIME(10s)"), @"This feature is not implemented: mixed window-aggregate and aggregate columns, such as DIFFERENCE(MEAN(col)), MEAN(col)"); + fn test_mixed_aggregate() { + assert_snapshot!(plan("SELECT DIFFERENCE(MEAN(usage_idle)), MEAN(usage_idle) FROM cpu GROUP BY TIME(10s)"), @r###" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, difference(AVG(cpu.usage_idle)) AS difference, AVG(cpu.usage_idle) AS mean [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, difference:Float64;N, mean:Float64;N] + WindowAggr: windowExpr=[[difference(AVG(cpu.usage_idle)) ORDER BY [time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING AS difference(AVG(cpu.usage_idle))]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N, difference(AVG(cpu.usage_idle)):Float64;N] + GapFill: groupBy=[time], aggr=[[AVG(cpu.usage_idle)]], time_column=time, stride=IntervalMonthDayNano("10000000000"), range=Unbounded..Included(Literal(TimestampNanosecond(1672531200000000000, None))) [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N] + Aggregate: groupBy=[[date_bin(IntervalMonthDayNano("10000000000"), cpu.time, TimestampNanosecond(0, None)) AS time]], aggr=[[AVG(cpu.usage_idle)]] [time:Timestamp(Nanosecond, None);N, AVG(cpu.usage_idle):Float64;N] + Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + "###); } } @@ -4190,22 +4326,25 @@ mod test { Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu"), @r###" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), distinct:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), distinct:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), distinct:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), distinct:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); assert_snapshot!(plan("SELECT DISTINCT usage_idle FROM cpu GROUP BY cpu"), @r###" Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] Distinct: [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] - Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] - TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Projection: TimestampNanosecond(0, None) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); assert_snapshot!(plan("SELECT COUNT(DISTINCT usage_idle) FROM cpu"), @r###" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N] @@ -4213,10 +4352,40 @@ mod test { Aggregate: groupBy=[[]], aggr=[[COUNT(DISTINCT cpu.usage_idle)]] [COUNT(DISTINCT cpu.usage_idle):Int64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] "###); + assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s)"), @r###" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, distinct:Float64;N] + Distinct: [time:Timestamp(Nanosecond, None);N, distinct:Float64;N] + Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, distinct:Float64;N] + Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, distinct:Float64;N] + Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + "###); + assert_snapshot!(plan("SELECT DISTINCT(usage_idle) FROM cpu GROUP BY time(1s), cpu"), @r###" + Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Projection: Dictionary(Int32, Utf8("cpu")) AS iox::measurement, time, cpu, distinct [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Distinct: [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Filter: NOT distinct IS NULL [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Projection: date_bin(IntervalMonthDayNano("1000000000"), cpu.time, TimestampNanosecond(0, None)) AS time, cpu.cpu AS cpu, cpu.usage_idle AS distinct [time:Timestamp(Nanosecond, None);N, cpu:Dictionary(Int32, Utf8);N, distinct:Float64;N] + Filter: cpu.time <= TimestampNanosecond(1672531200000000000, None) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] + "###); // fallible - assert_snapshot!(plan("SELECT DISTINCT(usage_idle), DISTINCT(usage_system) FROM cpu"), @"Error during planning: aggregate function distinct() cannot be combined with other functions or fields"); - assert_snapshot!(plan("SELECT DISTINCT(usage_idle), usage_system FROM cpu"), @"Error during planning: aggregate function distinct() cannot be combined with other functions or fields"); + assert_snapshot!(plan("SELECT DISTINCT(usage_idle), DISTINCT(usage_system) FROM cpu"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: aggregate function distinct() cannot be combined with other functions or fields + "###); + assert_snapshot!(plan("SELECT DISTINCT(usage_idle), usage_system FROM cpu"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: aggregate function distinct() cannot be combined with other functions or fields + "###); } mod functions { @@ -4358,7 +4527,13 @@ mod test { #[test] fn test_selectors_invalid_arguments_3() { // Invalid number of arguments - assert_snapshot!(plan("SELECT MIN(usage_idle, usage_idle) FROM cpu"), @"Error during planning: invalid number of arguments for min, expected 1, got 2"); + assert_snapshot!(plan("SELECT MIN(usage_idle, usage_idle) FROM cpu"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: invalid number of arguments for min, expected 1, got 2 + "###); } } @@ -4422,7 +4597,7 @@ mod test { Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT top(usage_idle,10),cpu FROM cpu"), @r###" Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N] @@ -4430,7 +4605,7 @@ mod test { Filter: ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT top(usage_idle,10) FROM cpu GROUP BY cpu"), @r###" Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, top:Float64;N] @@ -4438,7 +4613,7 @@ mod test { Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT top(usage_idle,cpu,10) FROM cpu"), @r###" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), top:Float64;N, cpu:Dictionary(Int32, Utf8);N] @@ -4448,7 +4623,7 @@ mod test { Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle DESC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle DESC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); } #[test] @@ -4459,7 +4634,7 @@ mod test { Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT bottom(usage_idle,10),cpu FROM cpu"), @r###" Sort: time ASC NULLS LAST, cpu ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N] @@ -4467,7 +4642,7 @@ mod test { Filter: ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT bottom(usage_idle,10) FROM cpu GROUP BY cpu"), @r###" Sort: cpu ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), cpu:Dictionary(Int32, Utf8);N, bottom:Float64;N] @@ -4475,7 +4650,7 @@ mod test { Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(10) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); assert_snapshot!(plan("SELECT bottom(usage_idle,cpu,10) FROM cpu"), @r###" Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), bottom:Float64;N, cpu:Dictionary(Int32, Utf8);N] @@ -4485,7 +4660,7 @@ mod test { Filter: ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW <= Int64(1) [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] WindowAggr: windowExpr=[[ROW_NUMBER() PARTITION BY [cpu.cpu] ORDER BY [cpu.usage_idle ASC NULLS LAST, cpu.time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N, ROW_NUMBER() PARTITION BY [cpu] ORDER BY [usage_idle ASC NULLS LAST, time ASC NULLS LAST] ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW:UInt64;N] TableScan: cpu [cpu:Dictionary(Int32, Utf8);N, host:Dictionary(Int32, Utf8);N, region:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), usage_idle:Float64;N, usage_system:Float64;N, usage_user:Float64;N] - "###); + "###); } /// Test InfluxQL-specific behaviour of scalar functions that differ @@ -4502,7 +4677,13 @@ mod test { // Fallible // LOG requires two arguments - assert_snapshot!(plan("SELECT LOG(usage_idle) FROM cpu"), @"Error during planning: invalid number of arguments for log, expected 2, got 1"); + assert_snapshot!(plan("SELECT LOG(usage_idle) FROM cpu"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: invalid number of arguments for log, expected 2, got 1 + "###); } /// Validate the metadata is correctly encoded in the schema. @@ -4631,7 +4812,13 @@ mod test { "### ); assert_snapshot!( - plan("SELECT foo, f64_field FROM data where time > '2004-04-09T'"), @r###"Error during planning: invalid expression "'2004-04-09T'": '2004-04-09T' is not a valid timestamp"### + plan("SELECT foo, f64_field FROM data where time > '2004-04-09T'"), @r###" + rewriting statement + caused by + split condition + caused by + Error during planning: invalid expression "'2004-04-09T'": '2004-04-09T' is not a valid timestamp + "### ); // time on the right-hand side @@ -4647,7 +4834,13 @@ mod test { // fallible // Unsupported operator - assert_snapshot!(plan("SELECT foo, f64_field FROM data where time != 0"), @"Error during planning: invalid time comparison operator: !=") + assert_snapshot!(plan("SELECT foo, f64_field FROM data where time != 0"), @r###" + rewriting statement + caused by + split condition + caused by + Error during planning: invalid time comparison operator: != + "###) } #[test] @@ -4777,6 +4970,33 @@ mod test { Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.foo AS foo, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, f64_field:Float64;N] TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] "###); + assert_snapshot!(plan("EXPLAIN SHOW MEASUREMENTS"), @r###" + Explain [plan_type:Utf8, plan:Utf8] + TableScan: measurements [iox::measurement:Dictionary(Int32, Utf8), name:Dictionary(Int32, Utf8)] + "###); + assert_snapshot!(plan("EXPLAIN SHOW TAG KEYS"), @r###" + Explain [plan_type:Utf8, plan:Utf8] + TableScan: tag_keys [iox::measurement:Dictionary(Int32, Utf8), tagKey:Dictionary(Int32, Utf8)] + "###); + + assert_snapshot!(plan("EXPLAIN SHOW FIELD KEYS"), @r###" + Explain [plan_type:Utf8, plan:Utf8] + TableScan: field_keys [iox::measurement:Utf8, fieldKey:Utf8, fieldType:Utf8] + "###); + + assert_snapshot!(plan("EXPLAIN SHOW RETENTION POLICIES"), @r###" + Explain [plan_type:Utf8, plan:Utf8] + TableScan: retention policies [iox::measurement:Dictionary(Int32, Utf8), name:Utf8, duration:Utf8, shardGroupDuration:Utf8, replicaN:Int64, default:Boolean] + "###); + + assert_snapshot!(plan("EXPLAIN SHOW DATABASES"), @"This feature is not implemented: SHOW DATABASES"); + assert_snapshot!(plan("EXPLAIN EXPLAIN SELECT f64_field::string FROM data"), @r###" + Explain [plan_type:Utf8, plan:Utf8] + Explain [plan_type:Utf8, plan:Utf8] + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, NULL AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Null;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); } #[test] @@ -4946,6 +5166,17 @@ mod test { TableScan: all_types [bool_field:Boolean;N, f64_field:Float64;N, i64_field:Int64;N, str_field:Utf8;N, tag0:Dictionary(Int32, Utf8);N, tag1:Dictionary(Int32, Utf8);N, time:Timestamp(Nanosecond, None), u64_field:UInt64;N] "###); } + + /// See + #[test] + fn test_true_and_time_pred() { + assert_snapshot!(plan("SELECT f64_field FROM data WHERE true AND time < '2022-10-31T02:02:00Z'"), @r###" + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, data.time AS time, data.f64_field AS f64_field [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), f64_field:Float64;N] + Filter: data.time <= TimestampNanosecond(1667181719999999999, None) AND Boolean(true) [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); + } } /// Tests to validate InfluxQL `SELECT` statements that project aggregate functions, such as `COUNT` or `SUM`. @@ -4958,24 +5189,24 @@ mod test { #[test] fn no_group_by() { assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data"), @r###" - Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N] - Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - "###); + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), count:Int64;N] + Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY non_existent"), @r###" - Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N] - Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - "###); + Sort: time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), non_existent:Null;N, count:Int64;N] + Aggregate: groupBy=[[]], aggr=[[COUNT(data.f64_field)]] [COUNT(data.f64_field):Int64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo"), @r###" - Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N] - Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - "###); + Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, count:Int64;N] + Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); // The `COUNT(f64_field)` aggregate is only projected ones in the Aggregate and reused in the projection assert_snapshot!(plan("SELECT COUNT(f64_field), COUNT(f64_field) + COUNT(f64_field), COUNT(f64_field) * 3 FROM data"), @r###" @@ -4987,11 +5218,11 @@ mod test { // non-existent tags are excluded from the Aggregate groupBy and Sort operators assert_snapshot!(plan("SELECT COUNT(f64_field) FROM data GROUP BY foo, non_existent"), @r###" - Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N] - Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N] - Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N] - TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] - "###); + Sort: foo ASC NULLS LAST, time ASC NULLS LAST [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N] + Projection: Dictionary(Int32, Utf8("data")) AS iox::measurement, TimestampNanosecond(0, None) AS time, data.foo AS foo, NULL AS non_existent, coalesce_struct(COUNT(data.f64_field), Int64(0)) AS count [iox::measurement:Dictionary(Int32, Utf8), time:Timestamp(Nanosecond, None), foo:Dictionary(Int32, Utf8);N, non_existent:Null;N, count:Int64;N] + Aggregate: groupBy=[[data.foo]], aggr=[[COUNT(data.f64_field)]] [foo:Dictionary(Int32, Utf8);N, COUNT(data.f64_field):Int64;N] + TableScan: data [TIME:Boolean;N, bar:Dictionary(Int32, Utf8);N, bool_field:Boolean;N, f64_field:Float64;N, foo:Dictionary(Int32, Utf8);N, i64_field:Int64;N, mixedCase:Float64;N, str_field:Utf8;N, time:Timestamp(Nanosecond, None), with space:Float64;N] + "###); // Aggregate expression is projected once and reused in final projection assert_snapshot!(plan("SELECT COUNT(f64_field), COUNT(f64_field) * 2 FROM data"), @r###" @@ -5027,8 +5258,20 @@ mod test { // Fallible // Cannot combine aggregate and non-aggregate columns in the projection - assert_snapshot!(plan("SELECT COUNT(f64_field), f64_field FROM data"), @"Error during planning: mixing aggregate and non-aggregate columns is not supported"); - assert_snapshot!(plan("SELECT COUNT(f64_field) + f64_field FROM data"), @"Error during planning: mixing aggregate and non-aggregate columns is not supported"); + assert_snapshot!(plan("SELECT COUNT(f64_field), f64_field FROM data"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: mixing aggregate and non-aggregate columns is not supported + "###); + assert_snapshot!(plan("SELECT COUNT(f64_field) + f64_field FROM data"), @r###" + rewriting statement + caused by + gather information about select statement + caused by + Error during planning: mixing aggregate and non-aggregate columns is not supported + "###); } #[test] diff --git a/iox_query_influxql/src/plan/planner/select.rs b/iox_query_influxql/src/plan/planner/select.rs index bfafb455424..97f1fdb08a3 100644 --- a/iox_query_influxql/src/plan/planner/select.rs +++ b/iox_query_influxql/src/plan/planner/select.rs @@ -288,7 +288,7 @@ impl<'a> Selector<'a> { )); } Ok(Self::First { - field_key: Self::identifier(call.args.first().unwrap())?, + field_key: Self::identifier(&call.args[0])?, }) } @@ -300,7 +300,7 @@ impl<'a> Selector<'a> { )); } Ok(Self::Last { - field_key: Self::identifier(call.args.first().unwrap())?, + field_key: Self::identifier(&call.args[0])?, }) } @@ -312,7 +312,7 @@ impl<'a> Selector<'a> { )); } Ok(Self::Max { - field_key: Self::identifier(call.args.first().unwrap())?, + field_key: Self::identifier(&call.args[0])?, }) } @@ -324,7 +324,7 @@ impl<'a> Selector<'a> { )); } Ok(Self::Min { - field_key: Self::identifier(call.args.first().unwrap())?, + field_key: Self::identifier(&call.args[0])?, }) } @@ -336,8 +336,8 @@ impl<'a> Selector<'a> { )); } Ok(Self::Percentile { - field_key: Self::identifier(call.args.first().unwrap())?, - n: Self::literal_num(call.args.get(1).unwrap())?, + field_key: Self::identifier(&call.args[0])?, + n: Self::literal_num(&call.args[1])?, }) } @@ -349,8 +349,8 @@ impl<'a> Selector<'a> { )); } Ok(Self::Sample { - field_key: Self::identifier(call.args.first().unwrap())?, - n: Self::literal_int(call.args.get(1).unwrap())?, + field_key: Self::identifier(&call.args[0])?, + n: Self::literal_int(&call.args[1])?, }) } diff --git a/iox_query_influxql/src/plan/planner_rewrite_expression.rs b/iox_query_influxql/src/plan/planner_rewrite_expression.rs index e9afe3f5a99..9fd50d5f995 100644 --- a/iox_query_influxql/src/plan/planner_rewrite_expression.rs +++ b/iox_query_influxql/src/plan/planner_rewrite_expression.rs @@ -127,7 +127,7 @@ use crate::plan::util::IQLSchema; use arrow::datatypes::DataType; use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; use datafusion::common::{Result, ScalarValue}; -use datafusion::logical_expr::expr::{AggregateFunction, AggregateUDF, WindowFunction}; +use datafusion::logical_expr::expr::{AggregateFunction, WindowFunction}; use datafusion::logical_expr::{ binary_expr, cast, coalesce, lit, BinaryExpr, Expr, ExprSchemable, GetIndexedField, Operator, }; @@ -424,7 +424,6 @@ fn rewrite_expr(expr: Expr, schema: &IQLSchema<'_>) -> Result { // Invoking an aggregate or window function on a tag column should return `NULL` // to be consistent with OG. Expr::AggregateFunction(AggregateFunction { ref args, .. } ) - | Expr::AggregateUDF(AggregateUDF { ref args, .. } ) | Expr::WindowFunction(WindowFunction { ref args, .. } ) => match &args[0] { Expr::Column(Column { ref name, .. }) if schema.is_tag_field(name) => yes(lit(ScalarValue::Null)), _ => no(expr), @@ -546,9 +545,8 @@ mod test { use crate::plan::ir::DataSourceSchema; use super::*; - use datafusion::logical_expr::lit_timestamp_nano; use datafusion::prelude::col; - use datafusion_util::AsExpr; + use datafusion_util::{lit_timestamptz_nano, AsExpr}; use chrono::{DateTime, NaiveDate, Utc}; use datafusion::common::{DFSchemaRef, ToDFSchema}; @@ -796,15 +794,15 @@ mod test { let schemas = new_schema(); let rewrite = |expr| rewrite_expr(expr, &schemas).unwrap().to_string(); - let expr = "time".as_expr().gt_eq(lit_timestamp_nano(1000)); + let expr = "time".as_expr().gt_eq(lit_timestamptz_nano(1000)); assert_eq!(rewrite(expr), "time >= TimestampNanosecond(1000, None)"); - let expr = lit_timestamp_nano(1000).lt_eq("time".as_expr()); + let expr = lit_timestamptz_nano(1000).lt_eq("time".as_expr()); assert_eq!(rewrite(expr), "TimestampNanosecond(1000, None) <= time"); let expr = "time" .as_expr() - .gt_eq(lit_timestamp_nano(1000)) + .gt_eq(lit_timestamptz_nano(1000)) .and("tag0".as_expr().eq(lit("foo"))); assert_eq!( rewrite(expr), @@ -813,7 +811,7 @@ mod test { let expr = "time" .as_expr() - .gt_eq(lit_timestamp_nano(1000)) + .gt_eq(lit_timestamptz_nano(1000)) .and("float_field".as_expr().eq(lit(false))); assert_eq!( rewrite(expr), diff --git a/iox_query_influxql/src/plan/rewriter.rs b/iox_query_influxql/src/plan/rewriter.rs index be451229590..dc4fcc7b37b 100644 --- a/iox_query_influxql/src/plan/rewriter.rs +++ b/iox_query_influxql/src/plan/rewriter.rs @@ -100,8 +100,12 @@ impl RewriteSelect { let from = self.expand_from(s, stmt)?; let tag_set = from_tag_set(s, &from); - let (fields, group_by) = self.expand_projection(s, stmt, &from, &tag_set)?; - let condition = self.condition_resolve_types(s, stmt, &from)?; + let (fields, group_by) = self + .expand_projection(s, stmt, &from, &tag_set) + .map_err(|e| e.context("expand projection"))?; + let condition = self + .condition_resolve_types(s, stmt, &from) + .map_err(|e| e.context("resolve types in condition"))?; let now = Timestamp::from(s.execution_props().query_execution_start_time); let rc = ReduceContext { @@ -109,10 +113,14 @@ impl RewriteSelect { tz: stmt.timezone.map(|tz| *tz), }; - let interval = self.find_interval_offset(&rc, group_by.as_ref())?; + let interval = self + .find_interval_offset(&rc, group_by.as_ref()) + .map_err(|e| e.context("find interval offset"))?; let (condition, time_range) = match condition { - Some(where_clause) => split_cond(&rc, &where_clause).map_err(error::map::expr_error)?, + Some(where_clause) => split_cond(&rc, &where_clause) + .map_err(error::map::expr_error) + .map_err(|e| e.context("split condition"))?, None => (None, TimeRange::default()), }; @@ -131,7 +139,8 @@ impl RewriteSelect { let SelectStatementInfo { projection_type, extra_intervals, - } = select_statement_info(&fields, &group_by, stmt.fill)?; + } = select_statement_info(&fields, &group_by, stmt.fill) + .map_err(|e| e.context("gather information about select statement"))?; // Following InfluxQL OG behaviour, if this is a subquery, and the fill strategy equates // to `FILL(null)`, switch to `FILL(none)`. @@ -1042,6 +1051,8 @@ impl FieldChecker { } else { ProjectionType::WindowAggregateMixed } + } else if self.has_distinct { + ProjectionType::RawDistinct } else { ProjectionType::Aggregate } @@ -1566,7 +1577,7 @@ pub(crate) enum ProjectionType { /// A query that projects no aggregate or selector functions. #[default] Raw, - /// A query that projects a single DISTINCT(field) + /// A query that projects a single DISTINCT(field). RawDistinct, /// A query that projects one or more aggregate functions or /// two or more selector functions. @@ -2432,21 +2443,21 @@ mod test { let stmt = parse_select("SELECT *::field + *::tag FROM cpu"); let err = rewrite_select_statement(&namespace, &stmt).unwrap_err(); - assert_eq!( + assert_contains!( err.to_string(), "Error during planning: unsupported binary expression: contains a wildcard or regular expression" ); let stmt = parse_select("SELECT COUNT(*) + SUM(usage_idle) FROM cpu"); let err = rewrite_select_statement(&namespace, &stmt).unwrap_err(); - assert_eq!( + assert_contains!( err.to_string(), "Error during planning: unsupported binary expression: contains a wildcard or regular expression" ); let stmt = parse_select("SELECT COUNT(*::tag) FROM cpu"); let err = rewrite_select_statement(&namespace, &stmt).unwrap_err(); - assert_eq!( + assert_contains!( err.to_string(), "Error during planning: unable to use tag as wildcard in count()" ); diff --git a/iox_query_influxql/src/plan/udf.rs b/iox_query_influxql/src/plan/udf.rs index 437bfda68e7..fdf8a2b5c1a 100644 --- a/iox_query_influxql/src/plan/udf.rs +++ b/iox_query_influxql/src/plan/udf.rs @@ -8,12 +8,16 @@ use crate::plan::util::find_exprs_in_exprs; use crate::{error, NUMERICS}; use arrow::datatypes::{DataType, TimeUnit}; -use datafusion::logical_expr::{ - Expr, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, Signature, TypeSignature, - Volatility, +use datafusion::{ + error::{DataFusionError, Result}, + logical_expr::{ + Expr, ScalarFunctionDefinition, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, + Volatility, + }, + physical_plan::ColumnarValue, }; use once_cell::sync::Lazy; -use std::sync::Arc; +use std::{any::Any, sync::Arc}; pub(super) enum WindowFunction { MovingAverage, @@ -27,7 +31,7 @@ pub(super) enum WindowFunction { impl WindowFunction { /// Try to return the equivalent [`WindowFunction`] for `fun`. pub(super) fn try_from_scalar_udf(fun: Arc) -> Option { - match fun.name.as_str() { + match fun.name() { MOVING_AVERAGE_UDF_NAME => Some(Self::MovingAverage), DIFFERENCE_UDF_NAME => Some(Self::Difference), NON_NEGATIVE_DIFFERENCE_UDF_NAME => Some(Self::NonNegativeDifference), @@ -39,17 +43,51 @@ impl WindowFunction { } } -/// Find all [`Expr::ScalarUDF`] expressions that match one of the supported +/// Find all [`ScalarUDF`] expressions that match one of the supported /// window UDF functions. pub(super) fn find_window_udfs(exprs: &[Expr]) -> Vec { - find_exprs_in_exprs( - exprs, - &|nested_expr| matches!(nested_expr, Expr::ScalarUDF(s) if WindowFunction::try_from_scalar_udf(Arc::clone(&s.fun)).is_some()), - ) + find_exprs_in_exprs(exprs, &|nested_expr| { + let Expr::ScalarFunction(fun) = nested_expr else { + return false; + }; + let ScalarFunctionDefinition::UDF(udf) = &fun.func_def else { + return false; + }; + WindowFunction::try_from_scalar_udf(Arc::clone(udf)).is_some() + }) } const MOVING_AVERAGE_UDF_NAME: &str = "moving_average"; +#[derive(Debug)] +struct MovingAverageUDF { + signature: Signature, +} + +impl ScalarUDFImpl for MovingAverageUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + MOVING_AVERAGE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{MOVING_AVERAGE_UDF_NAME} should not exist in the final logical plan" + )) + } +} + /// Create an expression to represent the `MOVING_AVERAGE` function. pub(crate) fn moving_average(args: Vec) -> Expr { MOVING_AVERAGE.call(args) @@ -57,25 +95,53 @@ pub(crate) fn moving_average(args: Vec) -> Expr { /// Definition of the `MOVING_AVERAGE` function. static MOVING_AVERAGE: Lazy> = Lazy::new(|| { - static RETURN_TYPE: Lazy> = Lazy::new(|| Arc::new(DataType::Float64)); - - let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(RETURN_TYPE.clone())); - Arc::new(ScalarUDF::new( - MOVING_AVERAGE_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(MovingAverageUDF { + signature: Signature::one_of( NUMERICS .iter() .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64])) .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(MOVING_AVERAGE_UDF_NAME), - )) + })) }); const DIFFERENCE_UDF_NAME: &str = "difference"; +#[derive(Debug)] +struct DifferenceUDF { + signature: Signature, +} + +impl ScalarUDFImpl for DifferenceUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + DIFFERENCE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.is_empty() { + return Err(DataFusionError::Plan(format!( + "{DIFFERENCE_UDF_NAME} expects at least 1 argument" + ))); + } + Ok(arg_types[0].clone()) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{DIFFERENCE_UDF_NAME} should not exist in the final logical plan" + )) + } +} + /// Create an expression to represent the `DIFFERENCE` function. pub(crate) fn difference(args: Vec) -> Expr { DIFFERENCE.call(args) @@ -83,23 +149,53 @@ pub(crate) fn difference(args: Vec) -> Expr { /// Definition of the `DIFFERENCE` function. static DIFFERENCE: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone()))); - Arc::new(ScalarUDF::new( - DIFFERENCE_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(DifferenceUDF { + signature: Signature::one_of( NUMERICS .iter() .map(|dt| TypeSignature::Exact(vec![dt.clone()])) .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(DIFFERENCE_UDF_NAME), - )) + })) }); const NON_NEGATIVE_DIFFERENCE_UDF_NAME: &str = "non_negative_difference"; +#[derive(Debug)] +struct NonNegativeDifferenceUDF { + signature: Signature, +} + +impl ScalarUDFImpl for NonNegativeDifferenceUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + NON_NEGATIVE_DIFFERENCE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.is_empty() { + return Err(DataFusionError::Plan(format!( + "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} expects at least 1 argument" + ))); + } + Ok(arg_types[0].clone()) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{NON_NEGATIVE_DIFFERENCE_UDF_NAME} should not exist in the final logical plan" + )) + } +} + /// Create an expression to represent the `NON_NEGATIVE_DIFFERENCE` function. pub(crate) fn non_negative_difference(args: Vec) -> Expr { NON_NEGATIVE_DIFFERENCE.call(args) @@ -107,23 +203,48 @@ pub(crate) fn non_negative_difference(args: Vec) -> Expr { /// Definition of the `NON_NEGATIVE_DIFFERENCE` function. static NON_NEGATIVE_DIFFERENCE: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone()))); - Arc::new(ScalarUDF::new( - NON_NEGATIVE_DIFFERENCE_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(NonNegativeDifferenceUDF { + signature: Signature::one_of( NUMERICS .iter() .map(|dt| TypeSignature::Exact(vec![dt.clone()])) .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(NON_NEGATIVE_DIFFERENCE_UDF_NAME), - )) + })) }); const DERIVATIVE_UDF_NAME: &str = "derivative"; +#[derive(Debug)] +struct DerivativeUDF { + signature: Signature, +} + +impl ScalarUDFImpl for DerivativeUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + DERIVATIVE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{DERIVATIVE_UDF_NAME} should not exist in the final logical plan" + )) + } +} + /// Create an expression to represent the `DERIVATIVE` function. pub(crate) fn derivative(args: Vec) -> Expr { DERIVATIVE.call(args) @@ -131,10 +252,8 @@ pub(crate) fn derivative(args: Vec) -> Expr { /// Definition of the `DERIVATIVE` function. static DERIVATIVE: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Float64))); - Arc::new(ScalarUDF::new( - DERIVATIVE_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(DerivativeUDF { + signature: Signature::one_of( NUMERICS .iter() .flat_map(|dt| { @@ -149,13 +268,39 @@ static DERIVATIVE: Lazy> = Lazy::new(|| { .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(DERIVATIVE_UDF_NAME), - )) + })) }); const NON_NEGATIVE_DERIVATIVE_UDF_NAME: &str = "non_negative_derivative"; +#[derive(Debug)] +struct NonNegativeDerivativeUDF { + signature: Signature, +} + +impl ScalarUDFImpl for NonNegativeDerivativeUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + NON_NEGATIVE_DERIVATIVE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{NON_NEGATIVE_DERIVATIVE_UDF_NAME} should not exist in the final logical plan" + )) + } +} /// Create an expression to represent the `NON_NEGATIVE_DERIVATIVE` function. pub(crate) fn non_negative_derivative(args: Vec) -> Expr { NON_NEGATIVE_DERIVATIVE.call(args) @@ -163,10 +308,8 @@ pub(crate) fn non_negative_derivative(args: Vec) -> Expr { /// Definition of the `NON_NEGATIVE_DERIVATIVE` function. static NON_NEGATIVE_DERIVATIVE: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|_| Ok(Arc::new(DataType::Float64))); - Arc::new(ScalarUDF::new( - NON_NEGATIVE_DERIVATIVE_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(NonNegativeDerivativeUDF { + signature: Signature::one_of( NUMERICS .iter() .flat_map(|dt| { @@ -181,35 +324,58 @@ static NON_NEGATIVE_DERIVATIVE: Lazy> = Lazy::new(|| { .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(NON_NEGATIVE_DERIVATIVE_UDF_NAME), - )) + })) }); const CUMULATIVE_SUM_UDF_NAME: &str = "cumulative_sum"; +#[derive(Debug)] +struct CumulativeSumUDF { + signature: Signature, +} + +impl ScalarUDFImpl for CumulativeSumUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + CUMULATIVE_SUM_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.is_empty() { + return Err(DataFusionError::Plan(format!( + "{CUMULATIVE_SUM_UDF_NAME} expects at least 1 argument" + ))); + } + Ok(arg_types[0].clone()) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + error::internal(format!( + "{CUMULATIVE_SUM_UDF_NAME} should not exist in the final logical plan" + )) + } +} + /// Create an expression to represent the `CUMULATIVE_SUM` function. pub(crate) fn cumulative_sum(args: Vec) -> Expr { CUMULATIVE_SUM.call(args) } /// Definition of the `CUMULATIVE_SUM` function. static CUMULATIVE_SUM: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone()))); - Arc::new(ScalarUDF::new( - CUMULATIVE_SUM_UDF_NAME, - &Signature::one_of( + Arc::new(ScalarUDF::from(CumulativeSumUDF { + signature: Signature::one_of( NUMERICS .iter() .map(|dt| TypeSignature::Exact(vec![dt.clone()])) .collect(), Volatility::Immutable, ), - &return_type_fn, - &stand_in_impl(CUMULATIVE_SUM_UDF_NAME), - )) + })) }); - -/// Returns an implementation that always returns an error. -fn stand_in_impl(name: &'static str) -> ScalarFunctionImplementation { - Arc::new(move |_| error::internal(format!("{name} should not exist in the final logical plan"))) -} diff --git a/iox_query_influxql/src/plan/util.rs b/iox_query_influxql/src/plan/util.rs index 719fcab1eb8..d63f8919f54 100644 --- a/iox_query_influxql/src/plan/util.rs +++ b/iox_query_influxql/src/plan/util.rs @@ -61,7 +61,7 @@ impl<'a> IQLSchema<'a> { } /// Returns `true` if the schema contains a tag column with the specified name. - pub fn is_tag_field(&self, name: &str) -> bool { + pub(crate) fn is_tag_field(&self, name: &str) -> bool { match self.tag_info { TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_tag_field(name), TagInfo::FieldList(fields) => fields @@ -73,7 +73,7 @@ impl<'a> IQLSchema<'a> { /// Returns `true` if the schema contains a tag column with the specified name. /// If the underlying data source is a subquery, it will apply any aliases in the /// projection that represents the SELECT list. - pub fn is_projected_tag_field(&self, name: &str) -> bool { + pub(crate) fn is_projected_tag_field(&self, name: &str) -> bool { match self.tag_info { TagInfo::DataSourceSchema(ref ds_schema) => ds_schema.is_projected_tag_field(name), _ => self.is_tag_field(name), diff --git a/iox_query_influxql/src/window.rs b/iox_query_influxql/src/window.rs index ced7f04ce1b..32d9586b5e9 100644 --- a/iox_query_influxql/src/window.rs +++ b/iox_query_influxql/src/window.rs @@ -1,8 +1,6 @@ //! User defined window functions implementing influxQL features. -use datafusion::logical_expr::{ - PartitionEvaluatorFactory, ReturnTypeFunction, WindowFunction, WindowUDF, -}; +use datafusion::logical_expr::{WindowFunctionDefinition, WindowUDF}; use once_cell::sync::Lazy; use std::sync::Arc; @@ -14,109 +12,55 @@ mod non_negative; mod percent_row_number; /// Definition of the `CUMULATIVE_SUM` user-defined window function. -pub(crate) static CUMULATIVE_SUM: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(cumulative_sum::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = - Arc::new(cumulative_sum::partition_evaluator_factory); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - cumulative_sum::NAME, - &cumulative_sum::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static CUMULATIVE_SUM: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + cumulative_sum::CumulativeSumUDWF::new(), ))) }); /// Definition of the `DERIVATIVE` user-defined window function. -pub(crate) static DERIVATIVE: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(derivative::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = - Arc::new(derivative::partition_evaluator_factory); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - derivative::NAME, - &derivative::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static DERIVATIVE: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + derivative::DerivativeUDWF::new(), ))) }); /// Definition of the `DIFFERENCE` user-defined window function. -pub(crate) static DIFFERENCE: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(difference::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = - Arc::new(difference::partition_evaluator_factory); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - difference::NAME, - &difference::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static DIFFERENCE: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + difference::DifferenceUDWF::new(), ))) }); /// Definition of the `MOVING_AVERAGE` user-defined window function. -pub(crate) static MOVING_AVERAGE: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(moving_average::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = - Arc::new(moving_average::partition_evaluator_factory); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - moving_average::NAME, - &moving_average::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static MOVING_AVERAGE: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + moving_average::MovingAverageUDWF::new(), ))) }); -const NON_NEGATIVE_DERIVATIVE_NAME: &str = "non_negative_derivative"; - /// Definition of the `NON_NEGATIVE_DERIVATIVE` user-defined window function. -pub(crate) static NON_NEGATIVE_DERIVATIVE: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(derivative::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = Arc::new(|| { - Ok(non_negative::wrapper( - derivative::partition_evaluator_factory()?, - )) - }); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - NON_NEGATIVE_DERIVATIVE_NAME, - &derivative::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static NON_NEGATIVE_DERIVATIVE: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + non_negative::NonNegativeUDWF::new( + "non_negative_derivative", + derivative::DerivativeUDWF::new(), + ), ))) }); - -const NON_NEGATIVE_DIFFERENCE_NAME: &str = "non_negative_difference"; - /// Definition of the `NON_NEGATIVE_DIFFERENCE` user-defined window function. -pub(crate) static NON_NEGATIVE_DIFFERENCE: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(difference::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = Arc::new(|| { - Ok(non_negative::wrapper( - difference::partition_evaluator_factory()?, - )) - }); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - NON_NEGATIVE_DIFFERENCE_NAME, - &difference::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static NON_NEGATIVE_DIFFERENCE: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + non_negative::NonNegativeUDWF::new( + "non_negative_difference", + difference::DifferenceUDWF::new(), + ), ))) }); /// Definition of the `PERCENT_ROW_NUMBER` user-defined window function. -pub(crate) static PERCENT_ROW_NUMBER: Lazy = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(percent_row_number::return_type); - let partition_evaluator_factory: PartitionEvaluatorFactory = - Arc::new(percent_row_number::partition_evaluator_factory); - - WindowFunction::WindowUDF(Arc::new(WindowUDF::new( - percent_row_number::NAME, - &percent_row_number::SIGNATURE, - &return_type, - &partition_evaluator_factory, +pub(crate) static PERCENT_ROW_NUMBER: Lazy = Lazy::new(|| { + WindowFunctionDefinition::WindowUDF(Arc::new(WindowUDF::new_from_impl( + percent_row_number::PercentRowNumberUDWF::new(), ))) }); diff --git a/iox_query_influxql/src/window/cumulative_sum.rs b/iox_query_influxql/src/window/cumulative_sum.rs index b6acc4c3097..8153a9246aa 100644 --- a/iox_query_influxql/src/window/cumulative_sum.rs +++ b/iox_query_influxql/src/window/cumulative_sum.rs @@ -2,32 +2,51 @@ use crate::NUMERICS; use arrow::array::{Array, ArrayRef}; use arrow::datatypes::DataType; use datafusion::common::{Result, ScalarValue}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility}; -use once_cell::sync::Lazy; +use datafusion::logical_expr::{ + PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +}; +use std::any::Any; use std::sync::Arc; -/// The name of the cumulative_sum window function. -pub(super) const NAME: &str = "cumumlative_sum"; - -/// Valid signatures for the cumulative_sum window function. -pub(super) static SIGNATURE: Lazy = Lazy::new(|| { - Signature::one_of( - NUMERICS - .iter() - .map(|dt| TypeSignature::Exact(vec![dt.clone()])) - .collect(), - Volatility::Immutable, - ) -}); +#[derive(Debug)] +pub(super) struct CumulativeSumUDWF { + signature: Signature, +} -/// Calculate the return type given the function signature. -pub(super) fn return_type(sig: &[DataType]) -> Result> { - Ok(Arc::new(sig[0].clone())) +impl CumulativeSumUDWF { + pub(super) fn new() -> Self { + Self { + signature: Signature::one_of( + NUMERICS + .iter() + .map(|dt| TypeSignature::Exact(vec![dt.clone()])) + .collect(), + Volatility::Immutable, + ), + } + } } -/// Create a new partition_evaluator_factory. -pub(super) fn partition_evaluator_factory() -> Result> { - Ok(Box::new(CumulativeSumPartitionEvaluator {})) +impl WindowUDFImpl for CumulativeSumUDWF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "cumumlative_sum" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[0].clone()) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(CumulativeSumPartitionEvaluator {})) + } } /// PartitionEvaluator which returns the cumulative sum of the input. diff --git a/iox_query_influxql/src/window/derivative.rs b/iox_query_influxql/src/window/derivative.rs index 42730d532c5..019bc4ab23a 100644 --- a/iox_query_influxql/src/window/derivative.rs +++ b/iox_query_influxql/src/window/derivative.rs @@ -2,39 +2,66 @@ use crate::{error, NUMERICS}; use arrow::array::{Array, ArrayRef}; use arrow::datatypes::{DataType, TimeUnit}; use datafusion::common::{Result, ScalarValue}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility}; -use once_cell::sync::Lazy; - +use datafusion::logical_expr::{ + PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, TIMEZONE_WILDCARD, +}; use std::sync::Arc; -/// The name of the derivative window function. -pub(super) const NAME: &str = "derivative"; - -/// Valid signatures for the derivative window function. -pub(super) static SIGNATURE: Lazy = Lazy::new(|| { - Signature::one_of( - NUMERICS - .iter() - .map(|dt| { - TypeSignature::Exact(vec![ - dt.clone(), - DataType::Duration(TimeUnit::Nanosecond), - DataType::Timestamp(TimeUnit::Nanosecond, None), - ]) - }) - .collect(), - Volatility::Immutable, - ) -}); - -/// Calculate the return type given the function signature. -pub(super) fn return_type(_: &[DataType]) -> Result> { - Ok(Arc::new(DataType::Float64)) +#[derive(Debug)] +pub(super) struct DerivativeUDWF { + signature: Signature, +} + +impl DerivativeUDWF { + pub(super) fn new() -> Self { + Self { + signature: Signature::one_of( + NUMERICS + .iter() + .flat_map(|dt| { + [ + TypeSignature::Exact(vec![ + dt.clone(), + DataType::Duration(TimeUnit::Nanosecond), + DataType::Timestamp(TimeUnit::Nanosecond, None), + ]), + TypeSignature::Exact(vec![ + dt.clone(), + DataType::Duration(TimeUnit::Nanosecond), + DataType::Timestamp( + TimeUnit::Nanosecond, + Some(TIMEZONE_WILDCARD.into()), + ), + ]), + ] + }) + .collect(), + Volatility::Immutable, + ), + } + } } -/// Create a new partition_evaluator_factory. -pub(super) fn partition_evaluator_factory() -> Result> { - Ok(Box::new(DifferencePartitionEvaluator {})) +impl WindowUDFImpl for DerivativeUDWF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "derivative" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(DifferencePartitionEvaluator {})) + } } /// PartitionEvaluator which returns the derivative between input values, diff --git a/iox_query_influxql/src/window/difference.rs b/iox_query_influxql/src/window/difference.rs index d4c8adbb9fa..1618d72d07c 100644 --- a/iox_query_influxql/src/window/difference.rs +++ b/iox_query_influxql/src/window/difference.rs @@ -4,32 +4,50 @@ use arrow::compute::kernels::numeric::sub_wrapping; use arrow::compute::shift; use arrow::datatypes::DataType; use datafusion::common::{Result, ScalarValue}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility}; -use once_cell::sync::Lazy; +use datafusion::logical_expr::{ + PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +}; use std::sync::Arc; -/// The name of the difference window function. -pub(super) const NAME: &str = "difference"; - -/// Valid signatures for the difference window function. -pub(super) static SIGNATURE: Lazy = Lazy::new(|| { - Signature::one_of( - NUMERICS - .iter() - .map(|dt| TypeSignature::Exact(vec![dt.clone()])) - .collect(), - Volatility::Immutable, - ) -}); +#[derive(Debug)] +pub(super) struct DifferenceUDWF { + signature: Signature, +} -/// Calculate the return type given the function signature. -pub(super) fn return_type(sig: &[DataType]) -> Result> { - Ok(Arc::new(sig[0].clone())) +impl DifferenceUDWF { + pub(super) fn new() -> Self { + Self { + signature: Signature::one_of( + NUMERICS + .iter() + .map(|dt| TypeSignature::Exact(vec![dt.clone()])) + .collect(), + Volatility::Immutable, + ), + } + } } -/// Create a new partition_evaluator_factory. -pub(super) fn partition_evaluator_factory() -> Result> { - Ok(Box::new(DifferencePartitionEvaluator {})) +impl WindowUDFImpl for DifferenceUDWF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "difference" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + Ok(arg_types[0].clone()) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(DifferencePartitionEvaluator {})) + } } /// PartitionEvaluator which returns the difference between input values. diff --git a/iox_query_influxql/src/window/moving_average.rs b/iox_query_influxql/src/window/moving_average.rs index 3702e691f48..e61129158e8 100644 --- a/iox_query_influxql/src/window/moving_average.rs +++ b/iox_query_influxql/src/window/moving_average.rs @@ -2,33 +2,51 @@ use crate::{error, NUMERICS}; use arrow::array::{Array, ArrayRef, Int64Array}; use arrow::datatypes::DataType; use datafusion::common::{downcast_value, DataFusionError, Result, ScalarValue}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility}; -use once_cell::sync::Lazy; +use datafusion::logical_expr::{ + PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +}; use std::collections::VecDeque; use std::sync::Arc; -/// The name of the moving average window function. -pub(super) const NAME: &str = "moving_average"; - -/// Valid signatures for the moving average window function. -pub(super) static SIGNATURE: Lazy = Lazy::new(|| { - Signature::one_of( - NUMERICS - .iter() - .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64])) - .collect(), - Volatility::Immutable, - ) -}); +#[derive(Debug)] +pub(super) struct MovingAverageUDWF { + signature: Signature, +} -/// Calculate the return type given the function signature. -pub(super) fn return_type(_: &[DataType]) -> Result> { - Ok(Arc::new(DataType::Float64)) +impl MovingAverageUDWF { + pub(super) fn new() -> Self { + Self { + signature: Signature::one_of( + NUMERICS + .iter() + .map(|dt| TypeSignature::Exact(vec![dt.clone(), DataType::Int64])) + .collect(), + Volatility::Immutable, + ), + } + } } -/// Create a new partition_evaluator_factory. -pub(super) fn partition_evaluator_factory() -> Result> { - Ok(Box::new(AvgNPartitionEvaluator {})) +impl WindowUDFImpl for MovingAverageUDWF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "moving_average" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Float64) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(AvgNPartitionEvaluator {})) + } } /// PartitionEvaluator which returns a moving average of the input data.. diff --git a/iox_query_influxql/src/window/non_negative.rs b/iox_query_influxql/src/window/non_negative.rs index 504b4b743cb..d97187963dc 100644 --- a/iox_query_influxql/src/window/non_negative.rs +++ b/iox_query_influxql/src/window/non_negative.rs @@ -1,26 +1,61 @@ use arrow::array::Array; use arrow::compute::kernels::cmp::lt; use arrow::compute::nullif; +use arrow::datatypes::DataType; use datafusion::common::{Result, ScalarValue}; use datafusion::logical_expr::window_state::WindowAggState; -use datafusion::logical_expr::PartitionEvaluator; +use datafusion::logical_expr::{PartitionEvaluator, Signature, WindowUDFImpl}; +use std::any::Any; use std::ops::Range; use std::sync::Arc; -/// Wrap a PartitionEvaluator in a non-negative filter. -pub(super) fn wrapper( - partition_evaluator: Box, -) -> Box { - Box::new(NonNegative { - partition_evaluator, - }) +/// Wrap a WindowUDF so that all values are non-negative. + +#[derive(Debug)] +pub(super) struct NonNegativeUDWF { + name: String, + inner: U, +} + +impl NonNegativeUDWF { + pub(super) fn new(name: impl Into, inner: U) -> Self { + Self { + name: name.into(), + inner, + } + } +} + +impl WindowUDFImpl for NonNegativeUDWF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + &self.name + } + + fn signature(&self) -> &Signature { + self.inner.signature() + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + self.inner.return_type(arg_types) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(NonNegative { + partition_evaluator: self.inner.partition_evaluator()?, + })) + } } +/// Wraps an existing [`PartitionEvaluator`] and ensures that all values are +/// non-negative. #[derive(Debug)] struct NonNegative { partition_evaluator: Box, } - impl PartitionEvaluator for NonNegative { fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> { self.partition_evaluator.memoize(state) @@ -37,7 +72,7 @@ impl PartitionEvaluator for NonNegative { ) -> Result> { let array = self.partition_evaluator.evaluate_all(values, num_rows)?; let zero = ScalarValue::new_zero(array.data_type())?; - let predicate = lt(&array, &zero.to_scalar())?; + let predicate = lt(&array, &zero.to_scalar()?)?; Ok(nullif(&array, &predicate)?) } @@ -60,7 +95,7 @@ impl PartitionEvaluator for NonNegative { .evaluate_all_with_rank(num_rows, ranks_in_partition)?; let zero = ScalarValue::new_zero(array.data_type())?; - let predicate = lt(&array, &zero.to_scalar())?; + let predicate = lt(&array, &zero.to_scalar()?)?; Ok(nullif(&array, &predicate)?) } diff --git a/iox_query_influxql/src/window/percent_row_number.rs b/iox_query_influxql/src/window/percent_row_number.rs index 91df0587ae0..7d1714e1225 100644 --- a/iox_query_influxql/src/window/percent_row_number.rs +++ b/iox_query_influxql/src/window/percent_row_number.rs @@ -2,33 +2,50 @@ use crate::error; use arrow::array::{Array, ArrayRef, Float64Array, Int64Array, UInt64Array}; use arrow::datatypes::DataType; use datafusion::common::{downcast_value, DataFusionError, Result}; -use datafusion::logical_expr::{PartitionEvaluator, Signature, TypeSignature, Volatility}; -use once_cell::sync::Lazy; +use datafusion::logical_expr::{ + PartitionEvaluator, Signature, TypeSignature, Volatility, WindowUDFImpl, +}; use std::sync::Arc; -/// The name of the percent_row_number window function. -pub(super) const NAME: &str = "percent_row_number"; - -/// Valid signatures for the percent_row_number window function. -pub(super) static SIGNATURE: Lazy = Lazy::new(|| { - Signature::one_of( - vec![ - TypeSignature::Exact(vec![DataType::Int64]), - TypeSignature::Exact(vec![DataType::Float64]), - ], - Volatility::Immutable, - ) -}); +#[derive(Debug)] +pub(super) struct PercentRowNumberUDWF { + signature: Signature, +} -/// Calculate the return type given the function signature. Percent_row_number -/// always returns a UInt64. -pub(super) fn return_type(_: &[DataType]) -> Result> { - Ok(Arc::new(DataType::UInt64)) +impl PercentRowNumberUDWF { + pub(super) fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Int64]), + TypeSignature::Exact(vec![DataType::Float64]), + ], + Volatility::Immutable, + ), + } + } } -/// Create a new partition_evaluator_factory. -pub(super) fn partition_evaluator_factory() -> Result> { - Ok(Box::new(PercentRowNumberPartitionEvaluator {})) +impl WindowUDFImpl for PercentRowNumberUDWF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + "percent_row_number" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::UInt64) + } + + fn partition_evaluator(&self) -> Result> { + Ok(Box::new(PercentRowNumberPartitionEvaluator {})) + } } /// PartitionEvaluator which returns the row number at which the nth diff --git a/iox_query_influxrpc/Cargo.toml b/iox_query_influxrpc/Cargo.toml index bf8cd300c8b..73996fdf444 100644 --- a/iox_query_influxrpc/Cargo.toml +++ b/iox_query_influxrpc/Cargo.toml @@ -5,8 +5,11 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } data_types = { path = "../data_types" } datafusion = { workspace = true } datafusion_util = { path = "../datafusion_util" } @@ -17,11 +20,11 @@ observability_deps = { path = "../observability_deps" } query_functions = { path = "../query_functions"} schema = { path = "../schema" } predicate = { path = "../predicate" } -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order arrow_util = { path = "../arrow_util" } test_helpers = { path = "../test_helpers" } insta = { version = "1", features = ["yaml"] } -tokio = { version = "1.32", features = ["macros", "parking_lot"] } +tokio = { version = "1.35", features = ["macros", "parking_lot"] } diff --git a/iox_query_influxrpc/src/lib.rs b/iox_query_influxrpc/src/lib.rs index 261df170774..ea4808fc24c 100644 --- a/iox_query_influxrpc/src/lib.rs +++ b/iox_query_influxrpc/src/lib.rs @@ -1602,7 +1602,7 @@ fn filtered_fields_iter<'a>( impl AggExprs { /// Create the appropriate aggregate expressions, based on the type of the /// field for a `read_group` plan. - pub fn try_new_for_read_group( + pub(crate) fn try_new_for_read_group( agg: Aggregate, schema: &Schema, predicate: &Predicate, @@ -1620,7 +1620,7 @@ impl AggExprs { /// Create the appropriate aggregate expressions, based on the type of the /// field for a `read_window_aggregate` plan. - pub fn try_new_for_read_window_aggregate( + pub(crate) fn try_new_for_read_window_aggregate( agg: Aggregate, schema: &Schema, predicate: &Predicate, @@ -1816,7 +1816,7 @@ fn prune_chunks( } fn chunk_column_names( - chunk: &dyn QueryChunk, + chunk: &Arc, predicate: &Predicate, columns: Projection<'_>, ) -> Option { @@ -2575,6 +2575,7 @@ mod tests { TestChunk::new("h2o") .with_id(0) .with_tag_column("foo") + .with_f64_field_column("my_field") .with_time_column(), ); diff --git a/iox_query_influxrpc/src/missing_columns.rs b/iox_query_influxrpc/src/missing_columns.rs index 0254e6e4b55..d79d5601482 100644 --- a/iox_query_influxrpc/src/missing_columns.rs +++ b/iox_query_influxrpc/src/missing_columns.rs @@ -27,13 +27,13 @@ use schema::Schema; /// parts of the predicate make sense. /// See comments on 'is_null_column' #[derive(Debug)] -pub struct MissingColumnsToNull<'a> { +pub(crate) struct MissingColumnsToNull<'a> { schema: &'a Schema, df_schema: DFSchema, } impl<'a> MissingColumnsToNull<'a> { - pub fn new(schema: &'a Schema) -> Self { + pub(crate) fn new(schema: &'a Schema) -> Self { let df_schema: DFSchema = schema .as_arrow() .as_ref() diff --git a/iox_query_influxrpc/src/scan_plan.rs b/iox_query_influxrpc/src/scan_plan.rs index 661a4ee94c7..d2d58393b4e 100644 --- a/iox_query_influxrpc/src/scan_plan.rs +++ b/iox_query_influxrpc/src/scan_plan.rs @@ -44,9 +44,9 @@ pub enum Error { pub(crate) type Result = std::result::Result; /// Represents scanning one or more [`QueryChunk`]s. -pub struct ScanPlan { - pub plan_builder: LogicalPlanBuilder, - pub provider: Arc, +pub(crate) struct ScanPlan { + pub(crate) plan_builder: LogicalPlanBuilder, + pub(crate) provider: Arc, } impl std::fmt::Debug for ScanPlan { @@ -60,7 +60,7 @@ impl std::fmt::Debug for ScanPlan { impl ScanPlan { /// Return the schema of the source (the merged schema across all tables) - pub fn schema(&self) -> &Schema { + pub(crate) fn schema(&self) -> &Schema { self.provider.iox_schema() } } @@ -82,7 +82,7 @@ impl ScanPlan { /// (and thus prune) their own chunklist. #[derive(Debug)] -pub struct ScanPlanBuilder<'a> { +pub(crate) struct ScanPlanBuilder<'a> { table_name: Arc, /// The schema of the resulting table (any chunks that don't have /// all the necessary columns will be extended appropriately) @@ -92,7 +92,7 @@ pub struct ScanPlanBuilder<'a> { } impl<'a> ScanPlanBuilder<'a> { - pub fn new(table_name: Arc, table_schema: &'a Schema) -> Self { + pub(crate) fn new(table_name: Arc, table_schema: &'a Schema) -> Self { Self { table_name, table_schema, @@ -102,20 +102,23 @@ impl<'a> ScanPlanBuilder<'a> { } /// Adds `chunks` to the list of Chunks to scan - pub fn with_chunks(mut self, chunks: impl IntoIterator>) -> Self { + pub(crate) fn with_chunks( + mut self, + chunks: impl IntoIterator>, + ) -> Self { self.chunks.extend(chunks); self } /// Sets the predicate - pub fn with_predicate(mut self, predicate: &'a Predicate) -> Self { + pub(crate) fn with_predicate(mut self, predicate: &'a Predicate) -> Self { assert!(self.predicate.is_none()); self.predicate = Some(predicate); self } /// Creates a `ScanPlan` from the specified chunks - pub fn build(self) -> Result { + pub(crate) fn build(self) -> Result { let Self { table_name, chunks, @@ -212,7 +215,7 @@ mod tests { - " DeduplicateExec: [tag1@3 ASC,time@4 ASC]" - " SortPreservingMergeExec: [tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" - " SortExec: expr=[tag1@3 ASC,time@4 ASC,__chunk_order@0 ASC]" - - " RecordBatchesExec: chunks=2" + - " RecordBatchesExec: chunks=2, projection=[__chunk_order, field_int, field_int2, tag1, time]" "### ); diff --git a/iox_query_params/Cargo.toml b/iox_query_params/Cargo.toml new file mode 100644 index 00000000000..3c0eeb91607 --- /dev/null +++ b/iox_query_params/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "iox_query_params" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +datafusion = { workspace = true } +generated_types = { path = "../generated_types" } +observability_deps = { path = "../observability_deps" } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +thiserror = "1.0" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] +assert_matches = "1" + diff --git a/iox_query_params/src/lib.rs b/iox_query_params/src/lib.rs new file mode 100644 index 00000000000..501493565e5 --- /dev/null +++ b/iox_query_params/src/lib.rs @@ -0,0 +1,21 @@ +//! Crate for common types and utilities related to InfluxDB +//! query/statement parameters. +#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] +#![warn( + clippy::explicit_iter_loop, + clippy::use_self, + clippy::clone_on_ref_ptr, + // See https://github.com/influxdata/influxdb_iox/pull/1671 + clippy::future_not_send, + clippy::todo, + clippy::dbg_macro, + unused_crate_dependencies, + missing_debug_implementations, + unreachable_pub +)] + +mod params; + +pub use params::*; + +use workspace_hack as _; diff --git a/iox_query_params/src/params.rs b/iox_query_params/src/params.rs new file mode 100644 index 00000000000..ce2526c105f --- /dev/null +++ b/iox_query_params/src/params.rs @@ -0,0 +1,675 @@ +//! General-purpose data type and utilities for working with +//! values that can be supplied as an InfluxDB bind parameter. +use std::{borrow::Cow, collections::HashMap}; + +use datafusion::scalar::ScalarValue; +use observability_deps::tracing::warn; +use serde::{Deserialize, Serialize}; +use thiserror::Error; + +// remap protobuf types for convenience +mod proto { + pub(super) use generated_types::influxdata::iox::querier::v1::read_info::{ + query_param::{NullValue, Value}, + QueryParam, + }; +} + +#[derive(Debug, Error)] +/// Parameter errors +pub enum Error { + /// Data conversion error + #[error("{}", msg)] + Conversion { msg: String }, +} + +/// A helper macro to construct a `HashMap` over `(String, StatementParam)` pairs. +#[macro_export] +macro_rules! params { + () => ( + std::collections::HashMap::new() + ); + ($($key:expr => $val:expr),+ $(,)?) => ( + std::collections::HashMap::from([$((String::from($key), $crate::StatementParam::from($val))),+]) + ); +} + +/// A collection of statement parameter (name,value) pairs. +/// +/// This is a newtype wrapper to facillitate data conversions. +/// [From] instances can be used to convert to/from protobuf and JSON +/// protocol formats. +/// +/// There is also a [From] instance to convert to +/// [datafusion::common::ParamValues] which makes it possible to pass +/// parameters into a [datafusion::logical_expr::LogicalPlan] +#[repr(transparent)] +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct StatementParams(HashMap); + +impl StatementParams { + /// Convert to internal representation. + pub fn into_inner(&self) -> &HashMap { + &self.0 + } + + /// Convert into a HashMap of (name, value) pairs + pub fn into_hashmap>(self) -> HashMap { + self.0 + .into_iter() + .map(|(key, value)| (key, value.into())) + .collect::>() + } + + /// Convert to [datafusion::common::ParamValues] used by [datafusion::logical_expr::LogicalPlan]::with_param_values + pub fn into_df_param_values(self) -> datafusion::common::ParamValues { + self.into() + } +} + +/// From HashMap +impl From> for StatementParams { + fn from(value: HashMap) -> Self { + Self(value) + } +} + +/// To HashMap +impl From for HashMap { + fn from(value: StatementParams) -> Self { + value.0 + } +} + +/// Converting to [datafusion::common::ParamValues] allows for +/// parameters to be passed to DataFusion +impl From for datafusion::common::ParamValues { + fn from(params: StatementParams) -> Self { + Self::Map(params.into_hashmap()) + } +} + +/// Convert from protobuf +impl TryFrom> for StatementParams { + type Error = self::Error; + fn try_from(proto: Vec) -> Result { + let params = proto + .into_iter() + .map(|param| { + match param.value { + Some(value) => Ok((param.name, StatementParam::from(value))), + None => Err(Error::Conversion { + msg: format!( + "Missing value for parameter \"{}\" when decoding query parameters in Flight gRPC ticket.", + param.name) + }) + } + }).collect::, _>>()?; + Ok(Self(params)) + } +} + +/// Convert into protobuf +impl From for Vec { + fn from(params: StatementParams) -> Self { + params + .0 + .into_iter() + .map(|(name, value)| proto::QueryParam { + name, + value: Some(value.into()), + }) + .collect() + } +} + +/// Enum of possible data types that can be used as parameters in an InfluxQL query. +/// +/// # creating values +/// +/// [From] implementations for many builtin types are provided to make creation of parameter values +/// easier from the influxdb client. +/// +/// # protocol formats +/// +/// There are [From]/[TryFrom] implementations to convert to/from +/// protobuf and JSON. These are used for deserialization/serialization of +/// protocol messages across gRPC and the legacy REST API +/// +/// # planning/execution +/// +/// There is a [From] implementation to convert to DataFusion [ScalarValue]s. This +/// allows params to be passed into the DataFusion [datafusion::logical_expr::LogicalPlan] +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(try_from = "serde_json::Value", into = "serde_json::Value")] +pub enum StatementParam { + /// a NULL value + #[default] + Null, + /// a boolean value + Boolean(bool), + /// an unsigned integer value + UInt64(u64), + /// a signed integer value + Int64(i64), + /// a floating point value + Float64(f64), + /// a UTF-8 string value + String(String), +} + +/// Display as "SQL-like" literals +impl std::fmt::Display for StatementParam { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Null => write!(f, "NULL"), + Self::Boolean(b) => write!(f, "{}", b.to_string().to_uppercase()), + Self::UInt64(u) => write!(f, "{}", u), + Self::Int64(i) => write!(f, "{}", i), + Self::Float64(fl) => write!(f, "{}", fl), + Self::String(s) => write!(f, "'{}'", s.replace('\'', "''")), + } + } +} + +impl PartialEq for StatementParam { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::Null, Self::Null) => true, + (Self::Boolean(b1), Self::Boolean(b2)) => b1 == b2, + (Self::UInt64(u1), Self::UInt64(u2)) => u1 == u2, + (Self::Int64(i1), Self::Int64(i2)) => i1 == i2, + (Self::Float64(f1), Self::Float64(f2)) => f1 == f2, + (Self::String(s1), Self::String(s2)) => s1 == s2, + // do not use a `_` pattern here because we want the exhaustiveness + // check to fail if a new param variant is added + ( + Self::Null + | Self::Boolean(_) + | Self::UInt64(_) + | Self::Int64(_) + | Self::Float64(_) + | Self::String(_), + _, + ) => false, + } + } +} + +impl Eq for StatementParam {} + +/// Convert into protobuf representation +impl From for proto::Value { + fn from(value: StatementParam) -> Self { + use proto::NullValue; + match value { + StatementParam::Null => Self::Null(NullValue::Unspecified.into()), + StatementParam::Boolean(b) => Self::Boolean(b), + StatementParam::UInt64(u) => Self::UInt64(u), + StatementParam::Int64(i) => Self::Int64(i), + StatementParam::Float64(f) => Self::Float64(f), + StatementParam::String(s) => Self::String(s), + } + } +} + +/// Convert into JSON representation +impl From for serde_json::Value { + fn from(param: StatementParam) -> Self { + match param { + StatementParam::Null => Self::Null, + StatementParam::Boolean(b) => Self::Bool(b), + StatementParam::Float64(f) => Self::from(f), + StatementParam::UInt64(u) => Self::from(u), + StatementParam::Int64(i) => Self::from(i), + StatementParam::String(s) => Self::String(s), + } + } +} + +/// Convert to DataFusion [ScalarValue]. This makes it possible to pass parameters +/// into a datafusion [datafusion::logical_expr::LogicalPlan] +impl From for ScalarValue { + fn from(value: StatementParam) -> Self { + match value { + StatementParam::Null => Self::Null, + StatementParam::Boolean(b) => Self::Boolean(Some(b)), + StatementParam::UInt64(u) => Self::UInt64(Some(u)), + StatementParam::Int64(i) => Self::Int64(Some(i)), + StatementParam::Float64(f) => Self::Float64(Some(f)), + StatementParam::String(s) => Self::Utf8(Some(s)), + } + } +} + +/// Convert from protobuf representation +impl From for StatementParam { + fn from(value: proto::Value) -> Self { + match value { + proto::Value::Null(n) => { + const UNSPECIFIED: i32 = proto::NullValue::Unspecified as i32; + if n != UNSPECIFIED { + warn!( + "Malformed Null in protobuf when decoding parameter \ + value into StatementParam. Expected Null({UNSPECIFIED}) \ + but found Null({n}). Possibly mismatched protobuf \ + versions. + " + ); + } + Self::Null + } + proto::Value::Boolean(b) => Self::Boolean(b), + proto::Value::Float64(f) => Self::from(f), + proto::Value::UInt64(u) => Self::from(u), + proto::Value::Int64(i) => Self::from(i), + proto::Value::String(s) => Self::String(s), + } + } +} + +/// Convert from JSON representation +impl TryFrom for StatementParam { + type Error = self::Error; + fn try_from(value: serde_json::Value) -> Result { + use serde_json::Value; + match value { + Value::Null => Ok(Self::Null), + Value::Bool(b) => Ok(Self::Boolean(b)), + Value::Number(n) => { + if let Some(u) = n.as_u64() { + Ok(Self::UInt64(u)) + } else if let Some(i) = n.as_i64() { + Ok(Self::Int64(i)) + } else if let Some(f) = n.as_f64() { + Ok(Self::Float64(f)) + } else { + // NOTE: without the "arbitrary_precision" feature enabled on serde_json, + // deserialization will never encounter this branch + Err(Error::Conversion { + msg: format!("Could not convert JSON number to i64 or f64: {n}"), + }) + } + } + Value::String(s) => Ok(Self::String(s)), + Value::Array(_) => Err(Error::Conversion { + msg: "JSON arrays are not supported as query parameters. Expected null, boolean, number, or string.".to_string(), + }), + Value::Object(_) => Err(Error::Conversion { + msg: "JSON objects are not supported as query parameters. Expected null, boolean, number, or string".to_string(), + }), + } + } +} + +/// [`Option`] values are unwrapped and [`None`] values are converted to NULL +impl From> for StatementParam +where + Self: From, +{ + fn from(value: Option) -> Self { + match value { + None => Self::Null, + Some(value) => value.into(), + } + } +} + +/// Unit type is converted to NULL +impl From<()> for StatementParam { + fn from(_value: ()) -> Self { + Self::Null + } +} + +impl From for StatementParam { + fn from(value: bool) -> Self { + Self::Boolean(value) + } +} + +impl From for StatementParam { + fn from(value: u8) -> Self { + Self::UInt64(value as u64) + } +} + +impl From for StatementParam { + fn from(value: u16) -> Self { + Self::UInt64(value as u64) + } +} + +impl From for StatementParam { + fn from(value: u32) -> Self { + Self::UInt64(value as u64) + } +} + +impl From for StatementParam { + fn from(value: u64) -> Self { + Self::UInt64(value) + } +} + +impl From for StatementParam { + fn from(value: usize) -> Self { + Self::UInt64(value.try_into().unwrap()) + } +} + +impl From for StatementParam { + fn from(value: i8) -> Self { + Self::Int64(value as i64) + } +} + +impl From for StatementParam { + fn from(value: i16) -> Self { + Self::Int64(value as i64) + } +} + +impl From for StatementParam { + fn from(value: i32) -> Self { + Self::Int64(value.into()) + } +} + +impl From for StatementParam { + fn from(value: i64) -> Self { + Self::Int64(value) + } +} + +impl From for StatementParam { + fn from(value: isize) -> Self { + Self::Int64(value.try_into().unwrap()) + } +} + +impl From for StatementParam { + fn from(value: f32) -> Self { + Self::Float64(value.into()) + } +} + +impl From for StatementParam { + fn from(value: f64) -> Self { + Self::Float64(value) + } +} + +impl From<&str> for StatementParam { + fn from(value: &str) -> Self { + Self::String(value.to_string()) + } +} + +impl From for StatementParam { + fn from(value: String) -> Self { + Self::String(value) + } +} + +impl<'a> From> for StatementParam { + fn from(value: Cow<'a, str>) -> Self { + Self::String(value.into_owned()) + } +} + +#[cfg(test)] +#[allow(clippy::approx_constant)] // allow 3.14 >:) +mod tests { + use assert_matches::assert_matches; + use serde_json::json; + + use super::*; + + #[test] + fn params_from_protobuf_value() { + // empty case + assert_matches!(StatementParams::try_from(vec![]), Ok(StatementParams(hm)) if hm.is_empty()); + + // test happy path with all value types + let proto: Vec = [ + ("foo", proto::Value::String("Test String".to_string())), + ("bar", proto::Value::Float64(3.14)), + ("baz", proto::Value::UInt64(1234)), + ("int", proto::Value::Int64(-1234)), + ("1", proto::Value::Boolean(false)), + ("2", proto::Value::Null(0)), + ] + .map(|(key, value)| proto::QueryParam { + name: key.to_string(), + value: Some(value), + }) + .into(); + let result = StatementParams::try_from(proto); + let params = result.unwrap().0; + assert_eq!( + params, + params! { + "foo" => "Test String", + "bar" => 3.14_f64, + "baz" => 1234_u64, + "int" => -1234_i64, + "1" => false, + "2" => StatementParam::Null, + } + ); + } + + #[test] + fn params_from_json_values() { + use serde_json::Value; + assert_matches!( + StatementParam::try_from(Value::from("Test String")), + Ok(StatementParam::String(s)) if s == "Test String"); + assert_matches!( + StatementParam::try_from(Value::from(3.14)), + Ok(StatementParam::Float64(n)) if n == 3.14 + ); + assert_matches!( + StatementParam::try_from(Value::from(1234)), + Ok(StatementParam::UInt64(1234)) + ); + assert_matches!( + StatementParam::try_from(Value::from(-1234)), + Ok(StatementParam::Int64(-1234)) + ); + assert_matches!( + StatementParam::try_from(Value::from(false)), + Ok(StatementParam::Boolean(false)) + ); + assert_matches!( + StatementParam::try_from(Value::Null), + Ok(StatementParam::Null) + ); + // invalid values + assert_matches!( + StatementParam::try_from(json!([1, 2, 3])), + Err(Error::Conversion { .. }) + ); + assert_matches!( + StatementParam::try_from(json!({ "a": 1, "b": 2, "c": 3})), + Err(Error::Conversion { .. }) + ); + } + + #[test] + fn params_from_json_str() { + let json = r#" + { + "foo": "Test String", + "bar": 3.14, + "baz": 1234, + "int": -1234, + "1": false, + "2": null + } + "#; + let result = serde_json::from_str::(json); + let params = result.unwrap().0; + assert_eq!( + params, + params! { + "foo" => "Test String", + "bar" => 3.14_f64, + "baz" => 1234_u64, + "int" => -1234_i64, + "1" => false, + "2" => StatementParam::Null, + } + ); + } + + #[test] + fn params_from_json_str_invalid() { + // invalid top-level values + assert_matches!(serde_json::from_str::("null"), Err(_)); + assert_matches!(serde_json::from_str::("100"), Err(_)); + assert_matches!(serde_json::from_str::("3.14"), Err(_)); + assert_matches!(serde_json::from_str::("true"), Err(_)); + assert_matches!(serde_json::from_str::("[\"foo\"]"), Err(_)); + + // nested lists are invalid + let json = r#" + { + "foo": [], + } + "#; + let result = serde_json::from_str::(json); + assert_matches!(result, Err(serde_json::Error { .. })); + + // nested objects are invalid + let json = r#" + { + "foo": {}, + } + "#; + let result = serde_json::from_str::(json); + assert_matches!(result, Err(serde_json::Error { .. })); + + // nested list with contents + let json = r#" + { + "foo bar": [1, 2, "3", "4 5 6", [null], [[]], {}], + "baz": null + } + "#; + let result = serde_json::from_str::(json); + assert_matches!(result, Err(serde_json::Error { .. })); + + // nested object with contents + let json = r#" + { + "fazbar": { + "a": 1, + "b": 2, + "c": null + }, + "baz": null + } + "#; + let result = serde_json::from_str::(json); + assert_matches!(result, Err(serde_json::Error { .. })); + } + + // tests what happens when integer and float are out of bounds + // + // without `arbitrary_precision` flag, `serde_json` will always deserialize numbers to + // either i64 or f64. + // + // one potential edge case to be aware of is what happens when `serde_json::Value`` deserializes + // an integer number that's out-of-bounds for i64, but in-bounds for f64. In this case + // it will be interpreted as a float, and rounding errors can be introduced. This case + // is unlikely to occur as long as clients are properly validating that their integers + // are within 64-bit bounds, but it's possible that a client serializing a bigdecimal could + // encounter this case. this has not been testing when `serde_json` has `arbitrary_precision` enabled, + // so it's possible adding that feature would prevent rounding errors in this case. + // supporting bigdecimal parameters would also fix this edge case. + #[test] + fn params_from_json_str_bignum() { + let json = format! {" {{ \"abc\" : {}999 }} ", f64::MAX}; + let result = serde_json::from_str::(&json); + // NOTE: without the "arbitrary_precision" feature enabled on serde_json, deserialization will never encounter + // our out-of-bounds guard + let err = result.unwrap_err(); + assert!(err.to_string().contains("number out of range")); + } + + #[test] + fn params_conversions() { + assert_matches!(StatementParam::from(true), StatementParam::Boolean(true)); + assert_matches!(StatementParam::from(123_u32), StatementParam::UInt64(123)); + assert_matches!(StatementParam::from(-123), StatementParam::Int64(-123)); + assert_matches!(StatementParam::from(1.23), StatementParam::Float64(f) if f == 1.23); + assert_matches!(StatementParam::from("a string"), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from("a string".to_owned()), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from(Cow::from("a string")), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from(()), StatementParam::Null); + assert_matches!( + StatementParam::from(None::>), + StatementParam::Null + ); + assert_matches!( + StatementParam::from(Some(true)), + StatementParam::Boolean(true) + ); + assert_matches!( + StatementParam::from(Some(123_u32)), + StatementParam::UInt64(123) + ); + assert_matches!( + StatementParam::from(Some(-123)), + StatementParam::Int64(-123) + ); + assert_matches!(StatementParam::from(Some(1.23)), StatementParam::Float64(f) if f == 1.23); + assert_matches!(StatementParam::from(Some("a string")), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from(Some("a string".to_owned())), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from(Some(Cow::from("a string"))), StatementParam::String(s) if s == "a string"); + assert_matches!(StatementParam::from(Some(())), StatementParam::Null); + assert_matches!( + StatementParam::from(Some(None::>)), + StatementParam::Null + ); + assert_matches!( + StatementParam::from(Some(Some(true))), + StatementParam::Boolean(true) + ); + } + + // test equality comparisons for StatementParams + #[test] + fn params_equality() { + let values = [ + StatementParam::Null, + StatementParam::from(true), + StatementParam::from(32_u32), + StatementParam::from(-23), + StatementParam::from(32.23), + StatementParam::from("a string"), + ]; + for (i, value1) in values.iter().enumerate() { + for (j, value2) in values.iter().enumerate() { + if i == j { + assert_eq!(value1, value2); + } else { + assert_ne!(value1, value2); + } + } + } + assert_ne!(StatementParam::from(true), StatementParam::from(false)); + assert_ne!( + StatementParam::from(1984_u32), + StatementParam::from(2077_u32) + ); + assert_ne!(StatementParam::from(-100), StatementParam::from(100)); + assert_ne!(StatementParam::from(-1.23), StatementParam::from(1.23)); + assert_ne!( + StatementParam::from("string1"), + StatementParam::from("string2") + ); + } +} diff --git a/iox_tests/Cargo.toml b/iox_tests/Cargo.toml index 8fd924838bb..ae46c10d98b 100644 --- a/iox_tests/Cargo.toml +++ b/iox_tests/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] arrow = { workspace = true } data_types = { path = "../data_types" } diff --git a/iox_tests/src/builders.rs b/iox_tests/src/builders.rs index 9e86366e5f7..f1eb5baf728 100644 --- a/iox_tests/src/builders.rs +++ b/iox_tests/src/builders.rs @@ -1,7 +1,7 @@ use data_types::{ - Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, NamespaceId, ParquetFile, - ParquetFileId, Partition, PartitionId, PartitionKey, SkippedCompaction, Table, TableId, - Timestamp, TransitionPartitionId, + Column, ColumnId, ColumnSet, ColumnType, CompactionLevel, NamespaceId, ObjectStoreId, + ParquetFile, ParquetFileId, ParquetFileParams, Partition, PartitionHashId, PartitionId, + PartitionKey, SkippedCompaction, Table, TableId, Timestamp, }; use uuid::Uuid; @@ -21,11 +21,14 @@ impl ParquetFileBuilder { id: ParquetFileId::new(id), namespace_id: NamespaceId::new(0), table_id, - partition_id: TransitionPartitionId::new( + partition_id: PartitionId::new(0), + partition_hash_id: Some(PartitionHashId::new( table_id, &PartitionKey::from("arbitrary"), - ), - object_store_id: Uuid::from_u128(id.try_into().expect("invalid id")), + )), + object_store_id: ObjectStoreId::from_uuid(Uuid::from_u128( + id.try_into().expect("invalid id"), + )), min_time: Timestamp::new(0), max_time: Timestamp::new(0), to_delete: None, @@ -40,7 +43,7 @@ impl ParquetFileBuilder { } /// Set the partition identifier - pub fn with_partition(self, partition_id: TransitionPartitionId) -> Self { + pub fn with_partition(self, partition_id: PartitionId) -> Self { Self { file: ParquetFile { partition_id, @@ -104,6 +107,27 @@ impl ParquetFileBuilder { pub fn build(self) -> ParquetFile { self.file } + + /// Construct [`ParquetFileParams`] and the corresponding [`ParquetFile`] + pub fn params(self) -> (ParquetFileParams, ParquetFile) { + let file = self.clone().build(); + let params = ParquetFileParams { + partition_id: self.file.partition_id, + partition_hash_id: self.file.partition_hash_id, + namespace_id: self.file.namespace_id, + table_id: self.file.table_id, + object_store_id: self.file.object_store_id, + min_time: self.file.min_time, + max_time: self.file.max_time, + file_size_bytes: self.file.file_size_bytes, + row_count: self.file.row_count, + compaction_level: self.file.compaction_level, + created_at: self.file.created_at, + column_set: self.file.column_set, + max_l0_created_at: self.file.max_l0_created_at, + }; + (params, file) + } } impl From for ParquetFileBuilder { @@ -201,12 +225,16 @@ pub struct PartitionBuilder { impl PartitionBuilder { /// Create a builder to create a partition with `partition_id` `id` pub fn new(id: i64) -> Self { + let table_id = TableId::new(0); + let key = PartitionKey::from("key"); + let hash_id = PartitionHashId::new(table_id, &key); + Self { - partition: Partition::new_in_memory_only( + partition: Partition::new_catalog_only( PartitionId::new(id), - TableId::new(0), - PartitionKey::from("key"), - vec![], + Some(hash_id), + table_id, + key, Default::default(), None, ), diff --git a/iox_tests/src/catalog.rs b/iox_tests/src/catalog.rs index 88ab9b6695b..507ef090bb9 100644 --- a/iox_tests/src/catalog.rs +++ b/iox_tests/src/catalog.rs @@ -7,19 +7,18 @@ use arrow::{ use data_types::{ partition_template::TablePartitionTemplateOverride, Column, ColumnSet, ColumnType, ColumnsByName, CompactionLevel, MaxColumnsPerTable, MaxTables, Namespace, NamespaceName, - NamespaceSchema, ParquetFile, ParquetFileParams, Partition, PartitionId, SortedColumnSet, - Table, TableId, TableSchema, Timestamp, TransitionPartitionId, + NamespaceSchema, ObjectStoreId, ParquetFile, ParquetFileParams, Partition, PartitionId, + SortKeyIds, Table, TableSchema, Timestamp, TransitionPartitionId, }; use datafusion::physical_plan::metrics::Count; use datafusion_util::{unbounded_memory_pool, MemoryStream}; use generated_types::influxdata::iox::partition_template::v1::PartitionTemplate; +use iox_catalog::interface::PartitionRepoExt; use iox_catalog::{ - interface::{ - get_schema_by_id, get_table_columns_by_id, Catalog, RepoCollection, SoftDeletedRows, - }, + interface::{Catalog, ParquetFileRepoExt, RepoCollection, SoftDeletedRows}, mem::MemCatalog, - partition_lookup, test_helpers::arbitrary_table, + util::{get_schema_by_id, get_table_columns_by_id}, }; use iox_query::{ exec::{DedicatedExecutors, Executor, ExecutorConfig}, @@ -40,10 +39,9 @@ use schema::{ Projection, Schema, }; use std::{collections::HashMap, num::NonZeroUsize, sync::Arc}; -use uuid::Uuid; /// Common retention period used throughout tests -pub const TEST_RETENTION_PERIOD_NS: Option = Some(3_600 * 1_000_000_000); +pub(crate) const TEST_RETENTION_PERIOD_NS: Option = Some(3_600 * 1_000_000_000); /// Catalog for tests #[derive(Debug)] @@ -79,11 +77,14 @@ impl TestCatalog { target_query_partitions: NonZeroUsize, ) -> Arc { let metric_registry = Arc::new(metric::Registry::new()); - let catalog: Arc = Arc::new(MemCatalog::new(Arc::clone(&metric_registry))); + let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap())); + let catalog: Arc = Arc::new(MemCatalog::new( + Arc::clone(&metric_registry), + Arc::clone(&time_provider) as _, + )); let object_store = Arc::new(InMemory::new()); let parquet_store = ParquetStorage::new(Arc::clone(&object_store) as _, StorageId::from("iox")); - let time_provider = Arc::new(MockProvider::new(Time::from_timestamp(0, 0).unwrap())); let exec = Arc::new(Executor::new_with_config_and_executors( ExecutorConfig { num_threads: exec.num_threads(), @@ -148,7 +149,7 @@ impl TestCatalog { name: &str, retention_period_ns: Option, ) -> Arc { - let mut repos = self.catalog.repositories().await; + let mut repos = self.catalog.repositories(); let namespace_name = NamespaceName::new(name).unwrap(); let namespace = repos .namespaces() @@ -171,27 +172,13 @@ impl TestCatalog { .await } - /// List all non-deleted files - pub async fn list_by_table_not_to_delete( - self: &Arc, - table_id: TableId, - ) -> Vec { - self.catalog - .repositories() - .await - .parquet_files() - .list_by_table_not_to_delete(table_id) - .await - .unwrap() - } - /// Add a partition into skipped compaction pub async fn add_to_skipped_compaction( self: &Arc, partition_id: PartitionId, reason: &str, ) { - let mut repos = self.catalog.repositories().await; + let mut repos = self.catalog.repositories(); repos .partitions() @@ -212,7 +199,7 @@ pub struct TestNamespace { impl TestNamespace { /// Create a table in this namespace pub async fn create_table(self: &Arc, name: &str) -> Arc { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let table = arbitrary_table(&mut *repos, name, &self.namespace).await; @@ -229,7 +216,7 @@ impl TestNamespace { name: &str, template: Option, ) -> Arc { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let table = repos .tables() @@ -254,32 +241,36 @@ impl TestNamespace { /// Get namespace schema for this namespace. pub async fn schema(&self) -> NamespaceSchema { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); get_schema_by_id( self.namespace.id, repos.as_mut(), SoftDeletedRows::ExcludeDeleted, ) .await - .unwrap() + .expect("no catalog error") + .expect("namespace exists") } /// Set the number of tables allowed in this namespace. - pub async fn update_table_limit(&self, new_max: i32) { - let mut repos = self.catalog.catalog.repositories().await; + pub async fn update_table_limit(&self, new_max: usize) { + let mut repos = self.catalog.catalog.repositories(); repos .namespaces() - .update_table_limit(&self.namespace.name, MaxTables::new(new_max)) + .update_table_limit(&self.namespace.name, MaxTables::try_from(new_max).unwrap()) .await .unwrap(); } /// Set the number of columns per table allowed in this namespace. - pub async fn update_column_limit(&self, new_max: i32) { - let mut repos = self.catalog.catalog.repositories().await; + pub async fn update_column_limit(&self, new_max: usize) { + let mut repos = self.catalog.catalog.repositories(); repos .namespaces() - .update_column_limit(&self.namespace.name, MaxColumnsPerTable::new(new_max)) + .update_column_limit( + &self.namespace.name, + MaxColumnsPerTable::try_from(new_max).unwrap(), + ) .await .unwrap(); } @@ -297,7 +288,7 @@ pub struct TestTable { impl TestTable { /// Creat a partition for the table pub async fn create_partition(self: &Arc, key: &str) -> Arc { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let partition = repos .partitions() @@ -317,10 +308,9 @@ impl TestTable { pub async fn create_partition_with_sort_key( self: &Arc, key: &str, - sort_key: &[&str], sort_key_ids: &[i64], ) -> Arc { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let partition = repos .partitions() @@ -331,11 +321,9 @@ impl TestTable { let partition = repos .partitions() .cas_sort_key( - &TransitionPartitionId::Deprecated(partition.id), - None, + partition.id, None, - sort_key, - &SortedColumnSet::from(sort_key_ids.iter().cloned()), + &SortKeyIds::from(sort_key_ids.iter().cloned()), ) .await .unwrap(); @@ -354,7 +342,7 @@ impl TestTable { name: &str, column_type: ColumnType, ) -> Arc { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let column = repos .columns() @@ -381,7 +369,7 @@ impl TestTable { /// Get columns from the catalog. pub async fn catalog_columns(&self) -> ColumnsByName { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); get_table_columns_by_id(self.table.id, repos.as_mut()) .await @@ -402,9 +390,9 @@ impl TestTable { let selection: Vec<_> = file .column_set .iter() - .map(|id| *column_id_lookup.get(id).unwrap()) + .map(|id| column_id_lookup.get(id).unwrap().as_ref()) .collect(); - let schema = table_schema.select_by_names(&selection).unwrap(); + let schema = table_schema.select_by_names(&selection[..]).unwrap(); let chunk = ParquetChunk::new(Arc::new(file), schema, self.catalog.parquet_store.clone()); chunk @@ -421,6 +409,7 @@ impl TestTable { /// A test column. #[allow(missing_docs)] +#[derive(Debug)] pub struct TestColumn { pub catalog: Arc, pub namespace: Arc, @@ -446,35 +435,20 @@ pub struct TestPartition { impl TestPartition { /// Update sort key. - pub async fn update_sort_key( - self: &Arc, - sort_key: SortKey, - sort_key_ids: &SortedColumnSet, - ) -> Arc { - let partition = partition_lookup( - self.catalog.catalog.repositories().await.as_mut(), - &self.partition.transition_partition_id(), - ) - .await - .unwrap() - .unwrap(); + pub async fn update_sort_key(self: &Arc, sort_key_ids: &SortKeyIds) -> Arc { + let mut repos = self.catalog.catalog.repositories(); + let partition = repos + .partitions() + .get_by_id(self.partition.id) + .await + .unwrap() + .unwrap(); - let old_sort_key = partition.sort_key; - let old_sort_key_ids = partition.sort_key_ids; + let old_sort_key_ids = partition.sort_key_ids(); - let partition = self - .catalog - .catalog - .repositories() - .await + let partition = repos .partitions() - .cas_sort_key( - &self.partition.transition_partition_id(), - Some(old_sort_key), - Some(old_sort_key_ids), - &sort_key.to_columns().collect::>(), - sort_key_ids, - ) + .cas_sort_key(self.partition.id, old_sort_key_ids, sort_key_ids) .await .unwrap(); @@ -525,7 +499,7 @@ impl TestPartition { let (record_batch, sort_key) = sort_batch(record_batch, &schema); let record_batch = dedup_batch(record_batch, &sort_key); - let object_store_id = object_store_id.unwrap_or_else(Uuid::new_v4); + let object_store_id = object_store_id.unwrap_or_else(ObjectStoreId::new); let metadata = IoxMetadata { object_store_id, @@ -567,13 +541,8 @@ impl TestPartition { }; let result = self.create_parquet_file_catalog_record(builder).await; - let mut repos = self.catalog.catalog.repositories().await; - update_catalog_sort_key_if_needed( - repos.as_mut(), - &self.partition.transition_partition_id(), - sort_key, - ) - .await; + let mut repos = self.catalog.catalog.repositories(); + update_catalog_sort_key_if_needed(repos.as_mut(), self.partition.id, sort_key).await; result } @@ -622,8 +591,9 @@ impl TestPartition { let parquet_file_params = ParquetFileParams { namespace_id: self.namespace.namespace.id, table_id: self.table.table.id, - partition_id: self.partition.transition_partition_id(), - object_store_id: object_store_id.unwrap_or_else(Uuid::new_v4), + partition_id: self.partition.id, + partition_hash_id: self.partition.hash_id().cloned(), + object_store_id: object_store_id.unwrap_or_else(ObjectStoreId::new), min_time: Timestamp::new(min_time), max_time: Timestamp::new(max_time), file_size_bytes: file_size_bytes.unwrap_or(0) as i64, @@ -634,7 +604,7 @@ impl TestPartition { max_l0_created_at: Timestamp::new(max_l0_created_at), }; - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); let parquet_file = repos .parquet_files() .create(parquet_file_params) @@ -644,7 +614,13 @@ impl TestPartition { if to_delete { repos .parquet_files() - .create_upgrade_delete(&[parquet_file.id], &[], &[], CompactionLevel::Initial) + .create_upgrade_delete( + parquet_file.partition_id, + &[parquet_file.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) .await .unwrap(); } @@ -673,7 +649,7 @@ pub struct TestParquetFileBuilder { creation_time: i64, compaction_level: CompactionLevel, to_delete: bool, - object_store_id: Option, + object_store_id: Option, row_count: Option, max_l0_created_at: i64, } @@ -711,6 +687,12 @@ impl TestParquetFileBuilder { .with_schema(schema) } + /// Specify an object store id for this parquet file. + pub fn with_object_store_id(mut self, object_store_id: ObjectStoreId) -> Self { + self.object_store_id = Some(object_store_id); + self + } + fn with_record_batch(mut self, record_batch: RecordBatch) -> Self { self.record_batch = Some(record_batch); self @@ -782,15 +764,12 @@ impl TestParquetFileBuilder { } } -async fn update_catalog_sort_key_if_needed( - repos: &mut R, - id: &TransitionPartitionId, - sort_key: SortKey, -) where +async fn update_catalog_sort_key_if_needed(repos: &mut R, id: PartitionId, sort_key: SortKey) +where R: RepoCollection + ?Sized, { // Fetch the latest partition info from the catalog - let partition = partition_lookup(repos, id).await.unwrap().unwrap(); + let partition = repos.partitions().get_by_id(id).await.unwrap().unwrap(); // fecth column ids from catalog let columns = get_table_columns_by_id(partition.table_id, repos) @@ -799,9 +778,8 @@ async fn update_catalog_sort_key_if_needed( // Similarly to what the ingester does, if there's an existing sort key in the catalog, add new // columns onto the end - - match (partition.sort_key(), partition.sort_key_ids_none_if_empty()) { - (Some(catalog_sort_key), Some(catalog_sort_key_ids)) => { + match partition.sort_key(&columns) { + Some(catalog_sort_key) => { let new_sort_key = sort_key.to_columns().collect::>(); let (_metadata, update) = adjust_sort_key_columns(&catalog_sort_key, &new_sort_key); if let Some(new_sort_key) = update { @@ -811,44 +789,28 @@ async fn update_catalog_sort_key_if_needed( debug!( "Updating (sort_key, sort_key_ids) from ({:?}, {:?}) to ({:?}, {:?})", catalog_sort_key.to_columns().collect::>(), - catalog_sort_key_ids, + partition.sort_key_ids(), &new_sort_key, &new_sort_key_ids, ); repos .partitions() - .cas_sort_key( - id, - Some( - catalog_sort_key - .to_columns() - .map(ToString::to_string) - .collect::>(), - ), - Some(partition.sort_key_ids), - &new_sort_key, - &new_sort_key_ids, - ) + .cas_sort_key(partition.id, partition.sort_key_ids(), &new_sort_key_ids) .await .unwrap(); } } - (None, None) => { + None => { let new_columns = sort_key.to_columns().collect::>(); debug!("Updating sort key from None to {:?}", &new_columns); let column_ids = columns.ids_for_names(&new_columns); repos .partitions() - .cas_sort_key(id, None, None, &new_columns, &column_ids) + .cas_sort_key(partition.id, None, &column_ids) .await .unwrap(); } - _ => panic!( - "sort_key {:?} and sort_key_ids {:?} should be both None or both Some", - partition.sort_key(), - partition.sort_key_ids_none_if_empty() - ), } } @@ -869,6 +831,7 @@ async fn create_parquet_file( /// A test parquet file of the catalog #[allow(missing_docs)] +#[derive(Debug)] pub struct TestParquetFile { pub catalog: Arc, pub namespace: Arc, @@ -889,11 +852,17 @@ impl From for ParquetFile { impl TestParquetFile { /// Make the parquet file deletable pub async fn flag_for_delete(&self) { - let mut repos = self.catalog.catalog.repositories().await; + let mut repos = self.catalog.catalog.repositories(); repos .parquet_files() - .create_upgrade_delete(&[self.parquet_file.id], &[], &[], CompactionLevel::Initial) + .create_upgrade_delete( + self.parquet_file.partition_id, + &[self.parquet_file.object_store_id], + &[], + &[], + CompactionLevel::Initial, + ) .await .unwrap(); } @@ -906,15 +875,15 @@ impl TestParquetFile { .parquet_file .column_set .iter() - .map(|id| *column_id_lookup.get(id).unwrap()) + .map(|id| column_id_lookup.get(id).unwrap().as_ref()) .collect(); let table_schema: Schema = table_columns.clone().try_into().unwrap(); - table_schema.select_by_names(&selection).unwrap() + table_schema.select_by_names(&selection[..]).unwrap() } } /// Return the current time -pub fn now() -> Time { +pub(crate) fn now() -> Time { Time::from_timestamp(0, 0).unwrap() } diff --git a/iox_tests/src/lib.rs b/iox_tests/src/lib.rs index 98f65aefa1c..b5e4a283872 100644 --- a/iox_tests/src/lib.rs +++ b/iox_tests/src/lib.rs @@ -17,8 +17,6 @@ // Workaround for "unused crate" lint false positives. use workspace_hack as _; -use data_types::{PartitionKey, TableId, TransitionPartitionId}; - mod catalog; pub use catalog::{ TestCatalog, TestNamespace, TestParquetFile, TestParquetFileBuilder, TestPartition, TestTable, @@ -28,11 +26,3 @@ mod builders; pub use builders::{ ColumnBuilder, ParquetFileBuilder, PartitionBuilder, SkippedCompactionBuilder, TableBuilder, }; - -/// Create a partition identifier from an int (which gets used as the table ID) and a partition key -/// with the string "arbitrary". Most useful in cases where there isn't any actual catalog -/// interaction (that is, in mocks) and when the important property of the partition identifiers is -/// that they're either the same or different than other partition identifiers. -pub fn partition_identifier(table_id: i64) -> TransitionPartitionId { - TransitionPartitionId::new(TableId::new(table_id), &PartitionKey::from("arbitrary")) -} diff --git a/iox_time/Cargo.toml b/iox_time/Cargo.toml index d8ef58e829c..c8a8398d0f2 100644 --- a/iox_time/Cargo.toml +++ b/iox_time/Cargo.toml @@ -6,10 +6,13 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] chrono = { version = "0.4.31", default-features = false, features = ["clock", "std"] } parking_lot = "0.12" -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/iox_time/src/lib.rs b/iox_time/src/lib.rs index 853155d2312..3a2cf836029 100644 --- a/iox_time/src/lib.rs +++ b/iox_time/src/lib.rs @@ -204,7 +204,7 @@ pub trait TimeProvider: Debug + Display + Send + Sync + 'static { } /// A [`TimeProvider`] that uses [`Utc::now`] as a clock source -#[derive(Debug, Default, Clone)] +#[derive(Debug, Default, Clone, Copy)] pub struct SystemProvider {} impl SystemProvider { diff --git a/ioxd_common/Cargo.toml b/ioxd_common/Cargo.toml index 0fce9b58904..c52b49ffd89 100644 --- a/ioxd_common/Cargo.toml +++ b/ioxd_common/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + # Optional feature 'pprof' enables http://localhost:8080/debug/pprof/profile support support [dependencies] @@ -12,15 +15,16 @@ license.workspace = true authz = { path = "../authz", features = ["http"] } clap_blocks = { path = "../clap_blocks" } generated_types = { path = "../generated_types" } -heappy = { git = "https://github.com/mkmik/heappy", rev = "1de977a241cdd768acc5b6c82c0728b30c7db7b4", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true } +heappy = { git = "https://github.com/mkmik/heappy", rev = "01a1f88e1b404c5894f89eb1a57f813f713d7ad1", features = ["enable_heap_profiler", "jemalloc_shim", "measure_free"], optional = true } metric = { path = "../metric" } metric_exporters = { path = "../metric_exporters" } observability_deps = { path = "../observability_deps" } # NOTE: we may not notice that we need the "backtrace-rs" feature if we also build with the heappy feature, which depends on backtrace-rs. # (honestly I thought that cargo dependencies were isolated on a per crate basis so I'm a bit surprised that pprof accidentally builds # successfully just because another crate happens to depend on backtrace-rs) -pprof = { version = "0.12", default-features = false, features = ["flamegraph", "prost-codec"], optional = true } +pprof = { version = "0.13", default-features = false, features = ["flamegraph", "prost-codec"], optional = true } service_grpc_testing = { path = "../service_grpc_testing" } +tower_trailer = { path = "../tower_trailer" } trace = { path = "../trace" } trace_exporters = { path = "../trace_exporters" } trace_http = { path = "../trace_http" } @@ -32,18 +36,18 @@ clap = { version = "4", features = ["derive", "env"] } flate2 = "1.0" futures = "0.3" hashbrown = { workspace = true } -http = "0.2.9" +http = "0.2.11" hyper = "0.14" log = "0.4" parking_lot = "0.12" -reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } +reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.107" +serde_json = "1.0.111" serde_urlencoded = "0.7.0" -snafu = "0.7" -tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } +snafu = "0.8" +tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tokio-stream = { version = "0.1", features = ["net"] } -tokio-util = { version = "0.7.9" } +tokio-util = { version = "0.7.10" } tonic = { workspace = true } tonic-health = { workspace = true } tonic-reflection = { workspace = true } @@ -51,7 +55,6 @@ tower = "0.4" tower-http = { version = "0.4", features = ["catch-panic"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } - [dev-dependencies] # Workspace dependencies, in alphabetical order # Crates.io dependencies, in alphabetical order diff --git a/ioxd_common/src/http/error.rs b/ioxd_common/src/http/error.rs index 64538477676..c08146c677c 100644 --- a/ioxd_common/src/http/error.rs +++ b/ioxd_common/src/http/error.rs @@ -1,5 +1,6 @@ use hyper::{Body, Response, StatusCode}; use observability_deps::tracing::warn; +use serde::Serialize; /// Constants used in API error codes. /// @@ -13,6 +14,7 @@ pub enum HttpApiErrorCode { Invalid, UnprocessableEntity, EmptyValue, + PartialWrite, Unavailable, Forbidden, TooManyRequests, @@ -32,6 +34,7 @@ impl HttpApiErrorCode { Self::Invalid => "invalid", Self::UnprocessableEntity => "unprocessable entity", Self::EmptyValue => "empty value", + Self::PartialWrite => "created with partial errors found", Self::Unavailable => "unavailable", Self::Forbidden => "forbidden", Self::TooManyRequests => "too many requests", @@ -51,6 +54,7 @@ impl HttpApiErrorCode { Self::Invalid => StatusCode::BAD_REQUEST, Self::UnprocessableEntity => StatusCode::UNPROCESSABLE_ENTITY, Self::EmptyValue => StatusCode::NO_CONTENT, + Self::PartialWrite => StatusCode::CREATED, Self::Unavailable => StatusCode::SERVICE_UNAVAILABLE, Self::Forbidden => StatusCode::FORBIDDEN, Self::TooManyRequests => StatusCode::TOO_MANY_REQUESTS, @@ -76,6 +80,7 @@ impl From for HttpApiErrorCode { StatusCode::BAD_REQUEST => Self::Invalid, StatusCode::UNPROCESSABLE_ENTITY => Self::UnprocessableEntity, StatusCode::NO_CONTENT => Self::EmptyValue, + StatusCode::CREATED => Self::PartialWrite, StatusCode::SERVICE_UNAVAILABLE => Self::Unavailable, StatusCode::FORBIDDEN => Self::Forbidden, StatusCode::TOO_MANY_REQUESTS => Self::TooManyRequests, @@ -91,16 +96,27 @@ impl From for HttpApiErrorCode { } } +impl Serialize for HttpApiErrorCode { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(self.as_text()) + } +} + /// Error that is compatible with the Influxdata Cloud 2 HTTP API. /// /// See . -#[derive(Debug)] +#[derive(Debug, Serialize)] pub struct HttpApiError { /// Machine-readable error code. code: HttpApiErrorCode, /// Human-readable message. + #[serde(rename = "message")] msg: String, + + /// Optional error line (for line protocol errors). + #[serde(skip_serializing_if = "Option::is_none")] + line: Option, } impl HttpApiError { @@ -109,18 +125,18 @@ impl HttpApiError { Self { code: code.into(), msg: msg.into(), + line: None, } } + /// Add body to error. + pub fn with_line(self, line: Option) -> Self { + Self { line, ..self } + } + /// Generate response body for this error. fn body(&self) -> Body { - let json = serde_json::json!({ - "code": self.code.as_text().to_string(), - "message": self.msg.clone(), - }) - .to_string(); - - Body::from(json) + Body::from(serde_json::to_string(&self).expect("must serialise to json")) } /// Generate response for this error. diff --git a/ioxd_common/src/http/mod.rs b/ioxd_common/src/http/mod.rs index 6b59c416c71..21a770d3d3a 100644 --- a/ioxd_common/src/http/mod.rs +++ b/ioxd_common/src/http/mod.rs @@ -1,3 +1,4 @@ +use http::StatusCode; use std::{convert::Infallible, num::NonZeroI32, sync::Arc}; use authz::http::AuthorizationHeaderExtension; @@ -94,14 +95,11 @@ pub async fn serve( shutdown: CancellationToken, trace_header_parser: TraceHeaderParser, ) -> Result<(), hyper::Error> { - let metric_registry = server_type.metric_registry(); let trace_collector = server_type.trace_collector(); - let trace_layer = TraceLayer::new( trace_header_parser, - metric_registry, + Arc::new(server_type.http_request_metrics()), trace_collector, - false, server_type.name(), ); @@ -136,7 +134,7 @@ async fn route_request( let content_length = req.headers().get("content-length").cloned(); let response = match (method.clone(), uri.path()) { - (Method::GET, "/health") => health(), + (Method::GET, "/health") => Ok(health(server_type.as_ref())), (Method::GET, "/metrics") => handle_metrics(server_type.as_ref()), (Method::GET, "/debug/pprof") => pprof_home(req).await, (Method::GET, "/debug/pprof/profile") => pprof_profile(req).await, @@ -165,9 +163,18 @@ async fn route_request( } } -fn health() -> Result, ApplicationError> { - let response_body = "OK"; - Ok(Response::new(Body::from(response_body.to_string()))) +fn health(server_type: &dyn ServerType) -> Response { + match server_type.is_healthy() { + true => { + let response_body = "OK"; + Response::new(Body::from(response_body.to_string())) + } + false => { + let mut resp = Response::new(Body::empty()); + *resp.status_mut() = StatusCode::SERVICE_UNAVAILABLE; + resp + } + } } fn handle_metrics(server_type: &dyn ServerType) -> Result, ApplicationError> { diff --git a/ioxd_common/src/http/pprof.rs b/ioxd_common/src/http/pprof.rs index c49992f596f..c15a62da04b 100644 --- a/ioxd_common/src/http/pprof.rs +++ b/ioxd_common/src/http/pprof.rs @@ -1,7 +1,7 @@ use observability_deps::tracing::info; use tokio::time::Duration; -pub async fn dump_rsprof(seconds: u64, frequency: i32) -> pprof::Result { +pub(crate) async fn dump_rsprof(seconds: u64, frequency: i32) -> pprof::Result { let guard = pprof::ProfilerGuard::new(frequency)?; info!( "start profiling {} seconds with frequency {} /s", diff --git a/ioxd_common/src/lib.rs b/ioxd_common/src/lib.rs index cbc82c63dc5..4326c322d7e 100644 --- a/ioxd_common/src/lib.rs +++ b/ioxd_common/src/lib.rs @@ -29,6 +29,7 @@ pub mod reexport { pub use tonic_health; pub use tonic_reflection; pub use tower_http; + pub use tower_trailer; pub use trace_http; } @@ -45,6 +46,9 @@ use trace_http::ctx::TraceHeaderParser; #[derive(Debug, Snafu)] pub enum Error { + #[snafu(display("Neither grpc nor http listeners are available"))] + MissingListener, + #[snafu(display("Unable to bind to listen for HTTP requests on {}: {}", addr, source))] StartListeningHttp { addr: SocketAddr, @@ -121,10 +125,14 @@ pub async fn http_listener(addr: SocketAddr) -> Result { pub async fn serve( common_state: CommonServerState, frontend_shutdown: CancellationToken, - grpc_listener: tokio::net::TcpListener, + grpc_listener: Option, http_listener: Option, server_type: Arc, ) -> Result<()> { + if grpc_listener.is_none() && http_listener.is_none() { + return Err(Error::MissingListener); + } + let trace_header_parser = TraceHeaderParser::new() .with_jaeger_trace_context_header_name( &common_state @@ -140,14 +148,26 @@ pub async fn serve( ); // Construct and start up gRPC server - let grpc_server = rpc::serve( - grpc_listener, - Arc::clone(&server_type), - trace_header_parser.clone(), - frontend_shutdown.clone(), - ) + let captured_server_type = Arc::clone(&server_type); + let captured_shutdown = frontend_shutdown.clone(); + let captured_trace_header_parser = trace_header_parser.clone(); + let grpc_server = async move { + if let Some(grpc_listener) = grpc_listener { + info!(?captured_server_type, "gRPC server listening"); + rpc::serve( + grpc_listener, + captured_server_type, + captured_trace_header_parser, + captured_shutdown, + ) + .await? + } else { + // don't resolve otherwise will cause server to shutdown + captured_shutdown.cancelled().await + } + Ok(()) + } .fuse(); - info!(?server_type, "gRPC server listening"); let captured_server_type = Arc::clone(&server_type); let captured_shutdown = frontend_shutdown.clone(); @@ -218,7 +238,7 @@ pub async fn serve( // // This is important to ensure background tasks, such as polling the tracker // registry, don't exit before HTTP and gRPC requests dependent on them - while !grpc_server.is_terminated() && !http_server.is_terminated() { + while !grpc_server.is_terminated() || !http_server.is_terminated() { futures::select! { _ = signal => info!(?server_type, "shutdown requested"), _ = server_handle => { diff --git a/ioxd_common/src/rpc.rs b/ioxd_common/src/rpc.rs index 8f25f326c45..1185e5b6363 100644 --- a/ioxd_common/src/rpc.rs +++ b/ioxd_common/src/rpc.rs @@ -34,6 +34,9 @@ pub struct RpcBuilder { #[macro_export] macro_rules! add_service { ($builder:ident, $svc:expr) => { + $crate::add_service!($builder, $svc, Serving) + }; + ($builder:ident, $svc:expr, $status:ident) => { let $builder = { // `inner` might be required to be `mut` or not depending if we're acting on: // - a `Server`, no service added yet, no `mut` required @@ -50,7 +53,7 @@ macro_rules! add_service { } = $builder; let service = $svc; - let status = $crate::reexport::tonic_health::ServingStatus::Serving; + let status = $crate::reexport::tonic_health::ServingStatus::$status; health_reporter .set_service_status(service_name(&service), status) .await; @@ -97,16 +100,19 @@ macro_rules! setup_builder { let builder = builder .layer($crate::reexport::trace_http::tower::TraceLayer::new( trace_header_parser, - $server_type.metric_registry(), + Arc::new($crate::reexport::trace_http::metrics::RequestMetrics::new( + $server_type.metric_registry(), + $crate::reexport::trace_http::metrics::MetricFamily::GrpcServer, + )), $server_type.trace_collector(), - true, $server_type.name(), )) .layer( $crate::reexport::tower_http::catch_panic::CatchPanicLayer::custom( $crate::rpc::handle_panic, ), - ); + ) + .layer($crate::reexport::tower_trailer::TrailerLayer::default()); let builder = RpcBuilder { inner: builder, diff --git a/ioxd_common/src/server_type.rs b/ioxd_common/src/server_type.rs index 767dacd9ea4..519b2c443f2 100644 --- a/ioxd_common/src/server_type.rs +++ b/ioxd_common/src/server_type.rs @@ -10,6 +10,7 @@ use tokio_util::sync::CancellationToken; use trace::TraceCollector; pub use common_state::{CommonServerState, CommonServerStateError}; +use trace_http::metrics::{MetricFamily, RequestMetrics}; use crate::{http::error::HttpApiErrorSource, rpc::RpcBuilderInput}; @@ -20,6 +21,9 @@ pub enum RpcError { source: tonic::transport::Error, details: String, }, + + #[snafu(display("gRPC endpoint is not implemented"))] + UnImplemented, } // Custom impl to include underlying source (not included in tonic @@ -47,6 +51,11 @@ pub trait ServerType: std::fmt::Debug + Send + Sync + 'static { /// Trace collector associated with the server, if any. fn trace_collector(&self) -> Option>; + /// Returns the `RequestMetrics` for instrumenting HTTP requests + fn http_request_metrics(&self) -> RequestMetrics { + RequestMetrics::new(self.metric_registry(), MetricFamily::HttpServer) + } + /// Route given HTTP request. /// /// Note that this is only called if none of the shared, common routes (e.g. `/health`) match. @@ -69,4 +78,9 @@ pub trait ServerType: std::fmt::Debug + Send + Sync + 'static { /// to shutdown the "frontend" (HTTP & RPC servers) when appropriate - this /// should happen before [`Self::join()`] returns. fn shutdown(&self, frontend: CancellationToken); + + /// Return `true` if the service is healthy + fn is_healthy(&self) -> bool { + true + } } diff --git a/ioxd_common/src/service.rs b/ioxd_common/src/service.rs index 6f1bf840357..b7513030853 100644 --- a/ioxd_common/src/service.rs +++ b/ioxd_common/src/service.rs @@ -8,7 +8,7 @@ use crate::server_type::ServerType; #[derive(Debug)] pub struct Service { pub http_bind_address: Option, - pub grpc_bind_address: SocketAddr, + pub grpc_bind_address: Option, pub server_type: Arc, } @@ -16,7 +16,7 @@ impl Service { pub fn create(server_type: Arc, run_config: &RunConfig) -> Self { Self { http_bind_address: Some(run_config.http_bind_address), - grpc_bind_address: run_config.grpc_bind_address, + grpc_bind_address: Some(run_config.grpc_bind_address), server_type, } } @@ -24,7 +24,15 @@ impl Service { pub fn create_grpc_only(server_type: Arc, run_config: &RunConfig) -> Self { Self { http_bind_address: None, - grpc_bind_address: run_config.grpc_bind_address, + grpc_bind_address: Some(run_config.grpc_bind_address), + server_type, + } + } + + pub fn create_http_only(server_type: Arc, run_config: &RunConfig) -> Self { + Self { + http_bind_address: Some(run_config.http_bind_address), + grpc_bind_address: None, server_type, } } diff --git a/ioxd_test/Cargo.toml b/ioxd_test/Cargo.toml index 7483189b2be..488efafd9e7 100644 --- a/ioxd_test/Cargo.toml +++ b/ioxd_test/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # Workspace dependencies, in alphabetical order ioxd_common = { path = "../ioxd_common" } @@ -15,6 +18,6 @@ trace = { path = "../trace" } async-trait = "0.1" clap = { version = "4", features = ["derive", "env"] } hyper = "0.14" -snafu = "0.7" -tokio-util = "0.7.9" +snafu = "0.8" +tokio-util = "0.7.10" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/ioxd_test/src/lib.rs b/ioxd_test/src/lib.rs index fe652e5dc93..b32dcdabbf3 100644 --- a/ioxd_test/src/lib.rs +++ b/ioxd_test/src/lib.rs @@ -44,7 +44,7 @@ impl HttpApiErrorSource for ApplicationError { } } -#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum)] +#[derive(Debug, Clone, PartialEq, Eq, clap::ValueEnum, Copy)] pub enum TestAction { None, EarlyReturnFromGrpcWorker, diff --git a/kube_test/Cargo.toml b/kube_test/Cargo.toml new file mode 100644 index 00000000000..f7da1fe14de --- /dev/null +++ b/kube_test/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "kube_test" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +http = "0.2.9" +hyper = "0.14.27" +kube-core = "0.87.1" +k8s-openapi = { version = "0.20.0", features = ["earliest"] } +rand = "0.8.5" +serde = "1.0.195" +serde_json = "1.0.111" +serde_yaml = "0.9.30" +tower = "0.4.13" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + diff --git a/kube_test/src/call.rs b/kube_test/src/call.rs new file mode 100644 index 00000000000..6ed31a03bd8 --- /dev/null +++ b/kube_test/src/call.rs @@ -0,0 +1,70 @@ +use super::{request::Request, Handler, Result}; +use http::{HeaderMap, Response, StatusCode}; +use hyper::body::HttpBody; +use hyper::Body; +use std::future::Future; +use std::mem; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{ready, Context, Poll}; + +#[derive(Debug)] +pub struct Call { + handler: Option>, + request: Request, + header: HeaderMap, + body: Body, + buf: Vec, +} + +impl Call { + pub(crate) fn new( + handler: Option>, + request: Request, + header: HeaderMap, + body: Body, + ) -> Self { + Self { + handler, + request, + header, + body, + buf: vec![], + } + } +} + +impl Future for Call { + type Output = Result>; + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let this = self.get_mut(); + match &this.handler { + None => { + let data = serde_json::to_vec(&super::status::resource_not_found( + &this.request.api_plural(), + )) + .unwrap(); + Poll::Ready( + Response::builder() + .status(StatusCode::NOT_FOUND) + .body(data.into()) + .map_err(super::Error::from), + ) + } + Some(handler) => { + while !&this.body.is_end_stream() { + match ready!(Pin::new(&mut this.body).poll_data(cx)).transpose()? { + Some(buf) => this.buf.extend_from_slice(buf.as_ref()), + None => break, + } + } + Poll::Ready(handler.handle( + mem::take(&mut this.request), + mem::take(&mut this.header), + mem::take(&mut this.buf), + )) + } + } + } +} diff --git a/kube_test/src/error.rs b/kube_test/src/error.rs new file mode 100644 index 00000000000..e3c4f9139ba --- /dev/null +++ b/kube_test/src/error.rs @@ -0,0 +1,57 @@ +use std::fmt::{Display, Formatter}; + +#[derive(Debug)] +pub enum Error { + Serialization(serde_json::Error), + Yaml(serde_yaml::Error), + Http(http::Error), + Hyper(hyper::Error), +} + +pub type Result = std::result::Result; + +impl Display for Error { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Serialization(e) => e.fmt(f), + Self::Yaml(e) => e.fmt(f), + Self::Http(e) => e.fmt(f), + Self::Hyper(e) => e.fmt(f), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Serialization(e) => Some(e), + Self::Yaml(e) => Some(e), + Self::Http(e) => Some(e), + Self::Hyper(e) => Some(e), + } + } +} + +impl From for Error { + fn from(value: serde_json::Error) -> Self { + Self::Serialization(value) + } +} + +impl From for Error { + fn from(value: serde_yaml::Error) -> Self { + Self::Yaml(value) + } +} + +impl From for Error { + fn from(value: http::Error) -> Self { + Self::Http(value) + } +} + +impl From for Error { + fn from(value: hyper::Error) -> Self { + Self::Hyper(value) + } +} diff --git a/kube_test/src/handler.rs b/kube_test/src/handler.rs new file mode 100644 index 00000000000..464797baa25 --- /dev/null +++ b/kube_test/src/handler.rs @@ -0,0 +1,25 @@ +use super::{request::Request, Result}; +use http::{HeaderMap, Response}; +use hyper::Body; +use kube_core::ApiResource; +use std::fmt::Debug; +use std::sync::Arc; + +pub trait Handler: Debug { + fn api_resource(&self) -> ApiResource; + + fn handle(&self, req: Request, header: HeaderMap, body: Vec) -> Result>; +} + +pub trait AsHandler { + fn as_handler(self: &Arc) -> Arc; +} + +impl AsHandler for T +where + T: Handler + Send + Sync + 'static, +{ + fn as_handler(self: &Arc) -> Arc { + Arc::clone(self) as Arc + } +} diff --git a/kube_test/src/lib.rs b/kube_test/src/lib.rs new file mode 100644 index 00000000000..3c72ce2c1f3 --- /dev/null +++ b/kube_test/src/lib.rs @@ -0,0 +1,31 @@ +//! Kube_test provides a fake kubernetes service that can be used to test a kubernetes controller. +//! The Service class provides a [tower::Service] that can be used with a kubernetes Client to +//! behave sufficiently like a kubernetes controller to simplify testing controller reconcile loops. +#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] +#![warn( +missing_debug_implementations, +clippy::explicit_iter_loop, +clippy::use_self, +clippy::clone_on_ref_ptr, +// See https://github.com/influxdata/influxdb_iox/pull/1671 +clippy::future_not_send +)] +#![allow(unreachable_pub)] + +// Workaround for "unused crate" lint false positives. +use workspace_hack as _; + +mod call; +mod error; +mod handler; +mod object_map; +mod request; +mod resource_handler; +mod service; +mod status; + +pub use call::Call; +pub use error::{Error, Result}; +pub use handler::{AsHandler, Handler}; +pub use resource_handler::ResourceHandler; +pub use service::Service; diff --git a/kube_test/src/object_map.rs b/kube_test/src/object_map.rs new file mode 100644 index 00000000000..55807b8edb5 --- /dev/null +++ b/kube_test/src/object_map.rs @@ -0,0 +1,178 @@ +use super::status; +use kube_core::{ApiResource, DynamicObject, Status}; +use std::collections::{hash_map, HashMap}; +use std::mem; + +#[derive(Debug)] +pub struct ObjectMap { + api_resource: ApiResource, + objects: HashMap, +} + +#[derive(Debug, Clone, Hash, PartialEq, Eq)] +struct Key { + ns: Option, + name: String, +} + +impl ObjectMap { + pub fn new(api_resource: ApiResource) -> Self { + Self { + api_resource, + objects: HashMap::new(), + } + } + + pub fn entry(&mut self, ns: Option, name: String) -> Entry<'_> { + let key = Key { ns, name }; + let inner = self.objects.entry(key); + Entry { + api_resource: &self.api_resource, + inner, + } + } + + pub fn values(&self, ns: Option) -> Values<'_> { + Values { + ns, + inner: self.objects.values(), + } + } +} + +#[derive(Debug)] +pub struct Entry<'a> { + api_resource: &'a ApiResource, + inner: hash_map::Entry<'a, Key, DynamicObject>, +} + +impl<'a> Entry<'a> { + pub fn create(self, obj: DynamicObject) -> Result<&'a DynamicObject, Box> { + match self.inner { + hash_map::Entry::Occupied(entry) => Err(Box::new(status::already_exists( + self.api_resource, + Some(entry.key().name.as_str()), + ))), + hash_map::Entry::Vacant(entry) => { + let Key { ns, name } = entry.key().clone(); + let obj = entry.insert(obj); + obj.metadata.namespace = ns; + obj.metadata.name = Some(name); + if obj.metadata.uid.is_none() { + obj.metadata.uid = Some(format!("{}", rand::random::())); + } + Ok(obj) + } + } + } + + pub fn get(&mut self) -> Result<&DynamicObject, Box> { + match &self.inner { + hash_map::Entry::Occupied(entry) => Ok(entry.get()), + hash_map::Entry::Vacant(entry) => { + let name = entry.key().name.as_str(); + Err(Box::new(status::not_found(self.api_resource, Some(name)))) + } + } + } + + pub fn delete(self) -> Result> { + match self.inner { + hash_map::Entry::Occupied(entry) => { + let obj = entry.remove(); + Ok(obj) + } + hash_map::Entry::Vacant(entry) => { + let name = entry.key().name.as_str(); + Err(Box::new(status::not_found(self.api_resource, Some(name)))) + } + } + } + + pub fn update(self, mut obj: DynamicObject) -> Result<(bool, DynamicObject), Box> { + match self.inner { + hash_map::Entry::Occupied(mut entry) => { + let Key { ns, name } = entry.key().clone(); + obj.metadata.namespace = ns; + obj.metadata.name = Some(name); + let _ = entry.insert(obj.clone()); + Ok((false, obj)) + } + hash_map::Entry::Vacant(entry) => { + let Key { ns, name } = entry.key().clone(); + let obj = entry.insert(obj); + obj.metadata.namespace = ns; + obj.metadata.name = Some(name); + if obj.metadata.uid.is_none() { + obj.metadata.uid = Some(format!("{}", rand::random::())); + } + Ok((true, obj.clone())) + } + } + } + + pub fn apply(self, patch: DynamicObject) -> Result> { + let Key { ns, name } = self.inner.key().clone(); + + let obj = self.inner.or_insert_with(|| { + let obj = DynamicObject::new(name.as_str(), self.api_resource); + if let Some(ns) = ns { + obj.within(ns.as_str()) + } else { + obj + } + }); + let _ = mem::replace(&mut obj.data, patch.data); + Ok(obj.clone()) + } + + pub fn update_subresource( + self, + subresource: String, + obj: DynamicObject, + ) -> Result<(bool, DynamicObject), Box> { + match self.inner { + hash_map::Entry::Occupied(mut entry) => { + if let Some(value) = obj.data.as_object().and_then(|v| v.get(&subresource)) { + if let Some(data) = entry.get_mut().data.as_object_mut() { + data.insert(subresource, value.clone()); + } + } + Ok((false, entry.get().clone())) + } + hash_map::Entry::Vacant(entry) => { + let Key { ns, name } = entry.key().clone(); + let obj = entry.insert(obj); + obj.metadata.namespace = ns; + obj.metadata.name = Some(name); + if obj.metadata.uid.is_none() { + obj.metadata.uid = Some(format!("{}", rand::random::())); + } + Ok((true, obj.clone())) + } + } + } +} + +pub struct Values<'a> { + ns: Option, + inner: hash_map::Values<'a, Key, DynamicObject>, +} + +impl<'a> Iterator for Values<'a> { + type Item = &'a DynamicObject; + fn next(&mut self) -> Option { + match &self.ns { + None => self.inner.next(), + Some(ns) => loop { + match self.inner.next() { + None => return None, + Some(v) => match &v.metadata.namespace { + Some(ns2) if ns2 == ns => return Some(v), + _ => continue, + }, + }; + }, + } + } +} diff --git a/kube_test/src/request.rs b/kube_test/src/request.rs new file mode 100644 index 00000000000..6a741ab6b07 --- /dev/null +++ b/kube_test/src/request.rs @@ -0,0 +1,115 @@ +use http::request::Parts; +use kube_core::ApiResource; +use std::fmt::{Display, Formatter}; + +#[derive(Debug, Default, Clone)] +pub struct Request { + pub verb: String, + pub group: String, + pub version: String, + pub plural: String, + pub ns: Option, + pub name: Option, + pub subresource: Option, +} + +impl Request { + pub(crate) fn parse(parts: &Parts) -> Self { + let verb = parts.method.as_str().to_lowercase(); + let (group, version, plural, ns, name, subresource) = match parts + .uri + .path() + .split('/') + .skip(1) + .collect::>() + .as_slice() + { + ["api", "v1", plural] => ("", "v1", *plural, "", "", ""), + ["api", "v1", plural, name] => ("", "v1", *plural, "", *name, ""), + ["api", "v1", "namespaces", ns, plural] => ("", "v1", *plural, *ns, "", ""), + ["api", "v1", "namespaces", ns, plural, name] => ("", "v1", *plural, *ns, *name, ""), + ["api", "v1", "namespaces", ns, plural, name, subresource] => { + ("", "v1", *plural, *ns, *name, *subresource) + } + ["api", "v1", plural, name, subresource] => { + ("", "v1", *plural, "", *name, *subresource) + } + ["apis", group, version, "namespaces", ns, plural] => { + (*group, *version, *plural, *ns, "", "") + } + ["apis", group, version, "namespaces", ns, plural, name] => { + (*group, *version, *plural, *ns, *name, "") + } + ["apis", group, version, "namespaces", ns, plural, name, subresource] => { + (*group, *version, *plural, *ns, *name, *subresource) + } + ["apis", group, version, plural] => (*group, *version, *plural, "", "", ""), + ["apis", group, version, plural, name] => (*group, *version, *plural, "", *name, ""), + ["apis", group, version, plural, name, subresource] => { + (*group, *version, *plural, "", *name, *subresource) + } + _ => ("", "", "", "", "", ""), + }; + + let verb = match (verb.as_str(), name.len()) { + ("get", 0) => String::from("list"), + ("delete", 0) => String::from("deletecollection"), + ("post", _) => String::from("create"), + ("put", _) => String::from("update"), + _ => verb, + }; + + Self { + verb, + group: String::from(group), + version: String::from(version), + plural: String::from(plural), + ns: if ns.is_empty() { + None + } else { + Some(String::from(ns)) + }, + name: if name.is_empty() { + None + } else { + Some(String::from(name)) + }, + subresource: if subresource.is_empty() { + None + } else { + Some(String::from(subresource)) + }, + } + } + + pub fn api_plural(&self) -> ApiPlural { + ApiPlural::new(self.group.clone(), self.plural.clone()) + } +} +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct ApiPlural { + group: String, + plural: String, +} + +impl ApiPlural { + pub fn new(group: String, plural: String) -> Self { + Self { group, plural } + } +} + +impl Display for ApiPlural { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + if self.group.is_empty() { + self.plural.fmt(f) + } else { + write!(f, "{}/{}", self.group, self.plural) + } + } +} + +impl From for ApiPlural { + fn from(value: ApiResource) -> Self { + Self::new(value.group, value.plural) + } +} diff --git a/kube_test/src/resource_handler.rs b/kube_test/src/resource_handler.rs new file mode 100644 index 00000000000..9b76ad8aa6e --- /dev/null +++ b/kube_test/src/resource_handler.rs @@ -0,0 +1,267 @@ +use super::{object_map::ObjectMap, request::Request, status, Handler, Result}; +use http::{HeaderMap, HeaderValue, Response, StatusCode}; +use hyper::Body; +use kube_core::{ApiResource, DynamicObject, ObjectList, ObjectMeta, Resource}; +use serde::de::DeserializeOwned; +use serde::Serialize; +use std::fmt::Debug; +use std::marker::PhantomData; +use std::sync::atomic::{AtomicI16, Ordering}; +use std::sync::{Arc, Mutex}; + +#[derive(Debug)] +pub struct ResourceHandler { + api_resource: ApiResource, + objects: Arc>, + gen_id: AtomicI16, + phantom: PhantomData, +} + +impl ResourceHandler +where + R: Resource + DeserializeOwned + Serialize, +{ + /// Create a new handler for a kubernetes resource type. + pub fn new() -> Self { + let api_resource = ApiResource::erase::(&()); + Self { + api_resource: api_resource.clone(), + objects: Arc::new(Mutex::new(ObjectMap::new(api_resource))), + gen_id: AtomicI16::new(0), + phantom: Default::default(), + } + } + + /// Retrieve a stored kubernetes resource, if available. + pub fn get(&self, ns: impl Into, name: impl Into) -> Option { + let ns = ns.into(); + let ns = if ns.is_empty() { None } else { Some(ns) }; + let name = name.into(); + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .get() + { + Ok(obj) => obj.clone().try_parse::().ok(), + _ => None, + } + } + + /// Store, or overwrite, the resource with the given name. + pub fn set(&self, ns: impl Into, name: impl Into, resource: R) -> R { + let ns = ns.into(); + let ns = if ns.is_empty() { None } else { Some(ns) }; + let name = name.into(); + let obj = serde_json::from_value::(serde_json::to_value(resource).unwrap()) + .unwrap(); + let (_, obj) = Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .update(obj) + .unwrap(); + obj.try_parse::().unwrap() + } + + /// Retrieve all the stored resources. if the resource is namespaced and ns is not None then + /// only resources in that namespace will be returned. + pub fn all(&self, ns: impl Into) -> Vec { + let ns = ns.into(); + let ns = if ns.is_empty() { None } else { Some(ns) }; + Arc::clone(&self.objects) + .lock() + .unwrap() + .values(ns) + .cloned() + .filter_map(|v| v.try_parse::().ok()) + .collect::>() + } +} + +impl Default for ResourceHandler +where + R: Resource + DeserializeOwned + Serialize, +{ + fn default() -> Self { + Self::new() + } +} + +impl ResourceHandler { + fn maybe_generate_name(&self, meta: &mut ObjectMeta) { + if meta.name.is_none() { + if let Some(prefix) = &meta.generate_name { + meta.name = Some(format!( + "{prefix}{:05}", + self.gen_id.fetch_add(1, Ordering::SeqCst) + )); + } + } + } + + fn create(&self, body: Vec) -> Result> { + let mut obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?; + self.maybe_generate_name(&mut obj.metadata); + let ns = obj.metadata.namespace.clone(); + let name = obj.metadata.name.clone().unwrap(); + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .create(obj) + { + Ok(obj) => response(StatusCode::CREATED, obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + + fn retrieve(&self, ns: Option, name: String) -> Result> { + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .get() + { + Ok(obj) => response(StatusCode::OK, obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + + fn list(&self, ns: Option) -> Result> { + let list = ObjectList { + metadata: Default::default(), + items: Arc::clone(&self.objects) + .lock() + .unwrap() + .values(ns) + .cloned() + .collect(), + }; + response(StatusCode::OK, &list) + } + + fn update(&self, ns: Option, name: String, body: Vec) -> Result> { + let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?; + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .update(obj) + { + Ok((true, obj)) => response(StatusCode::CREATED, &obj), + Ok((false, obj)) => response(StatusCode::OK, &obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + + fn update_subresource( + &self, + ns: Option, + name: String, + subresource: String, + body: Vec, + ) -> Result> { + let obj = serde_json::from_reader::<&[u8], DynamicObject>(body.as_ref())?; + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .update_subresource(subresource, obj) + { + Ok((true, obj)) => response(StatusCode::CREATED, &obj), + Ok((false, obj)) => response(StatusCode::OK, &obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + + fn delete(&self, ns: Option, name: String) -> Result> { + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .delete() + { + Ok(obj) => response(StatusCode::OK, &obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + + fn patch( + &self, + ns: Option, + name: String, + header: HeaderMap, + body: Vec, + ) -> Result> { + let content_type = match header.get("Content-Type") { + Some(v) => v.to_str().unwrap(), + None => "", + }; + match content_type { + "application/apply-patch+yaml" => { + let obj = serde_yaml::from_reader::<&[u8], DynamicObject>(body.as_ref())?; + match Arc::clone(&self.objects) + .lock() + .unwrap() + .entry(ns, name) + .apply(obj) + { + Ok(obj) => response(StatusCode::OK, &obj), + Err(status) => response(StatusCode::from_u16(status.code).unwrap(), &status), + } + } + ct => { + let status = status::invalid(&format!("unsupported patch type \"{ct}\"")); + response(StatusCode::from_u16(status.code).unwrap(), &status) + } + } + } +} + +fn response(status: StatusCode, data: &T) -> Result> { + let buf = serde_json::to_vec(data)?; + Ok(Response::builder().status(status).body(buf.into())?) +} + +impl Handler for ResourceHandler +where + R: Debug, +{ + fn api_resource(&self) -> ApiResource { + self.api_resource.clone() + } + + fn handle( + &self, + req: Request, + header: HeaderMap, + body: Vec, + ) -> Result> { + let Request { + verb, + ns, + name, + subresource, + .. + } = req; + match verb.as_str() { + "create" => self.create(body), + "delete" => self.delete(ns, name.unwrap()), + "get" => self.retrieve(ns, name.unwrap()), + "list" => self.list(ns), + "patch" => self.patch(ns, name.unwrap(), header, body), + "update" => { + if let Some(subresource) = subresource { + self.update_subresource(ns, name.unwrap(), subresource, body) + } else { + self.update(ns, name.unwrap(), body) + } + } + v => { + let api_resource = self.api_resource(); + super::status::method_not_allowed(&api_resource, name, v) + } + } + } +} diff --git a/kube_test/src/service.rs b/kube_test/src/service.rs new file mode 100644 index 00000000000..ffc4ef8641a --- /dev/null +++ b/kube_test/src/service.rs @@ -0,0 +1,54 @@ +use super::{request::ApiPlural, Call, Handler, Result}; +use http::{Request, Response}; +use hyper::Body; +use std::collections::HashMap; +use std::ops::DerefMut; +use std::sync::{Arc, Mutex}; +use std::task::{Context, Poll}; + +/// Service provides a [tower::Service] that acts like a kubernetes API server. +#[derive(Debug)] +pub struct Service { + handlers: Arc>>>, +} + +impl Service { + pub fn new() -> Self { + let handlers = Arc::new(Mutex::new(HashMap::new())); + Self { handlers } + } + + pub fn add_handler(&self, handler: Arc) { + let key = handler.api_resource().into(); + self.handlers + .lock() + .unwrap() + .deref_mut() + .insert(key, handler); + } +} + +impl Default for Service { + fn default() -> Self { + Self::new() + } +} + +impl tower::Service> for Service { + type Response = Response; + type Error = super::Error; + type Future = Call; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let (parts, body) = req.into_parts(); + let req = super::request::Request::parse(&parts); + match self.handlers.lock().unwrap().get(&req.api_plural()) { + Some(handler) => Call::new(Some(Arc::clone(handler)), req, parts.headers, body), + None => Call::new(None, req, parts.headers, body), + } + } +} diff --git a/kube_test/src/status.rs b/kube_test/src/status.rs new file mode 100644 index 00000000000..cca92b63a78 --- /dev/null +++ b/kube_test/src/status.rs @@ -0,0 +1,61 @@ +use super::{request::ApiPlural, Result}; +use http::{Response, StatusCode}; +use hyper::Body; +use kube_core::{ApiResource, Status}; + +/// Generate an "Invalid" kubernetes status response. +pub(crate) fn invalid(message: &str) -> Status { + Status::failure(message, "Invalid").with_code(422) +} + +/// Generate an "AlreadyExists" kubernetes status response. +pub(crate) fn already_exists(api_resource: &ApiResource, name: Option<&str>) -> Status { + let resource_id = resource_id(&api_resource.group, &api_resource.kind, name); + Status::failure( + format!("{resource_id} already exists",).as_str(), + "AlreadyExists", + ) + .with_code(StatusCode::CONFLICT.as_u16()) +} + +/// Generate a "NotFound" kubernetes status response for a resource. +pub(crate) fn resource_not_found(api_plural: &ApiPlural) -> Status { + Status::failure(&format!("resource {api_plural} not found"), "NotFound") + .with_code(StatusCode::NOT_FOUND.as_u16()) +} + +/// Generate a "NotFound" kubernetes status response. +pub(crate) fn not_found(api_resource: &ApiResource, name: Option<&str>) -> Status { + let resource_id = resource_id(&api_resource.group, &api_resource.kind, name); + Status::failure(&format!("{resource_id} not found"), "NotFound") + .with_code(StatusCode::NOT_FOUND.as_u16()) +} + +/// Generate a "MethodNotAllowed" kubernetes status response. +pub(crate) fn method_not_allowed( + api_resource: &ApiResource, + name: Option, + method: &str, +) -> Result> { + let resource_id = resource_id(&api_resource.group, &api_resource.kind, name.as_deref()); + let status = Status::failure( + format!("method {method} not allowed for {resource_id}").as_str(), + "MethodNotAllowed", + ) + .with_code(StatusCode::METHOD_NOT_ALLOWED.as_u16()); + response(&status) +} + +fn response(status: &Status) -> Result> { + let buf = serde_json::to_vec(status)?; + Ok(Response::builder().status(status.code).body(buf.into())?) +} + +fn resource_id(group: &str, kind: &str, name: Option<&str>) -> String { + match (name, group.is_empty()) { + (None, true) => format!("resource {kind}"), + (None, false) => format!("resource {group}.{kind}"), + (Some(name), true) => format!("{kind} {name}"), + (Some(name), false) => format!("{group}.{kind} {name}"), + } +} diff --git a/logfmt/Cargo.toml b/logfmt/Cargo.toml index e7eceb04764..c194cba7f84 100644 --- a/logfmt/Cargo.toml +++ b/logfmt/Cargo.toml @@ -6,13 +6,16 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order observability_deps = { path = "../observability_deps" } tracing-subscriber = "0.3" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order -once_cell = { version = "1.18", features = ["parking_lot"] } +once_cell = { version = "1.19", features = ["parking_lot"] } parking_lot = "0.12" regex = "1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } diff --git a/metric/Cargo.toml b/metric/Cargo.toml index d7ced7e0e67..b177d090e5a 100644 --- a/metric/Cargo.toml +++ b/metric/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order parking_lot = "0.12" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/metric/src/counter.rs b/metric/src/counter.rs index 6e31585f4e8..b8c1cd41442 100644 --- a/metric/src/counter.rs +++ b/metric/src/counter.rs @@ -2,7 +2,10 @@ use crate::{MetricKind, MetricObserver, Observation}; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; -/// A monotonic counter +/// A monotonic counter. +/// +/// A [`U64Counter`]` is an internally reference counted type, and all mutations +/// to cloned instances mutate the same underlying counter. #[derive(Debug, Clone, Default)] pub struct U64Counter { state: Arc, diff --git a/metric/src/duration.rs b/metric/src/duration.rs index 7a0728e1e37..6fd9750ffbc 100644 --- a/metric/src/duration.rs +++ b/metric/src/duration.rs @@ -126,6 +126,14 @@ impl DurationHistogram { count, ) } + + pub fn reset(&self) { + self.inner.reset(); + } + + pub fn percentile(&self, percentile: u64) -> Duration { + Duration::from_nanos(self.inner.percentile(percentile)) + } } /// `DurationHistogramOptions` allows configuring the buckets used by `DurationHistogram` diff --git a/metric/src/histogram.rs b/metric/src/histogram.rs index 4416c849780..099ef66b494 100644 --- a/metric/src/histogram.rs +++ b/metric/src/histogram.rs @@ -65,6 +65,41 @@ impl U64Histogram { state.total = state.total.wrapping_add(value * count); } } + + pub fn reset(&self) { + let mut state = self.shared.lock(); + for bucket in &mut state.buckets { + bucket.count = 0; + } + state.total = 0; + } + + /// percentile returns the bucket threshold for the given percentile. + /// For example, if you want the median value, percentile(50) will return the 'le' threshold + /// for the histogram bucket that contains the median sample. + /// + /// A use case for for this function is: + /// Use a histogram tracks the load placed on a system. + /// Set the buckets so they represent load levels of idle/low/medium/high/overloaded. + /// Then use percentile to determine how much of the time is spent at various load levels. + /// e.g. if percentile(50) comes come back with the low load threshold, the median load on the system is low + pub fn percentile(&self, percentile: u64) -> u64 { + let state = self.shared.lock(); + + // we need the total quantity of samples, not the sum of samples. + let total: u64 = state.buckets.iter().map(|bucket| bucket.count).sum(); + + let target = total * percentile / 100; + + let mut sum = 0; + for bucket in &state.buckets { + sum += bucket.count; + if sum >= target { + return bucket.le; + } + } + 0 + } } impl MakeMetricObserver for U64Histogram { @@ -162,5 +197,61 @@ mod tests { histogram.record(0); assert_eq!(histogram.observe(), buckets(&[2, 1, 1], 80)); + + // Now test the percentile reporting function + let options = U64HistogramOptions::new(vec![0, 1, 2, 4, 8, 16, 32, u64::MAX]); + let histogram = U64Histogram::create(&options); + + histogram.record(0); // bucket 0, le 0 + histogram.record(2); // bucket 2, le 2 + histogram.record(3); // bucket 3, le 4 + histogram.record(3); // bucket 3, le 4 + histogram.record(20); // bucket 6, le 32 + histogram.record(20000); // bucket 7, le u64::MAX + histogram.record(20000); // bucket 7, le u64::MAX + histogram.record(20000); // bucket 7, le u64::MAX + histogram.record(20000); // bucket 7, le u64::MAX + histogram.record(20000); // bucket 7, le u64::MAX + + // Of the 10 samples above: + // 1 (10%) is in bucket 0, le 0 + // 1 (10%) is in bucket 2, le 2 + // 2 (20%) are in bucket 3, le 4 + // 1 (10%) is in bucket 6, le 32 + // 5 (50%) are in bucket 7, le u64::MAX + + // request percentiles falling in bucket 0, le 0 + assert_eq!(histogram.percentile(3), 0); + assert_eq!(histogram.percentile(10), 0); + assert_eq!(histogram.percentile(19), 0); + + // request percentiles falling in bucket 2, le 2 + assert_eq!(histogram.percentile(20), 2); + assert_eq!(histogram.percentile(29), 2); + + // requests percentiles falling in bucket 3, le 4 + assert_eq!(histogram.percentile(30), 4); + assert_eq!(histogram.percentile(49), 4); + + // requests percentiles falling in bucket 6, le 32 + assert_eq!(histogram.percentile(50), 32); + assert_eq!(histogram.percentile(59), 32); + + // requests percentiles falling in bucket 6, le 32 + assert_eq!(histogram.percentile(60), u64::MAX); + assert_eq!(histogram.percentile(80), u64::MAX); + assert_eq!(histogram.percentile(100), u64::MAX); + + // test reset + histogram.reset(); + assert_eq!(histogram.percentile(100), 0); + histogram.record(1); // bucket 1, le 1 + histogram.record(2); // bucket 2, le 2 + histogram.record(3); // bucket 3, le 4 + histogram.record(3); // bucket 3, le 4 + assert_eq!(histogram.percentile(0), 0); + assert_eq!(histogram.percentile(25), 1); + assert_eq!(histogram.percentile(49), 1); + assert_eq!(histogram.percentile(50), 2); } } diff --git a/metric/src/lib.rs b/metric/src/lib.rs index ccd0ddceb33..23b085eef8c 100644 --- a/metric/src/lib.rs +++ b/metric/src/lib.rs @@ -280,8 +280,7 @@ pub trait Instrument: std::fmt::Debug + Send + Sync { /// - call finish_metric once complete fn report(&self, reporter: &mut dyn Reporter); - /// Returns the type as [`Any`] so that it can be downcast to - /// it underlying type + /// Returns the type as [`Any`] so that it can be downcast to its underlying type fn as_any(&self) -> &dyn Any; } diff --git a/metric/src/metric.rs b/metric/src/metric.rs index cd0582c7b48..04b0a20e7db 100644 --- a/metric/src/metric.rs +++ b/metric/src/metric.rs @@ -265,6 +265,7 @@ pub struct ResultMetric { pub ok: T, pub client_error: T, pub server_error: T, + pub unexpected_response: T, } impl ResultMetric @@ -279,12 +280,16 @@ where let client_error = metric.recorder(attributes.clone()); attributes.insert("status", "server_error"); - let server_error = metric.recorder(attributes); + let server_error = metric.recorder(attributes.clone()); + + attributes.insert("status", "unexpected_response"); + let unexpected_response = metric.recorder(attributes); Self { ok, client_error, server_error, + unexpected_response, } } } diff --git a/metric_exporters/Cargo.toml b/metric_exporters/Cargo.toml index e1edd6fe9a4..dc70a674b8b 100644 --- a/metric_exporters/Cargo.toml +++ b/metric_exporters/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order observability_deps = { path = "../observability_deps" } metric = { path = "../metric" } diff --git a/mutable_batch/Cargo.toml b/mutable_batch/Cargo.toml index 251c83f85db..21bbf5277dd 100644 --- a/mutable_batch/Cargo.toml +++ b/mutable_batch/Cargo.toml @@ -6,24 +6,24 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } arrow_util = { path = "../arrow_util" } -chrono = { version = "0.4", default-features = false } data_types = { path = "../data_types" } +hashbrown = { workspace = true } iox_time = { path = "../iox_time" } +itertools = "0.12" schema = { path = "../schema" } -snafu = "0.7" -hashbrown = { workspace = true } -itertools = "0.11" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } -percent-encoding = "2.2.0" -thiserror = "1.0.48" -unicode-segmentation = "1.10.1" [dev-dependencies] assert_matches = "1.5.0" mutable_batch_lp = { path = "../mutable_batch_lp" } -paste = "1.0.14" -proptest = { version = "1.2.0", default-features = false } +partition = { path = "../partition" } +pretty_assertions = "1.4.0" +proptest = { version = "1.4.0", default-features = false } rand = "0.8" diff --git a/mutable_batch/src/column.rs b/mutable_batch/src/column.rs index 665ee9e7a84..17d43053a26 100644 --- a/mutable_batch/src/column.rs +++ b/mutable_batch/src/column.rs @@ -13,7 +13,7 @@ use arrow_util::{bitset::BitSet, string::PackedStringArray}; use data_types::{StatValues, Statistics}; use schema::{InfluxColumnType, InfluxFieldType, TIME_DATA_TYPE}; use snafu::{ResultExt, Snafu}; -use std::{fmt::Formatter, mem, sync::Arc}; +use std::{fmt::Formatter, iter, mem, num::NonZeroU64, sync::Arc}; /// A "dictionary ID" (DID) is a compact numeric representation of an interned /// string in the dictionary. The same string always maps the same DID. @@ -25,11 +25,38 @@ use std::{fmt::Formatter, mem, sync::Arc}; pub(crate) type DID = i32; /// An invalid DID used for NULL rows -pub(crate) const INVALID_DID: DID = -1; +pub(crate) const NULL_DID: DID = -1; /// The type of the dictionary used type Dictionary = arrow_util::dictionary::StringDictionary; +/// A type-agnostic way of splitting the various [`ColumnData`] arrays. +/// +/// This macro is required because it's not possible to write a generic function +/// that operates on all "data" types across [`ColumnData`] variants.` +macro_rules! split_off_column { + ($self:expr, $data:expr, $n:expr, $stats:expr, $right_nulls:expr, $($ty:tt)+) => {{ + // Compute the new number of nulls in the left side of the split. + let left_nulls = $stats.null_count.map(|v| v - $right_nulls); + + // Update the stats for the left side of the split. + *$stats = StatValues::new(None, None, $self.valid.len() as u64, left_nulls); + + // Generate the right side of the split (with minimal stats). + let right_data = $data.split_off($n); + let right_len = right_data.len(); + $($ty)+( + right_data, + StatValues::new( + None, + None, + right_len as _, + Some($right_nulls), + ), + ) + }}; +} + #[derive(Debug, Snafu)] #[allow(missing_copy_implementations, missing_docs)] pub enum Error { @@ -63,11 +90,28 @@ pub struct Column { #[derive(Debug, Clone)] #[allow(missing_docs)] pub enum ColumnData { - F64(Vec, StatValues), + /// These types contain arrays that contain an element for every logical row + /// (including nulls). + /// + /// Null values are padded with an arbitrary dummy value. + F64(Vec, StatValues), // NaN is ignored when computing statistics. I64(Vec, StatValues), U64(Vec, StatValues), - String(PackedStringArray, StatValues), Bool(BitSet, StatValues), + + /// The String encoding contains an entry for every logical row, and + /// explicitly stores an empty string in the PackedStringArray for NULL + /// values. + String(PackedStringArray, StatValues), + + /// Whereas the dictionary encoding does not store an explicit empty string + /// in the internal PackedStringArray, nor does it create an entry in the + /// dedupe map. A NULL entry is padded into the data vec using the + /// [`NULL_DID`] value. + /// + /// Every distinct, non-null value is stored in the dictionary exactly once, + /// and the data arrays contains the dictionary ID for every logical row + /// (including nulls as described above). Tag(Vec, Dictionary, StatValues), } @@ -97,6 +141,9 @@ impl Column { // Keep track of how many total rows there are let total_count = row_count as u64; + // If there are no values, there are no distinct values. + let distinct_count = if row_count > 0 { Some(1) } else { None }; + let data = match column_type { InfluxColumnType::Field(InfluxFieldType::Boolean) => { let mut data = BitSet::new(); @@ -119,12 +166,12 @@ impl Column { } InfluxColumnType::Field(InfluxFieldType::String) => ColumnData::String( PackedStringArray::new_empty(row_count), - StatValues::new_all_null(total_count, Some(1)), + StatValues::new_all_null(total_count, distinct_count), ), InfluxColumnType::Tag => ColumnData::Tag( - vec![INVALID_DID; row_count], + vec![NULL_DID; row_count], Default::default(), - StatValues::new_all_null(total_count, Some(1)), + StatValues::new_all_null(total_count, distinct_count), ), }; @@ -182,7 +229,7 @@ impl Column { stats.update_for_nulls(delta as u64); } ColumnData::Tag(data, _dict, stats) => { - data.resize(len, INVALID_DID); + data.resize(len, NULL_DID); stats.update_for_nulls(delta as u64); } } @@ -323,4 +370,819 @@ impl Column { Ok(data) } + + /// Split this [`Column`] at the specified row boundary, such that after + /// this call, `self` contains the range of rows indexed from `[0, n)` and + /// the returned value contains `[n, len)`. + /// + /// # Statistics + /// + /// For performance reasons, this operation leaves `self` and the returned + /// [`Column`] with reduced summary statistics available. + /// + /// This allows the caller to selectively reconstruct the statistics that + /// will be useful to the caller, instead of always paying the price of + /// recomputing statistics, even if unused. + /// + /// For the following column types: + /// + /// - [`ColumnData::F64`] + /// - [`ColumnData::I64`] + /// - [`ColumnData::U64`] + /// - [`ColumnData::Bool`] + /// - [`ColumnData::String`] + /// + /// The statistics for both [`Column`] contain only: + /// + /// - Total count + /// - NULL count (see below) + /// + /// The NULL count is always present in the returned [`Column`], and only + /// present in `self` if it had a NULL count statistic prior to the split. + /// + /// For [`ColumnData::Tag`] all the statistics above are included, with the + /// addition of the distinct count. + /// + /// # Performance + /// + /// This call is `O(n)` where `n` is the number of elements in the right + /// side of the split (the `[n, len)` interval) due to the need to copy + /// and process these elements only. + /// + /// The size of the left-side interval (the [0, n) interval) does not affect + /// performance of this call. + pub fn split_off(&mut self, n: usize) -> Self { + if n > self.len() { + return Self::new(0, self.influx_type); + } + + // Split the null mask into [0, n) and [n, len). + let right_bitmap = self.valid.split_off(n); + + // Compute the null count for the right side. + let right_nulls = right_bitmap.count_zeros() as u64; + + // Split the actual data and update/compute the statistics. + let right_data = match &mut self.data { + ColumnData::F64(data, left_stats) => { + split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::F64) + } + ColumnData::I64(data, left_stats) => { + split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::I64) + } + ColumnData::U64(data, left_stats) => { + split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::U64) + } + ColumnData::String(data, left_stats) => { + split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::String) + } + ColumnData::Bool(data, left_stats) => { + split_off_column!(self, data, n, left_stats, right_nulls, ColumnData::Bool) + } + ColumnData::Tag(data, dict, left_stats) => { + // Split the tag data at the value index. + let mut new_data = data.split_off(n); + + // "new_data" now contains values [n, len), and likely no longer + // references all the values in the current dictionary. + // + // Generate a dictionary for "new_data" that contains only the + // values that appear in "new_data", and rewrite the dictionary + // IDs in "new_data" to reflect this new mapping. + let new_dict = rebuild_dictionary(dict, &mut new_data); + + // The original "dict" may now contain references to keys that + // appear only in "new_data", and never in the "data" that + // remains. + // + // Rewrite this dictionary, to shrink it to contain only entries + // that appear in "data". + // + // Note: this may not be required if Arrow can tolerate a + // dictionary with more keys than necessary, but it optimises + // for memory utilisation. + *dict = rebuild_dictionary(dict, data); + + // Compute how many NULLs are left in the left side. + let left_nulls = left_stats.null_count.map(|v| v - right_nulls); + + // It's effectively free to compute the distinct count of a + // column using dictionary encoding - it's simply the length of + // the dictionary, and plus one if a NULL exists - maintain + // distinct counts in the returned statistics. + let make_distinct_count = |dict: &Dictionary, has_null| { + let mut count = dict.values().len(); + if has_null { + count += 1; + } + NonZeroU64::try_from(count as u64).ok() + }; + + let left_distinct = make_distinct_count(dict, left_nulls.unwrap_or_default() > 0); + let right_distinct = make_distinct_count(&new_dict, right_nulls > 0); + + // Update the stats for the left side of the split. + *left_stats = StatValues::new_with_distinct( + None, + None, + self.valid.len() as _, + left_nulls, + left_distinct, + ); + + // Generate the right side of the split. + let new_len = new_data.len(); + ColumnData::Tag( + new_data, + new_dict, + StatValues::new_with_distinct( + None, + None, + new_len as _, + Some(right_nulls), + right_distinct, + ), + ) + } + }; + + Self { + influx_type: self.influx_type, + valid: right_bitmap, + data: right_data, + } + } +} + +/// Constructs a new, minimal dictionary for `data`, rewriting the dictionary +/// IDs in `data` to use the new returned dictionary. +fn rebuild_dictionary(original: &Dictionary, data: &mut [DID]) -> Dictionary { + let mut dict = Dictionary::new(); + + for id in data.iter_mut() { + if *id == NULL_DID { + continue; + } + let value = original + .lookup_id(*id) + .expect("original dictionary does not contain value"); + *id = dict.lookup_value_or_insert(value); + } + + dict +} + +/// Recompute the min/max values for the given [`Column`]. +/// +/// This is an `O(n)` operation for: +/// +/// - [`ColumnData::F64`] +/// - [`ColumnData::I64`] +/// - [`ColumnData::U64`] +/// - [`ColumnData::Bool`] +/// - [`ColumnData::String`] +/// +/// This is an `O(distinct(n))` operation for [`ColumnData::Tag`]. +pub fn recompute_min_max(c: &mut Column) { + match &mut c.data { + // A specialised implementation for floats is required to filter out NaN + // values in order to match the behaviour of `StatValues::update()`. + ColumnData::F64(data, stats) => { + data.iter() + .zip(c.valid.iter()) + .filter_map(|(v, valid)| { + if !valid || v.is_nan() { + // NaN are completely ignored in stats. + return None; + } + Some(*v) + }) + .for_each(|v| { + stats.min = Some(stats.min.unwrap_or(v).min(v)); + stats.max = Some(stats.max.unwrap_or(v).max(v)); + }); + } + + // A specialised implementation for boolean values for significantly + // improved performance. + ColumnData::Bool(data, stats) => { + // Process 8 values at a time by evaluating against the underlying + // bytes directly in both the validity and value bitsets. + // + // Invariant: the excess bits beyond "bitset.len()" are always 0. + let iter = c.valid.bytes().iter().zip(data.bytes().iter()); + + let mut contains_false = false; + let mut contains_true = false; + + for (valid, data) in iter { + // Set bits only if they're non-null and 1. + contains_true |= valid & data > 0; + + // Set bits only if they're non-null and 0. + contains_false |= valid & !data > 0; + + // Short circuit if both have been observed. + if contains_false && contains_true { + break; + } + } + + // If all values are NULL, no real values were observed, and the + // stats should be cleared (as the stats ignore NULLs). + if !contains_false && !contains_true { + stats.min = None; + stats.max = None; + return; + } + + stats.min = Some(!contains_false); + stats.max = Some(contains_true); + } + + // The rest of the data types use `recompute_min_max_for()`. + ColumnData::I64(data, stats) => { + if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) { + stats.min = Some(*min); + stats.max = Some(*max); + } + } + ColumnData::U64(data, stats) => { + if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) { + stats.min = Some(*min); + stats.max = Some(*max); + } + } + + // Optimised to avoid cloning the string for every change in min/max + // value, instead this clones the strings at most once for each of + // min/max. + // + // This applies to both the String and Tag data types. + ColumnData::String(data, stats) => { + if let Some((min, max)) = recompute_min_max_for(data.iter(), c.valid.iter()) { + stats.min = Some(min.to_string()); + stats.max = Some(max.to_string()); + } + } + ColumnData::Tag(_, dict, stats) => { + // The dictionary does not store a representation of NULL, so all + // the values in the dictionary are candidates for min/max. + if let Some((min, max)) = + recompute_min_max_for(dict.values().iter(), iter::repeat(true)) + { + stats.min = Some(min.to_string()); + stats.max = Some(max.to_string()); + } + } + } +} + +/// Compute the min/max values of `data`, filtering out any values with +/// corresponding positions in `valid` that are `false`. +fn recompute_min_max_for<'a, T>( + data: impl IntoIterator, + valid: impl IntoIterator, +) -> Option<(&'a T, &'a T)> +where + T: Ord + ?Sized, +{ + let (min, max) = data + .into_iter() + .zip(valid.into_iter()) + .filter_map(|(v, valid)| if valid { Some(v) } else { None }) + .fold((None, None), |acc, v| { + ( + Some(acc.0.unwrap_or(v).min(v)), + Some(acc.1.unwrap_or(v).max(v)), + ) + }); + + min.zip(max) +} + +#[cfg(test)] +mod tests { + use std::{borrow::Borrow, collections::HashSet, fmt::Debug, mem::discriminant}; + + use arrow::record_batch::RecordBatch; + use arrow_util::assert_batches_eq; + use assert_matches::assert_matches; + use data_types::IsNan; + use proptest::prelude::*; + + use super::*; + + fn hydrate(dict: &Dictionary, data: &[DID]) -> Vec { + data.iter() + .map(|&id| dict.lookup_id(id).unwrap().to_string()) + .collect::>() + } + + /// Take an iterator of nullable `T`, and convert it into a vector of + /// non-optional values and a null mask compatible with [`ColumnData`]. + /// + /// Returns the number of nulls in `data`. + fn densify(data: impl IntoIterator>) -> (Vec, BitSet, usize) + where + U: ToOwned, + T: Default, + { + let mut out = Vec::new(); + let mut bitmap = BitSet::new(); + let mut nulls = 0; + for v in data.into_iter() { + match v { + Some(v) => { + bitmap.append_set(1); + out.push(v.to_owned()); + } + None => { + out.push(Default::default()); + bitmap.append_unset(1); + nulls += 1; + } + } + } + + (out, bitmap, nulls) + } + + #[test] + #[allow(clippy::bool_assert_comparison)] + fn test_densify() { + let input = [None, Some(42), None, None, Some(24)]; + + let (got, nulls, count) = densify(input); + assert_eq!(got, [0, 42, 0, 0, 24]); // NULLS are populated with 0 (not sparse representation) + assert_eq!(nulls.get(0), false); + assert_eq!(nulls.get(1), true); + assert_eq!(nulls.get(2), false); + assert_eq!(nulls.get(3), false); + assert_eq!(nulls.get(4), true); + assert_eq!(nulls.len(), 5); + assert_eq!(count, 3); + } + + #[test] + fn test_rewrite_dictionary() { + let mut original = Dictionary::new(); + let mut data = vec![]; + + // Input strings to be dictionary encoded. + let input = [ + "bananas", "platanos", "bananas", "platanos", "ananas", "ananas", "ananas", + ]; + + for v in input { + data.push(original.lookup_value_or_insert(v)); + } + + assert_eq!(data.len(), input.len()); + assert_eq!(original.values().len(), 3); // 3 distinct values + + let mut new_data = data.split_off(3); + let new_dict = rebuild_dictionary(&original, &mut new_data); + let old_dict = rebuild_dictionary(&original, &mut data); + + let new_data_hydrated = hydrate(&new_dict, &new_data); + let old_data_hydrated = hydrate(&old_dict, &data); + + assert_eq!( + new_data_hydrated, + ["platanos", "ananas", "ananas", "ananas"] + ); + assert_eq!(old_data_hydrated, ["bananas", "platanos", "bananas"]); + + assert_eq!(new_dict.values().len(), 2); // 2 distinct values + assert_eq!(old_dict.values().len(), 2); // 2 distinct values + } + + #[test] + fn test_split_off() { + let (data, valid, _) = densify([Some(42), None, None, Some(24)]); + valid.to_arrow(); + + let mut col = Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger), + valid, + data: ColumnData::U64(data, StatValues::new(None, None, 4, Some(2))), + }; + + let mut schema = schema::SchemaBuilder::new(); + schema.influx_column("bananas", col.influx_type()); + let schema = schema.build().unwrap(); + + // Before the split + let batch = RecordBatch::try_new( + schema.clone().into(), + vec![col.to_arrow().expect("failed to covert column to arrow")], + ) + .expect("failed to build record batch"); + assert_batches_eq!( + [ + "+---------+", + "| bananas |", + "+---------+", + "| 42 |", + "| |", + "| |", + "| 24 |", + "+---------+", + ], + &[batch] + ); + + let col2 = col.split_off(2); + + // After the split, the input column + let batch = RecordBatch::try_new( + schema.clone().into(), + vec![col.to_arrow().expect("failed to covert column to arrow")], + ) + .expect("failed to build record batch"); + assert_batches_eq!( + [ + "+---------+", + "| bananas |", + "+---------+", + "| 42 |", + "| |", + "+---------+", + ], + &[batch] + ); + + // After the split, the split off column + let batch = RecordBatch::try_new( + schema.into(), + vec![col2.to_arrow().expect("failed to covert column to arrow")], + ) + .expect("failed to build record batch"); + assert_batches_eq!( + [ + "+---------+", + "| bananas |", + "+---------+", + "| |", + "| 24 |", + "+---------+", + ], + &[batch] + ); + } + + const MAX_ROWS: usize = 20; + + /// Returns a vector of `Option`. + fn sparse_array(s: impl Strategy) -> impl Strategy>> + where + T: Debug, + { + prop::collection::vec(prop::option::of(s), 0..MAX_ROWS) + } + + /// Produces a valid [`Column`]` of an arbitrary data type and data. + /// + /// The embedded statistics do not contain min/max values but otherwise + /// model a column within a [`MutableBatch`] produced by a [`Writer`]. + /// + /// [`MutableBatch`]: crate::MutableBatch + /// [`Writer`]: crate::writer::Writer + fn arbitrary_column() -> impl Strategy { + prop_oneof![ + sparse_array(any::()).prop_map(|v| { + let (data, valid, null_count) = densify(v.clone()); + Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::Float), + valid, + data: ColumnData::F64( + data, + StatValues::new(None, None, v.len() as _, Some(null_count as _)), + ), + } + }), + sparse_array(any::()).prop_map(|v| { + let (data, valid, null_count) = densify(v.clone()); + Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::Integer), + valid, + data: ColumnData::I64( + data, + StatValues::new(None, None, v.len() as _, Some(null_count as _)), + ), + } + }), + sparse_array(any::()).prop_map(|v| { + let (data, valid, null_count) = densify(v.clone()); + Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::UInteger), + valid, + data: ColumnData::U64( + data, + StatValues::new(None, None, v.len() as _, Some(null_count as _)), + ), + } + }), + sparse_array(any::()).prop_map(|v| { + let (strings, valid, null_count) = densify(v.clone()); + let mut data = PackedStringArray::new(); + for s in strings { + data.append(&s); + } + Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::String), + valid, + data: ColumnData::String( + data, + StatValues::new(None, None, v.len() as _, Some(null_count as _)), + ), + } + }), + sparse_array(any::()).prop_map(|v| { + let (values, valid, null_count) = densify(v.clone()); + let mut data = BitSet::new(); + for v in values { + match v { + true => data.append_set(1), + false => data.append_unset(1), + } + } + Column { + influx_type: InfluxColumnType::Field(InfluxFieldType::Boolean), + valid, + data: ColumnData::Bool( + data, + StatValues::new(None, None, v.len() as _, Some(null_count as _)), + ), + } + }), + // This artificially weights string generation to produce arrays + // with a higher chance of covering both dense and sparse arrays + // where distinct values != array length. + prop_oneof![ + sparse_array( + prop::string::string_regex("[a-b]").expect("invalid repetition regex") + ), + sparse_array(any::()), + ] + .prop_map(|v| { + // The NULL encoding of the dictionary is a bit of a snowflake. + // + // Walk the NULL-able input, and for any NULLs insert NULL_DID + // into the data array without inserting into the dictionary. + let mut data = Vec::new(); + let mut dict = Dictionary::new(); + let mut valid = BitSet::new(); + + let mut nulls = 0; + for v in &v { + match v { + Some(v) => { + valid.append_set(1); + data.push(dict.lookup_value_or_insert(v)); + } + None => { + data.push(NULL_DID); + valid.append_unset(1); + nulls += 1; + } + } + } + + // A NULL is a distinct value, that does not appear in the + // dictionary. + let distinct_count = if nulls > 0 { + dict.values().len() + 1 + } else { + dict.values().len() + }; + + Column { + influx_type: InfluxColumnType::Tag, + valid, + data: ColumnData::Tag( + data, + dict, + StatValues::new_with_distinct( + None, + None, + v.len() as _, + Some(nulls), + NonZeroU64::try_from(distinct_count as u64).ok(), + ), + ), + } + }), + ] + } + // Set the number of test cases higher than the default (256) to ensure better + // coverage of the generated arbitrary columns without compromising too + // much on the input space. + proptest! { + #![proptest_config(ProptestConfig::with_cases(2048))] + + /// Asserts the correctness of the [`Column::split_off()`] method, using + /// the Arrow "Array" slice method as a test oracle. + /// + /// Asserts the following invariants after splitting: + /// + /// - Never panics due to out-of-bounds split position + /// - Data types remain unchanged + /// - Metadata for influx data model unchanged + /// - NULL mask is of the correct length + /// - Data length matches count statistics + /// - NULL value count matches NULL count statistics + /// - Tag distinct values matches distinct count statistics + /// - Tag dictionary contains correct number of entries, with NULLs + /// - Total count statistics are equal to input statistics + /// - NULL count statistics are equal to input statistics + /// - Both sides of the split match equivalent Arrow oracle splits + /// + #[test] + fn prop_split_off( + input in arbitrary_column(), + split_at in 0..=MAX_ROWS, + ) { + // Split the column. + let mut col = input.clone(); + let col2 = col.split_off(split_at); + + // Assert no rows were lost. + assert_eq!(col.len() + col2.len(), input.len()); + + // Because "split_at" may be greater than the number of rows in the + // input column, compute how many rows should remain after the + // split. + let want_remaining_rows = input.len().min(split_at); + assert_eq!(col.len(), want_remaining_rows); + + // And validate the rest of the rows wound up in the col2 half. + assert_eq!(col2.len(), input.len() - want_remaining_rows); + + for c in [&col, &col2] { + // The data type should remain the same. + assert_eq!(c.influx_type(), input.influx_type()); + assert_eq!(discriminant(c.data()), discriminant(input.data())); + + // Inspect the statistics for each. + let data_len = match c.data() { + ColumnData::F64(data, _) => data.len(), + ColumnData::I64(data, _) => data.len(), + ColumnData::U64(data, _) => data.len(), + ColumnData::String(data, _) => data.len(), + ColumnData::Bool(data, _) => data.len(), + ColumnData::Tag(data, dict, stats) => { + // Tags have an additional distinct count statistics + // maintained throughout the split. + let want = stats.distinct_count.map(|v| v.get()).unwrap_or_default(); + let have = data.iter().collect::>().len() as u64; + assert_eq!(have, want); + + // If there are no nulls, the dictionary length must + // match the number of distinct values. If there are + // NULLs, +1 to the dictionary length (it does not + // contain NULLs). + if stats.null_count.unwrap_or_default() == 0 { + assert_eq!(have, dict.values().len() as u64); + } else { + // Otherwise there must be one more distinct value. + assert_eq!(have, dict.values().len() as u64 + 1); + } + + data.len() + }, + }; + + // First check the consistency of the total count: + assert_eq!(c.valid_mask().len(), data_len); + assert_eq!(data_len as u64, c.stats().total_count()); + + // Null counts: + let nulls = c.valid_mask().count_zeros() as u64; + assert_eq!(c.stats().null_count(), Some(nulls)); + } + + // The sum of various statistics must match the input counts. + let count = col.stats().total_count() + col2.stats().total_count(); + assert_eq!(input.stats().total_count(), count); + + // Null counts must sum to the input count + let nulls = col.stats().null_count().unwrap_or_default() + + col2.stats().null_count().unwrap_or_default(); + assert_eq!(input.stats().null_count().unwrap_or_default(), nulls); + + // Generate arrow arrays from both inputs + let col = col.to_arrow().unwrap(); + let col2 = col2.to_arrow().unwrap(); + + // And the test oracle + let input = input.to_arrow().unwrap(); + + // Slice the input data using arrow's slice methods. + let want = input.slice(0, split_at.min(input.len())); + + // And assert the split_off() data is equal. + assert!(col.eq(&want)); + + // Only attempt to slice off and validate the right side if it would + // be non-empty (or arrow panics) + if split_at >= input.len() { + assert_eq!(col2.len(), 0); + } else { + let want2 = input.slice(split_at, input.len() - split_at); + assert!(col2.eq(&want2)); + } + } + + /// Exercise [`recompute_min_max()`] against a [`Column`], asserting the + /// resulting [`StatValues`] match that produced by using the [`Writer`] + /// to populate the [`Column`]. + #[test] + fn prop_recompute_min_max( + mut input in arbitrary_column(), + ) { + // Compute a `StatValues` using the test oracle implementation. + fn stats_oracle(data: S, valid: impl IntoIterator) -> StatValues + where + S: IntoIterator, + T: Borrow, + U: ToOwned + PartialOrd + IsNan, + { + data.into_iter() + .zip(valid.into_iter()) + .filter_map(|(v, valid)| if valid { Some(v) } else { None }) + .fold(StatValues::default(), |mut acc, v| { + acc.update(&v); + acc + }) + } + + match input.clone().data() { + ColumnData::F64(data,_) => { + let want = stats_oracle(data, input.valid.iter()); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::F64(v) => v); + + assert_eq!(want.min.cloned(), got.min); + assert_eq!(want.max.cloned(), got.max); + assert!(got.min <= got.max); + }, + ColumnData::I64(data, _) => { + let want = stats_oracle(data, input.valid.iter()); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::I64(v) => v); + + assert_eq!(want.min.cloned(), got.min); + assert_eq!(want.max.cloned(), got.max); + assert!(got.min <= got.max); + }, + ColumnData::U64(data, _) => { + let want = stats_oracle(data, input.valid.iter()); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::U64(v) => v); + + assert_eq!(want.min.cloned(), got.min); + assert_eq!(want.max.cloned(), got.max); + assert!(got.min <= got.max); + }, + ColumnData::Bool(data, _) => { + let want = stats_oracle(data.iter(), input.valid.iter()); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::Bool(v) => v); + + assert_eq!(want.min, got.min); + assert_eq!(want.max, got.max); + assert!(got.min <= got.max); + }, + ColumnData::String(data, _) => { + let want = stats_oracle(data.iter().map(ToString::to_string), input.valid.iter()); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::String(v) => v); + + assert_eq!(want.min, got.min); + assert_eq!(want.max, got.max); + assert!(got.min <= got.max); + }, + ColumnData::Tag(_data, dict, _) => { + let want = stats_oracle( + dict.values().iter().map(ToString::to_string), + iter::repeat(true) + ); + + recompute_min_max(&mut input); + let got = assert_matches!(input.stats(), Statistics::String(v) => v); + + assert_eq!(want.min, got.min); + assert_eq!(want.max, got.max); + assert!(got.min <= got.max); + }, + } + } + } } diff --git a/mutable_batch/src/lib.rs b/mutable_batch/src/lib.rs index 681bb5a06b2..62244d6944a 100644 --- a/mutable_batch/src/lib.rs +++ b/mutable_batch/src/lib.rs @@ -20,6 +20,12 @@ //! permitting fast conversion to [`RecordBatch`]. // Workaround for "unused crate" lint false positives. +#[cfg(test)] +use partition as _; +#[cfg(test)] +use pretty_assertions as _; +#[cfg(test)] +use rand as _; use workspace_hack as _; use crate::column::{Column, ColumnData}; @@ -156,16 +162,13 @@ impl MutableBatch { /// Returns a summary of the write timestamps in this chunk if a /// time column exists pub fn timestamp_summary(&self) -> Option { - let time = self.column_names.get(TIME_COLUMN_NAME)?; + let col_data = self.time_column().ok()?; let mut summary = TimestampSummary::default(); - match &self.columns[*time].data { - ColumnData::I64(col_data, _) => { - for t in col_data { - summary.record_nanos(*t) - } - } - _ => unreachable!(), + + for t in col_data { + summary.record_nanos(*t) } + Some(summary) } @@ -205,6 +208,27 @@ impl MutableBatch { Ok(&self.columns[*idx]) } + /// Returns a reference to the column at the specified index + pub fn column_by_index(&self, idx: usize) -> Result<&Column> { + self.columns.get(idx).with_context(|| ColumnNotFoundSnafu { + column: format!("index {}", idx), + }) + } + + /// Return the values in the time column in this batch. Returns an error if the batch has no + /// time column. + /// + /// # Panics + /// + /// If a time column exists but its data isn't of type `i64`, this function will panic. + fn time_column(&self) -> Result<&[i64]> { + let time_column = self.column(TIME_COLUMN_NAME)?; + match &time_column.data { + ColumnData::I64(col_data, _) => Ok(col_data), + x => unreachable!("expected i64 got {} for time column", x), + } + } + /// Return the approximate memory size of the batch, in bytes. /// /// This includes `Self`. @@ -222,6 +246,31 @@ impl MutableBatch { pub fn size_data(&self) -> usize { self.columns.iter().map(|c| c.size_data()).sum::() } + + /// Split this [`MutableBatch`] at the specified row boundary, such that + /// after this call, `self` contains the range of rows indexed from `[0, n)` + /// and the returned value contains `[n, len)`. + /// + /// # Panics + /// + /// Panics if `n > self.rows()`. + /// + /// # Performance + /// + /// This implementation is heavily optimised towards splitting `self` at a + /// `n` value skewed towards the high end of the row count - see [`Column`]. + pub fn split_off(&mut self, n: usize) -> Self { + assert!(n <= self.row_count); + + let right_row_count = self.row_count - n; + self.row_count = n; + + Self { + column_names: self.column_names.clone(), + columns: self.columns.iter_mut().map(|v| v.split_off(n)).collect(), + row_count: right_row_count, + } + } } /// A description of the distribution of timestamps in a @@ -262,7 +311,9 @@ impl TimestampSummary { #[cfg(test)] mod tests { + use arrow_util::assert_batches_eq; use mutable_batch_lp::lines_to_batches; + use schema::Projection; #[test] fn size_data_without_nulls() { @@ -298,4 +349,168 @@ mod tests { assert_eq!(batch.size_data(), 124); assert_eq!(batch.columns().len(), 5); } + + /// Assert the correct row index is split off using + /// [`MutableBatch::split_off()`]. + /// + /// Correctness of the [`Column`] splitting is handled by tests against the + /// [`Column`] itself. + #[test] + fn test_split_off() { + let mut batches = lines_to_batches( + "\ + cpu,t1=hello,t2=world f1=1.1 1234\n\ + cpu,t2=w f1=2.2,f2=2i 1234\n\ + ", + 0, + ) + .unwrap(); + let mut batch = batches.remove("cpu").unwrap(); + assert_eq!(batch.rows(), 2); + assert_eq!(batch.column_names().len(), 5); + + let got = batch.split_off(1); + + assert_batches_eq!( + &[ + "+-----+----+-------+-------+--------------------------------+", + "| f1 | f2 | t1 | t2 | time |", + "+-----+----+-------+-------+--------------------------------+", + "| 1.1 | | hello | world | 1970-01-01T00:00:00.000001234Z |", + "+-----+----+-------+-------+--------------------------------+", + ], + &[batch.to_arrow(Projection::All).unwrap()] + ); + assert_batches_eq!( + &[ + "+-----+----+----+----+--------------------------------+", + "| f1 | f2 | t1 | t2 | time |", + "+-----+----+----+----+--------------------------------+", + "| 2.2 | 2 | | w | 1970-01-01T00:00:00.000001234Z |", + "+-----+----+----+----+--------------------------------+", + ], + &[got.to_arrow(Projection::All).unwrap()] + ); + + assert_eq!(batch.rows(), 1); + assert_eq!(got.rows(), 1); + + // Actual Column instances + assert_eq!(got.columns().len(), batch.columns().len()); + + // Column name map + assert_eq!(got.column_names().len(), 5); + assert_eq!(got.column_names(), batch.column_names()); + assert_eq!(got.column_names().len(), got.columns().len()); + + // Schema + assert_eq!( + got.schema(Projection::All).unwrap(), + batch.schema(Projection::All).unwrap() + ); + assert_eq!( + got.schema(Projection::All).unwrap().len(), + got.columns().len() + ); + } + + #[test] + fn test_split_off_n_0() { + let mut batches = lines_to_batches( + "\ + cpu,t1=hello,t2=world f1=1.1 1234\n\ + cpu,t2=w f1=2.2,f2=2i 1234\n\ + ", + 0, + ) + .unwrap(); + let mut batch = batches.remove("cpu").unwrap(); + assert_eq!(batch.rows(), 2); + assert_eq!(batch.column_names().len(), 5); + + let got = batch.split_off(0); + + assert_batches_eq!( + &[ + "+-----+----+-------+-------+--------------------------------+", + "| f1 | f2 | t1 | t2 | time |", + "+-----+----+-------+-------+--------------------------------+", + "| 1.1 | | hello | world | 1970-01-01T00:00:00.000001234Z |", + "| 2.2 | 2 | | w | 1970-01-01T00:00:00.000001234Z |", + "+-----+----+-------+-------+--------------------------------+", + ], + &[got.to_arrow(Projection::All).unwrap()] + ); + + assert_eq!(batch.rows(), 0); + assert_eq!(got.rows(), 2); + + // Actual Column instances + assert_eq!(got.columns().len(), batch.columns().len()); + + // Column name map + assert_eq!(got.column_names().len(), 5); + assert_eq!(got.column_names(), batch.column_names()); + assert_eq!(got.column_names().len(), got.columns().len()); + + // Schema + assert_eq!( + got.schema(Projection::All).unwrap(), + batch.schema(Projection::All).unwrap() + ); + assert_eq!( + got.schema(Projection::All).unwrap().len(), + got.columns().len() + ); + } + + #[test] + fn test_split_off_none() { + let mut batches = lines_to_batches( + "\ + cpu,t1=hello,t2=world f1=1.1 1234\n\ + cpu,t2=w f1=2.2,f2=2i 1234\n\ + ", + 0, + ) + .unwrap(); + let mut batch = batches.remove("cpu").unwrap(); + assert_eq!(batch.rows(), 2); + assert_eq!(batch.column_names().len(), 5); + + let got = batch.split_off(2); + + assert_batches_eq!( + &[ + "+-----+----+-------+-------+--------------------------------+", + "| f1 | f2 | t1 | t2 | time |", + "+-----+----+-------+-------+--------------------------------+", + "| 1.1 | | hello | world | 1970-01-01T00:00:00.000001234Z |", + "| 2.2 | 2 | | w | 1970-01-01T00:00:00.000001234Z |", + "+-----+----+-------+-------+--------------------------------+", + ], + &[batch.to_arrow(Projection::All).unwrap()] + ); + + assert_eq!(batch.rows(), 2); + assert_eq!(got.rows(), 0); + + // Actual Column instances + assert_eq!(got.columns().len(), batch.columns().len()); + + // Column name map + assert_eq!(got.column_names().len(), 5); + assert_eq!(got.column_names(), batch.column_names()); + assert_eq!(got.column_names().len(), got.columns().len()); + + // Schema + assert_eq!( + got.schema(Projection::All).unwrap(), + batch.schema(Projection::All).unwrap() + ); + assert_eq!( + got.schema(Projection::All).unwrap().len(), + got.columns().len() + ); + } } diff --git a/mutable_batch/src/payload.rs b/mutable_batch/src/payload.rs index 7cb85de5a63..0fb64037639 100644 --- a/mutable_batch/src/payload.rs +++ b/mutable_batch/src/payload.rs @@ -1,15 +1,6 @@ //! Write payload abstractions derived from [`MutableBatch`] -use crate::{column::ColumnData, MutableBatch, Result}; -use data_types::{partition_template::TablePartitionTemplateOverride, PartitionKey}; -use hashbrown::HashMap; -use schema::TIME_COLUMN_NAME; -use std::{num::NonZeroUsize, ops::Range}; - -pub use self::partition::PartitionKeyError; - -mod filter; -mod partition; +use crate::{MutableBatch, Result}; /// A payload that can be written to a mutable batch pub trait WritePayload { @@ -22,142 +13,3 @@ impl WritePayload for MutableBatch { batch.extend_from(self) } } - -/// A [`MutableBatch`] with a non-zero set of row ranges to write -#[derive(Debug)] -pub struct PartitionWrite<'a> { - batch: &'a MutableBatch, - ranges: Vec>, - min_timestamp: i64, - max_timestamp: i64, - row_count: NonZeroUsize, -} - -impl<'a> PartitionWrite<'a> { - /// Create a new [`PartitionWrite`] with the entire range of the provided batch - /// - /// # Panic - /// - /// Panics if the batch has no rows - pub fn new(batch: &'a MutableBatch) -> Self { - let row_count = NonZeroUsize::new(batch.row_count).unwrap(); - let time = get_time_column(batch); - let (min_timestamp, max_timestamp) = min_max_time(time); - - // This `allow` can be removed when this issue is fixed and released: - // - #[allow(clippy::single_range_in_vec_init)] - Self { - batch, - ranges: vec![0..batch.row_count], - min_timestamp, - max_timestamp, - row_count, - } - } - - /// Returns the minimum timestamp in the write - pub fn min_timestamp(&self) -> i64 { - self.min_timestamp - } - - /// Returns the maximum timestamp in the write - pub fn max_timestamp(&self) -> i64 { - self.max_timestamp - } - - /// Returns the number of rows in the write - pub fn rows(&self) -> NonZeroUsize { - self.row_count - } - - /// Returns a [`PartitionWrite`] containing just the rows of `Self` that pass - /// the provided time predicate, or None if no rows - pub fn filter(&self, predicate: impl Fn(i64) -> bool) -> Option> { - let mut min_timestamp = i64::MAX; - let mut max_timestamp = i64::MIN; - let mut row_count = 0_usize; - - // Construct a predicate that lets us inspect the timestamps as they are filtered - let inspect = |t| match predicate(t) { - true => { - min_timestamp = min_timestamp.min(t); - max_timestamp = max_timestamp.max(t); - row_count += 1; - true - } - false => false, - }; - - let ranges: Vec<_> = filter::filter_time(self.batch, &self.ranges, inspect); - let row_count = NonZeroUsize::new(row_count)?; - - Some(PartitionWrite { - batch: self.batch, - ranges, - min_timestamp, - max_timestamp, - row_count, - }) - } - - /// Create a collection of [`PartitionWrite`] indexed by partition key - /// from a [`MutableBatch`] and [`TablePartitionTemplateOverride`] - pub fn partition( - batch: &'a MutableBatch, - partition_template: &TablePartitionTemplateOverride, - ) -> Result, PartitionKeyError> { - use hashbrown::hash_map::Entry; - let time = get_time_column(batch); - - let mut partition_ranges = HashMap::new(); - for (partition, range) in partition::partition_batch(batch, partition_template) { - let row_count = NonZeroUsize::new(range.end - range.start).unwrap(); - let (min_timestamp, max_timestamp) = min_max_time(&time[range.clone()]); - - match partition_ranges.entry(PartitionKey::from(partition?)) { - Entry::Vacant(v) => { - v.insert(PartitionWrite { - batch, - ranges: vec![range], - min_timestamp, - max_timestamp, - row_count, - }); - } - Entry::Occupied(mut o) => { - let pw = o.get_mut(); - pw.min_timestamp = pw.min_timestamp.min(min_timestamp); - pw.max_timestamp = pw.max_timestamp.max(max_timestamp); - pw.row_count = NonZeroUsize::new(pw.row_count.get() + row_count.get()).unwrap(); - pw.ranges.push(range); - } - } - } - Ok(partition_ranges) - } -} - -impl<'a> WritePayload for PartitionWrite<'a> { - fn write_to_batch(&self, batch: &mut MutableBatch) -> Result<()> { - batch.extend_from_ranges(self.batch, &self.ranges) - } -} - -fn get_time_column(batch: &MutableBatch) -> &[i64] { - let time_column = batch.column(TIME_COLUMN_NAME).expect("time column"); - match &time_column.data { - ColumnData::I64(col_data, _) => col_data, - x => unreachable!("expected i64 got {} for time column", x), - } -} - -fn min_max_time(col: &[i64]) -> (i64, i64) { - let mut min_timestamp = i64::MAX; - let mut max_timestamp = i64::MIN; - for t in col { - min_timestamp = min_timestamp.min(*t); - max_timestamp = max_timestamp.max(*t); - } - (min_timestamp, max_timestamp) -} diff --git a/mutable_batch/src/writer.rs b/mutable_batch/src/writer.rs index 3a1e2bcf117..8158077628f 100644 --- a/mutable_batch/src/writer.rs +++ b/mutable_batch/src/writer.rs @@ -1,7 +1,7 @@ //! A panic-safe write abstraction for [`MutableBatch`] use crate::{ - column::{Column, ColumnData, INVALID_DID}, + column::{Column, ColumnData, NULL_DID}, MutableBatch, }; use arrow_util::bitset::{iter_set_positions, iter_set_positions_with_offset, BitSet}; @@ -325,7 +325,7 @@ impl<'a> Writer<'a> { let mut stats = StatValues::new_empty(); match &mut col.data { ColumnData::Tag(col_data, dict, _) => { - col_data.resize(initial_rows + to_insert, INVALID_DID); + col_data.resize(initial_rows + to_insert, NULL_DID); for idx in set_position_iterator(valid_mask, to_insert) { let value = values.next().ok_or(Error::InsufficientValues)?; @@ -375,7 +375,7 @@ impl<'a> Writer<'a> { // Lazily compute mappings to handle dictionaries with unused mappings let mut mapping: Vec<_> = values.map(|value| (value, None)).collect(); - col_data.resize(initial_rows + to_insert, INVALID_DID); + col_data.resize(initial_rows + to_insert, NULL_DID); for idx in set_position_iterator(valid_mask, to_insert) { let key = keys.next().ok_or(Error::InsufficientValues)?; @@ -483,7 +483,7 @@ impl<'a> Writer<'a> { .collect(); dst_data.extend(src_data.iter().map(|src_id| match *src_id { - INVALID_DID => INVALID_DID, + NULL_DID => NULL_DID, _ => mapping[*src_id as usize], })); @@ -567,9 +567,9 @@ impl<'a> Writer<'a> { for range in ranges { dst_data.extend(src_data[range.clone()].iter().map( |src_id| match *src_id { - INVALID_DID => { + NULL_DID => { stats.update_for_nulls(1); - INVALID_DID + NULL_DID } _ => { let maybe_did = &mut mapping[*src_id as usize]; diff --git a/mutable_batch/tests/writer.rs b/mutable_batch/tests/writer.rs index 87da6b55575..96e1aa0575e 100644 --- a/mutable_batch/tests/writer.rs +++ b/mutable_batch/tests/writer.rs @@ -2,7 +2,7 @@ use arrow_util::assert_batches_eq; use data_types::{StatValues, Statistics}; use mutable_batch::{writer::Writer, MutableBatch, TimestampSummary}; use schema::Projection; -use std::num::NonZeroU64; +use std::{f64::NAN, num::NonZeroU64}; fn get_stats(batch: &MutableBatch) -> Vec<(&str, Statistics)> { let mut stats: Vec<_> = batch @@ -343,3 +343,110 @@ fn test_basic() { let timestamps = batch.timestamp_summary().unwrap(); assert_eq!(timestamps, expected_timestamps); } + +#[test] +fn test_null_only() { + let mut batch = MutableBatch::new(); + + let mut writer = Writer::new(&mut batch, 1); + + writer + .write_bool("b1", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer + .write_f64("f64", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer + .write_i64("i64", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer + .write_u64("u64", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer + .write_string("string", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer.write_time("time", vec![42].into_iter()).unwrap(); + + writer + .write_tag("tag1", Some(&[0b00000000]), vec![].into_iter()) + .unwrap(); + + writer.commit(); + + let stats: Vec<_> = get_stats(&batch); + + let expected_data = &[ + "+----+-----+-----+--------+------+--------------------------------+-----+", + "| b1 | f64 | i64 | string | tag1 | time | u64 |", + "+----+-----+-----+--------+------+--------------------------------+-----+", + "| | | | | | 1970-01-01T00:00:00.000000042Z | |", + "+----+-----+-----+--------+------+--------------------------------+-----+", + ]; + + let expected_stats = vec![ + ( + "b1", + Statistics::Bool(StatValues::new(None, None, 1, Some(1))), + ), + ( + "f64", + Statistics::F64(StatValues::new(None, None, 1, Some(1))), + ), + ( + "i64", + Statistics::I64(StatValues::new(None, None, 1, Some(1))), + ), + ( + "string", + Statistics::String(StatValues::new(None, None, 1, Some(1))), + ), + ( + "tag1", + Statistics::String(StatValues::new_with_distinct( + None, + None, + 1, + Some(1), + Some(1.try_into().unwrap()), + )), + ), + ( + "time", + Statistics::I64(StatValues::new(Some(42), Some(42), 1, Some(0))), + ), + ( + "u64", + Statistics::U64(StatValues::new(None, None, 1, Some(1))), + ), + ]; + + assert_batches_eq!(expected_data, &[batch.to_arrow(Projection::All).unwrap()]); + pretty_assertions::assert_eq!(expected_stats, stats); +} + +#[test] +fn test_nan_stats() { + let mut batch = MutableBatch::new(); + + let mut writer = Writer::new(&mut batch, 3); + + writer + .write_f64("f64", None, vec![4.2, NAN, 2.4].into_iter()) + .unwrap(); + + writer.commit(); + + let stats: Vec<_> = get_stats(&batch); + + let expected_stats = vec![( + "f64", + Statistics::F64(StatValues::new(Some(2.4), Some(4.2), 3, Some(0))), + )]; + + pretty_assertions::assert_eq!(expected_stats, stats); +} diff --git a/mutable_batch/tests/writer_fuzz.rs b/mutable_batch/tests/writer_fuzz.rs index 31c23f51c78..bf8183d4e96 100644 --- a/mutable_batch/tests/writer_fuzz.rs +++ b/mutable_batch/tests/writer_fuzz.rs @@ -19,7 +19,8 @@ use data_types::{ IsNan, StatValues, Statistics, }; use hashbrown::HashSet; -use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload}; +use mutable_batch::{writer::Writer, MutableBatch, WritePayload}; +use partition::PartitionWrite; use rand::prelude::*; use schema::Projection; use std::{collections::BTreeMap, num::NonZeroU64, ops::Range, sync::Arc}; @@ -416,7 +417,7 @@ fn test_partition_write() { let mut batch = MutableBatch::new(); let expected = extend_batch(&mut rng, &mut batch); - let w = PartitionWrite::new(&batch); + let w = PartitionWrite::new(&batch).unwrap(); assert_eq!(w.rows().get(), expected.tag_expected.len()); let verify_write = |write: &PartitionWrite<'_>| { diff --git a/mutable_batch_lp/Cargo.toml b/mutable_batch_lp/Cargo.toml index 3f75bb21aee..89e367aa351 100644 --- a/mutable_batch_lp/Cargo.toml +++ b/mutable_batch_lp/Cargo.toml @@ -6,11 +6,15 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] hashbrown = { workspace = true } influxdb-line-protocol = { path = "../influxdb_line_protocol" } +itertools = "0.12.0" mutable_batch = { path = "../mutable_batch" } -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] @@ -18,12 +22,12 @@ arrow_util = { path = "../arrow_util" } assert_matches = "1.5.0" criterion = { version = "0.5", default-features = false, features = ["rayon"]} schema = { path = "../schema" } +test_helpers = { path = "../test_helpers" } [[bench]] name = "parse_lp" harness = false - [lib] # Allow --save-baseline to work # https://github.com/bheisler/criterion.rs/issues/275 diff --git a/mutable_batch_lp/fuzz/.gitignore b/mutable_batch_lp/fuzz/.gitignore new file mode 100644 index 00000000000..1a45eee7760 --- /dev/null +++ b/mutable_batch_lp/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/mutable_batch_lp/fuzz/Cargo.lock b/mutable_batch_lp/fuzz/Cargo.lock new file mode 100644 index 00000000000..db2c6c7e4e5 --- /dev/null +++ b/mutable_batch_lp/fuzz/Cargo.lock @@ -0,0 +1,4129 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "addr2line" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "ahash" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +dependencies = [ + "cfg-if", + "const-random", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2969dcb958b36655471fc61f7e416fa76033bdd4bfed0678d8fee1e2d07a1f0" +dependencies = [ + "memchr", +] + +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d664a92ecae85fd0a7392615844904654d1d5f5514837f471ddef4a057aba1b6" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7079075b41f533b8c61d2a4d073c4676e1f8b249ff94a393b0595db304e0dd87" + +[[package]] +name = "anstyle-parse" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" +dependencies = [ + "anstyle", + "windows-sys 0.52.0", +] + +[[package]] +name = "anyhow" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "080e9890a082662b09c1ad45f567faeeb47f22b5fb23895fbe1e651e718e25ca" + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" + +[[package]] +name = "arrow" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614" +dependencies = [ + "ahash", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "num", +] + +[[package]] +name = "arrow-array" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.14.3", + "num", +] + +[[package]] +name = "arrow-buffer" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c" +dependencies = [ + "bytes", + "half", + "num", +] + +[[package]] +name = "arrow-cast" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", +] + +[[package]] +name = "arrow-csv" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e09aa6246a1d6459b3f14baeaa49606cfdbca34435c46320e14054d244987ca" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "lazy_static", + "lexical-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num", +] + +[[package]] +name = "arrow-ipc" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "flatbuffers", +] + +[[package]] +name = "arrow-json" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d82565c91fd627922ebfe2810ee4e8346841b6f9361b87505a9acea38b614fee" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.1.0", + "lexical-core", + "num", + "serde", + "serde_json", +] + +[[package]] +name = "arrow-ord" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "half", + "num", +] + +[[package]] +name = "arrow-row" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", + "hashbrown 0.14.3", +] + +[[package]] +name = "arrow-schema" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167" + +[[package]] +name = "arrow-select" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num", +] + +[[package]] +name = "arrow-string" +version = "49.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "num", + "regex", + "regex-syntax 0.8.2", +] + +[[package]] +name = "arrow_util" +version = "0.1.0" +dependencies = [ + "ahash", + "arrow", + "chrono", + "comfy-table", + "hashbrown 0.14.3", + "num-traits", + "once_cell", + "regex", + "snafu", + "uuid", + "workspace-hack", +] + +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "async-trait" +version = "0.1.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atomic-write-file" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edcdbedc2236483ab103a53415653d6b4442ea6141baf1ffa85df29635e88436" +dependencies = [ + "nix", + "rand", +] + +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + +[[package]] +name = "axum" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" +dependencies = [ + "async-trait", + "axum-core", + "bitflags 1.3.2", + "bytes", + "futures-util", + "http", + "http-body", + "hyper", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "sync_wrapper", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum-core" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "759fa577a247914fd3f7f76d62972792636412fbfd634cd452f6a385a74d2d2c" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http", + "http-body", + "mime", + "rustversion", + "tower-layer", + "tower-service", +] + +[[package]] +name = "backtrace" +version = "0.3.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +dependencies = [ + "addr2line", + "cc", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", +] + +[[package]] +name = "base64" +version = "0.21.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35636a1494ede3b646cc98f74f8e62c773a38a659ebc777a2cf26b9b74171df9" + +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + +[[package]] +name = "bit-set" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0700ddab506f33b20a03b13996eccd309a48e5ff77d0d95926aa0210fb4e95f1" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" +dependencies = [ + "serde", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" + +[[package]] +name = "cc" +version = "1.0.83" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0" +dependencies = [ + "jobserver", + "libc", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-targets 0.48.5", +] + +[[package]] +name = "chrono-tz" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7" +dependencies = [ + "chrono", + "chrono-tz-build", + "phf", +] + +[[package]] +name = "chrono-tz-build" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "433e39f13c9a060046954e0592a8d0a4bcb1040125cbf91cb8ee58964cfb350f" +dependencies = [ + "parse-zoneinfo", + "phf", + "phf_codegen", +] + +[[package]] +name = "clap" +version = "4.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcfab8ba68f3668e89f6ff60f5b205cea56aa7b769451a59f34b8682f51c056d" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb7fb5e4e979aec3be7791562fcba452f94ad85e954da024396433e0e25a79e9" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf9804afaaf59a91e75b022a30fb7229a7901f60c755489cc61c9b423b836442" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "clap_lex" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "702fc72eb24e5a1e48ce58027a675bc24edd52096d5397d4aea7c6dd9eca0bd1" + +[[package]] +name = "colorchoice" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" + +[[package]] +name = "comfy-table" +version = "7.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c64043d6c7b7a4c58e39e7efccfdea7b93d885a795d0c054a69dbbf4dd52686" +dependencies = [ + "strum", + "strum_macros", + "unicode-width", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-random" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aaf16c9c2c612020bcfd042e170f6e32de9b9d75adb5277cdbbd2e2c8c8299a" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" + +[[package]] +name = "cpufeatures" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86ec7a15cbe22e59248fc7eadb1907dab5ba09372595da4d73dd805ed4417dfe" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "croaring" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7266f0a7275b00ce4c4f4753e8c31afdefe93828101ece83a06e2ddab1dd1010" +dependencies = [ + "byteorder", + "croaring-sys", +] + +[[package]] +name = "croaring-sys" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47112498c394a7067949ebc07ef429b7384a413cf0efcf675846a47bcd307fb" +dependencies = [ + "cc", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc6598521bb5a83d491e8c1fe51db7296019d2ca3cb93cc6c2a20369a4d78a2" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3a430a770ebd84726f584a90ee7f020d28db52c6d02138900f22341f866d39c" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac574ff4d437a7b5ad237ef331c17ccca63c46479e5b5453eb8e10bb99a759fe" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "csv-core" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70" +dependencies = [ + "memchr", +] + +[[package]] +name = "data_types" +version = "0.1.0" +dependencies = [ + "arrow-buffer", + "bytes", + "chrono", + "croaring", + "generated_types", + "influxdb-line-protocol", + "iox_time", + "murmur3", + "observability_deps", + "once_cell", + "ordered-float 4.2.0", + "percent-encoding", + "prost", + "schema", + "serde_json", + "sha2", + "siphasher 1.0.0", + "snafu", + "sqlx", + "thiserror", + "uuid", + "workspace-hack", +] + +[[package]] +name = "der" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fffa369a668c8af7dbf8b5e56c9f744fbd399949ed171606040001947de40b1c" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "doc-comment" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" + +[[package]] +name = "dotenvy" +version = "0.15.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" + +[[package]] +name = "dyn-clone" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "545b22097d44f8a9581187cdf93de7a71e4722bf51200cfaba810865b49a495d" + +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +dependencies = [ + "serde", +] + +[[package]] +name = "encoding_rs" +version = "0.8.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "errno" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "etcetera" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "136d1b5283a1ab77bd9257427ffd09d8667ced0570b6f938942bc7568ed5b943" +dependencies = [ + "cfg-if", + "home", + "windows-sys 0.48.0", +] + +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + +[[package]] +name = "fastrand" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" + +[[package]] +name = "finl_unicode" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "flatbuffers" +version = "23.5.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" +dependencies = [ + "bitflags 1.3.2", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.0.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float-cmp" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4" +dependencies = [ + "num-traits", +] + +[[package]] +name = "flume" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ac459de2512911e4b674ce33cf20befaba382d05b62b008afc1c8b57cbf181" +dependencies = [ + "futures-core", + "futures-sink", + "spin 0.9.8", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "futures" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "645c6916888f6cb6350d2550b80fb63e734897a8498abe35cfb732b6487804b0" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" + +[[package]] +name = "futures-executor" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a576fc72ae164fca6b9db127eaa9a9dda0d61316034f33a0a0d4eda41f02b01d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-intrusive" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d930c203dd0b6ff06e0201a4a2fe9149b43c684fd4420555b26d21b1a02956f" +dependencies = [ + "futures-core", + "lock_api", + "parking_lot", +] + +[[package]] +name = "futures-io" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" + +[[package]] +name = "futures-macro" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "futures-sink" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" + +[[package]] +name = "futures-task" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" + +[[package]] +name = "futures-util" +version = "0.3.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "generated_types" +version = "0.1.0" +dependencies = [ + "observability_deps", + "pbjson", + "pbjson-build", + "pbjson-types", + "prost", + "prost-build", + "serde", + "tonic", + "tonic-build", + "uuid", + "workspace-hack", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe9006bed769170c11f845cf00c7c1e9092aeb3f268e007c3e760ac68008070f" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" + +[[package]] +name = "h2" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6250322ef6e60f93f9a2162799302cd6f68f79f6e5d85c8c16f14d1d958178" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap 2.1.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" +dependencies = [ + "ahash", + "allocator-api2", +] + +[[package]] +name = "hashlink" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7" +dependencies = [ + "hashbrown 0.14.3", +] + +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" +dependencies = [ + "unicode-segmentation", +] + +[[package]] +name = "hermit-abi" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "http" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8947b1a6fad4393052c7ba1f4cd97bed3e953a95c79c92ad9b051a04611d9fbb" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http", + "pin-project-lite", +] + +[[package]] +name = "http-range-header" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "add0ab9360ddbd88cfeb3bd9574a1d85cfdfa14db10b3e21d3700dbc4328758f" + +[[package]] +name = "httparse" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" + +[[package]] +name = "hyper" +version = "0.14.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http", + "hyper", + "log", + "rustls", + "rustls-native-certs", + "tokio", + "tokio-rustls", +] + +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "idna" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +dependencies = [ + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +dependencies = [ + "equivalent", + "hashbrown 0.14.3", +] + +[[package]] +name = "influxdb-line-protocol" +version = "1.0.0" +dependencies = [ + "bytes", + "log", + "nom", + "smallvec", + "snafu", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "iox_time" +version = "0.1.0" +dependencies = [ + "chrono", + "parking_lot", + "tokio", + "workspace-hack", +] + +[[package]] +name = "ipnet" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" + +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" + +[[package]] +name = "jobserver" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +dependencies = [ + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cee9c64da59eae3b50095c18d3e74f8b73c0b86d2792824ff01bbce68ba229ca" +dependencies = [ + "wasm-bindgen", +] + +[[package]] +name = "json-patch" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ff1e1486799e3f64129f8ccad108b38290df9cd7015cd31bed17239f0789d6" +dependencies = [ + "serde", + "serde_json", + "thiserror", + "treediff", +] + +[[package]] +name = "k8s-openapi" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edc3606fd16aca7989db2f84bb25684d0270c6d6fa1dbcd0025af7b4130523a6" +dependencies = [ + "base64", + "bytes", + "chrono", + "schemars", + "serde", + "serde-value", + "serde_json", +] + +[[package]] +name = "kube-core" +version = "0.87.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5bba93d054786eba7994d03ce522f368ef7d48c88a1826faa28478d85fb63ae" +dependencies = [ + "chrono", + "form_urlencoded", + "http", + "json-patch", + "k8s-openapi", + "once_cell", + "schemars", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +dependencies = [ + "spin 0.5.2", +] + +[[package]] +name = "lexical-core" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cde5de06e8d4c2faabc400238f9ae1c74d5412d03a7bd067645ccbc47070e46" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683b3a5ebd0130b8fb52ba0bdc718cc56815b6a097e28ae5a6997d0ad17dc05f" +dependencies = [ + "lexical-parse-integer", + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-parse-integer" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d0994485ed0c312f6d965766754ea177d07f9c00c9b82a5ee62ed5b47945ee9" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "lexical-util" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5255b9ff16ff898710eb9eb63cb39248ea8a5bb036bea8085b1a767ff6c4e3fc" +dependencies = [ + "static_assertions", +] + +[[package]] +name = "lexical-write-float" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accabaa1c4581f05a3923d1b4cfd124c329352288b7b9da09e766b0668116862" +dependencies = [ + "lexical-util", + "lexical-write-integer", + "static_assertions", +] + +[[package]] +name = "lexical-write-integer" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1b6f3d1f4422866b68192d62f77bc5c700bee84f3069f2469d7bc8c77852446" +dependencies = [ + "lexical-util", + "static_assertions", +] + +[[package]] +name = "libc" +version = "0.2.151" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "302d7ab3130588088d277783b1e2d2e10c9e9e4a16dd9050e6ec93fb3e7048f4" + +[[package]] +name = "libfuzzer-sys" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a96cfd5557eb82f2b83fed4955246c988d331975a002961b07c81584d107e7f7" +dependencies = [ + "arbitrary", + "cc", + "once_cell", +] + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "libsqlite3-sys" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4e226dcd58b4be396f7bd3c20da8fdee2911400705297ba7d2d7cc2c30f716" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" + +[[package]] +name = "lock_api" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" + +[[package]] +name = "matchers" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" +dependencies = [ + "regex-automata 0.1.10", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] + +[[package]] +name = "mio" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f3d0b296e374a4e6f3c7b0a1f5a51d748a0d34c85e7dc48fc3fa9a87657fe09" +dependencies = [ + "libc", + "log", + "wasi", + "windows-sys 0.48.0", +] + +[[package]] +name = "multimap" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" + +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + +[[package]] +name = "mutable_batch" +version = "0.1.0" +dependencies = [ + "arrow", + "arrow_util", + "chrono", + "data_types", + "hashbrown 0.14.3", + "iox_time", + "itertools 0.12.0", + "percent-encoding", + "schema", + "snafu", + "thiserror", + "unicode-segmentation", + "workspace-hack", +] + +[[package]] +name = "mutable_batch_lp" +version = "0.0.0" +dependencies = [ + "hashbrown 0.14.3", + "libfuzzer-sys", + "mutable_batch", + "mutable_batch_lp 0.1.0", +] + +[[package]] +name = "mutable_batch_lp" +version = "0.1.0" +dependencies = [ + "hashbrown 0.14.3", + "influxdb-line-protocol", + "itertools 0.12.0", + "mutable_batch", + "snafu", + "workspace-hack", +] + +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.1", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + +[[package]] +name = "num" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af" +dependencies = [ + "num-bigint", + "num-complex", + "num-integer", + "num-iter", + "num-rational", + "num-traits", +] + +[[package]] +name = "num-bigint" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608e7659b5c3d7cba262d894801b9ec9d00de989e8a82bd4bef91d08da45cdc0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc84195820f291c7697304f3cbdadd1cb7199c0efc917ff5eafd71225c136151" +dependencies = [ + "byteorder", + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-complex" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-integer" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +dependencies = [ + "autocfg", + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.32.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2524735495ea1268be33d200e1ee97455096a0846295a21548cd2f3541de7050" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "futures", + "humantime", + "hyper", + "itertools 0.11.0", + "parking_lot", + "percent-encoding", + "quick-xml", + "rand", + "reqwest", + "ring", + "rustls-pemfile", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "walkdir", +] + +[[package]] +name = "observability_deps" +version = "0.1.0" +dependencies = [ + "tracing", + "workspace-hack", +] + +[[package]] +name = "once_cell" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +dependencies = [ + "parking_lot_core", +] + +[[package]] +name = "openssl-probe" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" + +[[package]] +name = "ordered-float" +version = "2.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-float" +version = "4.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a76df7075c7d4d01fdcb46c912dd17fba5b60c78ea480b475f2b6ab6f666584e" +dependencies = [ + "num-traits", +] + +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + +[[package]] +name = "parking_lot" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets 0.48.5", +] + +[[package]] +name = "parse-zoneinfo" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c705f256449c60da65e11ff6626e0c16a0a0b96aaa348de61376b249bc340f41" +dependencies = [ + "regex", +] + +[[package]] +name = "paste" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" + +[[package]] +name = "pbjson" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1030c719b0ec2a2d25a5df729d6cff1acf3cc230bf766f4f97833591f7577b90" +dependencies = [ + "base64", + "serde", +] + +[[package]] +name = "pbjson-build" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2580e33f2292d34be285c5bc3dba5259542b083cfad6037b6d70345f24dcb735" +dependencies = [ + "heck", + "itertools 0.11.0", + "prost", + "prost-types", +] + +[[package]] +name = "pbjson-types" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18f596653ba4ac51bdecbb4ef6773bc7f56042dc13927910de1684ad3d32aa12" +dependencies = [ + "bytes", + "chrono", + "pbjson", + "pbjson-build", + "prost", + "prost-build", + "serde", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + +[[package]] +name = "petgraph" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" +dependencies = [ + "fixedbitset", + "indexmap 2.1.0", +] + +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_codegen" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8d39688d359e6b34654d328e262234662d16cc0f60ec8dcbe5e718709342a5a" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48e4cc64c2ad9ebe670cb8fd69dd50ae301650392e81c05f9bfcb2d5bdbc24b0" +dependencies = [ + "phf_shared", + "rand", +] + +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "pin-project" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fda4ed1c6c173e3fc7a83629421152e01d7b1f9b7f65fb301e490e8cfc656422" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4359fd9c9171ec6e8c62926d6faaf553a8dc3f64e1507e76da7911b4f6a04405" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "pkg-config" +version = "0.3.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69d3587f8a9e599cc7ec2c00e331f71c4e69a5f9a4b8a6efd5b07466b9736f9a" + +[[package]] +name = "ppv-lite86" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" + +[[package]] +name = "predicates" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dfc28575c2e3f19cb3c73b93af36460ae898d426eba6fc15b9bd2a5220758a0" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "itertools 0.11.0", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b794032607612e7abeb4db69adb4e33590fa6cf1149e95fd7cb00e634b92f174" + +[[package]] +name = "prettyplease" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a41cf62165e97c7f814d2221421dbb9afcbcdb0a88068e5ea206e19951c2cbb5" +dependencies = [ + "proc-macro2", + "syn 2.0.46", +] + +[[package]] +name = "proc-macro2" +version = "1.0.74" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "proptest" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31b476131c3c86cb68032fdc5cb6d5a1045e3e42d96b69fa599fd77701e1f5bf" +dependencies = [ + "bit-set", + "bit-vec", + "bitflags 2.4.1", + "lazy_static", + "num-traits", + "rand", + "rand_chacha", + "rand_xorshift", + "regex-syntax 0.8.2", + "rusty-fork", + "tempfile", + "unarray", +] + +[[package]] +name = "prost" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "146c289cda302b98a28d40c8b3b90498d6e526dd24ac2ecea73e4e491685b94a" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c55e02e35260070b6f716a2423c2ff1c3bb1642ddca6f99e1f26d06268a0e2d2" +dependencies = [ + "bytes", + "heck", + "itertools 0.11.0", + "log", + "multimap", + "once_cell", + "petgraph", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.46", + "tempfile", + "which", +] + +[[package]] +name = "prost-derive" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efb6c9a1dd1def8e2124d17e83a20af56f1570d6c2d2bd9e266ccb768df3840e" +dependencies = [ + "anyhow", + "itertools 0.11.0", + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "prost-types" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "193898f59edcf43c26227dcd4c8427f00d99d61e95dcde58dabd49fa291d470e" +dependencies = [ + "prost", +] + +[[package]] +name = "quick-error" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" + +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quote" +version = "1.0.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rand_xorshift" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d25bf25ec5ae4a3f1b92f929810509a2f53d7dca2f50b794ff57e3face536c8f" +dependencies = [ + "rand_core", +] + +[[package]] +name = "redox_syscall" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "regex" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + +[[package]] +name = "regex-automata" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax 0.8.2", +] + +[[package]] +name = "regex-syntax" +version = "0.6.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" + +[[package]] +name = "regex-syntax" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" + +[[package]] +name = "reqwest" +version = "0.11.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b1ae8d9ac08420c66222fb9096fc5de435c3c48542bc5336c51892cffafb41" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-rustls", + "ipnet", + "js-sys", + "log", + "mime", + "once_cell", + "percent-encoding", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "system-configuration", + "tokio", + "tokio-rustls", + "tokio-util", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", + "winreg", +] + +[[package]] +name = "ring" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "688c63d65483050968b2a8937f7995f443e27041a0f7700aa59b0822aedebb74" +dependencies = [ + "cc", + "getrandom", + "libc", + "spin 0.9.8", + "untrusted", + "windows-sys 0.48.0", +] + +[[package]] +name = "rsa" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e5124fcb30e76a7e79bfee683a2746db83784b86289f6251b54b7950a0dfc" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core", + "signature", + "spki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316" +dependencies = [ + "bitflags 2.4.1", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.52.0", +] + +[[package]] +name = "rustls" +version = "0.21.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d5a6813c0759e4609cd494e8e725babae6a2ca7b62a5536a13daaec6fcb7ba" +dependencies = [ + "log", + "ring", + "rustls-webpki", + "sct", +] + +[[package]] +name = "rustls-native-certs" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" +dependencies = [ + "openssl-probe", + "rustls-pemfile", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" +dependencies = [ + "base64", +] + +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ffc183a10b4478d04cbbbfc96d0873219d962dd5accaff2ffbd4ceb7df837f4" + +[[package]] +name = "rusty-fork" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb3dcc6e454c328bb824492db107ab7c0ae8fcffe4ad210136ef014458c1bc4f" +dependencies = [ + "fnv", + "quick-error", + "tempfile", + "wait-timeout", +] + +[[package]] +name = "ryu" +version = "1.0.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "schannel" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" +dependencies = [ + "windows-sys 0.52.0", +] + +[[package]] +name = "schema" +version = "0.1.0" +dependencies = [ + "arrow", + "hashbrown 0.14.3", + "indexmap 2.1.0", + "observability_deps", + "snafu", + "workspace-hack", +] + +[[package]] +name = "schemars" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a28f4c49489add4ce10783f7911893516f15afe45d015608d41faca6bc4d29" +dependencies = [ + "dyn-clone", + "schemars_derive", + "serde", + "serde_json", +] + +[[package]] +name = "schemars_derive" +version = "0.8.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c767fd6fa65d9ccf9cf026122c1b555f2ef9a4f0cea69da4d7dbc3e258d30967" +dependencies = [ + "proc-macro2", + "quote", + "serde_derive_internals", + "syn 1.0.109", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97ed7a9823b74f99c7742f5336af7be5ecd3eeafcb1507d1fa93347b1d589b0" + +[[package]] +name = "serde" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b114498256798c94a0689e1a15fec6005dee8ac1f41de56404b67afc2a4b773" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde-value" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3a1a3341211875ef120e117ea7fd5228530ae7e7036a779fdc9117be6b3282c" +dependencies = [ + "ordered-float 2.10.1", + "serde", +] + +[[package]] +name = "serde_derive" +version = "1.0.194" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3385e45322e8f9931410f01b3031ec534c3947d0e94c18049af4d9f9907d4e0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "serde_derive_internals" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85bf8229e7920a9f636479437026331ce11aa132b4dde37d121944a44d6e5f3c" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "serde_json" +version = "1.0.110" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fbd975230bada99c8bb618e0c365c2eefa219158d5c6c29610fd09ff1833257" +dependencies = [ + "indexmap 2.1.0", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" +dependencies = [ + "libc", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core", +] + +[[package]] +name = "similar" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32fea41aca09ee824cc9724996433064c89f7777e60762749a4170a14abbfa21" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "siphasher" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54ac45299ccbd390721be55b412d41931911f654fa99e2cb8bfb57184b2061fe" + +[[package]] +name = "slab" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" +dependencies = [ + "autocfg", +] + +[[package]] +name = "smallvec" +version = "1.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dccd0940a2dcdf68d092b8cbab7dc0ad8fa938bf95787e1b916b0e3d0e8e970" + +[[package]] +name = "snafu" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +dependencies = [ + "doc-comment", + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "socket2" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5fac59a5cb5dd637972e5fca70daf0523c9067fcdc4842f053dae04a18f8e9" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + +[[package]] +name = "spin" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlformat" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce81b7bd7c4493975347ef60d8c7e8b742d4694f4c49f93e0a12ea263938176c" +dependencies = [ + "itertools 0.12.0", + "nom", + "unicode_categories", +] + +[[package]] +name = "sqlx" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dba03c279da73694ef99763320dea58b51095dfe87d001b1d4b5fe78ba8763cf" +dependencies = [ + "sqlx-core", + "sqlx-macros", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", +] + +[[package]] +name = "sqlx-core" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d84b0a3c3739e220d94b3239fd69fb1f74bc36e16643423bd99de3b43c21bfbd" +dependencies = [ + "ahash", + "atoi", + "byteorder", + "bytes", + "crc", + "crossbeam-queue", + "dotenvy", + "either", + "event-listener", + "futures-channel", + "futures-core", + "futures-intrusive", + "futures-io", + "futures-util", + "hashlink", + "hex", + "indexmap 2.1.0", + "log", + "memchr", + "once_cell", + "paste", + "percent-encoding", + "rustls", + "rustls-pemfile", + "serde", + "serde_json", + "sha2", + "smallvec", + "sqlformat", + "thiserror", + "tokio", + "tokio-stream", + "tracing", + "url", + "uuid", + "webpki-roots", +] + +[[package]] +name = "sqlx-macros" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89961c00dc4d7dffb7aee214964b065072bff69e36ddb9e2c107541f75e4f2a5" +dependencies = [ + "proc-macro2", + "quote", + "sqlx-core", + "sqlx-macros-core", + "syn 1.0.109", +] + +[[package]] +name = "sqlx-macros-core" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0bd4519486723648186a08785143599760f7cc81c52334a55d6a83ea1e20841" +dependencies = [ + "atomic-write-file", + "dotenvy", + "either", + "heck", + "hex", + "once_cell", + "proc-macro2", + "quote", + "serde", + "serde_json", + "sha2", + "sqlx-core", + "sqlx-mysql", + "sqlx-postgres", + "sqlx-sqlite", + "syn 1.0.109", + "tempfile", + "tokio", + "url", +] + +[[package]] +name = "sqlx-mysql" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e37195395df71fd068f6e2082247891bc11e3289624bbc776a0cdfa1ca7f1ea4" +dependencies = [ + "atoi", + "base64", + "bitflags 2.4.1", + "byteorder", + "bytes", + "crc", + "digest", + "dotenvy", + "either", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "generic-array", + "hex", + "hkdf", + "hmac", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "percent-encoding", + "rand", + "rsa", + "serde", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "uuid", + "whoami", +] + +[[package]] +name = "sqlx-postgres" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6ac0ac3b7ccd10cc96c7ab29791a7dd236bd94021f31eec7ba3d46a74aa1c24" +dependencies = [ + "atoi", + "base64", + "bitflags 2.4.1", + "byteorder", + "crc", + "dotenvy", + "etcetera", + "futures-channel", + "futures-core", + "futures-io", + "futures-util", + "hex", + "hkdf", + "hmac", + "home", + "itoa", + "log", + "md-5", + "memchr", + "once_cell", + "rand", + "serde", + "serde_json", + "sha1", + "sha2", + "smallvec", + "sqlx-core", + "stringprep", + "thiserror", + "tracing", + "uuid", + "whoami", +] + +[[package]] +name = "sqlx-sqlite" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "210976b7d948c7ba9fced8ca835b11cbb2d677c59c79de41ac0d397e14547490" +dependencies = [ + "atoi", + "flume", + "futures-channel", + "futures-core", + "futures-executor", + "futures-intrusive", + "futures-util", + "libsqlite3-sys", + "log", + "percent-encoding", + "serde", + "sqlx-core", + "tracing", + "url", + "urlencoding", + "uuid", +] + +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + +[[package]] +name = "stringprep" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +dependencies = [ + "finl_unicode", + "unicode-bidi", + "unicode-normalization", +] + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23dc1fa9ac9c169a78ba62f0b841814b7abae11bdd047b9c58f893439e309ea0" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.46", +] + +[[package]] +name = "subtle" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89456b690ff72fddcecf231caedbe615c59480c93358a93dfae7fc29e3ebbf0e" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2047c6ded9c721764247e62cd3b03c09ffc529b2ba5b10ec482ae507a4a70160" + +[[package]] +name = "system-configuration" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba3a3adc5c275d719af8cb4272ea1c4a6d668a777f37e115f6d11ddbc1c8e0e7" +dependencies = [ + "bitflags 1.3.2", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75fb188eb626b924683e3b95e3a48e63551fcfb51949de2f06a9d91dbee93c9" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa" +dependencies = [ + "cfg-if", + "fastrand", + "redox_syscall", + "rustix", + "windows-sys 0.52.0", +] + +[[package]] +name = "thiserror" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "thread_local" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" +dependencies = [ + "cfg-if", + "once_cell", +] + +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "thrift" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" +dependencies = [ + "byteorder", + "integer-encoding", + "log", + "ordered-float 2.10.1", + "threadpool", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinyvec" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.35.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c89b4efa943be685f629b149f53829423f8f5531ea21249408e8e2f8671ec104" +dependencies = [ + "backtrace", + "bytes", + "libc", + "mio", + "num_cpus", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "tracing", + "windows-sys 0.48.0", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +dependencies = [ + "bytes", + "futures-core", + "futures-io", + "futures-sink", + "pin-project-lite", + "slab", + "tokio", + "tracing", +] + +[[package]] +name = "tonic" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d560933a0de61cf715926b9cac824d4c883c2c43142f787595e48280c40a1d0e" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "rustls", + "rustls-native-certs", + "rustls-pemfile", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d021fc044c18582b9a2408cd0dd05b1596e3ecdb5c4df822bb0183545683889" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c5bb1d698276a2443e5ecfabc1008bf15a36c12e6a7176e7bf089ea9131140" +dependencies = [ + "base64", + "bitflags 2.4.1", + "bytes", + "futures-core", + "futures-util", + "http", + "http-body", + "http-range-header", + "mime", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" + +[[package]] +name = "tower-service" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" + +[[package]] +name = "tracing" +version = "0.1.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "tracing-core" +version = "0.1.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "parking_lot", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", +] + +[[package]] +name = "treediff" +version = "4.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52984d277bdf2a751072b5df30ec0377febdb02f7696d64c2d7d54630bac4303" +dependencies = [ + "serde_json", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "unarray" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" + +[[package]] +name = "unicode-bidi" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f2528f27a9eb2b21e69c95319b30bd0efd85d09c379741b0f78ea1d86be2416" + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "unicode-normalization" +version = "0.1.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-segmentation" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" + +[[package]] +name = "unicode-width" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" + +[[package]] +name = "unicode_categories" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8parse" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" + +[[package]] +name = "uuid" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560" +dependencies = [ + "getrandom", +] + +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + +[[package]] +name = "wait-timeout" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f200f5b12eb75f8c1ed65abd4b2db8a6e1b138a20de009dacee265a2498f3f6" +dependencies = [ + "libc", +] + +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "wasm-bindgen" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" +dependencies = [ + "cfg-if", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" +dependencies = [ + "bumpalo", + "log", + "once_cell", + "proc-macro2", + "quote", + "syn 2.0.46", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac36a15a220124ac510204aec1c3e5db8a22ab06fd6706d881dc6149f8ed9a12" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" + +[[package]] +name = "wasm-streams" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4609d447824375f43e1ffbc051b50ad8f4b3ae8219680c94452ea05eb240ac7" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "web-sys" +version = "0.3.66" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c24a44ec86bb68fbecd1b3efed7e85ea5621b39b35ef2766b66cd984f8010f" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "0.25.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1778a42e8b3b90bff8d0f5032bf22250792889a5cdc752aa0020c84abe3aaf10" + +[[package]] +name = "which" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" +dependencies = [ + "either", + "home", + "once_cell", + "rustix", +] + +[[package]] +name = "whoami" +version = "1.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22fc3756b8a9133049b26c7f61ab35416c130e8c09b660f5b3958b446f52cc50" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.0", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a18201040b24831fbb9e4eb208f8892e1f50a37feb53cc7ff887feb8f50e7cd" +dependencies = [ + "windows_aarch64_gnullvm 0.52.0", + "windows_aarch64_msvc 0.52.0", + "windows_i686_gnu 0.52.0", + "windows_i686_msvc 0.52.0", + "windows_x86_64_gnu 0.52.0", + "windows_x86_64_gnullvm 0.52.0", + "windows_x86_64_msvc 0.52.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb7764e35d4db8a7921e09562a0304bf2f93e0a51bfccee0bd0bb0b666b015ea" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbaa0368d4f1d2aaefc55b6fcfee13f41544ddf36801e793edbbfd7d7df075ef" + +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28637cb1fa3560a16915793afb20081aba2c92ee8af57b4d5f28e4b3e7df313" + +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffe5e8e31046ce6230cc7215707b816e339ff4d4d67c65dffa206fd0f7aa7b9a" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d6fa32db2bc4a2f5abeacf2b69f7992cd09dca97498da74a151a3132c26befd" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a657e1e9d3f514745a572a6846d3c7aa7dbe1658c056ed9c3344c4109a6949e" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "workspace-hack" +version = "0.1.0" +dependencies = [ + "ahash", + "base64", + "bitflags 2.4.1", + "byteorder", + "bytes", + "cc", + "chrono", + "clap", + "clap_builder", + "crossbeam-utils", + "crypto-common", + "digest", + "either", + "fixedbitset", + "flatbuffers", + "flate2", + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", + "getrandom", + "hashbrown 0.14.3", + "heck", + "hyper", + "hyper-rustls", + "indexmap 2.1.0", + "itertools 0.11.0", + "k8s-openapi", + "kube-core", + "libc", + "lock_api", + "log", + "md-5", + "memchr", + "mio", + "nix", + "nom", + "num-traits", + "object_store", + "once_cell", + "parking_lot", + "percent-encoding", + "petgraph", + "phf_shared", + "predicates", + "proptest", + "prost", + "prost-types", + "rand", + "rand_core", + "regex", + "regex-automata 0.4.3", + "regex-syntax 0.8.2", + "reqwest", + "ring", + "rustls", + "serde", + "serde_json", + "sha2", + "similar", + "spin 0.9.8", + "sqlx", + "sqlx-core", + "sqlx-macros", + "sqlx-macros-core", + "sqlx-postgres", + "sqlx-sqlite", + "strum", + "syn 1.0.109", + "syn 2.0.46", + "thrift", + "tokio", + "tokio-stream", + "tokio-util", + "tower", + "tower-http", + "tracing", + "tracing-core", + "tracing-log", + "tracing-subscriber", + "unicode-bidi", + "unicode-normalization", + "url", + "uuid", + "winapi", + "windows-sys 0.48.0", + "windows-sys 0.52.0", +] + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.46", +] + +[[package]] +name = "zeroize" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" diff --git a/mutable_batch_lp/fuzz/Cargo.toml b/mutable_batch_lp/fuzz/Cargo.toml new file mode 100644 index 00000000000..7a564adb7cf --- /dev/null +++ b/mutable_batch_lp/fuzz/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "mutable_batch_lp" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +hashbrown = "0.14" +libfuzzer-sys = "0.4" +mutable_batch_lp = { path = ".." } +mutable_batch = { path = "../../mutable_batch" } + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[profile.release] +debug = 1 + +[[bin]] +name = "lines_converter" +path = "fuzz_targets/lines_converter.rs" +test = false +doc = false diff --git a/mutable_batch_lp/fuzz/README.md b/mutable_batch_lp/fuzz/README.md new file mode 100644 index 00000000000..b978638c87c --- /dev/null +++ b/mutable_batch_lp/fuzz/README.md @@ -0,0 +1,46 @@ +# Fuzz tests + +The fuzz tests in this `fuzz` crate were created using [cargo-fuzz] version 0.11.3. + +[cargo-fuzz]: https://rust-fuzz.github.io/book/introduction.html + +## One-time setup + +To install `cargo-fuzz`: + +``` +$ cargo install cargo-fuzz +``` + +You'll also need a nightly Rust: + +``` +$ rustup install nightly +``` + +## Running + +To run an existing fuzz test, change to the `mutable_batch_lp` directory and run: + +``` +$ cargo +nightly fuzz run +``` + +where `` is the name of one of the files in `fuzz/fuzz_targets`. To list all targets, run: + +``` +$ cargo fuzz list +``` + +## Adding more + +To add more fuzzing targets, run: + +``` +$ cargo fuzz add +``` + +which will add a new file in `fuzz/fuzz_targets`. Edit the new file to call the code you want to +fuzz; see the [`cargo-fuzz` tutorial] for examples. + +[`cargo-fuzz` tutorial]: https://rust-fuzz.github.io/book/cargo-fuzz/tutorial.html diff --git a/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs new file mode 100644 index 00000000000..34421898f63 --- /dev/null +++ b/mutable_batch_lp/fuzz/fuzz_targets/lines_converter.rs @@ -0,0 +1,66 @@ +#![no_main] + +use hashbrown::HashMap; +use libfuzzer_sys::fuzz_target; +use mutable_batch::{column::ColumnData, MutableBatch, PartitionWrite, WritePayload}; +use mutable_batch_lp::LinesConverter; + +fuzz_target!(|data: &[u8]| { + if let Ok(body) = std::str::from_utf8(data) { + let table_partition_template = Default::default(); + let mut converter = LinesConverter::new(10); + let errors = match converter.write_lp(body) { + Ok(_) => vec![], + Err(mutable_batch_lp::Error::PerLine { lines }) => lines, + Err(other) => panic!("unexpected error: `{other}` input: `{body}`"), + }; + + if let Ok((batches, stats)) = converter.finish() { + let mut total_rows = 0; + + let mut partitions: HashMap<_, HashMap> = + HashMap::default(); + + for (table_name, mutable_batch) in &batches { + assert!( + mutable_batch.column("time").is_ok(), + "batch for table `{table_name}` does not have a time column: \ + {mutable_batch:#?}\ninput: `{body}`\nerrors: `{errors:#?}`" + ); + + let data = mutable_batch.column("time").unwrap().data(); + assert!( + matches!(data, ColumnData::I64(_, _)), + "expected the time column to be I64, instead got `{data:?}`.\ninput: `{body}`" + ); + + for (partition_key, partition_payload) in + PartitionWrite::partition(&mutable_batch, &table_partition_template).unwrap() + { + let partition = partitions.entry(partition_key).or_default(); + + let mut table_batch = partition + .raw_entry_mut() + .from_key(table_name.as_str()) + .or_insert_with(|| (table_name.to_owned(), MutableBatch::default())); + partition_payload + .write_to_batch(&mut table_batch.1) + .unwrap(); + } + + total_rows += mutable_batch.rows(); + } + + for (_partition_key, table_batches) in partitions { + for (_table_name, batch) in table_batches { + assert_ne!(batch.rows(), 0); + } + } + + assert_eq!( + stats.num_lines, total_rows, + "batches: {batches:#?}\ninput: `{body}`\nerrors: `{errors:#?}`" + ); + } + } +}); diff --git a/mutable_batch_lp/src/lib.rs b/mutable_batch_lp/src/lib.rs index 9d0c07c8efc..5579a431963 100644 --- a/mutable_batch_lp/src/lib.rs +++ b/mutable_batch_lp/src/lib.rs @@ -26,24 +26,37 @@ use mutable_batch::writer::Writer; use mutable_batch::MutableBatch; use snafu::{ResultExt, Snafu}; -/// Error type for line protocol conversion +const MAXIMUM_RETURNED_ERRORS: usize = 100; + +/// Error type for a conversion attempt on a set of line protocol lines #[derive(Debug, Snafu)] #[allow(missing_docs)] pub enum Error { + #[snafu(display( + "errors encountered on line(s):\n{}", + itertools::join(lines.iter(), "\n") + ))] + PerLine { lines: Vec }, + + #[snafu(display("empty write payload"))] + EmptyPayload, +} + +/// Errors which occur independently per line +#[derive(Debug, Snafu)] +#[allow(missing_docs)] +pub enum LineError { #[snafu(display("error parsing line {} (1-based): {}", line, source))] LineProtocol { source: influxdb_line_protocol::Error, line: usize, }, - #[snafu(display("error writing line {}: {}", line, source))] + #[snafu(display("error writing line {} (1-based): {}", line, source))] Write { source: LineWriteError, line: usize }, - #[snafu(display("empty write payload"))] - EmptyPayload, - - #[snafu(display("timestamp overflows i64"))] - TimestampOverflow, + #[snafu(display("timestamp overflows i64 on line {} (1-based)", line))] + TimestampOverflow { line: usize }, } /// Result type for line protocol conversion @@ -107,40 +120,83 @@ impl LinesConverter { /// [`mutable_batch::writer::Error::TypeMismatch`] /// pub fn write_lp(&mut self, lines: &str) -> Result<()> { - for (line_idx, maybe_line) in parse_lines(lines).enumerate() { - let mut line = maybe_line.context(LineProtocolSnafu { line: line_idx + 1 })?; - - if let Some(t) = line.timestamp.as_mut() { - *t = t - .checked_mul(self.timestamp_base) - .ok_or(Error::TimestampOverflow)?; - } - - self.stats.num_lines += 1; - self.stats.num_fields += line.field_set.len(); - - let measurement = line.series.measurement.as_str(); - - let (_, batch) = self - .batches - .raw_entry_mut() - .from_key(measurement) - .or_insert_with(|| (measurement.to_string(), MutableBatch::new())); + let errors = parse_lines(lines) + .enumerate() + .filter_map(|(line_idx, maybe_line)| { + maybe_line + .context(LineProtocolSnafu { line: line_idx + 1 }) + .and_then(|line| self.rebase_timestamp(line, line_idx)) + .and_then(|line| self.add_line_to_batch(line, line_idx)) + .err() + }) + .take(MAXIMUM_RETURNED_ERRORS) + .collect::>(); + + if !errors.is_empty() { + return Err(Error::PerLine { lines: errors }); + } + Ok(()) + } - // TODO: Reuse writer - let mut writer = Writer::new(batch, 1); - write_line(&mut writer, &line, self.default_time) - .context(WriteSnafu { line: line_idx + 1 })?; - writer.commit(); + fn rebase_timestamp<'a>( + &self, + mut line: ParsedLine<'a>, + line_idx: usize, + ) -> Result, LineError> { + if let Some(t) = line.timestamp.as_mut() { + let updated_timestamp = match t.checked_mul(self.timestamp_base) { + Some(t) => t, + None => return Err(LineError::TimestampOverflow { line: line_idx + 1 }), + }; + *t = updated_timestamp; } + Ok(line) + } + + fn add_line_to_batch( + &mut self, + line: ParsedLine<'_>, + line_idx: usize, + ) -> Result<(), LineError> { + let measurement = line.series.measurement.as_str(); + + let (_, batch) = self + .batches + .raw_entry_mut() + .from_key(measurement) + .or_insert_with(|| (measurement.to_string(), MutableBatch::new())); + + // TODO: Reuse writer + let mut writer = Writer::new(batch, 1); + match write_line(&mut writer, &line, self.default_time) + .context(WriteSnafu { line: line_idx + 1 }) + { + Ok(_) => { + writer.commit(); + self.stats.num_lines += 1; + self.stats.num_fields += line.field_set.len(); + } + Err(e) => return Err(e), + }; Ok(()) } /// Consume this [`LinesConverter`] returning the [`MutableBatch`] /// and the [`PayloadStatistics`] for the written data pub fn finish(self) -> Result<(HashMap, PayloadStatistics)> { - match self.batches.is_empty() { - false => Ok((self.batches, self.stats)), + let Self { batches, stats, .. } = self; + + // Keep only batches that have rows. If add_line_to_batch returned a WriteError for all + // lines of that table, there will be an empty mutable batch in `batches` that will violate + // the assumptions that the partitioner makes later. + let nonempty_batches: HashMap<_, _> = batches + .into_iter() + .filter(|(_table, batch)| batch.rows() > 0) + .collect(); + + // If there aren't any nonempty batches, then we have an empty payload. + match nonempty_batches.is_empty() { + false => Ok((nonempty_batches, stats)), true => Err(Error::EmptyPayload), } } @@ -332,6 +388,7 @@ pub mod test_helpers { #[cfg(test)] mod tests { use super::*; + use ::test_helpers::assert_error; use arrow_util::assert_batches_eq; use assert_matches::assert_matches; use schema::Projection; @@ -376,6 +433,58 @@ mod tests { ); } + #[test] + fn test_partial_line_conversion() { + let lp = r#"cpu,tag1=v1,tag2=v2 val=2i 0 + cpu,tag1=v4,tag2=v1 val=2i 0 + mem,tag1=v2 ival=3i 0 + ,tag2=v2 val=3i 1 + cpu,tag1=v1,tag2=v2 fval=2.0 + bad_line + mem,tag1=v5 ival=2i 1 + "#; + + let mut converter = LinesConverter::new(5); + let result = converter.write_lp(lp); + assert_matches!( + result, + Err(Error::PerLine { lines }) if matches!(&lines[..], [LineError::LineProtocol { .. }, LineError::LineProtocol { .. }]), + "expected an error returned from write_lp(), but found {:?}", result + ); + let (batches, _) = converter.finish().unwrap(); + assert_eq!( + batches.len(), + 2, + "expected both batches are written, instead found {:?}", + batches.len(), + ); + + assert_batches_eq!( + &[ + "+------+------+------+--------------------------------+-----+", + "| fval | tag1 | tag2 | time | val |", + "+------+------+------+--------------------------------+-----+", + "| | v1 | v2 | 1970-01-01T00:00:00Z | 2 |", + "| | v4 | v1 | 1970-01-01T00:00:00Z | 2 |", + "| 2.0 | v1 | v2 | 1970-01-01T00:00:00.000000005Z | |", + "+------+------+------+--------------------------------+-----+", + ], + &[batches["cpu"].to_arrow(Projection::All).unwrap()] + ); + + assert_batches_eq!( + &[ + "+------+------+--------------------------------+", + "| ival | tag1 | time |", + "+------+------+--------------------------------+", + "| 3 | v2 | 1970-01-01T00:00:00Z |", + "| 2 | v5 | 1970-01-01T00:00:00.000000001Z |", + "+------+------+--------------------------------+", + ], + &[batches["mem"].to_arrow(Projection::All).unwrap()] + ); + } + #[test] fn test_nulls_string_and_float() { let lp = r#"m f0="cat" 1639612800000000000 @@ -501,13 +610,12 @@ m b=t 1639612800000000000 let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail"); assert_matches!(err, - Error::Write { + Error::PerLine { lines } if matches!(&lines[..], + [LineError::Write { source: LineWriteError::ConflictedFieldTypes { name }, line: 1 - } - => { - assert_eq!(name, "val"); - }); + }] if name == "val" + )); } #[test] @@ -516,13 +624,13 @@ m b=t 1639612800000000000 let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail"); assert_matches!(err, - Error::Write { - source: LineWriteError::DuplicateTag { name }, - line: 1 - } - => { - assert_eq!(name, "tag"); - }); + Error::PerLine { lines } if matches!( + &lines[..], + [LineError::Write { + source: LineWriteError::DuplicateTag { name }, + line: 1 + }] if name == "tag" + )); } #[test] @@ -531,13 +639,13 @@ m b=t 1639612800000000000 let err = lines_to_batches(lp, 5).expect_err("duplicate tag write should fail"); assert_matches!(err, - Error::Write { - source: LineWriteError::DuplicateTag { name }, - line: 1 - } - => { - assert_eq!(name, "tag"); - }); + Error::PerLine { lines } if matches!( + &lines[..], + [LineError::Write { + source: LineWriteError::DuplicateTag { name }, + line: 1 + }] if name == "tag" + )); } // NOTE: All tags are strings, so this should never be a type conflict. @@ -547,13 +655,13 @@ m b=t 1639612800000000000 let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail"); assert_matches!(err, - Error::Write { - source: LineWriteError::DuplicateTag { name }, - line: 1 - } - => { - assert_eq!(name, "tag"); - }); + Error::PerLine { lines } if matches!( + &lines[..], + [LineError::Write { + source: LineWriteError::DuplicateTag { name }, + line: 1 + }] if name == "tag" + )); } // NOTE: disallowed in IOx but accepted in TSM @@ -564,13 +672,14 @@ m b=t 1639612800000000000 let lp = "m1,v=1i v=1i 0"; let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail"); - assert_matches!( - err, - Error::Write { - source: LineWriteError::MutableBatch { .. }, - line: 1 - } - ); + assert_matches!(err, + Error::PerLine { lines } if matches!( + &lines[..], + [LineError::Write { + source: LineWriteError::MutableBatch { .. }, + line: 1 + }] + )); } #[test] @@ -578,13 +687,49 @@ m b=t 1639612800000000000 let lp = "m1,v=1i v=1.0 0"; let err = lines_to_batches(lp, 5).expect_err("type conflicted write should fail"); - assert_matches!( - err, - Error::Write { - source: LineWriteError::MutableBatch { .. }, - line: 1 - } - ); + assert_matches!(err, + Error::PerLine { lines } if matches!( + &lines[..], + [LineError::Write { + source: LineWriteError::MutableBatch { .. }, + line: 1 + }] + )); } } + + #[test] + fn dont_add_batches_when_there_are_write_errors() { + let lp = r#"6,,=0,,=^/+\---6,,=yY\w\w\,y-/- (=" +\_/1 (=""#; + + let mut converter = LinesConverter::new(10); + let _errors = match converter.write_lp(lp) { + Ok(_) => vec![], + Err(Error::PerLine { lines }) => lines, + Err(other) => panic!("unexpected error: `{other}` input: `{lp}`"), + }; + + assert_error!(converter.finish(), Error::EmptyPayload); + } + + #[test] + fn dont_add_stats_when_there_are_write_errors() { + let lp = "cpu,tag1=v1,tag2=v2 val=2i 0 +cpu val=4u"; + + let mut converter = LinesConverter::new(10); + // The second line has a different type for val + converter.write_lp(lp).unwrap_err(); + let (batches, stats) = converter.finish().unwrap(); + + let total_rows: usize = batches.iter().map(|(_table, batch)| batch.rows()).sum(); + assert_eq!(stats.num_lines, total_rows); + } + + #[test] + fn duplicate_field_names_when_one_contains_optional_escaping_doesnt_panic() { + let lp = "table ,field=33,\\,field=333"; + lines_to_batches(lp, 5).unwrap(); + } } diff --git a/mutable_batch_pb/Cargo.toml b/mutable_batch_pb/Cargo.toml index c9bc27acbd0..5af7558c399 100644 --- a/mutable_batch_pb/Cargo.toml +++ b/mutable_batch_pb/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] arrow_util = { path = "../arrow_util" } dml = { path = "../dml" } @@ -13,9 +16,10 @@ generated_types = { path = "../generated_types" } hashbrown = { workspace = true } mutable_batch = { path = "../mutable_batch" } schema = { path = "../schema" } -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -mutable_batch_lp = { path = "../mutable_batch_lp" } data_types = { path = "../data_types" } +mutable_batch_lp = { path = "../mutable_batch_lp" } +partition = { path = "../partition" } diff --git a/mutable_batch_pb/src/decode.rs b/mutable_batch_pb/src/decode.rs index 4c334b0ef39..5cb4fa528dc 100644 --- a/mutable_batch_pb/src/decode.rs +++ b/mutable_batch_pb/src/decode.rs @@ -4,6 +4,7 @@ use generated_types::influxdata::pbdata::v1::{ column::{SemanticType, Values as PbValues}, Column as PbColumn, DatabaseBatch, PackedStrings, TableBatch, }; +use generated_types::DecodeError; use hashbrown::{HashMap, HashSet}; use mutable_batch::{writer::Writer, MutableBatch}; use schema::{InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME}; @@ -49,6 +50,9 @@ pub enum Error { #[snafu(display("column \"{}\" contains more than one type of values", column))] MultipleValues { column: String }, + #[snafu(display("unknown type for column {column}: {source}"))] + UnknownType { source: DecodeError, column: String }, + #[snafu(display("cannot infer type for column: {}", column))] InvalidType { column: String }, } @@ -365,12 +369,14 @@ fn pb_column_type(col: &PbColumn) -> Result { })?; let value_type = pb_value_type(&col.column_name, values)?; - let semantic_type = SemanticType::from_i32(col.semantic_type); + let semantic_type = SemanticType::try_from(col.semantic_type).context(UnknownTypeSnafu { + column: &col.column_name, + })?; match (semantic_type, value_type) { - (Some(SemanticType::Tag), InfluxFieldType::String) => Ok(InfluxColumnType::Tag), - (Some(SemanticType::Field), field) => Ok(InfluxColumnType::Field(field)), - (Some(SemanticType::Time), InfluxFieldType::Integer) + (SemanticType::Tag, InfluxFieldType::String) => Ok(InfluxColumnType::Tag), + (SemanticType::Field, field) => Ok(InfluxColumnType::Field(field)), + (SemanticType::Time, InfluxFieldType::Integer) if col.column_name.as_str() == TIME_COLUMN_NAME => { Ok(InfluxColumnType::Timestamp) diff --git a/mutable_batch_pb/src/lib.rs b/mutable_batch_pb/src/lib.rs index f42f556264d..6babb30310c 100644 --- a/mutable_batch_pb/src/lib.rs +++ b/mutable_batch_pb/src/lib.rs @@ -20,6 +20,8 @@ use data_types as _; #[cfg(test)] use mutable_batch_lp as _; +#[cfg(test)] +use partition as _; use workspace_hack as _; pub mod decode; diff --git a/mutable_batch_pb/tests/encode.rs b/mutable_batch_pb/tests/encode.rs index f43c2208e9f..2fd818bd038 100644 --- a/mutable_batch_pb/tests/encode.rs +++ b/mutable_batch_pb/tests/encode.rs @@ -1,7 +1,8 @@ use arrow_util::assert_batches_eq; use data_types::PartitionKey; -use mutable_batch::{writer::Writer, MutableBatch, PartitionWrite, WritePayload}; +use mutable_batch::{writer::Writer, MutableBatch, WritePayload}; use mutable_batch_pb::{decode::write_table_batch, encode::encode_batch}; +use partition::PartitionWrite; use schema::Projection; #[test] diff --git a/mutable_batch_tests/Cargo.toml b/mutable_batch_tests/Cargo.toml index 5b44ada698f..d6e14947556 100644 --- a/mutable_batch_tests/Cargo.toml +++ b/mutable_batch_tests/Cargo.toml @@ -6,19 +6,26 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] flate2 = "1.0" [dev-dependencies] bytes = "1.5" -criterion = { version = "0.5", default-features = false, features = ["rayon"]} +criterion = { version = "0.5", default-features = false, features = ["rayon"] } data_types = { path = "../data_types", default-features = false } dml = { path = "../dml" } generated_types = { path = "../generated_types" } mutable_batch = { path = "../mutable_batch" } mutable_batch_lp = { path = "../mutable_batch_lp" } mutable_batch_pb = { path = "../mutable_batch_pb" } -prost = "0.11" +prost = { workspace = true } + +[[bench]] +name = "statistics" +harness = false [[bench]] name = "write_lp" @@ -28,7 +35,6 @@ harness = false name = "write_pb" harness = false - [lib] # Allow --save-baseline to work # https://github.com/bheisler/criterion.rs/issues/275 diff --git a/mutable_batch_tests/benches/statistics.rs b/mutable_batch_tests/benches/statistics.rs new file mode 100644 index 00000000000..ef8d14cf16f --- /dev/null +++ b/mutable_batch_tests/benches/statistics.rs @@ -0,0 +1,184 @@ +use std::hint::black_box; + +use criterion::{criterion_group, criterion_main, BatchSize, Criterion, Throughput}; + +use data_types::StatValues; +use mutable_batch::{ + column::{recompute_min_max, Column, ColumnData}, + writer::Writer, + MutableBatch, +}; + +const N_VALUES: usize = 16_000; // Must be multiple of 8 + +fn generate_f64() -> Column { + let mut mb = MutableBatch::default(); + + let mut w = Writer::new(&mut mb, N_VALUES); + let mask = std::iter::repeat(0b01010101) + .take(N_VALUES / 8) + .collect::>(); + + let values = (0..).map(|v| v as f64).take(N_VALUES / 2); + + w.write_f64("v", Some(mask.as_slice()), values) + .expect("failed to generate test column"); + + w.commit(); + + mb.column("v").unwrap().clone() +} + +fn generate_u64() -> Column { + let mut mb = MutableBatch::default(); + + let mut w = Writer::new(&mut mb, N_VALUES); + let mask = std::iter::repeat(0b01010101) + .take(N_VALUES / 8) + .collect::>(); + + let values = (0..).map(|v| v as u64).take(N_VALUES / 2); + + w.write_u64("v", Some(mask.as_slice()), values) + .expect("failed to generate test column"); + + w.commit(); + + mb.column("v").unwrap().clone() +} + +fn generate_bool() -> Column { + let mut mb = MutableBatch::default(); + + let mut w = Writer::new(&mut mb, N_VALUES); + let mask = std::iter::repeat(0b01010101) + .take(N_VALUES / 8) + .collect::>(); + + let values = (0..).map(|v| v & 1 == 0).take(N_VALUES / 2); + + w.write_bool("v", Some(mask.as_slice()), values) + .expect("failed to generate test column"); + + w.commit(); + + mb.column("v").unwrap().clone() +} + +fn generate_tag() -> Column { + let mut mb = MutableBatch::default(); + + let mut w = Writer::new(&mut mb, N_VALUES); + let mask = std::iter::repeat(0b01010101) + .take(N_VALUES / 8) + .collect::>(); + + let values = (0..) + .map(|v| (v % 100).to_string()) + .take(N_VALUES / 2) + .collect::>(); + + w.write_tag( + "v", + Some(mask.as_slice()), + values.iter().map(|v| v.as_str()), + ) + .expect("failed to generate test column"); + + w.commit(); + + mb.column("v").unwrap().clone() +} + +fn bench_rebuild(data: &mut Column) { + recompute_min_max(data); +} + +fn bench_stats(col: &Column) { + match col.data() { + ColumnData::F64(data, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, v) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(v) + } + } + black_box(s); + } + ColumnData::I64(data, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, v) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(v) + } + } + black_box(s); + } + ColumnData::U64(data, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, v) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(v) + } + } + black_box(s); + } + ColumnData::Bool(data, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, v) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(&v) + } + } + black_box(s); + } + ColumnData::String(data, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, v) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(v) + } + } + black_box(s); + } + ColumnData::Tag(data, dict, _) => { + let mut s = StatValues::new(None, None, N_VALUES as _, None); + for (i, id) in data.iter().enumerate() { + if col.valid_mask().get(i) { + s.update(dict.lookup_id(*id).unwrap()) + } + } + black_box(s); + } + } +} + +fn run_bench(col: Column, c: &mut Criterion) { + let mut group = c.benchmark_group(col.data().to_string()); + group.throughput(Throughput::Bytes(col.size() as u64)); + group.bench_function("StatValues", |b| { + b.iter(|| { + bench_stats(&col); + }); + }); + group.bench_function("recompute_min_max", |b| { + b.iter_batched( + || col.clone(), + |mut col| { + bench_rebuild(&mut col); + }, + BatchSize::SmallInput, + ); + }); + group.finish(); +} + +pub fn bench_statistics(c: &mut Criterion) { + run_bench(generate_f64(), c); + run_bench(generate_u64(), c); + run_bench(generate_bool(), c); + run_bench(generate_tag(), c); +} + +criterion_group!(benches, bench_statistics); +criterion_main!(benches); diff --git a/object_store_metrics/Cargo.toml b/object_store_metrics/Cargo.toml index 216213bd47b..3a9488516b8 100644 --- a/object_store_metrics/Cargo.toml +++ b/object_store_metrics/Cargo.toml @@ -5,17 +5,20 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order -async-trait = "0.1.73" +async-trait = "0.1.77" bytes = "1.5" futures = "0.3" iox_time = { version = "0.1.0", path = "../iox_time" } metric = { version = "0.1.0", path = "../metric" } object_store = { workspace = true } pin-project = "1.1.3" -tokio = { version = "1.32", features = ["io-util"] } +tokio = { version = "1.35", features = ["io-util"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order -snafu = "0.7" -tokio = { version = "1.32", features = ["macros", "io-util"] } +snafu = "0.8" +tokio = { version = "1.35", features = ["macros", "io-util"] } diff --git a/object_store_metrics/src/dummy.rs b/object_store_metrics/src/dummy.rs index d25325b1855..9960e2ed910 100644 --- a/object_store_metrics/src/dummy.rs +++ b/object_store_metrics/src/dummy.rs @@ -3,11 +3,13 @@ use async_trait::async_trait; use bytes::Bytes; +use futures::StreamExt; use snafu::Snafu; use std::ops::Range; use object_store::{ - path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, Result, + path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, PutOptions, PutResult, Result, }; use tokio::io::AsyncWrite; @@ -36,13 +38,13 @@ impl From for object_store::Error { #[derive(Debug, Clone)] #[allow(missing_copy_implementations)] /// An object store that always generates an error -pub struct DummyObjectStore { +pub(crate) struct DummyObjectStore { name: &'static str, } impl DummyObjectStore { /// Create a new [`DummyObjectStore`] that always fails - pub fn new(name: &'static str) -> Self { + pub(crate) fn new(name: &'static str) -> Self { Self { name } } } @@ -55,7 +57,12 @@ impl std::fmt::Display for DummyObjectStore { #[async_trait] impl ObjectStore for DummyObjectStore { - async fn put(&self, _location: &Path, _bytes: Bytes) -> Result<()> { + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> Result { Ok(NotSupportedSnafu { name: self.name }.fail()?) } @@ -90,11 +97,16 @@ impl ObjectStore for DummyObjectStore { Ok(NotSupportedSnafu { name: self.name }.fail()?) } - async fn list( - &self, - _prefix: Option<&Path>, - ) -> Result>> { - Ok(NotSupportedSnafu { name: self.name }.fail()?) + fn list(&self, _prefix: Option<&Path>) -> futures::stream::BoxStream<'_, Result> { + futures::stream::once(async move { + NotSupportedSnafu { name: self.name } + .fail() + .map_err(|e| ObjectStoreError::Generic { + store: self.name, + source: Box::new(e), + }) + }) + .boxed() } async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result { diff --git a/object_store_metrics/src/lib.rs b/object_store_metrics/src/lib.rs index c7d6812bdb8..09292baf450 100644 --- a/object_store_metrics/src/lib.rs +++ b/object_store_metrics/src/lib.rs @@ -14,7 +14,7 @@ unused_crate_dependencies )] -use object_store::{GetOptions, GetResultPayload}; +use object_store::{GetOptions, GetResultPayload, PutOptions, PutResult}; // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -41,6 +41,118 @@ use tokio::io::AsyncWrite; #[cfg(test)] mod dummy; +#[derive(Debug, Clone)] +struct Metrics { + success_duration: DurationHistogram, + error_duration: DurationHistogram, +} + +impl Metrics { + fn new(registry: &metric::Registry, op: &'static str) -> Self { + // Call durations broken down by op & result + let duration: Metric = registry.register_metric( + "object_store_op_duration", + "object store operation duration", + ); + + Self { + success_duration: duration.recorder(&[("op", op), ("result", "success")]), + error_duration: duration.recorder(&[("op", op), ("result", "error")]), + } + } + + fn record(&self, t_begin: Time, t_end: Time, success: bool) { + // Avoid exploding if time goes backwards - simply drop the measurement + // if it happens. + let Some(delta) = t_end.checked_duration_since(t_begin) else { + return; + }; + + if success { + self.success_duration.record(delta); + } else { + self.error_duration.record(delta); + } + } +} + +#[derive(Debug, Clone)] +struct MetricsWithBytes { + inner: Metrics, + success_bytes: U64Counter, + error_bytes: U64Counter, +} + +impl MetricsWithBytes { + fn new(registry: &metric::Registry, op: &'static str) -> Self { + // Byte counts up/down + let bytes = registry.register_metric::( + "object_store_transfer_bytes", + "cumulative count of file content bytes transferred to/from the object store", + ); + + Self { + inner: Metrics::new(registry, op), + success_bytes: bytes.recorder(&[("op", op), ("result", "success")]), + error_bytes: bytes.recorder(&[("op", op), ("result", "error")]), + } + } + + fn record_bytes_only(&self, success: bool, bytes: u64) { + if success { + self.success_bytes.inc(bytes); + } else { + self.error_bytes.inc(bytes); + } + } + + fn record(&self, t_begin: Time, t_end: Time, success: bool, bytes: Option) { + if let Some(bytes) = bytes { + self.record_bytes_only(success, bytes); + } + + self.inner.record(t_begin, t_end, success); + } +} + +#[derive(Debug, Clone)] +struct MetricsWithCount { + inner: Metrics, + success_count: U64Counter, + error_count: U64Counter, +} + +impl MetricsWithCount { + fn new(registry: &metric::Registry, op: &'static str) -> Self { + let count = registry.register_metric::( + "object_store_transfer_objects", + "cumulative count of objects transferred to/from the object store", + ); + + Self { + inner: Metrics::new(registry, op), + success_count: count.recorder(&[("op", op), ("result", "success")]), + error_count: count.recorder(&[("op", op), ("result", "error")]), + } + } + + fn record_count_only(&self, success: bool, count: u64) { + if success { + self.success_count.inc(count); + } else { + self.error_count.inc(count); + } + } + + fn record(&self, t_begin: Time, t_end: Time, success: bool, count: Option) { + if let Some(count) = count { + self.record_count_only(success, count); + } + + self.inner.record(t_begin, t_end, success); + } +} + /// An instrumentation decorator, wrapping an underlying [`ObjectStore`] /// implementation and recording bytes transferred and call latency. /// @@ -92,26 +204,20 @@ pub struct ObjectStoreMetrics { inner: Arc, time_provider: Arc, - put_success_duration: DurationHistogram, - put_error_duration: DurationHistogram, - put_bytes: U64Counter, - - get_success_duration: DurationHistogram, - get_error_duration: DurationHistogram, - get_bytes: U64Counter, - - get_range_success_duration: DurationHistogram, - get_range_error_duration: DurationHistogram, - get_range_bytes: U64Counter, - - head_success_duration: DurationHistogram, - head_error_duration: DurationHistogram, - - delete_success_duration: DurationHistogram, - delete_error_duration: DurationHistogram, - - list_success_duration: DurationHistogram, - list_error_duration: DurationHistogram, + put: MetricsWithBytes, + get: MetricsWithBytes, + get_range: MetricsWithBytes, + get_ranges: MetricsWithBytes, + head: Metrics, + delete: Metrics, + delete_stream: MetricsWithCount, + list: MetricsWithCount, + list_with_offset: MetricsWithCount, + list_with_delimiter: MetricsWithCount, + copy: Metrics, + rename: Metrics, + copy_if_not_exists: Metrics, + rename_if_not_exists: Metrics, } impl ObjectStoreMetrics { @@ -121,65 +227,24 @@ impl ObjectStoreMetrics { time_provider: Arc, registry: &metric::Registry, ) -> Self { - // Byte counts up/down - let bytes = registry.register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ); - let put_bytes = bytes.recorder(&[("op", "put")]); - let get_bytes = bytes.recorder(&[("op", "get")]); - let get_range_bytes = bytes.recorder(&[("op", "get_range")]); - - // Call durations broken down by op & result - let duration: Metric = registry.register_metric( - "object_store_op_duration", - "object store operation duration", - ); - - let put_success_duration = duration.recorder(&[("op", "put"), ("result", "success")]); - let put_error_duration = duration.recorder(&[("op", "put"), ("result", "error")]); - - let get_success_duration = duration.recorder(&[("op", "get"), ("result", "success")]); - let get_error_duration = duration.recorder(&[("op", "get"), ("result", "error")]); - - let get_range_success_duration = - duration.recorder(&[("op", "get_range"), ("result", "success")]); - let get_range_error_duration = - duration.recorder(&[("op", "get_range"), ("result", "error")]); - - let head_success_duration = duration.recorder(&[("op", "head"), ("result", "success")]); - let head_error_duration = duration.recorder(&[("op", "head"), ("result", "error")]); - - let delete_success_duration = duration.recorder(&[("op", "delete"), ("result", "success")]); - let delete_error_duration = duration.recorder(&[("op", "delete"), ("result", "error")]); - - let list_success_duration = duration.recorder(&[("op", "list"), ("result", "success")]); - let list_error_duration = duration.recorder(&[("op", "list"), ("result", "error")]); - Self { inner, time_provider, - put_success_duration, - put_error_duration, - put_bytes, - - get_bytes, - get_success_duration, - get_error_duration, - - get_range_bytes, - get_range_success_duration, - get_range_error_duration, - - head_success_duration, - head_error_duration, - - delete_success_duration, - delete_error_duration, - - list_success_duration, - list_error_duration, + put: MetricsWithBytes::new(registry, "put"), + get: MetricsWithBytes::new(registry, "get"), + get_range: MetricsWithBytes::new(registry, "get_range"), + get_ranges: MetricsWithBytes::new(registry, "get_ranges"), + head: Metrics::new(registry, "head"), + delete: Metrics::new(registry, "delete"), + delete_stream: MetricsWithCount::new(registry, "delete_stream"), + list: MetricsWithCount::new(registry, "list"), + list_with_offset: MetricsWithCount::new(registry, "list_with_offset"), + list_with_delimiter: MetricsWithCount::new(registry, "list_with_delimiter"), + copy: Metrics::new(registry, "copy"), + rename: Metrics::new(registry, "rename"), + copy_if_not_exists: Metrics::new(registry, "copy_if_not_exists"), + rename_if_not_exists: Metrics::new(registry, "rename_if_not_exists"), } } } @@ -192,22 +257,12 @@ impl std::fmt::Display for ObjectStoreMetrics { #[async_trait] impl ObjectStore for ObjectStoreMetrics { - async fn put(&self, location: &Path, bytes: Bytes) -> Result<()> { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { let t = self.time_provider.now(); - let size = bytes.len(); - let res = self.inner.put(location, bytes).await; - self.put_bytes.inc(size as _); - - // Avoid exploding if time goes backwards - simply drop the measurement - // if it happens. - if let Some(delta) = self.time_provider.now().checked_duration_since(t) { - match &res { - Ok(_) => self.put_success_duration.record(delta), - Err(_) => self.put_error_duration.record(delta), - }; - } - + let res = self.inner.put_opts(location, bytes, opts).await; + self.put + .record(t, self.time_provider.now(), res.is_ok(), Some(size as _)); res } @@ -231,15 +286,12 @@ impl ObjectStore for ObjectStoreMetrics { Ok(mut res) => { res.payload = match res.payload { GetResultPayload::File(file, path) => { - // Record the file size in bytes and time the inner call took. - if let Ok(m) = file.metadata() { - self.get_bytes.inc(m.len()); - if let Some(d) = - self.time_provider.now().checked_duration_since(started_at) - { - self.get_success_duration.record(d) - } - } + self.get.record( + started_at, + self.time_provider.now(), + true, + file.metadata().ok().map(|m| m.len()), + ); GetResultPayload::File(file, path) } GetResultPayload::Stream(s) => { @@ -249,9 +301,7 @@ impl ObjectStore for ObjectStoreMetrics { StreamMetricRecorder::new( s, started_at, - self.get_success_duration.clone(), - self.get_error_duration.clone(), - BytesStreamDelegate(self.get_bytes.clone()), + BytesStreamDelegate::new(self.get.clone()), ) .fuse(), ))) @@ -260,10 +310,8 @@ impl ObjectStore for ObjectStoreMetrics { Ok(res) } Err(e) => { - // Record the call duration in the error histogram. - if let Some(delta) = self.time_provider.now().checked_duration_since(started_at) { - self.get_error_duration.record(delta); - } + self.get + .record(started_at, self.time_provider.now(), false, None); Err(e) } } @@ -271,113 +319,135 @@ impl ObjectStore for ObjectStoreMetrics { async fn get_range(&self, location: &Path, range: Range) -> Result { let t = self.time_provider.now(); - let res = self.inner.get_range(location, range).await; + self.get_range.record( + t, + self.time_provider.now(), + res.is_ok(), + res.as_ref().ok().map(|b| b.len() as _), + ); + res + } - // Avoid exploding if time goes backwards - simply drop the measurement - // if it happens. - if let Some(delta) = self.time_provider.now().checked_duration_since(t) { - match &res { - Ok(data) => { - self.get_range_success_duration.record(delta); - self.get_range_bytes.inc(data.len() as _); - } - Err(_) => self.get_range_error_duration.record(delta), - }; - } - + async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { + let t = self.time_provider.now(); + let res = self.inner.get_ranges(location, ranges).await; + self.get_ranges.record( + t, + self.time_provider.now(), + res.is_ok(), + res.as_ref() + .ok() + .map(|b| b.iter().map(|b| b.len() as u64).sum()), + ); res } async fn head(&self, location: &Path) -> Result { let t = self.time_provider.now(); - let res = self.inner.head(location).await; - - // Avoid exploding if time goes backwards - simply drop the measurement - // if it happens. - if let Some(delta) = self.time_provider.now().checked_duration_since(t) { - match &res { - Ok(_) => self.head_success_duration.record(delta), - Err(_) => self.head_error_duration.record(delta), - }; - } - + self.head.record(t, self.time_provider.now(), res.is_ok()); res } async fn delete(&self, location: &Path) -> Result<()> { let t = self.time_provider.now(); - let res = self.inner.delete(location).await; + self.delete.record(t, self.time_provider.now(), res.is_ok()); + res + } - // Avoid exploding if time goes backwards - simply drop the measurement - // if it happens. - if let Some(delta) = self.time_provider.now().checked_duration_since(t) { - match &res { - Ok(_) => self.delete_success_duration.record(delta), - Err(_) => self.delete_error_duration.record(delta), - }; - } + fn delete_stream<'a>( + &'a self, + locations: BoxStream<'a, Result>, + ) -> BoxStream<'a, Result> { + let started_at = self.time_provider.now(); - res + let s = self.inner.delete_stream(locations); + + // Wrap the object store data stream in a decorator to track the + // yielded data / wall clock, inclusive of the inner call above. + StreamMetricRecorder::new( + s, + started_at, + CountStreamDelegate::new(self.delete_stream.clone()), + ) + .fuse() + .boxed() } - async fn list(&self, prefix: Option<&Path>) -> Result>> { + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { let started_at = self.time_provider.now(); - let res = self.inner.list(prefix).await; + let s = self.inner.list(prefix); - match res { - Ok(s) => { - // Wrap the object store data stream in a decorator to track the - // yielded data / wall clock, inclusive of the inner call above. - Ok(Box::pin(Box::new( - StreamMetricRecorder::new( - s, - started_at, - self.list_success_duration.clone(), - self.list_error_duration.clone(), - NopStreamDelegate::default(), - ) - .fuse(), - ))) - } - Err(e) => { - // Record the call duration in the error histogram. - if let Some(delta) = self.time_provider.now().checked_duration_since(started_at) { - self.list_error_duration.record(delta); - } - Err(e) - } - } + // Wrap the object store data stream in a decorator to track the + // yielded data / wall clock, inclusive of the inner call above. + StreamMetricRecorder::new(s, started_at, CountStreamDelegate::new(self.list.clone())) + .fuse() + .boxed() } - async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { - let t = self.time_provider.now(); + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'_, Result> { + let started_at = self.time_provider.now(); - let res = self.inner.list_with_delimiter(prefix).await; + let s = self.inner.list_with_offset(prefix, offset); - // Avoid exploding if time goes backwards - simply drop the measurement - // if it happens. - if let Some(delta) = self.time_provider.now().checked_duration_since(t) { - match &res { - Ok(_) => self.list_success_duration.record(delta), - Err(_) => self.list_error_duration.record(delta), - }; - } + // Wrap the object store data stream in a decorator to track the + // yielded data / wall clock, inclusive of the inner call above. + StreamMetricRecorder::new( + s, + started_at, + CountStreamDelegate::new(self.list_with_offset.clone()), + ) + .fuse() + .boxed() + } + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + let t = self.time_provider.now(); + let res = self.inner.list_with_delimiter(prefix).await; + self.list_with_delimiter.record( + t, + self.time_provider.now(), + res.is_ok(), + res.as_ref().ok().map(|res| res.objects.len() as _), + ); res } async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - // TODO: Instrument me - self.inner.copy(from, to).await + let t = self.time_provider.now(); + let res = self.inner.copy(from, to).await; + self.copy.record(t, self.time_provider.now(), res.is_ok()); + res + } + + async fn rename(&self, from: &Path, to: &Path) -> Result<()> { + let t = self.time_provider.now(); + let res = self.inner.rename(from, to).await; + self.rename.record(t, self.time_provider.now(), res.is_ok()); + res } async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - // TODO: Instrument me - self.inner.copy_if_not_exists(from, to).await + let t = self.time_provider.now(); + let res = self.inner.copy_if_not_exists(from, to).await; + self.copy_if_not_exists + .record(t, self.time_provider.now(), res.is_ok()); + res + } + + async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + let t = self.time_provider.now(); + let res = self.inner.rename_if_not_exists(from, to).await; + self.rename_if_not_exists + .record(t, self.time_provider.now(), res.is_ok()); + res } } @@ -386,8 +456,12 @@ impl ObjectStore for ObjectStoreMetrics { trait MetricDelegate { /// The type this delegate observes. type Item; + /// Invoked when the stream yields an `Ok(Item)`. fn observe_ok(&self, value: &Self::Item); + + /// Finish stream. + fn finish(&self, t_begin: Time, t_end: Time, success: bool); } /// A [`MetricDelegate`] for instrumented streams of [`Bytes`]. @@ -395,30 +469,44 @@ trait MetricDelegate { /// This impl is used to record the number of bytes yielded for /// [`ObjectStore::get()`] calls. #[derive(Debug)] -struct BytesStreamDelegate(U64Counter); +struct BytesStreamDelegate(MetricsWithBytes); + +impl BytesStreamDelegate { + fn new(metrics: MetricsWithBytes) -> Self { + Self(metrics) + } +} impl MetricDelegate for BytesStreamDelegate { type Item = Bytes; fn observe_ok(&self, bytes: &Self::Item) { - self.0.inc(bytes.len() as _); + self.0.record_bytes_only(true, bytes.len() as _); + } + + fn finish(&self, t_begin: Time, t_end: Time, success: bool) { + self.0.record(t_begin, t_end, success, None); } } #[derive(Debug)] -struct NopStreamDelegate(PhantomData); +struct CountStreamDelegate(MetricsWithCount, PhantomData); -impl Default for NopStreamDelegate { - fn default() -> Self { - Self(Default::default()) +impl CountStreamDelegate { + fn new(metrics: MetricsWithCount) -> Self { + Self(metrics, Default::default()) } } -impl MetricDelegate for NopStreamDelegate { +impl MetricDelegate for CountStreamDelegate { type Item = T; fn observe_ok(&self, _value: &Self::Item) { - // it does nothing! + self.0.record_count_only(true, 1); + } + + fn finish(&self, t_begin: Time, t_end: Time, success: bool) { + self.0.record(t_begin, t_end, success, None); } } @@ -472,9 +560,6 @@ where // Called when the stream yields an `Ok(T)` to allow the delegate to inspect // the `T`. metric_delegate: D, - - success_duration: DurationHistogram, - error_duration: DurationHistogram, } impl StreamMetricRecorder @@ -482,13 +567,7 @@ where S: Stream, D: MetricDelegate, { - fn new( - stream: S, - started_at: Time, - success_duration: DurationHistogram, - error_duration: DurationHistogram, - metric_delegate: D, - ) -> Self { + fn new(stream: S, started_at: Time, metric_delegate: D) -> Self { let time_provider = SystemProvider::default(); Self { inner: stream, @@ -504,8 +583,6 @@ where started_at, time_provider, - success_duration, - error_duration, metric_delegate, } } @@ -542,21 +619,13 @@ where Poll::Ready(None) => { // The stream has terminated - record the wall clock duration // immediately. - let hist = match this.last_call_ok { - true => this.success_duration, - false => this.error_duration, - }; - - // Take the last_yielded_at option, marking metrics as emitted - // so the drop impl does not duplicate them. - if let Some(d) = this - .last_yielded_at - .take() - .expect("no last_yielded_at value for fused stream") - .checked_duration_since(*this.started_at) - { - hist.record(d) - } + this.metric_delegate.finish( + *this.started_at, + this.last_yielded_at + .take() + .expect("no last_yielded_at value for fused stream"), + *this.last_call_ok, + ); Poll::Ready(None) } @@ -581,14 +650,8 @@ where // Only emit metrics if the end of the stream was not observed (and // therefore last_yielded_at is still Some). if let Some(last) = self.last_yielded_at { - let hist = match self.last_call_ok { - true => &self.success_duration, - false => &self.error_duration, - }; - - if let Some(d) = last.checked_duration_since(self.started_at) { - hist.record(d) - } + self.metric_delegate + .finish(self.started_at, last, self.last_call_ok); } } } @@ -601,7 +664,7 @@ mod tests { time::Duration, }; - use futures::stream; + use futures::{stream, TryStreamExt}; use metric::Attributes; use std::io::Read; @@ -610,6 +673,7 @@ mod tests { use super::*; + #[track_caller] fn assert_histogram_hit( metrics: &metric::Registry, name: &'static str, @@ -626,6 +690,24 @@ mod tests { assert!(hit_count > 0, "metric {name} did not record any calls"); } + #[track_caller] + fn assert_histogram_not_hit( + metrics: &metric::Registry, + name: &'static str, + attr: [(&'static str, &'static str); N], + ) { + let histogram = metrics + .get_instrument::>(name) + .expect("failed to read histogram") + .get_observer(&Attributes::from(&attr)) + .expect("failed to get observer") + .fetch(); + + let hit_count = histogram.sample_count(); + assert!(hit_count == 0, "metric {name} did record {hit_count} calls"); + } + + #[track_caller] fn assert_counter_value( metrics: &metric::Registry, name: &'static str, @@ -656,7 +738,12 @@ mod tests { .await .expect("put should succeed"); - assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "put")], 5); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "put"), ("result", "success")], + 5, + ); assert_histogram_hit( &metrics, "object_store_op_duration", @@ -679,7 +766,12 @@ mod tests { .await .expect_err("put should error"); - assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "put")], 5); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "put"), ("result", "error")], + 5, + ); assert_histogram_hit( &metrics, "object_store_op_duration", @@ -691,12 +783,71 @@ mod tests { async fn test_list() { let metrics = Arc::new(metric::Registry::default()); let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + store + .put(&Path::from("bar"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store.list(None).try_collect::>().await.unwrap(); + + assert_counter_value( + &metrics, + "object_store_transfer_objects", + [("op", "list"), ("result", "success")], + 2, + ); + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "list"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_list_with_offset() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + store + .put(&Path::from("bar"), Bytes::default()) + .await + .unwrap(); + store + .put(&Path::from("baz"), Bytes::default()) + .await + .unwrap(); let time = Arc::new(SystemProvider::new()); let store = ObjectStoreMetrics::new(store, time, &metrics); - store.list(None).await.expect("list should succeed"); + store + .list_with_offset(None, &Path::from("bar")) + .try_collect::>() + .await + .unwrap(); + assert_counter_value( + &metrics, + "object_store_transfer_objects", + [("op", "list_with_offset"), ("result", "success")], + 2, + ); assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "list_with_offset"), ("result", "success")], + ); + + // NOT raw `list` call + assert_histogram_not_hit( &metrics, "object_store_op_duration", [("op", "list"), ("result", "success")], @@ -710,7 +861,10 @@ mod tests { let time = Arc::new(SystemProvider::new()); let store = ObjectStoreMetrics::new(store, time, &metrics); - assert!(store.list(None).await.is_err(), "mock configured to fail"); + assert!( + store.list(None).try_collect::>().await.is_err(), + "mock configured to fail" + ); assert_histogram_hit( &metrics, @@ -734,7 +888,7 @@ mod tests { assert_histogram_hit( &metrics, "object_store_op_duration", - [("op", "list"), ("result", "success")], + [("op", "list_with_delimiter"), ("result", "success")], ); } @@ -756,7 +910,7 @@ mod tests { assert_histogram_hit( &metrics, "object_store_op_duration", - [("op", "list"), ("result", "error")], + [("op", "list_with_delimiter"), ("result", "error")], ); } @@ -817,6 +971,212 @@ mod tests { ); } + #[tokio::test] + async fn test_getranges() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::from_static(b"bar")) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .get_ranges(&Path::from("foo"), &[0..2, 1..2, 0..1]) + .await + .unwrap(); + + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "get_ranges"), ("result", "success")], + 4, + ); + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "get_ranges"), ("result", "success")], + ); + + // NO `get_range` used! + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "get_range"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_copy() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .copy(&Path::from("foo"), &Path::from("bar")) + .await + .unwrap(); + + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "copy"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_copy_if_not_exists() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .copy_if_not_exists(&Path::from("foo"), &Path::from("bar")) + .await + .unwrap(); + + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "copy_if_not_exists"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_rename() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .rename(&Path::from("foo"), &Path::from("bar")) + .await + .unwrap(); + + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "rename"), ("result", "success")], + ); + + // NO `copy`/`delete` used! + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "copy"), ("result", "success")], + ); + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "delete"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_rename_if_not_exists() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .rename_if_not_exists(&Path::from("foo"), &Path::from("bar")) + .await + .unwrap(); + + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "rename_if_not_exists"), ("result", "success")], + ); + + // NO `copy`/`copy_if_not_exists`/`delete` used! + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "copy"), ("result", "success")], + ); + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "copy_if_not_exists"), ("result", "success")], + ); + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "delete"), ("result", "success")], + ); + } + + #[tokio::test] + async fn test_delete_stream() { + let metrics = Arc::new(metric::Registry::default()); + let store = Arc::new(InMemory::new()); + store + .put(&Path::from("foo"), Bytes::default()) + .await + .unwrap(); + store + .put(&Path::from("bar"), Bytes::default()) + .await + .unwrap(); + store + .put(&Path::from("baz"), Bytes::default()) + .await + .unwrap(); + let time = Arc::new(SystemProvider::new()); + let store = ObjectStoreMetrics::new(store, time, &metrics); + + store + .delete_stream( + stream::iter(["foo", "baz"]) + .map(|s| Ok(Path::from(s))) + .boxed(), + ) + .try_collect::>() + .await + .unwrap(); + + assert_counter_value( + &metrics, + "object_store_transfer_objects", + [("op", "delete_stream"), ("result", "success")], + 2, + ); + assert_histogram_hit( + &metrics, + "object_store_op_duration", + [("op", "delete_stream"), ("result", "success")], + ); + + // NOT raw `delete` call + assert_histogram_not_hit( + &metrics, + "object_store_op_duration", + [("op", "delete"), ("result", "success")], + ); + } + #[tokio::test] async fn test_put_get_getrange_head_delete_file() { let metrics = Arc::new(metric::Registry::default()); @@ -844,7 +1204,12 @@ mod tests { v => panic!("not a file: {v:?}"), } - assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "get")], 5); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "get"), ("result", "success")], + 5, + ); assert_histogram_hit( &metrics, "object_store_op_duration", @@ -858,7 +1223,7 @@ mod tests { assert_counter_value( &metrics, "object_store_transfer_bytes", - [("op", "get_range")], + [("op", "get_range"), ("result", "success")], 3, ); assert_histogram_hit( @@ -905,7 +1270,12 @@ mod tests { v => panic!("not a stream: {v:?}"), } - assert_counter_value(&metrics, "object_store_transfer_bytes", [("op", "get")], 5); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "get"), ("result", "success")], + 5, + ); assert_histogram_hit( &metrics, "object_store_op_duration", @@ -930,21 +1300,12 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let mut stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); let got = stream @@ -953,7 +1314,12 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 1); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); // Sleep at least 10ms to assert the recorder to captures the wall clock // time. @@ -966,11 +1332,14 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 3); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 4, + ); - let success_hist = hist - .get_observer(&metric::Attributes::from(&[("result", "success")])) - .expect("failed to get observer"); + let success_hist = &m.inner.success_duration; // Until the stream is fully consumed, there should be no wall clock // metrics emitted. @@ -983,7 +1352,12 @@ mod tests { // recorded. let hit_count = success_hist.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 4, + ); // And it must be in a SLEEP or higher bucket. let hit_count: u64 = success_hist @@ -1002,7 +1376,12 @@ mod tests { drop(stream); let hit_count = success_hist.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration duplicated"); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 4, + ); } // Ensures the stream decorator correctly records the wall clock duration @@ -1022,21 +1401,12 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let mut stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); let got = stream @@ -1045,7 +1415,12 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 1); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); // Sleep at least 10ms to assert the recorder to captures the wall clock // time. @@ -1057,15 +1432,16 @@ mod tests { // Now the stream is complete, the wall clock duration must have been // recorded. - let hit_count = hist - .get_observer(&metric::Attributes::from(&[("result", "success")])) - .expect("failed to get observer") - .fetch() - .sample_count(); + let hit_count = m.inner.success_duration.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); // And the number of bytes read must match the pre-drop value. - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); } // Ensures the stream decorator records the wall clock duration into the @@ -1085,21 +1461,12 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let mut stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); let got = stream @@ -1108,7 +1475,18 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 1); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "error")], + 0, + ); let _err = stream .next() @@ -1120,15 +1498,22 @@ mod tests { drop(stream); // Ensure the wall clock was added to the "error" histogram. - let hit_count = hist - .get_observer(&metric::Attributes::from(&[("result", "error")])) - .expect("failed to get observer") - .fetch() - .sample_count(); + let hit_count = m.inner.error_duration.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); // And the number of bytes read must match - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "error")], + 0, + ); } // Ensures the stream decorator records the wall clock duration into the @@ -1148,21 +1533,12 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let mut stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); let got = stream @@ -1171,7 +1547,12 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 1); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 1); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 1, + ); let _err = stream .next() @@ -1185,22 +1566,28 @@ mod tests { .expect("should yield data") .expect("should succeed"); assert_eq!(got.len(), 3); - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 4, + ); // Drop after observing an error drop(stream); // Ensure the wall clock was added to the "success" histogram after // progressing past the transient error. - let hit_count = hist - .get_observer(&metric::Attributes::from(&[("result", "success")])) - .expect("failed to get observer") - .fetch() - .sample_count(); + let hit_count = m.inner.success_duration.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); // And the number of bytes read must match - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 4); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 4, + ); } // Ensures the wall clock time recorded by the stream decorator includes the @@ -1216,36 +1603,28 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); // Drop immediately drop(stream); // Ensure the wall clock was added to the "success" histogram - let hit_count = hist - .get_observer(&metric::Attributes::from(&[("result", "success")])) - .expect("failed to get observer") - .fetch() - .sample_count(); + let hit_count = m.inner.success_duration.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); // And the number of bytes read must match - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 0); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 0, + ); } // Ensures the wall clock time recorded by the stream decorator emits a wall @@ -1260,35 +1639,27 @@ mod tests { let time_provider = SystemProvider::default(); let metrics = Arc::new(metric::Registry::default()); - let hist: Metric = metrics.register_metric("wall_clock", ""); - - let bytes = metrics - .register_metric::( - "object_store_transfer_bytes", - "cumulative count of file content bytes transferred to/from the object store", - ) - .recorder(&[]); + let m = MetricsWithBytes::new(&metrics, "test"); let mut stream = StreamMetricRecorder::new( inner, time_provider.now(), - hist.recorder(&[("result", "success")]), - hist.recorder(&[("result", "error")]), - BytesStreamDelegate(bytes), + BytesStreamDelegate::new(m.clone()), ); assert!(stream.next().await.is_none()); // Ensure the wall clock was added to the "success" histogram even // though it yielded no data. - let hit_count = hist - .get_observer(&metric::Attributes::from(&[("result", "success")])) - .expect("failed to get observer") - .fetch() - .sample_count(); + let hit_count = m.inner.success_duration.fetch().sample_count(); assert_eq!(hit_count, 1, "wall clock duration recorded incorrectly"); // And the number of bytes read must match - assert_counter_value(&metrics, "object_store_transfer_bytes", [], 0); + assert_counter_value( + &metrics, + "object_store_transfer_bytes", + [("op", "test"), ("result", "success")], + 0, + ); } } diff --git a/observability_deps/Cargo.toml b/observability_deps/Cargo.toml index 20b64a89860..a24de4fb002 100644 --- a/observability_deps/Cargo.toml +++ b/observability_deps/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order tracing = { version = "0.1", features = ["max_level_trace"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/panic_logging/Cargo.toml b/panic_logging/Cargo.toml index 75010930e0a..3c704e26821 100644 --- a/panic_logging/Cargo.toml +++ b/panic_logging/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order metric = { path = "../metric" } observability_deps = { path = "../observability_deps" } diff --git a/panic_logging/src/lib.rs b/panic_logging/src/lib.rs index c1413b536c1..ef17394f2fe 100644 --- a/panic_logging/src/lib.rs +++ b/panic_logging/src/lib.rs @@ -60,7 +60,17 @@ impl SendPanicsToTracing { if let Some(metrics) = &metrics { metrics.inc(panic_type); } - error!(panic_type=panic_type.name(), panic_info=%info, "Thread panic"); + + let location = info.location(); + error!( + panic_type = panic_type.name(), + panic_message = message(info), + panic_file = location.map(|l| l.file()), + panic_line = location.map(|l| l.line()), + panic_column = location.map(|l| l.column()), + "Thread panic", + ); + current_panic_hook(info); })); @@ -149,20 +159,23 @@ impl PanicType { } fn classify(panic_info: &PanicInfo<'_>) -> Self { - let payload_any = panic_info.payload(); - - let maybe_msg = payload_any - .downcast_ref::<&str>() - .copied() - .or(payload_any.downcast_ref::().map(|s| s.as_str())); - - match maybe_msg { + match message(panic_info) { Some("offset overflow" | "offset") => Self::OffsetOverflow, _ => Self::Unknown, } } } +/// Extract string message from [`PanicInfo`] +fn message<'a>(panic_info: &'a PanicInfo<'a>) -> Option<&'a str> { + let payload_any = panic_info.payload(); + + payload_any + .downcast_ref::<&str>() + .copied() + .or(payload_any.downcast_ref::().map(|s| s.as_str())) +} + /// Metrics used for panics. #[derive(Debug)] struct Metrics { @@ -195,6 +208,8 @@ impl Metrics { #[cfg(test)] mod tests { + use std::panic::panic_any; + use metric::{Attributes, Metric}; use test_helpers::{maybe_start_logging, tracing::TracingCapture}; @@ -246,6 +261,14 @@ mod tests { .join() .expect_err("wat"); + let capture2 = Arc::clone(&capture); + std::thread::spawn(move || { + capture2.register_in_current_thread(); + panic_any(1); + }) + .join() + .expect_err("wat"); + drop(guard); let capture2 = Arc::clone(&capture); std::thread::spawn(move || { @@ -256,16 +279,14 @@ mod tests { .expect_err("wat"); assert_count(&metrics, "offset_overflow", 2); - assert_count(&metrics, "unknown", 1); + assert_count(&metrics, "unknown", 2); assert_eq!( capture.to_string(), - "level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_info = panicked at panic_logging/src/lib.rs:227:13:\n\ - it's bananas; \n\ - level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_info = panicked at panic_logging/src/lib.rs:235:13:\n\ - offset; \n\ - level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_info = panicked at panic_logging/src/lib.rs:244:13:\n\ - offset overflow; " + "level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_message = \"it's bananas\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 242; panic_column = 13; \n\ + level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 250; panic_column = 13; \n\ + level = ERROR; message = Thread panic; panic_type = \"offset_overflow\"; panic_message = \"offset overflow\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 259; panic_column = 13; \n\ + level = ERROR; message = Thread panic; panic_type = \"unknown\"; panic_file = \"panic_logging/src/lib.rs\"; panic_line = 267; panic_column = 13; " ); } } diff --git a/parquet_cache/Cargo.toml b/parquet_cache/Cargo.toml new file mode 100644 index 00000000000..7fc1b907166 --- /dev/null +++ b/parquet_cache/Cargo.toml @@ -0,0 +1,60 @@ +[package] +name = "parquet_cache" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[features] +test-with-server-port = [] + +[dependencies] +arc-swap = "1.6.0" +async-channel = "2.1.1" +async-trait = "0.1.77" +backoff = { path = "../backoff" } +bytes = "1.5.0" +chrono = "0.4.31" +data_types = { path = "../data_types" } +fnv = "1.0.7" +futures = "0.3.30" +http = "0.2.11" +hyper = { version = "0.14.27", features = ["http2"] } +iox_catalog = { path = "../iox_catalog" } +k8s-openapi = { version = "0.20.0", features = ["schemars", "earliest"] } +kube = { version = "0.87.1", features = ["runtime", "client", "derive"] } +moka = { version = "0.12.3", features = ["future"] } +mpchash = "1.2.1" +notify = "6.1.1" +object_store = { workspace = true } +observability_deps = { path = "../observability_deps" } +parking_lot = "0.12.1" +parquet_file = { path = "../parquet_file" } +pin-project = "1.1.3" +reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] } +schemars = "0.8.16" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0.111" +thiserror = "1.0.56" +tokio = "1.35.1" +tokio-util = { version = "0.7.10", features = ["codec"] } +tower = "0.4.13" +url = "2.5.0" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] +ahash = "0.8.7" +assert_matches = "1.5.0" +bytes = "1.5.0" +iox_tests = { path = "../iox_tests" } +iox_time = { path = "../iox_time" } +kube_test = { path = "../kube_test" } +lazy_static = "1.4.0" +rand = "0.8.5" +tempfile = "3.9.0" +tokio-stream = "0.1.14" +uuid = "1.6.1" + diff --git a/parquet_cache/src/client.rs b/parquet_cache/src/client.rs new file mode 100644 index 00000000000..0f767f588ab --- /dev/null +++ b/parquet_cache/src/client.rs @@ -0,0 +1,16 @@ +//! Contains the cache client. + +/// Interface for the object store. Consumed by Iox components. +pub mod object_store; +/// Interface for write hinting. Consumed by Iox components. +pub mod write_hints; + +/// Connection to remote data cache. Used by the ObjectStore cache impl. +pub(crate) mod cache_connector; +pub(crate) mod http; +pub(crate) mod keyspace; +pub(crate) mod request; + +/// Mocks used for internal testing +#[cfg(test)] +pub(crate) mod mock; diff --git a/parquet_cache/src/client/cache_connector.rs b/parquet_cache/src/client/cache_connector.rs new file mode 100644 index 00000000000..6ec3c185bc7 --- /dev/null +++ b/parquet_cache/src/client/cache_connector.rs @@ -0,0 +1,37 @@ +use std::fmt::Debug; + +use tower::{Layer, ServiceBuilder}; + +use super::{http::HttpService, keyspace::HostKeyspaceService}; + +pub type ClientCacheConnector = HostKeyspaceService; + +/// Data cache errors. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Failure getting data from the cache. + #[error("Fetch error: {0}")] + FetchData(#[from] super::keyspace::Error), + + /// Failure reading the (already fetched) data from cache. + #[error("Data error: {0}")] + ReadData(String), +} + +/// Builder for the cache connector service. +pub fn build_cache_connector(ns_service_addr: impl ToString) -> ClientCacheConnector { + ServiceBuilder::new() + .layer(MapToHost(ns_service_addr.to_string())) + .service(HttpService::new()) +} + +#[derive(Debug)] +struct MapToHost(pub String); + +impl Layer for MapToHost { + type Service = HostKeyspaceService; + + fn layer(&self, service: S) -> Self::Service { + HostKeyspaceService::new(service, self.0.clone()) + } +} diff --git a/parquet_cache/src/client/http.rs b/parquet_cache/src/client/http.rs new file mode 100644 index 00000000000..6bac49e0ff5 --- /dev/null +++ b/parquet_cache/src/client/http.rs @@ -0,0 +1,62 @@ +use std::{pin::Pin, sync::Arc, task::Poll}; + +use futures::Future; +use hyper::{client::HttpConnector, Body, Client, Request, Response, StatusCode}; +use tower::Service; + +use super::request::{PinnedFuture, RawRequest}; + +#[derive(Debug, Clone)] +pub struct HttpService { + /// Pool of connections. + client: Arc>, +} + +impl HttpService { + pub fn new() -> Self { + let client = Client::builder() + .http2_keep_alive_while_idle(true) + .http2_only(true) + .retry_canceled_requests(true) + .build_http::(); + + Self { + client: Arc::new(client), + } + } +} + +impl Default for HttpService { + fn default() -> Self { + Self::new() + } +} + +impl Service for HttpService { + type Response = Response; + type Error = hyper::Error; + type Future = PinnedFuture; + + fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: RawRequest) -> Self::Future { + match Request::::try_from(req) { + Ok(req) => Box::pin(self.client.request(req)), + Err(e) => invalid_request(e), + } + } +} + +fn invalid_request( + error: impl std::error::Error, +) -> Pin, hyper::Error>> + Send>> { + let (mut parts, _) = Response::new("invalid request").into_parts(); + parts.status = StatusCode::BAD_REQUEST; + + let body = Body::from( + serde_json::json!({"status": 400, "description": error.to_string()}).to_string(), + ); + Box::pin(futures::future::ok(Response::from_parts(parts, body))) +} diff --git a/parquet_cache/src/client/keyspace.rs b/parquet_cache/src/client/keyspace.rs new file mode 100644 index 00000000000..d65a0b5d0d5 --- /dev/null +++ b/parquet_cache/src/client/keyspace.rs @@ -0,0 +1,314 @@ +use std::{collections::HashMap, sync::Arc, task::Poll}; + +use arc_swap::ArcSwap; +use backoff::{Backoff, BackoffConfig}; +use bytes::Buf; +use http::uri::Authority; +use hyper::{Body, Method, Response, StatusCode, Uri}; +use mpchash::HashRing; +use observability_deps::tracing::warn; +use tokio::sync::OnceCell; +use tower::{Service, ServiceExt}; + +use super::request::{PinnedFuture, RawRequest}; +use crate::data_types::{KeyspaceResponseBody, ServiceNode, ServiceNodeId}; + +/// Errors associated fetching data from the cache. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Generic connection failure to remote data cache service. + #[error("Connection error: {0}")] + Connection(#[from] hyper::Error), + + /// Error in constructing request. + #[error("Request error: {0}")] + Request(String), + + /// Error with hashring keyspace + #[error("Keyspace error: {0}")] + Keyspace(String), + + /// Invalid addr + #[error("Invalid addr: {0}")] + InvalidAddr(#[from] http::uri::InvalidUri), + + /// Failure reading data from cache. + #[error("Data error: {0}")] + ReadData(String), +} + +#[derive(Debug, Clone)] +pub struct HostKeyspaceService { + /// Inner service + service: S, + /// Namespace service addr (for requests to any cache server). + dst: String, + /// Inner state + inner: Arc, +} + +impl HostKeyspaceService { + /// Create keyspace middleware [`HostKeyspaceService`] + pub fn new(service: S, dst: String) -> Self { + Self { + service, + dst, + inner: Default::default(), + } + } +} + +impl HostKeyspaceService +where + S: Clone + Send + Sync + Service, Error = hyper::Error>, + for<'b> >::Future: std::marker::Send + 'b, +{ + /// Primary goal of [`HostKeyspaceService`] is to add the host to the [`RawRequest`]. + async fn add_host_to_request(&mut self, mut req: RawRequest) -> Result { + let host = match &req.key { + Some(obj_key) => self.hostname(obj_key).await?, + None => self.dst.clone(), // k8s namespace service addr + }; + + req.uri_parts.authority = + Some(Authority::from_maybe_shared(host).map_err(Error::InvalidAddr)?); + + Ok(req) + } + + /// Hostname provided based upon hashed keyspace. + /// Lookup, if missing the re-query service for the latest keyspace. + async fn hostname(&mut self, key: &String) -> Result { + let node = self.inner.key_to_node(key); + + match self.inner.hostname_table.load().get(&node) { + Some(hostname) => Ok(hostname.to_owned()), + None => { + let keyspace = self.get_service_nodes().await?; + let inner = &mut self.inner; + inner.build_keyspace(keyspace); + let node = inner.key_to_node(key); + + let hostname = inner.hostname_table + .load() + .get(&node) + .ok_or(Error::Keyspace(format!("key {} was assigned to node {}, but node was not found in latest keyspace hosts", key, node)))? + .to_owned(); + Ok(hostname) + } + } + } + + /// Get list of [`ServiceNode`]s from cache service. + async fn get_service_nodes(&mut self) -> Result, Error> { + // use the Namespace service addr (self.dst), and not an individual server, to fetch the keyspace. + let uri_parts = format!("{}/keyspace", &self.dst) + .parse::() + .map(http::uri::Parts::from) + .map_err(Error::InvalidAddr)?; + + let req = RawRequest { + uri_parts, + method: Method::GET, + ..Default::default() + }; + + let service = self.service.ready().await?; + let resp = service.call(req).await.map_err(Error::Connection)?; + + match resp.status() { + StatusCode::OK => { + let reader = hyper::body::aggregate(resp.into_body()) + .await + .map_err(|e| Error::Keyspace(e.to_string()))? + .reader(); + + let keyspace_nodes: KeyspaceResponseBody = + serde_json::from_reader(reader).map_err(|e| Error::Keyspace(e.to_string()))?; + + Ok(keyspace_nodes.nodes) + } + _ => Err(Error::Keyspace(String::from("keyspace request failure"))), + } + } + + /// Initialize the keyspace on service start. + /// Has backoff-and-retry; intended to be called once. + async fn initialized(&mut self) { + Backoff::new(&BackoffConfig::default()) + .retry_all_errors("probe data cache service for keyspace", || { + let mut this = self.clone(); + async move { + let probe = this + .get_service_nodes() + .await + .map(|keyspace| this.inner.build_keyspace(keyspace)); + if probe.is_err() { + warn!("failed to build data cache keyspace"); + } + probe + } + }) + .await + .expect("retry forever") + } +} + +impl Service for HostKeyspaceService +where + S: Clone + + Service, Error = hyper::Error> + + Send + + Sync + + 'static, + for<'b> >::Future: std::marker::Send + 'b, +{ + type Response = S::Response; + type Error = Error; + type Future = PinnedFuture; + + fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: RawRequest) -> Self::Future { + let mut this = self.clone(); + Box::pin(async move { + Arc::clone(&this.inner) + .initialize_once + .get_or_init(|| this.initialized()) + .await; + let req = this.add_host_to_request(req).await?; + this.service.call(req).await.map_err(Error::Connection) + }) + } +} + +#[derive(Debug, Default)] +struct HostKeyspace { + /// Hashring + keyspace: ArcSwap>, + /// Map nodes to hostname. + hostname_table: ArcSwap>, + /// A single init of the shared, clonable keyspace. + /// (Note that the re-building of an invalidated keyspace, is separate from this init.) + initialize_once: OnceCell<()>, +} + +impl HostKeyspace { + /// Lookup key in keyspace + fn key_to_node(&self, key: &String) -> ServiceNodeId { + self.keyspace + .load() + .as_ref() + .primary_node(key) + .unwrap() + .to_owned() + } + + /// Build keyspace for cache connector, from list of [`ServiceNode`]s. + fn build_keyspace(&self, keyspace_nodes: Vec) { + let mut keyspace = HashRing::new(); + let mut hostname_table = HashMap::new(); + + for node in keyspace_nodes { + keyspace.add(node.id); + hostname_table.insert(node.id, node.hostname); + } + + self.keyspace.swap(Arc::new(keyspace)); + self.hostname_table.swap(Arc::new(hostname_table)); + } +} + +#[cfg(test)] +mod test { + use std::collections::hash_map::Entry; + + use parking_lot::Mutex; + use rand::seq::SliceRandom; + use uuid::Uuid; + + use super::super::http::HttpService; + use crate::data_types::ServiceNode; + + use super::*; + + async fn assert_consistent_hashing( + mut cache_connector: HostKeyspaceService, + prev_assignments: Arc>>, + ) { + // test with 100 files + for _ in 0..100 { + let key = format!("unique/location/{}/file.parquet", Uuid::new_v4()); + for _ in 0..1000 { + let key = key.clone(); + + let got = cache_connector + .hostname(&key) + .await + .expect("should assign hostname"); + let expected = match prev_assignments.lock().entry(key) { + Entry::Vacant(v) => { + v.insert(got.clone()); + got.clone() + } + Entry::Occupied(o) => o.get().clone(), + }; + + assert_eq!( + got, expected, + "should match previous assignment {}, instead got {}", + expected, got + ); + } + } + } + + #[tokio::test] + async fn test_keyspace_hashing_is_consistent() { + let remote_cache_connector = + HostKeyspaceService::new(HttpService::default(), "foo".to_string()); + + let keyspace_nodes = (0..100) + .map(|id| ServiceNode { + id, + hostname: format!("cache-server-hostname-{}", id), + }) + .collect(); + remote_cache_connector.inner.build_keyspace(keyspace_nodes); + + let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned + assert_consistent_hashing(remote_cache_connector, prev_assignments).await; + } + + #[tokio::test] + async fn test_keyspace_population_is_not_ordering_sensitive() { + // Sanity check. Asserting that the expected hashing properties hold true. + + let remote_cache_connector = + HostKeyspaceService::new(HttpService::default(), "foo".to_string()); + let prev_assignments = Arc::new(Mutex::new(HashMap::new())); // location_key, hostname_assigned + + // test with 0..100 ordered nodes, used when building keyspace + let mut keyspace_nodes: Vec = (0..100) + .map(|id| ServiceNode { + id, + hostname: format!("cache-server-hostname-{}", id), + }) + .collect(); + remote_cache_connector + .inner + .build_keyspace(keyspace_nodes.clone()); + assert_consistent_hashing( + remote_cache_connector.clone(), + Arc::clone(&prev_assignments), + ) + .await; + + // shuffled nodes, test against same/original assignments + keyspace_nodes.shuffle(&mut rand::thread_rng()); + remote_cache_connector.inner.build_keyspace(keyspace_nodes); + assert_consistent_hashing(remote_cache_connector, prev_assignments).await; + } +} diff --git a/parquet_cache/src/client/mock.rs b/parquet_cache/src/client/mock.rs new file mode 100644 index 00000000000..584cb314641 --- /dev/null +++ b/parquet_cache/src/client/mock.rs @@ -0,0 +1,153 @@ +use std::collections::HashSet; +use std::{ops::Range, sync::Arc}; + +use async_trait::async_trait; +use bytes::Bytes; +use data_types::ParquetFileParams; +use futures::stream::BoxStream; +use object_store::{ + path::Path, GetOptions, GetResult, ListResult, MultipartId, ObjectMeta, ObjectStore, + PutOptions, PutResult, Result, +}; +use parking_lot::Mutex; +use tokio::io::AsyncWrite; + +use crate::{ + data_types::WriteHintAck, DataCacheObjectStore, MockCacheServer, WriteHintingObjectStore, +}; + +use super::cache_connector::build_cache_connector; + +/// Build a cache client, +/// with a mocked server and mocked direct-to-store fallback. +pub async fn build_cache_server_client( + direct_to_store: Arc, +) -> (DataCacheObjectStore, MockCacheServer) { + // build server and client + let dst = "localhost:0"; + let cache_server = MockCacheServer::create(dst, Arc::clone(&direct_to_store)).await; + let cache_client = build_cache_connector(cache_server.addr()); + + // build object_store + let object_store = DataCacheObjectStore::new(cache_client, direct_to_store); + + (object_store, cache_server) +} + +/// A mocked direct-to-object-store, with the following characteristics: +/// * panics when used as fallback (for GET requests) +/// * tracks when called for PUT requests +#[derive(Debug, Default)] +pub struct MockDirectStore { + called: Mutex>, +} + +impl MockDirectStore { + pub fn was_called(&self, fn_name: &str) -> bool { + self.called.lock().contains(&String::from(fn_name)) + } +} + +#[async_trait] +impl ObjectStore for MockDirectStore { + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> Result { + self.called.lock().insert(String::from("put")); + Ok(PutResult { + e_tag: None, + version: None, + }) + } + + async fn put_multipart( + &self, + _location: &Path, + ) -> Result<(MultipartId, Box)> { + self.called.lock().insert(String::from("put_multipart")); + Ok(( + String::from("AsyncWriter for MockDirectStore"), + Box::new(tokio::io::BufWriter::new(vec![])), + )) + } + + async fn abort_multipart(&self, _location: &Path, _multipart_id: &MultipartId) -> Result<()> { + self.called.lock().insert(String::from("abort_multipart")); + Ok(()) + } + + async fn get(&self, _location: &Path) -> Result { + panic!("object was not found in test cache") + } + + async fn get_opts(&self, _location: &Path, _options: GetOptions) -> Result { + // test may intentionally test fallback behavior of get_opts() + panic!("direct_store.get_opts() was called during test") + } + + async fn get_range(&self, _location: &Path, _range: Range) -> Result { + panic!("direct_store should not be called during test") + } + + async fn get_ranges(&self, _location: &Path, _ranges: &[Range]) -> Result> { + panic!("direct_store should not be called during test") + } + + async fn head(&self, _location: &Path) -> Result { + // test may intentionally test fallback behavior of get_opts() + panic!("direct_store.head() was called during test") + } + + async fn delete(&self, _location: &Path) -> Result<()> { + self.called.lock().insert(String::from("delete")); + Ok(()) + } + + fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, Result> { + self.called.lock().insert(String::from("list")); + Box::pin(tokio_stream::iter(vec![])) + } + + async fn list_with_delimiter(&self, _prefix: Option<&Path>) -> Result { + self.called + .lock() + .insert(String::from("list_with_delimiter")); + Ok(ListResult { + common_prefixes: vec![], + objects: vec![], + }) + } + + async fn copy(&self, _from: &Path, _to: &Path) -> Result<()> { + self.called.lock().insert(String::from("copy")); + Ok(()) + } + + async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> Result<()> { + self.called + .lock() + .insert(String::from("copy_if_not_exists")); + Ok(()) + } +} + +#[async_trait] +impl WriteHintingObjectStore for MockDirectStore { + async fn write_hint<'a>( + &self, + _location: &'a Path, + _new_file: &'a ParquetFileParams, + _ack_setting: WriteHintAck, + ) -> Result<()> { + panic!("direct_store should not be called during test"); + } +} + +impl std::fmt::Display for MockDirectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DirectStore") + } +} diff --git a/parquet_cache/src/client/object_store.rs b/parquet_cache/src/client/object_store.rs new file mode 100644 index 00000000000..b00642b5767 --- /dev/null +++ b/parquet_cache/src/client/object_store.rs @@ -0,0 +1,776 @@ +use std::collections::HashMap; +use std::io::{Error, ErrorKind}; +use std::{ops::Range, sync::Arc}; + +use async_trait::async_trait; +use bytes::{Buf, Bytes}; +use futures::stream::{BoxStream, StreamExt, TryStreamExt}; +use http::Method; +use hyper::StatusCode; +use hyper::{Body, Response}; +use object_store::{ + path::Path, Error as ObjectStoreError, GetOptions, GetResult, ListResult, MultipartId, + ObjectMeta, ObjectStore, PutOptions, PutResult, Result, +}; +use tokio::io::AsyncWrite; +use tower::{Service, ServiceExt}; + +use crate::data_types::{ + extract_usize_header, GetObjectMetaResponse, X_RANGE_END_HEADER, X_RANGE_START_HEADER, +}; + +use super::cache_connector::{ClientCacheConnector, Error as CacheClientError}; +use super::request::RawRequest; + +/// identifier for `object_store::Error::Generic` +const DATA_CACHE: &str = "object store to data cache"; + +/// Data cache, consumable by IOX Components. +pub struct DataCacheObjectStore { + pub(crate) cache: ClientCacheConnector, + pub(crate) direct_passthru: Arc, +} + +impl DataCacheObjectStore { + /// Create a new [`DataCacheObjectStore`]. + pub fn new(cache: ClientCacheConnector, direct_store: Arc) -> Self { + Self { + cache, + direct_passthru: Arc::new(direct_store), + } + } +} + +/// ObjectStore client for using the data cache. +/// +/// Defines when to use the direct (passthru) object store, +/// versus the data cache. +/// +/// Iox components all utilize the [`ObjectStore`] for store connection. +/// Based upon startup configuration, this may be the data cache. +#[async_trait] +impl ObjectStore for DataCacheObjectStore { + async fn put_opts(&self, location: &Path, bytes: Bytes, opts: PutOptions) -> Result { + self.direct_passthru.put_opts(location, bytes, opts).await + } + + async fn put_multipart( + &self, + location: &Path, + ) -> Result<(MultipartId, Box)> { + self.direct_passthru.put_multipart(location).await + } + + async fn abort_multipart(&self, location: &Path, multipart_id: &MultipartId) -> Result<()> { + self.direct_passthru + .abort_multipart(location, multipart_id) + .await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { + let object_meta: ObjectMeta = self.head(location).await?; + + let key = location.to_string(); + + let uri_parts = format!("/object?location={}", key) + .parse::() + .map(http::uri::Parts::from) + .expect("should be valid uri"); + + let GetOptions { + if_match, + if_none_match, + if_modified_since, + if_unmodified_since, + range, + version, + head, + } = &options; + let headers = Headers(&mut HashMap::new()) + .add_header("If-Match", if_match) + .add_header("If-None-Match", if_none_match) + .add_header("If-Modified-Since", if_modified_since) + .add_header("If-Unmodified-Since", if_unmodified_since) + // Pass other options as non standard headers + .add_header("X-Version", version) + .add_header("X-Head", &Some(head)) + .add_range(range) + .0 + .to_owned(); + + let req = RawRequest { + method: Method::GET, + uri_parts, + headers, + key: Some(key), + ..Default::default() + }; + + let mut cache = self.cache.clone(); + let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + + match service.call(req).await { + Ok(resp) => match resp.status() { + StatusCode::OK => { + match transform_get_object_response(resp, object_meta, range) { + Ok(res) => Ok(res), + Err(_) => self.direct_passthru.get_opts(location, options).await, // read_data error + } + } + code => { + if use_fallback(code) { + self.direct_passthru.get_opts(location, options).await // http code error + } else { + let source = Box::new(Error::new(ErrorKind::Other, code.to_string())); + Err(ObjectStoreError::Generic { + store: DATA_CACHE, + source, + }) + } + } + }, + Err(_) => self.direct_passthru.get_opts(location, options).await, // connection error + } + } + + async fn get_range(&self, location: &Path, range: Range) -> Result { + self.get_opts( + location, + GetOptions { + range: Some(range), + ..Default::default() + }, + ) + .await? + .bytes() + .await + } + + async fn head(&self, location: &Path) -> Result { + let key = location.to_string(); + + let uri_parts = format!("/metadata?location={}", key) + .parse::() + .map(http::uri::Parts::from) + .expect("should be valid uri"); + + let req = RawRequest { + method: Method::GET, + uri_parts, + key: Some(key), + ..Default::default() + }; + + let mut cache = self.cache.clone(); + let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + + match service.call(req).await { + Ok(mut resp) => match resp.status() { + StatusCode::OK => { + let maybe_meta: Result = + hyper::body::aggregate(resp.body_mut()) + .await + .map_err(|e| CacheClientError::ReadData(e.to_string())) + .map(|buf| buf.reader()) + .and_then(|reader| { + serde_json::from_reader(reader) + .map_err(|e| CacheClientError::ReadData(e.to_string())) + }) + .map(|get_meta_resp: GetObjectMetaResponse| { + ObjectMeta::from(get_meta_resp) + }); + + match maybe_meta { + Ok(meta) => Ok(meta), + Err(_) => self.direct_passthru.head(location).await, // read_data error + } + } + code => { + if use_fallback(code) { + self.direct_passthru.head(location).await // http code error + } else { + let source = Box::new(Error::new(ErrorKind::Other, code.to_string())); + Err(ObjectStoreError::Generic { + store: DATA_CACHE, + source, + }) + } + } + }, + Err(_) => self.direct_passthru.head(location).await, // connection error + } + } + + async fn delete(&self, location: &Path) -> Result<()> { + // Do not delete from cache, instead let it age out. + // Querier runs off of catalog snapshots of object_store state. + self.direct_passthru.delete(location).await + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'_, Result> { + // Use object_store directly as src of truth for currently existing files. + // Because cache cannot know about completeness of the file set. + self.direct_passthru.list(prefix) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> Result { + // Use object_store directly as src of truth for currently existing files. + // Because cache cannot know about completeness of the file set. + self.direct_passthru.list_with_delimiter(prefix).await + } + + async fn copy(&self, from: &Path, to: &Path) -> Result<()> { + self.direct_passthru.copy(from, to).await + } + + async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + self.direct_passthru.copy_if_not_exists(from, to).await + } +} + +impl std::fmt::Display for DataCacheObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DataCacheObjectStore") + } +} + +impl std::fmt::Debug for DataCacheObjectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "DataCacheObjectStore") + } +} + +fn use_fallback(code: StatusCode) -> bool { + match code { + StatusCode::OK => unreachable!("should not be requesting fallback if response is OK"), + // Errors which should not result in trying the fallback. + StatusCode::BAD_REQUEST + | StatusCode::PRECONDITION_FAILED + | StatusCode::FORBIDDEN + | StatusCode::UNAUTHORIZED + | StatusCode::MOVED_PERMANENTLY + | StatusCode::NETWORK_AUTHENTICATION_REQUIRED => false, + // All other errors => use fallback. + _ => true, + } +} + +fn transform_get_object_response( + resp: Response, + meta: ObjectMeta, + expected_range: &Option>, +) -> Result { + let headers = resp.headers(); + let range = Range { + start: extract_usize_header(X_RANGE_START_HEADER, headers)?, + end: extract_usize_header(X_RANGE_END_HEADER, headers)?, + }; + + if let Some(expected_range) = expected_range { + if !expected_range.start.eq(&range.start) || !expected_range.end.eq(&range.end) { + return Err(CacheClientError::ReadData(format!( + "expected range {:?} but found range {:?}", + expected_range, range + ))); + } + }; + + let stream = resp + .into_body() + .map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + }) + .boxed(); + + Ok(GetResult { + payload: object_store::GetResultPayload::Stream(stream), + meta, + range, + }) +} + +/// Newtype around headers, for convenience methods. +struct Headers<'a>(pub &'a mut HashMap<&'static str, String>); + +impl<'a> Headers<'a> { + fn add_header(&mut self, k: &'static str, v: &Option) -> &mut Self { + if let Some(v) = v { + // let header_name = k.to_owned(); + self.0.insert(k, v.to_string()); + } + self + } + + fn add_range(&mut self, range: &Option>) -> &mut Self { + if let Some(v) = range { + self.0 + .insert("Range", format!("bytes={}-{}", v.start, v.end)); + } + self + } +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + + use crate::client::mock::{build_cache_server_client, MockDirectStore}; + use crate::server::mock::{build_resp_body, ExpectedResponse}; + + use super::*; + + static FILE: &[u8] = "All my pretty data.".as_bytes(); + + #[tokio::test] + async fn test_writes_are_passed_to_store() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + assert!(object_store + .put(&Path::default(), FILE.into()) + .await + .is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("put"), + "put should be passed to direct store" + ); + + assert!(object_store.put_multipart(&Path::default()).await.is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("put_multipart"), + "put_multipart should be passed to direct store" + ); + + assert!(object_store + .abort_multipart(&Path::default(), &MultipartId::default()) + .await + .is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("abort_multipart"), + "abort_multipart should be passed to direct store" + ); + + assert!(object_store.delete(&Path::default()).await.is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("delete"), + "delete should be passed to direct store" + ); + + assert!(object_store + .copy(&Path::default(), &Path::default()) + .await + .is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("copy"), + "copy should be passed to direct store" + ); + + assert!(object_store + .copy_if_not_exists(&Path::default(), &Path::default()) + .await + .is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("copy_if_not_exists"), + "copy_if_not_exists should be passed to direct store" + ); + + cache_server.close().await; + } + + #[tokio::test] + async fn test_list_all_objects_are_passed_to_store() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + object_store.list(Some(&Path::default())); + assert!( + Arc::clone(&direct_to_store).was_called("list"), + "list should be passed to direct store" + ); + + assert!(object_store + .list_with_delimiter(Some(&Path::default())) + .await + .is_ok()); + assert!( + Arc::clone(&direct_to_store).was_called("list_with_delimiter"), + "list_with_delimiter should be passed to direct store" + ); + + cache_server.close().await; + } + + #[tokio::test] + async fn test_fetch_requests_hit_the_cache() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // GET /metadata + let route = format!("/metadata?location={}", &path.to_string()); + let expected_metadata_resp = GetObjectMetaResponse { + location: path.to_string(), + last_modified: Default::default(), + size: 42, + e_tag: None, + version: None, + }; + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: build_resp_body(&expected_metadata_resp), + range: None, + }, + ); + assert_matches!( + object_store.head(&path).await, + Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()), + "payload was returned and parsed properly" + ); + assert!( + cache_server.was_called(&route), + "head should hit the cache server" + ); + + // GET fetch /object + // note: all fetch object requests use ObjectStore::get_opts() + let route = format!("/object?location={}", path); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: std::str::from_utf8(FILE).unwrap().into(), + range: Some(Range { + start: 0, + end: FILE.len(), + }), + }, + ); + let object_resp = object_store.get(&path).await; + assert_matches!( + &object_resp, + Ok(GetResult {payload: _, meta, range: _}) if meta == &ObjectMeta::from(expected_metadata_resp), + "object metadata was returned and parsed properly" + ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests. + assert!( + cache_server.was_called(&route), + "get should hit the cache server" + ); + + cache_server.close().await; + } + + #[tokio::test] + async fn test_fetch_range_request() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // add mock metadata + let route = format!("/metadata?location={}", &path.to_string()); + let expected_metadata_resp = GetObjectMetaResponse { + location: path.to_string(), + last_modified: Default::default(), + size: 42, + e_tag: None, + version: None, + }; + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: build_resp_body(&expected_metadata_resp), + range: None, + }, + ); + + // add mock file + let route = format!("/object?location={}", &path.to_string()); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(), + range: Some(Range { start: 3, end: 9 }), + }, + ); + + // TEST: get_range() + let range = Range { start: 3, end: 9 }; + let object_resp = object_store.get_range(&path, range.clone()).await; + assert_matches!( + &object_resp, + Ok(bytes) if bytes.len() == range.len(), + "returns proper bytes size for the range" + ); + assert!( + cache_server.was_called(&route), + "get should hit the cache server" + ); + + // TEST: multiple get_ranges() + let object_resp = object_store + .get_ranges(&path, &[range.clone(), range.clone()]) + .await; + assert_matches!( + &object_resp, + Ok(vec_bytes) if matches!( + &vec_bytes[..], + [bytes, bytes_2] if bytes.len() == range.len() && bytes_2.len() == range.len() + ), + "returns proper bytes size for multiple ranges" + ); + + cache_server.close().await; + } + + mod test_range_failures { + use super::*; + + #[should_panic(expected = "direct_store.get_opts() was called during test")] + #[tokio::test] + async fn test_get_opts_will_use_fallback_if_returned_range_does_not_match() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // add mock metadata + let route = format!("/metadata?location={}", &path.to_string()); + let expected_metadata_resp = GetObjectMetaResponse { + location: path.to_string(), + last_modified: Default::default(), + size: 42, + e_tag: None, + version: None, + }; + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: build_resp_body(&expected_metadata_resp), + range: None, + }, + ); + + // add mock file + let route = format!("/object?location={}", &path.to_string()); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: std::str::from_utf8(&FILE[3..9]).unwrap().into(), + range: Some(Range { start: 3, end: 9 }), + }, + ); + + // TEST: get_range() + let range = Range { start: 1, end: 7 }; + let _ = object_store.get_range(&path, range.clone()).await; + + cache_server.close().await; + } + } + + mod test_head_failures { + use super::*; + + #[should_panic(expected = "direct_store.head() was called during test")] + #[tokio::test] + async fn test_use_fallback_when_missing_data() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, _cache_server) = + build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // TEST: metadata never provided to mock + let _ = object_store.head(&path).await; + } + + #[should_panic(expected = "direct_store.head() was called during test")] + #[tokio::test] + async fn test_use_fallback_when_bad_data() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // TEST: incorrect metadata provided to mock + let route = format!("/metadata?location={}", &path.to_string()); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: vec![].into(), // BAD: should be metadata + range: None, + }, + ); + let _ = object_store.head(&path).await; + } + + #[should_panic(expected = "direct_store.head() was called during test")] + #[tokio::test] + async fn test_use_fallback_on_connection_failed() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + + // GET /metadata is working + let route = format!("/metadata?location={}", &path.to_string()); + let expected_metadata_resp = GetObjectMetaResponse { + location: path.to_string(), + last_modified: Default::default(), + size: 42, + e_tag: None, + version: None, + }; + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: build_resp_body(&expected_metadata_resp), + range: None, + }, + ); + assert_matches!( + object_store.head(&path).await, + Ok(res) if res == ObjectMeta::from(expected_metadata_resp.clone()), + "payload was returned and parsed properly" + ); + + // kill server + cache_server.close().await; + + // TEST: connection fails + let _ = object_store.head(&path).await; + } + } + + mod test_get_opts_failures { + use crate::MockCacheServer; + + use super::*; + + async fn setup_metadata_head(path: &Path, cache_server: &MockCacheServer) { + // GET /metadata is working + let route = format!("/metadata?location={}", path); + let expected_metadata_resp = GetObjectMetaResponse { + location: path.to_string(), + last_modified: Default::default(), + size: 42, + e_tag: None, + version: None, + }; + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: build_resp_body(&expected_metadata_resp), + range: None, + }, + ); + } + + #[should_panic(expected = "direct_store.get_opts() was called during test")] + #[tokio::test] + async fn test_use_fallback_when_missing_data() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + setup_metadata_head(&path, &cache_server).await; + assert!( + object_store.head(&path).await.is_ok(), + "should have functioning metadata/head request" + ); + + // TEST: object never provided to mock + let _ = object_store.get(&path).await; + } + + #[should_panic(expected = "direct_store.get_opts() was called during test")] + #[tokio::test] + async fn test_use_fallback_when_bad_data() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + setup_metadata_head(&path, &cache_server).await; + assert!( + object_store.head(&path).await.is_ok(), + "should have functioning metadata/head request" + ); + + // TEST: incorrect metadata provided to mock + let route = format!("/object?location={}", &path.to_string()); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: vec![].into(), // BAD: should be object + range: None, + }, + ); + let _ = object_store.get(&path).await; + } + + // since server is shutdown, will fail on head() request before get_opts() request + #[should_panic(expected = "direct_store.head() was called during test")] + #[tokio::test] + async fn test_use_fallback_on_connection_failed() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let path = Path::from("my/scoped/data/file.parquet"); + setup_metadata_head(&path, &cache_server).await; + assert!( + object_store.head(&path).await.is_ok(), + "should have functioning metadata/head request" + ); + + // GET /object is working + let route = format!("/object?location={}", path); + cache_server.respond_with( + route.clone(), + ExpectedResponse { + bytes: std::str::from_utf8(FILE).unwrap().into(), + range: Some(Range { + start: 0, + end: FILE.len(), + }), + }, + ); + assert!(object_store.get(&path).await.is_ok()); + + // kill server + cache_server.close().await; + + // TEST: connection fails + let _ = object_store.get(&path).await; + } + } +} diff --git a/parquet_cache/src/client/request.rs b/parquet_cache/src/client/request.rs new file mode 100644 index 00000000000..bb0e7335af4 --- /dev/null +++ b/parquet_cache/src/client/request.rs @@ -0,0 +1,46 @@ +use std::{collections::HashMap, pin::Pin}; + +use futures::Future; +use http::uri::Scheme; +use hyper::{header::HeaderValue, Body, Method, Request, Uri}; + +pub type PinnedFuture = Pin> + Send>>; + +#[derive(Debug, Default)] +pub struct RawRequest { + pub headers: HashMap<&'static str, String>, + pub body: Body, + pub uri_parts: http::uri::Parts, + pub method: Method, + pub key: Option, +} + +impl TryFrom for Request { + type Error = http::Error; + + fn try_from(value: RawRequest) -> Result { + let RawRequest { + headers: req_headers, + body, + mut uri_parts, + method, + key: _, + } = value; + + // reduce unnecessary (within cluster) overhead from https + uri_parts.scheme = Some(Scheme::HTTP); + + let mut req = Request::builder() + .method(method) + .uri(Uri::from_parts(uri_parts)?); + + for (k, v) in req_headers.into_iter() { + req = req.header( + k, + HeaderValue::from_str(v.as_str()).map_err(http::Error::from)?, + ); + } + + req.body(body) + } +} diff --git a/parquet_cache/src/client/write_hints.rs b/parquet_cache/src/client/write_hints.rs new file mode 100644 index 00000000000..4d091ac3a7d --- /dev/null +++ b/parquet_cache/src/client/write_hints.rs @@ -0,0 +1,223 @@ +use async_trait::async_trait; +use bytes::{BufMut, BytesMut}; +use data_types::ParquetFileParams; +use futures::FutureExt; +use hyper::Method; +use object_store::{limit::LimitStore, path::Path, Error as ObjectStoreError, ObjectStore, Result}; +use tower::{Service, ServiceExt}; + +use crate::data_types::{WriteHint, WriteHintAck, WriteHintRequestBody}; +use crate::DataCacheObjectStore; + +use super::request::RawRequest; + +/// identifier for `object_store::Error::Generic` +const DATA_CACHE: &str = "write hint to data cache"; + +/// An [`ObjectStore`] which handles write hinting. +/// +/// In some cases, the write hinting request does nothing (e.g. for direct-to-store impls). +#[async_trait] +pub trait WriteHintingObjectStore: ObjectStore { + /// Handle any write hinting performed by the [`ObjectStore`]. + async fn write_hint<'a>( + &self, + location: &'a Path, + new_file: &'a ParquetFileParams, + ack_setting: WriteHintAck, + ) -> Result<()>; +} + +#[async_trait] +impl WriteHintingObjectStore for DataCacheObjectStore { + /// Provide write hinting to data cache. + /// + /// Response is configuration based on [`WriteHintAck`]. + async fn write_hint<'a>( + &self, + location: &'a Path, + new_file: &'a ParquetFileParams, + ack_setting: WriteHintAck, + ) -> Result<()> { + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: location.to_string(), + hint: WriteHint::from(new_file), + ack_setting, + }, + ) + .map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + + let key = location.to_string(); + + let uri_parts = "/write-hint" + .parse::() + .map(http::uri::Parts::from) + .expect("should be valid uri"); + + let req = RawRequest { + method: Method::POST, + uri_parts, + key: Some(key), + body: hyper::Body::from(buf.into_inner().freeze()), + ..Default::default() + }; + + let mut cache = self.cache.clone(); + let service = cache.ready().await.map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + + let write_hints = service.call(req); + + match ack_setting { + WriteHintAck::Sent => { + write_hints.now_or_never(); + Ok(()) + } + WriteHintAck::Received => { + // server responds ok after receipt + write_hints.await.map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + Ok(()) + } + WriteHintAck::Completed => { + // server responds ok after downstream actions complete + write_hints.await.map_err(|e| ObjectStoreError::Generic { + store: DATA_CACHE, + source: Box::new(e), + })?; + Ok(()) + } + } + } +} + +#[async_trait] +impl WriteHintingObjectStore for LimitStore { + /// Enable our store interface to always use `Arc`. + /// (Aws, Azure, and Gcp [`ObjectStore`] impls are all [`LimitStore`].) + /// + /// When data cache is not used, the write hinting does not occur. + async fn write_hint<'a>( + &self, + _location: &'a Path, + _new_file: &'a ParquetFileParams, + _ack_setting: WriteHintAck, + ) -> Result<()> { + Ok(()) + } +} + +#[cfg(test)] +mod test { + use std::sync::Arc; + + use data_types::{ + ColumnId, ColumnSet, CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId, + Timestamp, + }; + use object_store::{ + aws::AmazonS3Builder, azure::MicrosoftAzureBuilder, gcp::GoogleCloudStorageBuilder, + limit::LimitStore, + }; + + use crate::client::mock::{build_cache_server_client, MockDirectStore}; + + use super::*; + + fn new_file() -> ParquetFileParams { + ParquetFileParams { + namespace_id: NamespaceId::new(0), + table_id: TableId::new(0), + partition_id: PartitionId::new(0), + partition_hash_id: None, + object_store_id: ObjectStoreId::new(), + min_time: Timestamp::new(1), + max_time: Timestamp::new(5), + file_size_bytes: 0, + row_count: 0, + compaction_level: CompactionLevel::Initial, + created_at: Timestamp::new(1234), + column_set: ColumnSet::new([ColumnId::new(1), ColumnId::new(2)]), + max_l0_created_at: Timestamp::new(1234), + } + } + + #[tokio::test] + async fn test_write_hinting_always_available() { + // This test confirms that any external interfaces can always utilize + // the object_store, without awareness of whether or not it's the data cache + // or a direct_to_store. + // + // if object_store.put(&location).await.is_ok() { + // object_store.write_hints(&location, new_files, ack_setting).await + // } + // + // This avoids leaking any configuration details (for conditional checks) across the codebase. + + let location = Path::from("my/scoped/data/file.parquet"); + let new_file = new_file(); + let ack_setting = WriteHintAck::Received; + + // impl with gcp store + let builder = GoogleCloudStorageBuilder::new().with_bucket_name("foo".to_string()); + let direct_store: Arc = + Arc::new(LimitStore::new(builder.build().unwrap(), 10)); + assert!(direct_store + .write_hint(&location, &new_file, ack_setting) + .await + .is_ok()); + + // impl with aws store + let builder = AmazonS3Builder::new() + .with_bucket_name("foo".to_string()) + .with_region("mars".to_string()); + let direct_store: Arc = + Arc::new(LimitStore::new(builder.build().unwrap(), 10)); + assert!(direct_store + .write_hint(&location, &new_file, ack_setting) + .await + .is_ok()); + + // impl with azure store + let builder = MicrosoftAzureBuilder::new() + .with_container_name("foo".to_string()) + .with_account("dabozz".to_string()); + let direct_store: Arc = + Arc::new(LimitStore::new(builder.build().unwrap(), 10)); + assert!(direct_store + .write_hint(&location, &new_file, ack_setting) + .await + .is_ok()); + } + + #[tokio::test] + async fn test_write_hinting_hits_the_cache() { + let direct_to_store = Arc::new(MockDirectStore::default()); + + let casted_object_store = Arc::clone(&direct_to_store) as Arc; + let (object_store, cache_server) = build_cache_server_client(casted_object_store).await; + + let location = Path::from("my/scoped/data/file.parquet"); + let new_file = new_file(); + let ack_setting = WriteHintAck::Received; + + assert!(object_store + .write_hint(&location, &new_file, ack_setting) + .await + .is_ok()); + assert!( + cache_server.was_called(&"/write-hint".to_string()), + "write-hint should hit the cache server" + ); // note: payload bytes will be asserted separately with the (non-mock-)server integration tests. + } +} diff --git a/parquet_cache/src/controller.rs b/parquet_cache/src/controller.rs new file mode 100644 index 00000000000..e2ae2484663 --- /dev/null +++ b/parquet_cache/src/controller.rs @@ -0,0 +1,53 @@ +//! The controller module contains the API and functionality +//! used to implement the controller for a DataCacheSet. + +use futures::future::select; +use kube::Client; +use std::time::Duration; + +mod error; +pub use error::{Error, Result}; +mod kube_util; +mod parquet_cache; +pub use parquet_cache::{ + ParquetCache, ParquetCacheInstanceSet, ParquetCacheSpec, ParquetCacheStatus, +}; + +mod parquet_cache_controller; + +mod parquet_cache_set; +pub use parquet_cache_set::{ParquetCacheSet, ParquetCacheSetSpec, ParquetCacheSetStatus}; + +mod parquet_cache_set_controller; + +mod state_service; + +/// The name of the controller. +const CONTROLLER_NAME: &str = "parquet-cache-set-controller"; + +/// Label used to annotate the objects with the hash of the pod template. +const POD_TEMPLATE_HASH_LABEL: &str = "pod-template-hash"; + +/// Label used to annotate objects with the count of parquet cache replicas. +const PARQUET_CACHE_REPLICAS_LABEL: &str = "parquet-cache-replicas"; + +/// The time to wait before re-executing when waiting for cache instances to warm, or cool. +const SHORT_WAIT: Duration = Duration::from_secs(60); + +/// The time to wait before re-executing when there is no longer any active work to do, or +/// the controller will be awoken by changes to owned objects. +const LONG_WAIT: Duration = Duration::from_secs(3600); + +/// Run the controllers for ParquetCache and ParquetCacheSet resources to completion. +pub async fn run(client: Client, namespace: Option) -> Result<(), kube::Error> { + let parquet_cache_join_handle = + parquet_cache_controller::spawn_controller(client.clone(), namespace.clone()); + let parquet_cache_set_join_handle = + parquet_cache_set_controller::spawn_controller(client.clone(), namespace.clone()); + + select(parquet_cache_join_handle, parquet_cache_set_join_handle) + .await + .factor_first() + .0 + .unwrap() +} diff --git a/parquet_cache/src/controller/error.rs b/parquet_cache/src/controller/error.rs new file mode 100644 index 00000000000..bb3dc653350 --- /dev/null +++ b/parquet_cache/src/controller/error.rs @@ -0,0 +1,29 @@ +/// Errors that can be generated by the controller. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Error when encoding a resource object. + #[error("encoding error: {0}")] + EncodingError(#[from] serde_json::Error), + + /// Error performing a kubernetes operation. + #[error("kubernetes error: {0}")] + KubeError(#[from] kube::Error), + + /// Error getting a cache node's state. + #[error("node state error: {0}")] + NodeStateError(Box), + + /// Error caused by an internal failure, this is almost certainly a bug. + #[error("internal error: {0}")] + InternalError(String), +} + +impl Error { + /// Create a new [Error::InternalError] with the provided message. + pub fn internal(msg: &str) -> Self { + Self::InternalError(String::from(msg)) + } +} + +/// Result type for the controller. +pub type Result = std::result::Result; diff --git a/parquet_cache/src/controller/kube_util.rs b/parquet_cache/src/controller/kube_util.rs new file mode 100644 index 00000000000..67f847fdff3 --- /dev/null +++ b/parquet_cache/src/controller/kube_util.rs @@ -0,0 +1,93 @@ +use fnv::FnvHasher; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::{LabelSelector, OwnerReference}; +use kube::{Api, Error, Resource, ResourceExt}; +use serde::de::DeserializeOwned; +use serde::Serialize; +use std::fmt::Debug; +use std::hash::Hasher; + +/// The set of characters kubernetes considers safe for generated strings. +const SAFE_CHARS: [char; 27] = [ + 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', + 'z', '2', '4', '5', '6', '7', '8', '9', +]; + +/// Encode a string using a small character set that is considered safe. This +/// minimizes the chances of accidental vulgarity. +pub fn safe_string(s: &str) -> String { + s.chars() + .map(|c| SAFE_CHARS[c as usize % SAFE_CHARS.len()]) + .collect() +} + +/// Get a hash value for the provided object. The hashed value has no guaranteed properties +/// other than the same input will have the same resulting hash. There is no attempt made to +/// hash the value in the same way that kubernetes controllers will. +pub fn hash_object(obj: &T) -> Result +where + T: ?Sized + Serialize, +{ + let bytes = serde_json::to_vec(obj)?; + let mut hasher = FnvHasher::with_key(0); + hasher.write(&bytes); + Ok(safe_string(&format!( + "{}", + (hasher.finish() & 0xFFFFFFFF) as u32 + ))) +} + +/// Format label selectors so they can be used with ListParams. +pub fn selectors(selector: &LabelSelector) -> Option { + let mut clauses = vec![]; + if let Some(expressions) = &selector.match_expressions { + clauses.extend(expressions.iter().filter_map(|requirement| { + match requirement.operator.as_ref() { + "In" => requirement + .values + .as_ref() + .map(|values| format!("{} in ({})", requirement.key, values.join(","))), + "NotIn" => requirement + .values + .as_ref() + .map(|values| format!("{} notin ({})", requirement.key, values.join(","))), + "Exists" => Some(requirement.key.clone()), + "DoesNotExist" => Some(format!("!{}", requirement.key)), + _ => None, // Skip unknown operator. + } + })); + } + if let Some(labels) = &selector.match_labels { + clauses.extend(labels.iter().map(|(k, v)| format!("{k}={v}"))) + } + match clauses.len() { + 0 => None, + _ => Some(clauses.join(",")), + } +} + +pub async fn list_owned(api: &Api, owner_uid: &String) -> Result, Error> +where + K: Debug + Clone + Resource + DeserializeOwned + Send + Sync + 'static, +{ + let object_list = api.list(&Default::default()).await?; + Ok(object_list + .items + .into_iter() + .filter(|obj| obj.owner_references().iter().any(|or| &or.uid == owner_uid)) + .collect()) +} + +pub fn owner_reference(obj: &R) -> OwnerReference +where + R: Resource, +{ + let meta = obj.meta(); + OwnerReference { + api_version: R::api_version(&()).into(), + block_owner_deletion: Some(true), + controller: Some(true), + kind: R::kind(&()).into(), + name: meta.name.clone().unwrap_or_default(), + uid: meta.uid.clone().unwrap_or_default(), + } +} diff --git a/parquet_cache/src/controller/parquet_cache.rs b/parquet_cache/src/controller/parquet_cache.rs new file mode 100644 index 00000000000..8c65bc7ddc5 --- /dev/null +++ b/parquet_cache/src/controller/parquet_cache.rs @@ -0,0 +1,139 @@ +use super::{Error, Result, PARQUET_CACHE_REPLICAS_LABEL}; +use k8s_openapi::api::core::v1::PodTemplateSpec; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector; +use k8s_openapi::schemars::JsonSchema; +use kube::CustomResource; +use serde::{Deserialize, Serialize}; + +/// Specification of a ParquetCache. +#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)] +#[kube( + kind = "ParquetCache", + group = "iox.influxdata.com", + version = "v1alpha1", + namespaced +)] +#[kube(status = "ParquetCacheStatus")] +#[kube(derive = "Default")] +#[serde(rename_all = "camelCase")] +pub struct ParquetCacheSpec { + /// The name of the config map to generate containing the data cache set + /// state. This config map must be volume mounted in the pod template. + /// If a name isn't set then the config map will have the same name as + /// the data cache set. + pub config_map_name: Option, + + /// The number of replicas that are required to be in the data cache set. + pub replicas: Option, + + /// Selector is a label query over pods that should match the replica + /// count. Label keys and values that must match in order to be controlled + /// by this data cache set. It must match the pod template's labels. + pub selector: LabelSelector, + + /// Port running on the pods that should be used to query the working state + /// using the `/state` endpoint. + pub state_port: Option, + + /// Template is the object that describes the pod that will be created + /// if insufficient replicas are detected. + pub template: PodTemplateSpec, +} + +/// Status of a ParquetCache. +#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)] +pub struct ParquetCacheStatus { + /// The current cache instance set. + pub current: ParquetCacheInstanceSet, + + /// The upcoming cache instance set. + pub next: ParquetCacheInstanceSet, +} + +/// The set of instances that form a parquet cache group. +#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct ParquetCacheInstanceSet { + /// The revision number of the cache instance set. + pub revision: i64, + + /// The set of instances that form the cache set. + pub instances: Vec, +} + +impl ParquetCache { + fn name(&self) -> Result<&String> { + self.metadata + .name + .as_ref() + .ok_or(Error::internal("ParquetCache has no name")) + } + + /// Get the name of the [k8s_openapi::api::core::v1::ConfigMap] that should be created to + /// contain the status information required by the parquet servers. + pub(super) fn config_map_name(&self) -> Result<&String> { + if let Some(name) = &self.spec.config_map_name { + Ok(name) + } else { + self.name() + } + } + + /// The number of replicas specified for this ParquetCache. + pub(super) fn replicas(&self) -> i32 { + self.spec.replicas.unwrap_or(1) + } + + /// Get the PodTemplateSpec to pass on to the [super::ParquetCacheSet]. This will make necessary + /// changes to the template supplied in the [ParquetCacheSpec]. + /// + /// The generated [PodTemplateSpec] includes a label containing the requested replica count. + /// This ensures that a different [super::ParquetCacheSet] is created even if the only change to the + /// [ParquetCache] is a change in the replica count. + pub(super) fn parquet_cache_set_template(&self) -> PodTemplateSpec { + let mut template = self.spec.template.clone(); + let metadata = template.metadata.get_or_insert(Default::default()); + let labels = metadata.labels.get_or_insert(Default::default()); + labels.insert( + String::from(PARQUET_CACHE_REPLICAS_LABEL), + format!("{}", self.replicas()), + ); + template + } + + /// Generate a name for a ParquetCacheSet derived from this ParquetCache. + pub(super) fn parquet_cache_set_name(&self, pod_template_hash: &str) -> Result { + let name = self.name()?; + Ok(format!("{name}-{pod_template_hash}")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; + + #[test] + fn config_map_name() { + let pc = ParquetCache { + metadata: ObjectMeta { + name: Some(String::from("test-data-cache-set")), + ..Default::default() + }, + ..Default::default() + }; + assert_eq!("test-data-cache-set", pc.config_map_name().unwrap()); + + let pc = ParquetCache { + metadata: ObjectMeta { + name: Some("test-data-cache-set".to_string()), + ..Default::default() + }, + spec: ParquetCacheSpec { + config_map_name: Some(String::from("config-map")), + ..Default::default() + }, + ..Default::default() + }; + assert_eq!("config-map", pc.config_map_name().unwrap()); + } +} diff --git a/parquet_cache/src/controller/parquet_cache_controller.rs b/parquet_cache/src/controller/parquet_cache_controller.rs new file mode 100644 index 00000000000..3dba0587b66 --- /dev/null +++ b/parquet_cache/src/controller/parquet_cache_controller.rs @@ -0,0 +1,1446 @@ +use super::{ + kube_util::{hash_object, list_owned, owner_reference}, + Error, ParquetCache, ParquetCacheInstanceSet, ParquetCacheSet, ParquetCacheSetSpec, + ParquetCacheStatus, Result, LONG_WAIT, PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL, + SHORT_WAIT, +}; +use crate::data_types::InstanceState; +use chrono::Utc; +use futures::StreamExt; +use k8s_openapi::api::core::v1::{ConfigMap, PodTemplateSpec}; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; +use kube::runtime::controller::Action; +use kube::runtime::Controller; +use kube::{Api, Client, Resource, ResourceExt}; +use observability_deps::tracing::{debug, error, info}; +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt::Debug; +use std::sync::Arc; +use std::time::Duration; +use tokio::task::JoinHandle; + +/// Start a new controller task to reconcile [ParquetCacheSet] objects. +pub fn spawn_controller(client: Client, ns: Option) -> JoinHandle> { + tokio::spawn(run_controller(client, ns)) +} + +async fn run_controller(client: Client, ns: Option) -> Result<(), kube::Error> { + let parquet_cache_api = match &ns { + Some(ns) => Api::::namespaced(client.clone(), ns), + None => Api::::all(client.clone()), + }; + let parquet_cache_set_api = match &ns { + Some(ns) => Api::::namespaced(client.clone(), ns), + None => Api::::all(client.clone()), + }; + + Controller::new(parquet_cache_api, Default::default()) + .owns(parquet_cache_set_api, Default::default()) + .run(reconcile, error_policy, Arc::new(Context { client })) + .for_each(|_| futures::future::ready(())) + .await; + Ok(()) +} + +async fn reconcile(obj: Arc, ctx: Arc) -> Result { + let namespace = obj.metadata.namespace.as_deref(); + let name = obj.name_any(); + info!(namespace, name, "reconcile request"); + let sleep = ParquetCacheController::new(obj.as_ref().clone(), ctx.client.clone()) + .reconcile() + .await?; + Ok(Action::requeue(sleep)) +} + +fn error_policy(_object: Arc, err: &Error, _ctx: Arc) -> Action { + // TODO add exponential backoff + let sleep = Duration::from_secs(5); + error!( + err = err as &dyn std::error::Error, + "reconcile failed, requeue in {:?}", sleep + ); + Action::requeue(sleep) +} + +/// Context used when reconciling [ParquetCacheSet] objects. +struct Context { + client: Client, +} + +const COOLING_SECONDS: i64 = 300; + +/// Controller for the ParquetCache custom resource. This controller maintains ParquetCacheSet +/// resources for a ParquetCache. +#[derive(Debug)] +struct ParquetCacheController { + config_map_api: Api, + parquet_cache_api: Api, + parquet_cache_set_api: Api, + + parquet_cache: ParquetCache, +} + +impl ParquetCacheController { + /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet] + /// and [Client]. + fn new(parquet_cache: ParquetCache, client: Client) -> Self { + let ns = parquet_cache.metadata.namespace.as_ref().unwrap(); + let config_maps = Api::namespaced(client.clone(), ns); + let parquet_caches = Api::namespaced(client.clone(), ns); + let parquet_cache_sets = Api::namespaced(client.clone(), ns); + + Self { + config_map_api: config_maps, + parquet_cache_api: parquet_caches, + parquet_cache_set_api: parquet_cache_sets, + parquet_cache, + } + } + + /// Perform the business logic required to move the DataCacheSet state forward towards the + /// desired state. + pub async fn reconcile(&mut self) -> Result { + // ensure the config map exists before attempting to start pods. + let cm = self.status_config_map()?; + match self.config_map_api.create(&Default::default(), &cm).await { + Ok(_) => { + info!(name = cm.metadata.name, "Created ConfigMap"); + } + Err(kube::Error::Api(status)) if status.reason == "AlreadyExists" => (), + Err(error) => return Err(error)?, + } + + let duration = self.reconcile_inner().await?; + + // update the config map with the latest set. + let cm = self.status_config_map()?; + debug!("update config map"); + self.config_map_api + .replace( + self.parquet_cache.config_map_name()?, + &Default::default(), + &cm, + ) + .await?; + debug!("update ParquetCache status"); + self.parquet_cache_api + .replace_status( + self.parquet_cache.metadata.name.as_ref().unwrap(), + &Default::default(), + serde_json::to_vec(&self.parquet_cache)?, + ) + .await?; + Ok(duration) + } + + /// Perform the changes required to reconcile the state of the ParquetCache. Changes to the + /// status are written to memory and will updated after this method returns. + async fn reconcile_inner(&mut self) -> Result { + let template = self.parquet_cache.parquet_cache_set_template(); + let pod_template_hash = hash_object(&template)?; + + // find and remove any owned cache sets that are no longer required. + self.remove_empty_cache_sets(&pod_template_hash).await?; + + if self.check_warming_pods().await? { + self.status_mut().current = self.status_mut().next.clone(); + } else { + // Some pods are still warming, check again soon. + return Ok(SHORT_WAIT); + } + if !self.check_cooling_pods(&pod_template_hash).await? { + // Some pods are still cooling, check again soon. + return Ok(SHORT_WAIT); + } + if self.status_mut().current.instances.len() != self.parquet_cache.replicas() as usize { + self.resize(&pod_template_hash, &template).await?; + } else { + self.migrate(&pod_template_hash, &template).await?; + } + + // If we get to here then either there is nothing to change, or some changes + // have been made and the controller will be woken by those changes. + Ok(LONG_WAIT) + } + + async fn remove_empty_cache_sets(&mut self, pod_template_hash: &String) -> Result<()> { + let parquet_cache_sets = self.owned_parquet_cache_sets().await?; + let to_delete = parquet_cache_sets + .into_iter() + .filter(|pcs| { + let is_latest = if let Some(pth) = pcs + .metadata + .labels + .as_ref() + .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL).cloned()) + { + &pth == pod_template_hash + } else { + false + }; + let is_empty = if let Some(pods) = + pcs.status.as_ref().and_then(|status| status.pods.as_ref()) + { + pods.is_empty() + } else { + true + }; + !is_latest && is_empty + }) + .collect::>(); + + for pcs in to_delete { + info!(name = pcs.metadata.name, "Deleting ParquetCacheSet"); + self.parquet_cache_set_api + .delete(&pcs.metadata.name.unwrap(), &Default::default()) + .await?; + } + Ok(()) + } + + async fn check_warming_pods(&mut self) -> Result { + let status = self.status_mut(); + if status.current.revision == status.next.revision { + return Ok(true); + } + for instance in status.next.instances.clone() { + let (parquet_cache_set_name, _) = instance.rsplit_once('-').unwrap(); + let parquet_cache_set = self + .parquet_cache_set_api + .get(parquet_cache_set_name) + .await?; + let parquet_cache_set_status = parquet_cache_set.status.unwrap_or_default(); + let pod_status = parquet_cache_set_status + .pods + .as_ref() + .and_then(|pods| pods.get(&instance)); + let phase = pod_status + .and_then(|status| status.phase.as_ref()) + .map(String::as_str); + let state = pod_status + .and_then(|status| status.state.as_ref()) + .map(|state| state.state.to_string()); + debug!(name = &instance, phase, state, "Checking Pod status"); + if phase.unwrap_or("") != "Running" { + return Ok(false); + } + if pod_status + .and_then(|status| status.state.as_ref()) + .map(|state| state.state != InstanceState::Warming) + .unwrap_or(true) + { + return Ok(false); + } + } + Ok(true) + } + + async fn check_cooling_pods(&mut self, pod_template_hash: &String) -> Result { + let mut live_pods = self + .status_mut() + .current + .instances + .iter() + .cloned() + .collect::>(); + for pod in &self.status_mut().next.instances { + live_pods.insert(pod.clone()); + } + let parquet_cache_sets = self.owned_parquet_cache_sets().await?; + + let current_status = parquet_cache_sets + .iter() + .filter_map(|pcs| pcs.status.as_ref()) + .filter_map(|status| status.pods.as_ref()) + .flat_map(|pods| pods.clone().into_iter().collect::>()) + .filter(|(k, _)| self.status_mut().current.instances.contains(k)) + .map(|(k, status)| { + let (_, suffix) = split_pod_name(&k); + (suffix, status) + }) + .collect::>(); + + let cooling_pods = parquet_cache_sets + .iter() + .filter(|&pcs| !has_pod_template_hash(pcs, pod_template_hash)) + .filter_map(|pcs| pcs.status.as_ref()) + .filter_map(|status| status.pods.as_ref()) + .flat_map(|pods| pods.keys().cloned().collect::>()) + .filter(|key| !live_pods.contains(key)) + .collect::>(); + + let mut cooling = false; + for pod in cooling_pods { + let (pcs_name, suffix) = split_pod_name(&pod); + if let Some(change) = current_status + .get(&suffix) + .and_then(|status| status.state.as_ref()) + .map(|state| state.state_changed) + { + if change > Utc::now().timestamp() - COOLING_SECONDS { + // If the pod has been cooling for less than the wait time, keep waiting. + cooling = true; + continue; + } + } + let mut pcs = self.parquet_cache_set_api.get(&pcs_name).await?; + pcs.spec + .replica_suffixes + .as_mut() + .unwrap() + .retain(|s| s != &suffix); + self.parquet_cache_set_api + .replace(&pcs_name, &Default::default(), &pcs) + .await?; + } + Ok(!cooling) + } + + async fn resize( + &mut self, + pod_template_hash: &String, + template: &PodTemplateSpec, + ) -> Result<()> { + let owned = self.owned_parquet_cache_sets().await?; + + // Clear any ParquetCacheSets that are not the required one. + for mut pcs in owned { + let is_current = pcs + .metadata + .labels + .as_ref() + .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL)) + .map(|v| v == pod_template_hash) + .unwrap_or_default(); + if is_current { + continue; + } + pcs.spec.replica_suffixes = None; + self.set_parquet_cache_set(&pcs).await?; + } + + // Create the desired ParquetCacheSet. + let mut pcs = self + .get_parquet_cache_set(pod_template_hash, template) + .await?; + let suffixes = (0..self.parquet_cache.replicas()) + .map(|n| format!("{n}")) + .collect::>(); + pcs.spec.replica_suffixes = Some(suffixes.clone()); + self.set_parquet_cache_set(&pcs).await?; + let next_revision = self.status_mut().next.revision + 1; + let instances = suffixes + .iter() + .map(|suffix| format!("{}-{suffix}", pcs.metadata.name.as_ref().unwrap())) + .collect(); + self.status_mut().next = ParquetCacheInstanceSet { + revision: next_revision, + instances, + }; + self.status_mut().current = self.status_mut().next.clone(); + Ok(()) + } + + async fn migrate(&mut self, pod_template_hash: &str, template: &PodTemplateSpec) -> Result<()> { + let current = self.status_mut().current.clone(); + assert_eq!(current.revision, self.status_mut().next.revision); + let parquet_cache_set_name = self + .parquet_cache + .parquet_cache_set_name(pod_template_hash)?; + + for (idx, name) in current.instances.iter().enumerate() { + let (prefix, suffix) = split_pod_name(name); + if prefix == parquet_cache_set_name { + continue; + } + let mut pcs = self + .get_parquet_cache_set(pod_template_hash, template) + .await?; + if pcs.spec.replica_suffixes.is_none() { + pcs.spec.replica_suffixes = Some(vec![]); + } + pcs.spec + .replica_suffixes + .as_mut() + .unwrap() + .push(suffix.clone()); + self.set_parquet_cache_set(&pcs).await?; + self.status_mut().next.revision = current.revision + 1; + self.status_mut().next.instances[idx] = format!("{parquet_cache_set_name}-{suffix}"); + break; + } + Ok(()) + } + + async fn owned_parquet_cache_sets(&self) -> Result> { + let uid = self + .parquet_cache + .metadata + .uid + .as_ref() + .ok_or(Error::internal("ParquetCache has no uid"))?; + Ok(list_owned(&self.parquet_cache_set_api, uid).await?) + } + + /// Create or update the specified ParquetCacheSet. + async fn set_parquet_cache_set(&mut self, pcs: &ParquetCacheSet) -> Result { + let name = pcs.metadata.name.as_ref().ok_or(Error::internal( + "attempt to set a ParquetCacheSet without a name", + ))?; + let pp = Default::default(); + if pcs.metadata.uid.is_some() { + Ok(self.parquet_cache_set_api.replace(name, &pp, pcs).await?) + } else { + Ok(self.parquet_cache_set_api.create(&pp, pcs).await?) + } + } + + /// Retrieve the ParquetCacheSet for the specified Pod template hash. If there is no such + /// ParquetCacheSet then create a ParquetCacheSet object with appropriate defaults taken from + /// the current ParquetCache document. + async fn get_parquet_cache_set( + &mut self, + pod_template_hash: &str, + template: &PodTemplateSpec, + ) -> Result { + let name = self + .parquet_cache + .parquet_cache_set_name(pod_template_hash)?; + Ok(self + .parquet_cache_set_api + .get_opt(&name) + .await? + .unwrap_or_else(|| self.new_parquet_cache_set(name, pod_template_hash, template))) + } + + fn new_parquet_cache_set( + &self, + name: String, + pod_template_hash: &str, + template: &PodTemplateSpec, + ) -> ParquetCacheSet { + let pod_template_hash_key = String::from(POD_TEMPLATE_HASH_LABEL); + let replica_count_key = String::from(PARQUET_CACHE_REPLICAS_LABEL); + let replica_count_value = format!("{}", self.parquet_cache.replicas()); + + let mut labels = self + .parquet_cache + .metadata + .labels + .clone() + .unwrap_or_default(); + labels.insert( + pod_template_hash_key.clone(), + String::from(pod_template_hash), + ); + labels.insert(replica_count_key.clone(), replica_count_value.clone()); + let mut selector = self.parquet_cache.spec.selector.clone(); + let match_labels = selector.match_labels.get_or_insert(Default::default()); + match_labels.insert( + pod_template_hash_key.clone(), + String::from(pod_template_hash), + ); + match_labels.insert(replica_count_key.clone(), replica_count_value.clone()); + + let mut template = template.clone(); + let template_metadata = template.metadata.get_or_insert(Default::default()); + template_metadata.namespace = self.parquet_cache.metadata.namespace.clone(); + let template_labels = template_metadata.labels.get_or_insert(Default::default()); + template_labels.insert( + pod_template_hash_key.clone(), + String::from(pod_template_hash), + ); + + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(labels), + name: Some(name), + namespace: self.parquet_cache.metadata.namespace.clone(), + owner_references: Some(vec![self.owner_reference()]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: None, + selector, + state_port: self.parquet_cache.spec.state_port.clone(), + template: Some(template), + }, + status: None, + } + } + + fn status_config_map(&mut self) -> Result { + let mut data = BTreeMap::new(); + let status = self.status_mut(); + data.insert( + "current".to_string(), + serde_json::to_string(&status.current)?, + ); + data.insert("next".to_string(), serde_json::to_string(&status.next)?); + Ok(ConfigMap { + metadata: ObjectMeta { + namespace: self.parquet_cache.metadata.namespace.clone(), + name: Some(self.parquet_cache.config_map_name()?.clone()), + owner_references: Some(vec![self.owner_reference()]), + ..Default::default() + }, + data: Some(data), + ..Default::default() + }) + } + + /// Generate an owner reference for the current ParquetCache document. + fn owner_reference(&self) -> OwnerReference { + owner_reference(&self.parquet_cache) + } + + fn status_mut(&mut self) -> &mut ParquetCacheStatus { + self.parquet_cache.status.get_or_insert(Default::default()) + } +} + +fn split_pod_name(name: &str) -> (String, String) { + if let Some((prefix, suffix)) = name.rsplit_once('-') { + (String::from(prefix), String::from(suffix)) + } else { + (String::from(name), String::from("")) + } +} + +fn has_pod_template_hash(obj: &K, pod_template_hash: &String) -> bool +where + K: Resource, +{ + if let Some(hash) = obj + .meta() + .labels + .as_ref() + .and_then(|labels| labels.get(POD_TEMPLATE_HASH_LABEL)) + { + hash == pod_template_hash + } else { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::controller::parquet_cache::{ParquetCacheInstanceSet, ParquetCacheSpec}; + use crate::controller::parquet_cache_set::InstanceStatus; + use crate::controller::{ParquetCacheSet, ParquetCacheSetStatus, SHORT_WAIT}; + use crate::data_types::{InstanceState, State}; + use hyper::Body; + use kube::client::ClientBuilder; + use kube::ResourceExt; + use kube_test::{AsHandler, ResourceHandler, Service}; + use std::ops::Sub; + use std::sync::Arc; + + #[tokio::test] + async fn create_config_map() { + let ns = "create_config_map"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set(ns, name, Default::default()); + let uid = pc.metadata.uid.clone().unwrap_or_default(); + + fixture.reconcile(ns, pc).await.unwrap(); + + let cm = fixture.config_maps.get(ns, name).unwrap(); + assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap()); + assert_eq!(name, cm.metadata.name.as_ref().unwrap()); + assert_eq!(uid, cm.metadata.owner_references.as_ref().unwrap()[0].uid); + assert!(!cm.data.as_ref().unwrap().get("current").unwrap().is_empty()); + assert!(!cm.data.as_ref().unwrap().get("next").unwrap().is_empty()); + } + + #[tokio::test] + async fn create_config_map_no_fail_on_existing() { + let ns = "create_config_map_no_fail_on_existing"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + fixture.config_maps.set(ns, name, Default::default()); + let pc = fixture.parquet_caches.set(ns, name, Default::default()); + + fixture.reconcile(ns, pc).await.unwrap(); + + let cm = fixture.config_maps.get(ns, name).unwrap(); + assert_eq!(ns, cm.metadata.namespace.as_ref().unwrap()); + assert_eq!(name, cm.metadata.name.as_ref().unwrap()); + } + + #[tokio::test] + async fn create_initial_parquet_cache_set_at_full_size() { + let ns = "create_initial_parquet_cache_set_at_full_size"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + spec: ParquetCacheSpec { + replicas: Some(5), + ..Default::default() + }, + ..Default::default() + }, + ); + let uid = pc.metadata.uid.clone().unwrap_or_default(); + let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap(); + + fixture.reconcile(ns, pc.clone()).await.unwrap(); + + let parquet_cache_sets = fixture + .parquet_cache_sets + .all(ns) + .into_iter() + .filter(|pcs| pcs.owner_references().iter().any(|or| or.uid == uid)) + .collect::>(); + + assert_eq!(1, parquet_cache_sets.len()); + let pcs = &parquet_cache_sets[0]; + assert_eq!( + &template_hash, + pcs.metadata + .labels + .as_ref() + .and_then(|map| map.get(POD_TEMPLATE_HASH_LABEL)) + .unwrap() + ); + assert_eq!( + 5, + pcs.spec + .replica_suffixes + .as_ref() + .map(Vec::len) + .unwrap_or_default() + ); + + let cm = fixture.config_maps.get(ns, name).unwrap(); + let current = cm.data.as_ref().unwrap().get("current").unwrap().clone(); + let next = cm.data.as_ref().unwrap().get("next").unwrap().clone(); + assert_eq!(current, next); + + let pcis = serde_json::from_str::(¤t).unwrap(); + assert_eq!(5, pcis.instances.len()); + } + + #[tokio::test] + async fn old_parquet_cache_set_removed() { + let ns = "old_parquet_cache_set_removed"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set(ns, name, Default::default()); + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([])), + }), + }, + ); + let pcs2_name = format!("{name}-bbbbbbbbbb"); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("bbbbbbbbbb"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{pcs2_name}-0"), + Default::default(), + )])), + }), + }, + ); + let template_hash = hash_object(&pc.parquet_cache_set_template()).unwrap(); + let pcs3_name = format!("{name}-{template_hash}"); + fixture.parquet_cache_sets.set( + ns, + &pcs3_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + template_hash.clone(), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([])), + }), + }, + ); + + fixture.reconcile(ns, pc).await.unwrap(); + + assert!(fixture.parquet_cache_sets.get(ns, &pcs1_name).is_none()); + assert!(fixture.parquet_cache_sets.get(ns, &pcs2_name).is_some()); + assert!(fixture.parquet_cache_sets.get(ns, &pcs3_name).is_some()); + } + + #[tokio::test] + async fn warming_pods_retry_shortly() { + let ns = "warming_pods_retry_shortly"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + next: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + let pcs2_name = format!("{name}-bbbbbbbbbb"); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("bbbbbbbbbb"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-bbbbbbbbbb-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(30)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap()); + } + + #[tokio::test] + async fn warm_pods_update_status() { + let ns = "warm_pods_update_status"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + next: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + let pcs2_name = format!("{name}-bbbbbbbbbb"); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("bbbbbbbbbb"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-bbbbbbbbbb-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(30)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + fixture.reconcile(ns, pc).await.unwrap(); + + let status = fixture + .parquet_caches + .get(ns, name) + .unwrap() + .status + .unwrap(); + assert_eq!(status.next, status.current); + } + + #[tokio::test] + async fn cooling_pods_retry_shortly() { + let ns = "cooling_pods_retry_shortly"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + next: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + let pcs2_name = format!("{name}-bbbbbbbbbb"); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("bbbbbbbbbb"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-bbbbbbbbbb-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(30)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pc).await.unwrap()); + } + + #[tokio::test] + async fn cooled_pods_are_removed() { + let ns = "cooled_pods_are_removed"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + next: ParquetCacheInstanceSet { + revision: 2, + instances: vec![format!("{name}-bbbbbbbbbb-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + let pcs2_name = format!("{name}-bbbbbbbbbb"); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("bbbbbbbbbb"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-bbbbbbbbbb-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(400)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + fixture.reconcile(ns, pc).await.unwrap(); + + let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap(); + assert!( + pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty() + ); + } + + #[tokio::test] + async fn resizing_recreates_everything() { + let ns = "resizing_recreates_everything"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + spec: ParquetCacheSpec { + replicas: Some(2), + ..Default::default() + }, + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + next: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 1, + }), + }, + )])), + }), + }, + ); + let template = pc.parquet_cache_set_template(); + let hash = hash_object(&template).unwrap(); + let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap(); + + fixture.reconcile(ns, pc).await.unwrap(); + + let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap(); + assert!( + pcs1.spec.replica_suffixes.is_none() || pcs1.spec.replica_suffixes.unwrap().is_empty() + ); + let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap(); + assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len()) + } + + #[tokio::test] + async fn template_change_starts_migration() { + let ns = "template_change_starts_migration"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + next: ParquetCacheInstanceSet { + revision: 1, + instances: vec![format!("{name}-aaaaaaaaaa-0")], + }, + }), + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-aaaaaaaaaa-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 1, + next_node_set_revision: 1, + }), + }, + )])), + }), + }, + ); + let template = pc.parquet_cache_set_template(); + let hash = hash_object(&template).unwrap(); + let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap(); + + fixture.reconcile(ns, pc).await.unwrap(); + + let pc = fixture.parquet_caches.get(ns, name).unwrap(); + let status = &pc.status.unwrap(); + assert!(status.current.revision < status.next.revision); + assert_eq!(format!("{name}-aaaaaaaaaa-0"), status.current.instances[0]); + assert_eq!(format!("{name}-{hash}-0"), status.next.instances[0]); + + let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap(); + assert_eq!(1, pcs1.spec.replica_suffixes.unwrap().len()); + let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap(); + assert_eq!(1, pcs2.spec.replica_suffixes.unwrap().len()) + } + + #[tokio::test] + async fn one_pod_migrated_at_a_time() { + let ns = "template_change_starts_migration"; + let name = "parquet-cache"; + let fixture: Fixture = Default::default(); + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + spec: ParquetCacheSpec { + replicas: Some(3), + ..Default::default() + }, + ..Default::default() + }, + ); + + let pcs1_name = format!("{name}-aaaaaaaaaa"); + fixture.parquet_cache_sets.set( + ns, + &pcs1_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + String::from("aaaaaaaaaa"), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("1"), String::from("2")]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([ + ( + format!("{name}-aaaaaaaaaa-1"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(1800)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + ), + ( + format!("{name}-aaaaaaaaaa-2"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(1800)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + ), + ])), + }), + }, + ); + let template = pc.parquet_cache_set_template(); + let hash = hash_object(&template).unwrap(); + let pcs2_name = pc.parquet_cache_set_name(&hash).unwrap(); + fixture.parquet_cache_sets.set( + ns, + &pcs2_name, + ParquetCacheSet { + metadata: ObjectMeta { + labels: Some(BTreeMap::from([( + String::from(POD_TEMPLATE_HASH_LABEL), + hash.clone(), + )])), + owner_references: Some(vec![owner_reference(&pc)]), + ..Default::default() + }, + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + format!("{name}-{hash}-0"), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + state_changed: chrono::Utc::now() + .sub(Duration::from_secs(600)) + .timestamp(), + current_node_set_revision: 2, + next_node_set_revision: 2, + }), + }, + )])), + }), + }, + ); + + let pc = fixture.parquet_caches.set( + ns, + name, + ParquetCache { + status: Some(ParquetCacheStatus { + current: ParquetCacheInstanceSet { + revision: 2, + instances: vec![ + format!("{name}-{hash}-0"), + format!("{name}-aaaaaaaaaa-1"), + format!("{name}-aaaaaaaaaa-2"), + ], + }, + next: ParquetCacheInstanceSet { + revision: 2, + instances: vec![ + format!("{name}-{hash}-0"), + format!("{name}-aaaaaaaaaa-1"), + format!("{name}-aaaaaaaaaa-2"), + ], + }, + }), + ..pc + }, + ); + + fixture.reconcile(ns, pc).await.unwrap(); + + let pc = fixture.parquet_caches.get(ns, name).unwrap(); + let status = &pc.status.unwrap(); + assert!(status.current.revision < status.next.revision); + assert_eq!(status.current.instances[0], status.next.instances[0]); + assert_eq!(format!("{name}-aaaaaaaaaa-1"), status.current.instances[1]); + assert_eq!(format!("{name}-{hash}-1"), status.next.instances[1]); + assert_eq!(status.current.instances[2], status.next.instances[2]); + + let pcs1 = fixture.parquet_cache_sets.get(ns, pcs1_name).unwrap(); + assert_eq!(2, pcs1.spec.replica_suffixes.unwrap().len()); + let pcs2 = fixture.parquet_cache_sets.get(ns, pcs2_name).unwrap(); + assert_eq!(2, pcs2.spec.replica_suffixes.unwrap().len()) + } + + #[derive(Debug, Default)] + struct Fixture { + pub config_maps: Arc>, + pub parquet_cache_sets: Arc>, + pub parquet_caches: Arc>, + } + + impl Fixture { + fn service(&self) -> Service { + let service = Service::new(); + service.add_handler(self.config_maps.as_handler()); + service.add_handler(self.parquet_cache_sets.as_handler()); + service.add_handler(self.parquet_caches.as_handler()); + service + } + + async fn reconcile( + &self, + ns: impl Into + Send, + pc: ParquetCache, + ) -> Result { + let service = self.service(); + let client = ClientBuilder::new(service, ns).build::(); + let mut controller = ParquetCacheController::new(pc, client); + let hnd = tokio::spawn(async move { controller.reconcile().await }); + hnd.await.unwrap() + } + } +} diff --git a/parquet_cache/src/controller/parquet_cache_set.rs b/parquet_cache/src/controller/parquet_cache_set.rs new file mode 100644 index 00000000000..bb172fa15bf --- /dev/null +++ b/parquet_cache/src/controller/parquet_cache_set.rs @@ -0,0 +1,75 @@ +use crate::data_types::{InstanceState, State}; +use k8s_openapi::api::core::v1::PodTemplateSpec; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector; +use k8s_openapi::schemars::JsonSchema; +use kube::CustomResource; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; + +/// Specification of a ParquetCacheSet. +#[derive(Debug, Default, Clone, CustomResource, Deserialize, Serialize, JsonSchema)] +#[kube( + kind = "ParquetCacheSet", + group = "iox.influxdata.com", + version = "v1alpha1", + namespaced +)] +#[kube(status = "ParquetCacheSetStatus")] +#[kube(derive = "Default")] +#[serde(rename_all = "camelCase")] +pub struct ParquetCacheSetSpec { + /// Suffixes for the pods required to be in the set. + pub replica_suffixes: Option>, + + /// Selector is a label query over pods that should match the replica + /// count. Label keys and values that must match in order to be controlled + /// by this parquet cache set. It must match the pod template's labels. + pub selector: LabelSelector, + + /// Port to connect to on the pod in order to enquire about the status of + /// the cache. + pub state_port: Option, + + /// Template is the object that describes the pod that will be created + /// if insufficient replicas are detected. + pub template: Option, +} + +/// Status of a ParquetCacheSet. +#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)] +pub struct ParquetCacheSetStatus { + /// Status of the pods that form the set. + pub pods: Option>, +} + +#[derive(Debug, Default, Deserialize, Serialize, Clone, JsonSchema)] +pub struct InstanceStatus { + /// The phase the pod is in. + pub phase: Option, + + /// The state reported by the pod. This is only included if the pod is in the "Running" phase + /// and the state could be queried successfully. + pub state: Option, +} + +impl InstanceStatus { + /// Determine if the status represents a warming instance. + pub(super) fn is_warming(&self) -> bool { + match &self.phase { + None => false, + Some(phase) => match phase.as_str() { + "Running" => match &self.state { + None => true, + Some(state) => state.state == InstanceState::Warming, + }, + _ => false, + }, + } + } +} + +impl ParquetCacheSet { + pub(super) fn selectors(&self) -> Option { + super::kube_util::selectors(&self.spec.selector) + } +} diff --git a/parquet_cache/src/controller/parquet_cache_set_controller.rs b/parquet_cache/src/controller/parquet_cache_set_controller.rs new file mode 100644 index 00000000000..ad478bf1d10 --- /dev/null +++ b/parquet_cache/src/controller/parquet_cache_set_controller.rs @@ -0,0 +1,676 @@ +use super::{ + kube_util::owner_reference, parquet_cache_set::InstanceStatus, state_service, Error, + ParquetCacheSet, ParquetCacheSetStatus, Result, CONTROLLER_NAME, LONG_WAIT, + PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL, SHORT_WAIT, +}; +use futures::StreamExt; +use k8s_openapi::api::core::v1::Pod; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference}; +use kube::api::{ListParams, PostParams}; +use kube::runtime::controller::Action; +use kube::runtime::watcher::Config; +use kube::runtime::Controller; +use kube::{Api, Client, ResourceExt}; +use observability_deps::tracing::{error, info}; +use std::collections::BTreeSet; +use std::fmt::Debug; +use std::sync::Arc; +use std::time::Duration; +use tokio::task::JoinHandle; + +/// Start a new controller task to reconcile [ParquetCacheSet] objects. +pub fn spawn_controller(client: Client, ns: Option) -> JoinHandle> { + tokio::spawn(run_controller(client, ns)) +} + +async fn run_controller(client: Client, ns: Option) -> Result<(), kube::Error> { + let parquet_cache_set_api = match &ns { + Some(ns) => Api::::namespaced(client.clone(), ns), + None => Api::::all(client.clone()), + }; + let pod_api = match &ns { + Some(ns) => Api::::namespaced(client.clone(), ns), + None => Api::::all(client.clone()), + }; + + Controller::new(parquet_cache_set_api, Default::default()) + .owns( + pod_api, + Config::default().labels(&format!( + "{},{}", + PARQUET_CACHE_REPLICAS_LABEL, POD_TEMPLATE_HASH_LABEL + )), + ) + .run( + reconcile, + error_policy, + Arc::new(Context { + client, + state_service: Default::default(), + }), + ) + .for_each(|_| futures::future::ready(())) + .await; + Ok(()) +} + +async fn reconcile(obj: Arc, ctx: Arc) -> Result { + let namespace = obj.metadata.namespace.as_deref(); + let name = obj.name_any(); + info!(namespace, name, "reconcile request"); + let sleep = ParquetCacheSetController::new( + obj.as_ref().clone(), + ctx.state_service.clone(), + ctx.client.clone(), + ) + .reconcile() + .await?; + Ok(Action::requeue(sleep)) +} + +fn error_policy(_object: Arc, err: &Error, _ctx: Arc) -> Action { + // TODO add exponential backoff + let sleep = Duration::from_secs(5); + error!( + err = err as &dyn std::error::Error, + "reconcile failed, requeue in {:?}", sleep + ); + Action::requeue(sleep) +} + +/// Context used when reconciling [ParquetCacheSet] objects. +struct Context { + client: Client, + state_service: state_service::Client, +} + +/// Controller for the ParquetCacheSet custom resource. This controller maintains the set of pods +/// created by a ParquetCacheSet. +#[derive(Debug)] +struct ParquetCacheSetController { + parquet_cache_set_api: Api, + pod_api: Api, + state_service: state_service::Client, + + parquet_cache_set: ParquetCacheSet, +} + +impl ParquetCacheSetController { + /// Create a new ParquetCacheSetController instance for the provided [ParquetCacheSet] + /// and [Client]. + fn new( + parquet_cache_set: ParquetCacheSet, + state_service: state_service::Client, + client: Client, + ) -> Self { + let ns = parquet_cache_set.metadata.namespace.as_ref().unwrap(); + let parquet_cache_sets: Api = Api::namespaced(client.clone(), ns); + let pods: Api = Api::namespaced(client.clone(), ns); + + Self { + parquet_cache_set_api: parquet_cache_sets, + pod_api: pods, + state_service, + parquet_cache_set, + } + } + + /// Perform the business logic required to move the ParquetCacheSet state forward towards the + /// desired state. + async fn reconcile(&mut self) -> Result { + let duration = self.reconcile_inner().await?; + + // Ensure the status is always kept up-to-date. + self.parquet_cache_set_api + .replace_status( + self.parquet_cache_set.metadata.name.as_ref().unwrap(), + &Default::default(), + serde_json::to_vec(&self.parquet_cache_set)?, + ) + .await?; + Ok(duration) + } + + async fn reconcile_inner(&mut self) -> Result { + let prefix = self.parquet_cache_set.metadata.name.as_ref().unwrap(); + let pod_names = self + .parquet_cache_set + .spec + .replica_suffixes + .as_ref() + .map_or_else(BTreeSet::new, |v| { + v.iter() + .map(|suffix| format!("{prefix}-{suffix}")) + .collect::>() + }); + + self.delete_removed(&pod_names).await?; + self.create_missing(&pod_names).await?; + self.update_status(&pod_names).await?; + + let warming = self + .status_mut() + .pods + .as_ref() + .map(|pods| pods.iter().any(|(_, status)| status.is_warming())) + .unwrap_or(false); + + // If there are cache pods in the warming state then check them in a minute, otherwise wait + // for an hour, or for a state change. + Ok(if warming { SHORT_WAIT } else { LONG_WAIT }) + } + + async fn delete_removed(&mut self, pod_names: &BTreeSet) -> Result<()> { + let pods = self + .pod_api + .list(&ListParams { + label_selector: self.parquet_cache_set.selectors(), + ..Default::default() + }) + .await?; + let to_delete = pods + .iter() + .filter_map(|pod| pod.metadata.name.as_ref()) + .filter(|&name| !pod_names.contains(name)) + .collect::>(); + + for pod_name in to_delete { + info!(name = pod_name, "Deleting Pod"); + self.pod_api.delete(pod_name, &Default::default()).await?; + } + Ok(()) + } + + async fn create_missing(&mut self, pods: &BTreeSet) -> Result<()> { + for pod in pods { + if !self.pod_exists(pod).await? { + info!(name = pod, "Creating Pod"); + self.create_pod(pod.clone()).await?; + } + } + Ok(()) + } + + async fn update_status(&mut self, pod_names: &BTreeSet) -> Result<()> { + if let Some(pods) = self.status_mut().pods.as_mut() { + pods.clear(); + } + for name in pod_names { + let pod = self.pod_api.get_status(name).await?; + let phase = pod.status.clone().and_then(|status| status.phase); + let state = match phase.as_deref() { + Some("Running") => { + self.state_service + .state(&pod, &self.parquet_cache_set.spec.state_port) + .await? + } + _ => None, + }; + self.status_mut() + .pods + .get_or_insert(Default::default()) + .insert(name.clone(), InstanceStatus { phase, state }); + } + Ok(()) + } + + async fn pod_exists(&self, name: &str) -> Result { + match self.pod_api.get(name).await { + Ok(_) => Ok(true), + Err(kube::Error::Api(error_response)) if error_response.reason == "NotFound" => { + Ok(false) + } + Err(error) => Err(Error::from(error)), + } + } + + /// Create a new data cache instance pod. + async fn create_pod(&self, name: String) -> Result { + let template = self + .parquet_cache_set + .spec + .template + .clone() + .unwrap_or_default(); + let pod = Pod { + metadata: ObjectMeta { + namespace: self.parquet_cache_set.metadata.namespace.clone(), + name: Some(name), + owner_references: Some(vec![self.owner_reference()]), + ..template.metadata.unwrap_or_default() + }, + spec: template.spec, + ..Default::default() + }; + Ok(self + .pod_api + .create( + &PostParams { + dry_run: false, + field_manager: Some(CONTROLLER_NAME.to_string()), + }, + &pod, + ) + .await?) + } + + /// Generate an owner reference for the current ParquetCacheSet document. + fn owner_reference(&self) -> OwnerReference { + owner_reference(&self.parquet_cache_set) + } + + fn status_mut(&mut self) -> &mut ParquetCacheSetStatus { + self.parquet_cache_set + .status + .get_or_insert(Default::default()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::controller::state_service::Request; + use crate::controller::{ParquetCacheSet, ParquetCacheSetSpec}; + use crate::data_types::{InstanceState, State}; + use hyper::Body; + use k8s_openapi::api::core::v1::{Pod, PodSpec, PodTemplateSpec}; + use kube::client::ClientBuilder; + use kube_test::{AsHandler, ResourceHandler, Service}; + use std::collections::BTreeMap; + use std::sync::Arc; + use std::task::{Context, Poll}; + + #[tokio::test] + async fn create_pods() { + let ns = "create_pods"; + let name = "parquet-cache-aaaaaaaaaa"; + + let fixture: Fixture = Default::default(); + + let pcs = fixture.parquet_cache_sets.set( + ns, + name, + ParquetCacheSet { + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0"), String::from("1")]), + template: Some(PodTemplateSpec { + spec: Some(PodSpec { + priority: Some(2), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }, + ..Default::default() + }, + ); + + fixture.reconcile(ns, pcs.clone()).await.unwrap(); + + let pods = fixture.pods.all(ns); + assert_eq!(2, pods.len()); + + let mut pod_names = pods + .iter() + .map(|pod| pod.metadata.name.as_ref().unwrap().clone()) + .collect::>(); + pod_names.sort(); + assert_eq!( + &vec!["parquet-cache-aaaaaaaaaa-0", "parquet-cache-aaaaaaaaaa-1"], + &pod_names + ); + + // Make sure the provided template has been used, and the pods are owned by the + // ParquetCacheSet. + for pod in &pods { + assert_eq!(2, pod.spec.as_ref().unwrap().priority.unwrap()); + assert_eq!( + owner_reference(&pcs), + pod.metadata.owner_references.as_ref().unwrap()[0].clone() + ); + } + } + + #[tokio::test] + async fn remove_pods() { + let ns = "remove_pods"; + let name = "parquet-cache-aaaaaaaaaa"; + + let fixture: Fixture = Default::default(); + + let pod0_name = format!("{name}-0"); + let pod1_name = format!("{name}-1"); + let pod2_name = format!("{name}-2"); + + let pcs = fixture.parquet_cache_sets.set( + ns, + name, + ParquetCacheSet { + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("1"), String::from("2")]), + template: Some(PodTemplateSpec { + spec: Some(PodSpec { + priority: Some(2), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([ + ( + pod0_name.clone(), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + ..Default::default() + }), + }, + ), + ( + pod1_name.clone(), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + ..Default::default() + }), + }, + ), + ( + pod2_name.clone(), + InstanceStatus { + phase: Some(String::from("Running")), + state: Some(State { + state: InstanceState::Warming, + ..Default::default() + }), + }, + ), + ])), + }), + ..Default::default() + }, + ); + + fixture.pods.set( + ns, + &pod0_name, + Pod { + metadata: ObjectMeta { + owner_references: Some(vec![owner_reference(&pcs)]), + ..Default::default() + }, + ..Default::default() + }, + ); + + fixture.pods.set( + ns, + &pod1_name, + Pod { + metadata: ObjectMeta { + owner_references: Some(vec![owner_reference(&pcs)]), + ..Default::default() + }, + ..Default::default() + }, + ); + + fixture.pods.set( + ns, + &pod2_name, + Pod { + metadata: ObjectMeta { + owner_references: Some(vec![owner_reference(&pcs)]), + ..Default::default() + }, + ..Default::default() + }, + ); + + fixture.reconcile(ns, pcs).await.unwrap(); + + let pods = fixture.pods.all(ns); + assert_eq!(2, pods.len()); + + let mut pod_names = pods + .iter() + .map(|pod| pod.metadata.name.as_ref().unwrap().clone()) + .collect::>(); + pod_names.sort(); + assert_eq!(vec![pod1_name.clone(), pod2_name.clone()], pod_names); + } + + #[tokio::test] + async fn warming_pods_refresh_shortly() { + let ns = "warming_pods_refresh_shortly"; + let name = "parquet-cache-aaaaaaaaaa"; + + let mut fixture: Fixture = Default::default(); + + let pod0_name = format!("{name}-0"); + + let pcs = fixture.parquet_cache_sets.set( + ns, + name, + ParquetCacheSet { + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + template: Some(PodTemplateSpec { + spec: Some(PodSpec { + priority: Some(2), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + pod0_name.clone(), + InstanceStatus { + phase: Some(String::from("Pending")), + state: None, + }, + )])), + }), + ..Default::default() + }, + ); + + fixture.pods.set( + ns, + &pod0_name, + Pod { + metadata: ObjectMeta { + owner_references: Some(vec![owner_reference(&pcs)]), + ..Default::default() + }, + status: Some(k8s_openapi::api::core::v1::PodStatus { + phase: Some(String::from("Running")), + ..Default::default() + }), + ..Default::default() + }, + ); + + fixture.instance_state.insert( + pod0_name.clone(), + State { + state: InstanceState::Warming, + ..Default::default() + }, + ); + + assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap()); + + let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap(); + assert_eq!( + "Running", + pcs.status + .as_ref() + .unwrap() + .pods + .as_ref() + .unwrap() + .get(&pod0_name) + .unwrap() + .phase + .as_deref() + .unwrap() + ); + assert_eq!( + InstanceState::Warming, + pcs.status + .as_ref() + .unwrap() + .pods + .as_ref() + .unwrap() + .get(&pod0_name) + .unwrap() + .state + .as_ref() + .unwrap() + .state + ); + } + + #[tokio::test] + async fn no_status_pods_refresh_shortly() { + let ns = "no_status_pods_refresh_shortly"; + let name = "parquet-cache-aaaaaaaaaa"; + + let fixture: Fixture = Default::default(); + + let pod0_name = format!("{name}-0"); + + let pcs = fixture.parquet_cache_sets.set( + ns, + name, + ParquetCacheSet { + spec: ParquetCacheSetSpec { + replica_suffixes: Some(vec![String::from("0")]), + template: Some(PodTemplateSpec { + spec: Some(PodSpec { + priority: Some(2), + ..Default::default() + }), + ..Default::default() + }), + ..Default::default() + }, + status: Some(ParquetCacheSetStatus { + pods: Some(BTreeMap::from([( + pod0_name.clone(), + InstanceStatus { + phase: Some(String::from("Pending")), + state: None, + }, + )])), + }), + ..Default::default() + }, + ); + + fixture.pods.set( + ns, + &pod0_name, + Pod { + metadata: ObjectMeta { + owner_references: Some(vec![owner_reference(&pcs)]), + ..Default::default() + }, + status: Some(k8s_openapi::api::core::v1::PodStatus { + phase: Some(String::from("Running")), + ..Default::default() + }), + ..Default::default() + }, + ); + + assert_eq!(SHORT_WAIT, fixture.reconcile(ns, pcs).await.unwrap()); + + let pcs = fixture.parquet_cache_sets.get(ns, name).unwrap(); + assert_eq!( + "Running", + pcs.status + .as_ref() + .unwrap() + .pods + .as_ref() + .unwrap() + .get(&pod0_name) + .unwrap() + .phase + .as_deref() + .unwrap() + ); + assert!(&pcs + .status + .unwrap() + .pods + .unwrap() + .get(&pod0_name) + .unwrap() + .state + .is_none()); + } + + #[derive(Debug, Default)] + struct Fixture { + pub parquet_cache_sets: Arc>, + pub pods: Arc>, + pub instance_state: BTreeMap, + } + + impl Fixture { + fn service(&self) -> Service { + let service = Service::new(); + service.add_handler(self.parquet_cache_sets.as_handler()); + service.add_handler(self.pods.as_handler()); + service + } + + async fn reconcile( + &self, + ns: impl Into + Send, + pcs: ParquetCacheSet, + ) -> Result { + let service = self.service(); + let client = ClientBuilder::new(service, ns).build::(); + let state_service_client = + state_service::Client::new(StateService(self.instance_state.clone())); + let mut controller = ParquetCacheSetController::new(pcs, state_service_client, client); + let hnd = tokio::spawn(async move { controller.reconcile().await }); + hnd.await.unwrap() + } + } + + #[derive(Debug, Clone)] + struct StateService(BTreeMap); + + impl tower::Service for StateService { + type Response = Option; + type Error = Box; + type Future = std::future::Ready>; + + fn poll_ready( + &mut self, + _cx: &mut Context<'_>, + ) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + std::future::ready(Ok(self + .0 + .get(req.pod.metadata.name.as_deref().unwrap_or_default()) + .cloned())) + } + } +} diff --git a/parquet_cache/src/controller/state_service.rs b/parquet_cache/src/controller/state_service.rs new file mode 100644 index 00000000000..847d3b887cf --- /dev/null +++ b/parquet_cache/src/controller/state_service.rs @@ -0,0 +1,109 @@ +use super::{Error, Result}; +use crate::data_types::State; +use hyper::service::Service; +use k8s_openapi::api::core::v1::Pod; +use observability_deps::tracing::debug; +use std::fmt::{Debug, Formatter}; +use std::future::{poll_fn, Future}; +use std::pin::Pin; +use std::task::{Context, Poll}; +use tower::buffer::Buffer; +use tower::util::BoxService; +use tower::{BoxError, ServiceExt}; + +#[derive(Debug, Clone)] +pub struct Request { + pub pod: Pod, + pub port: Option, +} + +#[derive(Clone)] +pub struct Client { + inner: Buffer, BoxError>, Request>, +} + +impl Client { + pub fn new(svc: S) -> Self + where + S: Service> + Clone + Send + 'static, + S::Error: Into + Send + Sync, + S::Future: Future, S::Error>> + Send + 'static, + { + Self { + inner: Buffer::new(BoxService::new(svc.map_err(|e| e.into())), 1024), + } + } + + pub async fn state(&mut self, pod: &Pod, port: &Option) -> Result> { + let request = Request { + pod: pod.clone(), + port: port.clone(), + }; + poll_fn(|cx| (self.inner.poll_ready(cx))) + .await + .map_err(Error::NodeStateError)?; + self.inner + .call(request) + .await + .map_err(Error::NodeStateError) + } +} + +impl Debug for Client { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "pod state service client") + } +} + +impl Default for Client { + fn default() -> Self { + Self::new(ReqwestClient {}) + } +} + +#[derive(Debug, Clone)] +struct ReqwestClient {} + +impl Service for ReqwestClient { + type Response = Option; + type Error = reqwest::Error; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, _cx: &mut Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + let fut = async { + let url = req + .pod + .status + .and_then(|status| status.pod_ip) + .map(|ip_addr| match req.port { + Some(port) => format!("http://{ip_addr}:{port}/state"), + None => format!("http://{ip_addr}/state"), + }); + debug!(url, "Getting pod state"); + if let Some(url) = url { + let response = match reqwest::get(url).await { + Ok(response) => Some(response.json().await?), + Err(error) => { + debug!( + error = &error as &dyn std::error::Error, + "Error getting state" + ); + if error.is_connect() { + None + } else { + return Err(error); + } + } + }; + Ok(response) + } else { + Ok(None) + } + }; + Box::pin(fut) + } +} diff --git a/parquet_cache/src/data_types.rs b/parquet_cache/src/data_types.rs new file mode 100644 index 00000000000..f69dd60c321 --- /dev/null +++ b/parquet_cache/src/data_types.rs @@ -0,0 +1,12 @@ +//! Contains the datatypes to be shared across the data cache server and client. + +mod keyspace; +pub use keyspace::*; +mod objects; +pub use objects::*; +mod policy; +pub use policy::*; +mod state; +pub use state::*; +mod write_hints; +pub use write_hints::*; diff --git a/parquet_cache/src/data_types/keyspace.rs b/parquet_cache/src/data_types/keyspace.rs new file mode 100644 index 00000000000..faecda8fed4 --- /dev/null +++ b/parquet_cache/src/data_types/keyspace.rs @@ -0,0 +1,164 @@ +use crate::data_types::State; +use k8s_openapi::schemars::JsonSchema; +use serde::{Deserialize, Serialize, Serializer}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use super::state::InstanceState; + +/// Response body for keyspace request. +#[derive(Debug, Serialize, Deserialize)] +pub struct KeyspaceResponseBody { + /// Complete list of nodes for the hashring assignment of keyspace. + pub nodes: Vec, +} + +/// Identifier used by data cache node. +/// +/// This identifier should remain consistent for any nodes being cycled (e.g. k8s), +/// as it determines the location in the hashring. +pub type ServiceNodeId = u64; + +/// Hostname data cache node. +pub type ServiceNodeHostname = String; + +/// Data cache service node. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServiceNode { + /// Id of data cache service node. + pub id: ServiceNodeId, + /// Hostname. + pub hostname: ServiceNodeHostname, +} + +/// The set of instances that form a parquet cache group. +#[derive(Debug, Clone, Default, PartialEq, Deserialize, Serialize, JsonSchema)] +pub struct ParquetCacheInstanceSet { + /// The revision number of the cache instance set. + pub revision: i64, + + /// The set of instances that form the cache set. + pub instances: Vec, +} + +impl ParquetCacheInstanceSet { + /// Returns true if the instance set is empty. + pub fn contains(&self, node_hostname: &ServiceNodeHostname) -> bool { + self.instances.contains(node_hostname) + } +} + +// TODO: make on-disc and in-mem representations match! +/// Converts on on-disc representation of the keyspace from the controller +/// into the keyspace respresentation consumed by the cache client & server. +impl From<&ParquetCacheInstanceSet> for KeyspaceResponseBody { + fn from(value: &ParquetCacheInstanceSet) -> Self { + Self { + nodes: value + .clone() + .instances + .into_iter() + .enumerate() + .map(|(id, hostname)| ServiceNode { + id: id as u64, + hostname, + }) + .collect(), + } + } +} + +impl From<&KeyspaceVersion> for InstanceState { + fn from(value: &KeyspaceVersion) -> Self { + match (&value.current, &value.next) { + (Some(current), Some(next)) => { + match ( + current.contains(&value.self_node), + next.contains(&value.self_node), + ) { + (false, true) => Self::Warming, + (true, true) => Self::Running, + (true, false) => Self::Cooling, + (false, false) => Self::Cooling, + } + } + (None, Some(next)) if next.contains(&value.self_node) => Self::Warming, + (Some(_), None) => unreachable!("next should always be set, if curr exists"), + _ => Self::Pending, + } + } +} + +/// Tracker of Keyspace version changes. +/// +/// The response of `GET /state` is the serialized version of this struct. +#[derive(Clone, Debug)] +pub struct KeyspaceVersion { + /// Hostname of node, in order to identify self in [`ParquetCacheInstanceSet`]. + /// + /// Does not change. + self_node: ServiceNodeHostname, + /// current ParquetCacheInstanceSet + pub current: Option, + /// next ParquetCacheInstanceSet + pub next: Option, + /// time that the service was last updated + pub changed: SystemTime, +} + +impl Serialize for KeyspaceVersion { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let state = State { + state: InstanceState::from(self), + state_changed: self + .changed + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64, + current_node_set_revision: self.current.as_ref().map(|pcis| pcis.revision).unwrap_or(0), + next_node_set_revision: self.next.as_ref().map(|pcis| pcis.revision).unwrap_or(0), + }; + state.serialize(serializer) + } +} + +impl KeyspaceVersion { + /// Initialize the KeyspaceVersion with only the hostname (config option) known. + pub fn new(self_node: ServiceNodeHostname) -> Self { + Self { + self_node, + current: None, + next: None, + changed: SystemTime::now(), + } + } + + /// Get hostname. + pub fn hostname(&self) -> &ServiceNodeHostname { + &self.self_node + } + + /// Duplicate the `next` to `current`. + /// + /// This method is tightly coupled to the definition of InstanceState::from(KeyspaceVersion). + pub fn clone_next_to_curr(&self) -> Self { + Self { + self_node: self.self_node.clone(), + current: self.next.clone(), + next: self.next.clone(), + changed: SystemTime::now(), + } + } + + /// Set next. + pub fn set_next(&self, next: ParquetCacheInstanceSet) -> Self { + Self { + self_node: self.self_node.clone(), + current: self.next.clone(), // increment forward + next: Some(next), + changed: SystemTime::now(), + } + } +} diff --git a/parquet_cache/src/data_types/objects.rs b/parquet_cache/src/data_types/objects.rs new file mode 100644 index 00000000000..55555698313 --- /dev/null +++ b/parquet_cache/src/data_types/objects.rs @@ -0,0 +1,79 @@ +use hyper::{header::HeaderValue, HeaderMap}; +use serde::{Deserialize, Serialize}; + +use crate::client::cache_connector::Error as CacheServerError; + +pub static X_RANGE_START_HEADER: &str = "x-object-range-start"; +pub static X_RANGE_END_HEADER: &str = "x-object-range-end"; + +pub fn extract_usize_header( + header: &'static str, + values: &HeaderMap, +) -> Result { + let val = values + .get(header) + .ok_or(CacheServerError::ReadData(format!( + "missing header {}", + header + )))? + .to_str() + .map_err(|_| CacheServerError::ReadData(format!("missing {} header", header)))?; + + val.parse::() + .map_err(|_| CacheServerError::ReadData(format!("invalid {} header", header))) +} + +/// Metadata for object. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)] +pub struct GetObjectMetaResponse { + /// The full path to the object + pub location: String, + /// The last modified time + pub last_modified: chrono::DateTime, + /// The size in bytes of the object + pub size: usize, + /// The unique identifier for the object + pub e_tag: Option, + /// A version indicator for this object + pub version: Option, +} + +impl From for object_store::ObjectMeta { + fn from(value: GetObjectMetaResponse) -> Self { + let GetObjectMetaResponse { + location, + last_modified, + size, + e_tag, + version, + } = value; + + Self { + location: object_store::path::Path::parse(location).expect("should be valid path"), + last_modified, + size, + e_tag, + version, + } + } +} + +impl From for GetObjectMetaResponse { + fn from(value: object_store::ObjectMeta) -> Self { + let object_store::ObjectMeta { + location, + last_modified, + size, + e_tag, + version, + } = value; + + Self { + location: location.to_string(), + last_modified, + size, + e_tag, + version, + } + } +} diff --git a/parquet_cache/src/data_types/policy.rs b/parquet_cache/src/data_types/policy.rs new file mode 100644 index 00000000000..da13b6be98a --- /dev/null +++ b/parquet_cache/src/data_types/policy.rs @@ -0,0 +1,17 @@ +use serde::{Deserialize, Serialize}; + +/// TODO: clap_blocks +#[derive(Debug, Default, Clone, Copy)] +pub struct PolicyConfig { + pub max_capacity: u64, + pub event_recency_max_duration_nanoseconds: u64, +} + +#[derive(Debug, Serialize, Deserialize, Clone, Copy, Default)] +pub struct ObjectParams { + pub namespace_id: i64, + pub table_id: i64, + pub min_time: i64, + pub max_time: i64, + pub file_size_bytes: i64, +} diff --git a/parquet_cache/src/data_types/state.rs b/parquet_cache/src/data_types/state.rs new file mode 100644 index 00000000000..9afe54e8740 --- /dev/null +++ b/parquet_cache/src/data_types/state.rs @@ -0,0 +1,52 @@ +use schemars::JsonSchema; +use serde::{Deserialize, Serialize}; +use std::fmt::{Display, Formatter}; + +#[derive(Debug, Default, Clone, PartialEq, Deserialize, Copy, Serialize, JsonSchema)] +pub struct State { + /// The current state of the cache node. + pub state: InstanceState, + + /// Timestamp (seconds from unix epoch) that the state last changed. + pub state_changed: i64, + + /// The revision number of the current node set known to the cache node. + pub current_node_set_revision: i64, + + /// The revision number of the next node set known to the cache node. + pub next_node_set_revision: i64, +} + +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq, Default, Copy, Clone, JsonSchema)] +pub enum InstanceState { + #[default] + /// Default state, prior to loading any configmap keyspace. + #[serde(rename = "pending")] + Pending, + /// Have configmap, are warming, and not receiving traffic. + /// + /// Can still respond to `GET /state` requests (from controller). + #[serde(rename = "warming")] + Warming, + /// Ready for traffic. + /// + /// Includes own host in `GET /keyspace` responses. + #[serde(rename = "running")] + Running, + /// Response to `GET /keyspace` requests are now directing traffic elsewhere. + /// + /// May still have ongoing requests. + #[serde(rename = "cooling")] + Cooling, +} + +impl Display for InstanceState { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Pending => write!(f, "pending"), + Self::Warming => write!(f, "warming"), + Self::Running => write!(f, "running"), + Self::Cooling => write!(f, "cooling"), + } + } +} diff --git a/parquet_cache/src/data_types/write_hints.rs b/parquet_cache/src/data_types/write_hints.rs new file mode 100644 index 00000000000..fdff107a5fc --- /dev/null +++ b/parquet_cache/src/data_types/write_hints.rs @@ -0,0 +1,81 @@ +use data_types::{ParquetFile, ParquetFileParams}; +use serde::{Deserialize, Serialize}; + +use super::ObjectParams; + +/// Request payload provided on WriteHinting. +#[derive(Debug, Serialize, Deserialize)] +pub struct WriteHintRequestBody { + /// Object store [`Path`](object_store::path::Path) converted to cache key. + pub location: String, + /// The actual [`WriteHint`]. + pub hint: WriteHint, + /// Requested server contract to fulfill prior to ACK. + pub ack_setting: WriteHintAck, +} + +/// DataCache is a read-only, write-hinting service. +/// +/// Cache writes to store, then hints to pull into cache. +/// Return ok based upon a configurable level of cache server ack. +#[derive(Debug, Default, Copy, Clone, Serialize, Deserialize)] +pub enum WriteHintAck { + /// cache client sent write hint + Sent, + /// cache server received write hint + Received, + /// cache server completed downstream action + #[default] + Completed, +} + +impl std::fmt::Display for WriteHintAck { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:?}", self) + } +} + +/// Write hint metadata provided by the client. +pub type WriteHint = ObjectParams; + +impl From<&ParquetFileParams> for WriteHint { + fn from(value: &ParquetFileParams) -> Self { + let ParquetFileParams { + namespace_id, + table_id, + min_time, + max_time, + file_size_bytes, + .. + } = value; + + Self { + namespace_id: namespace_id.get(), + table_id: table_id.get(), + min_time: min_time.get(), + max_time: max_time.get(), + file_size_bytes: file_size_bytes.to_owned(), + } + } +} + +impl From<&ParquetFile> for WriteHint { + fn from(value: &ParquetFile) -> Self { + let ParquetFile { + namespace_id, + table_id, + min_time, + max_time, + file_size_bytes, + .. + } = value; + + Self { + namespace_id: namespace_id.get(), + table_id: table_id.get(), + min_time: min_time.get(), + max_time: max_time.get(), + file_size_bytes: file_size_bytes.to_owned(), + } + } +} diff --git a/parquet_cache/src/lib.rs b/parquet_cache/src/lib.rs new file mode 100644 index 00000000000..b4f4d5d1b77 --- /dev/null +++ b/parquet_cache/src/lib.rs @@ -0,0 +1,51 @@ +//! IOx parquet cache client. +//! +//! ParquetCache client interface to be used by IOx components to +//! get and put parquet files into the cache. + +#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_docs, + clippy::explicit_iter_loop, + // See https://github.com/influxdata/influxdb_iox/pull/1671 + clippy::future_not_send, + clippy::use_self, + clippy::clone_on_ref_ptr, + clippy::todo, + clippy::dbg_macro, + unused_crate_dependencies +)] +#![allow(rustdoc::private_intra_doc_links, unreachable_pub)] + +// Workaround for "unused crate" lint false positives. +use workspace_hack as _; + +mod client; +pub use client::{cache_connector::Error, write_hints::WriteHintingObjectStore}; + +pub mod controller; + +pub(crate) mod data_types; + +mod server; +#[cfg(test)] +pub use server::mock::MockCacheServer; +pub use server::{build_cache_server, ParquetCacheServer, ParquetCacheServerConfig, ServerError}; + +use object_store::ObjectStore; +use std::sync::Arc; + +use crate::client::{cache_connector::build_cache_connector, object_store::DataCacheObjectStore}; + +// TODO: change this to `Arc` +// and have consumers (e.g. ingester, compactor) issue write-hints. +// +/// Build a cache client. +pub fn make_client( + namespace_service_address: String, + object_store: Arc, +) -> Arc { + let server_connection = build_cache_connector(namespace_service_address); + Arc::new(DataCacheObjectStore::new(server_connection, object_store)) +} diff --git a/parquet_cache/src/server.rs b/parquet_cache/src/server.rs new file mode 100644 index 00000000000..0f5308b3f82 --- /dev/null +++ b/parquet_cache/src/server.rs @@ -0,0 +1,482 @@ +#![allow(dead_code)] +//! Contains the cache server. + +use std::sync::Arc; + +use iox_catalog::interface::Catalog; +use object_store::ObjectStore; +use tower::ServiceBuilder; + +use crate::data_types::PolicyConfig; + +use self::{ + cache::{BuildCacheService, CacheService}, + data::DataService, + keyspace::{BuildKeyspaceService, KeyspaceService}, + precondition::{BuildPreconditionService, PreconditionService}, +}; + +// Layers in the cache server: +mod cache; +mod data; +mod keyspace; +mod precondition; + +// Shared server types: +mod error; +pub use error::Error as ServerError; +mod response; + +#[cfg(test)] +pub(crate) mod mock; + +/// The cache server type. +pub type ParquetCacheServer = CacheService>>; + +/// Config for cache server. +#[derive(Debug)] +pub struct ParquetCacheServerConfig { + /// The path to the config file for the keyspace. + pub keyspace_config_path: String, + /// The hostname of the cache instance (k8s pod) running this process. + pub hostname: String, + /// The local directory to store data. + pub local_dir: String, + /// The policy config for the cache eviction. + pub policy_config: PolicyConfig, +} + +/// Build a cache server. +pub async fn build_cache_server( + config: ParquetCacheServerConfig, + direct_store: Arc, + catalog: Arc, +) -> ParquetCacheServer { + let ParquetCacheServerConfig { + keyspace_config_path: configfile_path, + hostname: node_hostname, + local_dir, + policy_config, + } = config; + + ServiceBuilder::new() + // outermost layer 0 + .layer(BuildCacheService) + // layer 1 + .layer(BuildKeyspaceService { + configfile_path, + node_hostname, + }) + // layer 2 + .layer(BuildPreconditionService) + // innermost layer 3 + .service(DataService::new(direct_store, catalog, policy_config, Some(local_dir)).await) +} + +#[cfg(test)] +mod integration_tests { + use std::{ + fs::create_dir_all, + io::{Seek, Write}, + path::Path, + time::Duration, + }; + + use bytes::{Buf, BufMut, BytesMut}; + use http::{Method, StatusCode}; + use hyper::{Body, Request}; + use iox_tests::{TestCatalog, TestParquetFileBuilder}; + use object_store::{local::LocalFileSystem, ObjectMeta}; + use serde::Deserialize; + use serde_json::Deserializer; + use tempfile::{tempdir, NamedTempFile, TempDir}; + use tower::Service; + + use crate::data_types::{ + GetObjectMetaResponse, InstanceState, KeyspaceResponseBody, ParquetCacheInstanceSet, + ServiceNode, State, WriteHint, WriteHintRequestBody, + }; + use crate::server::response::Response as ServerInternalResponse; + + use super::*; + + fn create_fs_direct_store(local_dir: &Path) -> Arc { + create_dir_all(local_dir).unwrap(); + Arc::new(LocalFileSystem::new_with_prefix(local_dir).expect("should create fs ObjectStore")) + } + + #[tokio::test] + async fn test_invalid_path() { + let tmpdir = tempdir().unwrap(); + let direct_store = create_fs_direct_store(tmpdir.path()); + let catalog = iox_tests::TestCatalog::new(); + + let config = ParquetCacheServerConfig { + keyspace_config_path: "/tmp".to_string(), + hostname: "localhost".to_string(), + local_dir: tmpdir.path().to_str().unwrap().to_string(), + policy_config: PolicyConfig::default(), + }; + + let mut server = build_cache_server(config, direct_store, catalog.catalog()).await; + + let req = Request::get("http://foo.io/invalid-path/") + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await; + + // assert expected http response + assert_matches::assert_matches!( + resp, + Err(ServerError::BadRequest(msg)) if msg.contains("invalid path"), + "expected bad request, instead found {:?}", resp + ); + } + + const VALID_HOSTNAME: &str = "hostname-a"; + lazy_static::lazy_static! { + static ref KEYSPACE_DEFINITION: ParquetCacheInstanceSet = ParquetCacheInstanceSet { + revision: 0, + // a single node in the keyspace, therefore all keys should hash to this keyspace + instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(), + }; + } + + const LOCATION: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet"; + const DATA: &[u8] = b"all my pretty words"; + + async fn setup_service_and_direct_store( + direct_store: Arc, + cache_tmpdir: TempDir, + file: &mut NamedTempFile, + ) -> (ParquetCacheServer, Arc, ObjectMeta) { + let catalog = iox_tests::TestCatalog::new(); + + let policy_config = PolicyConfig { + max_capacity: 3_200_000_000, + event_recency_max_duration_nanoseconds: 1_000_000_000 * 5, // 5 seconds + }; + + writeln!(file, "{}", serde_json::json!(*KEYSPACE_DEFINITION)) + .expect("should write keyspace definition to configfile"); + + let obj_store_path = object_store::path::Path::from(LOCATION); + + let config = ParquetCacheServerConfig { + keyspace_config_path: file.path().to_str().unwrap().to_string(), + hostname: VALID_HOSTNAME.to_string(), + local_dir: cache_tmpdir.path().to_str().unwrap().to_string(), + policy_config, + }; + + let server = build_cache_server(config, Arc::clone(&direct_store), catalog.catalog()).await; + + // add object to direct store + direct_store + .put(&obj_store_path, DATA.into()) + .await + .expect("should write object to direct store"); + let expected_meta = direct_store + .head(&obj_store_path) + .await + .expect("should have object in direct store"); + + // wait until service is ready + let mut this = server.clone(); + futures::future::poll_fn(move |cx| this.poll_ready(cx)) + .await + .expect("should not have failed"); + + (server, catalog, expected_meta) + } + + async fn confirm_data_exists(expected_meta: ObjectMeta, server: &mut ParquetCacheServer) { + // issue read metadata + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION)) + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + + // assert expected http response for metadata + assert_eq!( + resp.status(), + StatusCode::OK, + "expected http 200, instead found {:?}", + resp + ); + let resp_body: GetObjectMetaResponse = serde_json::from_reader( + hyper::body::aggregate(resp.into_body()) + .await + .expect("should create reader") + .reader(), + ) + .expect("should read response body"); + let resp_meta: object_store::ObjectMeta = resp_body.into(); + assert_eq!( + resp_meta, expected_meta, + "expected proper metadata, instead found {:?}", + resp_meta + ); + + // issue read object + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/object?location={}", LOCATION)) + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + + // assert expected http response for object + assert_eq!( + resp.status(), + StatusCode::OK, + "expected http 200, instead found {:?}", + resp + ); + let body = hyper::body::to_bytes(resp.into_body()) + .await + .expect("reading response body"); + assert_eq!( + body.len(), + DATA.to_vec().len(), + "expected data in body, instead found {}", + std::str::from_utf8(&body).unwrap() + ); + } + + #[tokio::test] + async fn test_write_hint_and_read() { + // keep in scope so they are not dropped + let dir_store_tmpdir = tempdir().unwrap(); + let cache_tmpdir = tempdir().unwrap(); + let mut configfile = NamedTempFile::new().unwrap(); + let direct_store = create_fs_direct_store(dir_store_tmpdir.path()); + + // setup server + let (mut server, _, expected_meta) = + setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await; + + // issue write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION.into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri(format!("http://foo.io/write-hint?location={}", LOCATION)) + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + + // assert expected http response for write-hint + let expected_resp = ServerInternalResponse::Written; + assert_eq!( + resp.status(), + expected_resp.code(), + "expected http response status code to match, instead found {:?}", + resp + ); + let body = hyper::body::to_bytes(resp.into_body()) + .await + .expect("reading response body"); + assert_eq!( + body.len(), + 0, + "expected empty body, instead found {}", + std::str::from_utf8(&body).unwrap() + ); + + confirm_data_exists(expected_meta, &mut server).await; + } + + #[tokio::test] + async fn test_cache_miss_writeback_and_read() { + // keep in scope so they are not dropped + let dir_store_tmpdir = tempdir().unwrap(); + let cache_tmpdir = tempdir().unwrap(); + let mut configfile = NamedTempFile::new().unwrap(); + let direct_store = create_fs_direct_store(dir_store_tmpdir.path()); + + // setup server + let (mut server, catalog, expected_meta) = + setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await; + + // write-back requires catalog data, therefore insert into catalog + let namespace = catalog.create_namespace_1hr_retention("ns0").await; + let table = namespace.create_table("table0").await; + let partition = table.create_partition("partition_key").await; + + // insert parquet file into catalog, with proper matching object store id + let parquet_file_path = parquet_file::ParquetFilePath::try_from(&LOCATION.to_string()) + .expect("should be valid parquet file path"); + let parquet_file = TestParquetFileBuilder::default() + .with_creation_time(iox_time::Time::from_date_time(expected_meta.last_modified)) + .with_file_size_bytes(DATA.to_vec().len() as u64) + .with_object_store_id(parquet_file_path.object_store_id()); + partition + .create_parquet_file_catalog_record(parquet_file) + .await; + + // trigger cache miss + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION)) + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await; + assert_matches::assert_matches!( + resp, + Err(ServerError::CacheMiss), + "expected cache miss, instead found {:?}", + resp + ); + + // wait for write-back to complete + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + confirm_data_exists(expected_meta, &mut server).await; + } + + #[tokio::test] + async fn test_state_responses() { + // keep in scope so they are not dropped + let dir_store_tmpdir = tempdir().unwrap(); + let cache_tmpdir = tempdir().unwrap(); + let mut configfile = NamedTempFile::new().unwrap(); + let direct_store = create_fs_direct_store(dir_store_tmpdir.path()); + + // setup server + let (mut server, _, _meta) = + setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await; + + // check keyspace status is running + let req = Request::builder() + .method(Method::GET) + .uri("http://foo.io/state") + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + assert_eq!( + resp.status(), + StatusCode::OK, + "expected http 200, instead found {:?}", + resp + ); + let resp_body_json = hyper::body::to_bytes(resp.into_body()) + .await + .expect("should read response body"); + let mut de = Deserializer::from_slice(&resp_body_json); + let mut state = State::deserialize(&mut de).expect("valid State object"); + state.state_changed = 0; // ignore the timestamp + assert_eq!( + state, + State { + state: InstanceState::Running, + state_changed: 0, + current_node_set_revision: 0, + next_node_set_revision: 0, + }, + ); + + // tell keyspace to cool, by changing keyspace definition + let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet { + revision: 1, + instances: vec!["another-node"].into_iter().map(String::from).collect(), + }) + .to_string(); + let mut file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(configfile.path()) + .unwrap(); + file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite + writeln!(file, "{}", new_keyspace_definition.as_str()) + .expect("should write keyspace definition to configfile"); + file.sync_all().unwrap(); + + // waiting for new_keyspace_definition to load + // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests) + tokio::time::sleep(Duration::from_secs(10)).await; + + // check keyspace status is cooling + let req = Request::builder() + .method(Method::GET) + .uri("http://foo.io/state") + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + assert_eq!( + resp.status(), + StatusCode::OK, + "expected http 200, instead found {:?}", + resp + ); + let resp_body_json = hyper::body::to_bytes(resp.into_body()) + .await + .expect("should read response body"); + let mut de = Deserializer::from_slice(&resp_body_json); + let mut state = State::deserialize(&mut de).expect("valid State object"); + state.state_changed = 0; // ignore the timestamp + assert_eq!( + state, + State { + state: InstanceState::Cooling, + state_changed: 0, + current_node_set_revision: 0, + next_node_set_revision: 1, + }, + ); + } + + #[tokio::test] + async fn test_keyspace_nodes() { + // keep in scope so they are not dropped + let dir_store_tmpdir = tempdir().unwrap(); + let cache_tmpdir = tempdir().unwrap(); + let mut configfile = NamedTempFile::new().unwrap(); + let direct_store = create_fs_direct_store(dir_store_tmpdir.path()); + + // setup server + let (mut server, _, _meta) = + setup_service_and_direct_store(direct_store, cache_tmpdir, &mut configfile).await; + + // get keyspace nodes + let req = Request::builder() + .method(Method::GET) + .uri("http://foo.io/keyspace") + .body(Body::empty()) + .unwrap(); + let resp = server.call(req).await.expect("should get a response"); + assert_eq!( + resp.status(), + StatusCode::OK, + "expected http 200, instead found {:?}", + resp + ); + let resp_body: KeyspaceResponseBody = serde_json::from_reader( + hyper::body::aggregate(resp.into_body()) + .await + .expect("should create reader") + .reader(), + ) + .expect("should read response body"); + assert_matches::assert_matches!( + resp_body, + KeyspaceResponseBody { nodes } if matches!( + &nodes[..], + [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME + ) + ); + } +} diff --git a/parquet_cache/src/server/cache.rs b/parquet_cache/src/server/cache.rs new file mode 100644 index 00000000000..6acb7c4e46c --- /dev/null +++ b/parquet_cache/src/server/cache.rs @@ -0,0 +1,113 @@ +use std::{pin::Pin, task::Poll}; + +use futures::{ready, Future}; +use http::{Method, Request, Response, StatusCode}; +use hyper::Body; +use tokio::sync::OnceCell; +use tower::{Layer, Service}; + +use super::response::PinnedFuture; + +pub type FinalResponseFuture = + Pin, super::error::Error>> + Send>>; + +/// Cache Service +#[derive(Debug, Clone)] +pub struct CacheService { + inner: S, + initialize_once: OnceCell<()>, +} + +impl CacheService +where + S: Service, Future = PinnedFuture> + Clone + Send + Sync + 'static, +{ + pub fn new(inner: S) -> Self { + Self { + inner, + initialize_once: Default::default(), + } + } + + pub async fn prewarm(&mut self) -> Result<(), super::error::Error> { + // TODO: + // 0. (already done): LruCacheManager::new() => should have cache policy. + // 1. (already done): Keyspace::poll_ready() => should have the keyspace. + // 2. TODO(optional): may have persisted state from previous LruCacheManager, to reduce catalog load + // 3. GET list of obj_keys from catalog. + // * Query limits based on cache policy. + // * Use slower prewarming, paginated catalog queries, prioritized cache insertion. + // 4. for key in list => self.call(<`/write-hint` request for key>) + // * inner KeyspaceService will filter by key hash + // * inner DataService will filter by cache eviction policy + // * inner WriteService will handle write-back + + // 5. message to inner that prewarming is done. + let req = Request::builder() + .method(Method::PATCH) + .uri("/warmed") + .body(Body::empty()) + .expect("should create prewarm PATCH /warmed req"); + self.inner + .call(req) + .await + .map_err(|e| super::error::Error::Warming(e.to_string()))?; + + Ok(()) + } +} + +impl Service> for CacheService +where + S: Service, Future = PinnedFuture> + Clone + Send + Sync + 'static, +{ + type Response = Response; + type Error = super::error::Error; + type Future = FinalResponseFuture; + + fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { + // wait for inner service to receive requests + let _ = ready!(self.inner.poll_ready(cx)); + + // initialize once (which issues a request to inner service) + let mut this = self.clone(); + Box::pin(async move { + self.initialize_once + .get_or_try_init(|| this.prewarm()) + .await + }) + .as_mut() + .poll(cx) + .map_ok(|_| ()) + } + + fn call(&mut self, req: Request) -> Self::Future { + let clone = self.inner.clone(); + let mut inner = std::mem::replace(&mut self.inner, clone); + Box::pin(async move { + match inner.call(req).await { + Ok(resp) => match Response::builder().status(resp.code()).body(resp.into()) { + Ok(resp) => Ok(resp), + Err(e) => Ok(Response::builder() + .status(StatusCode::INTERNAL_SERVER_ERROR) + .body(e.to_string().into()) + .expect("should build error response")), + }, + Err(e) => Err(e), + } + }) + } +} + +pub struct BuildCacheService; + +impl Layer for BuildCacheService +where + S: Service, Future = PinnedFuture> + Clone + Send + Sync + 'static, +{ + type Service = CacheService; + + fn layer(&self, service: S) -> Self::Service { + CacheService::new(service) + } +} diff --git a/parquet_cache/src/server/data.rs b/parquet_cache/src/server/data.rs new file mode 100644 index 00000000000..fd50aab2eaf --- /dev/null +++ b/parquet_cache/src/server/data.rs @@ -0,0 +1,810 @@ +mod manager; +mod reads; +mod store; +mod writes; + +use std::{sync::Arc, task::Poll}; + +use backoff::{Backoff, BackoffConfig}; +use bytes::Buf; +use http::{Request, Uri}; +use hyper::{Body, Method}; +use iox_catalog::interface::Catalog; +use object_store::ObjectStore; +use observability_deps::tracing::{error, warn}; +use tokio::task::JoinHandle; +use tower::Service; + +use self::{ + manager::{CacheManager, CacheManagerValue}, + reads::ReadHandler, + store::LocalStore, + writes::WriteHandler, +}; +use super::{error::Error, response::Response}; +use crate::data_types::{PolicyConfig, WriteHint, WriteHintRequestBody}; + +#[derive(Debug, thiserror::Error)] +pub enum DataError { + #[error("Read error: {0}")] + Read(String), + #[error("Write-stream error: {0}")] + Stream(String), + #[error("Write-file error: {0}")] + File(String), + #[error("Bad Request: {0}")] + BadRequest(String), + #[error("Bad Request: object location does not exist in catalog or object store")] + DoesNotExist, +} + +/// Service that provides access to the data. +#[derive(Debug, Clone)] +pub struct DataService { + catalog: Arc, + cache_manager: Arc, + read_handler: ReadHandler, + write_hander: WriteHandler, + handle: Arc>, + backoff_config: BackoffConfig, +} + +impl DataService { + pub async fn new( + direct_store: Arc, + catalog: Arc, + config: PolicyConfig, + dir: Option, + ) -> Self { + let data_accessor = Arc::new(LocalStore::new(dir)); + + // TODO: use a bounded channel + // Apply back pressure if we can't keep up (a.k.a. the actual eviction from the local store). + let (evict_tx, evict_rx) = async_channel::unbounded(); + + // start background task to evict from local store + let data_accessor_ = Arc::clone(&data_accessor); + let handle = tokio::spawn(async move { + while let Ok(key) = evict_rx.recv().await { + let _ = data_accessor_.delete_object(&key).await; + } + }); + + Self { + catalog, + read_handler: ReadHandler::new(Arc::clone(&data_accessor)), + write_hander: WriteHandler::new(Arc::clone(&data_accessor), direct_store), + cache_manager: Arc::new(CacheManager::new(config, evict_tx)), + handle: Arc::new(handle), + backoff_config: Default::default(), + } + } + + async fn create_write_hint(&self, location: &String) -> Result { + let parquet_file_path = parquet_file::ParquetFilePath::try_from(location) + .map_err(|e| Error::BadRequest(e.to_string()))?; + + let maybe_parquet_file = Backoff::new(&self.backoff_config) + .retry_all_errors("lookup write-hint in catalog", || async { + self.catalog + .repositories() + .parquet_files() + .get_by_object_store_id(parquet_file_path.object_store_id()) + .await + }) + .await + .expect("retry forever"); + + match maybe_parquet_file { + None => Err(Error::DoesNotExist), + Some(parquet_file) => Ok(WriteHint::from(&parquet_file)), + } + } + + async fn write_back(&self, location: String, write_hint: WriteHint) -> Result<(), Error> { + // confirm valid location + parquet_file::ParquetFilePath::try_from(&location) + .map_err(|e| Error::BadRequest(e.to_string()))?; + + // write to local store + let metadata = self + .write_hander + .write_local(&location, &write_hint) + .await?; + + // update cache manager + self.cache_manager + .insert( + location, + CacheManagerValue { + params: write_hint, + metadata, + }, + ) + .await; + + Ok(()) + } +} + +impl Service> for DataService { + type Response = Response; + type Error = Error; + type Future = super::response::PinnedFuture; + + fn poll_ready(&mut self, _cx: &mut std::task::Context<'_>) -> Poll> { + Poll::Ready(Ok(())) + } + + fn call(&mut self, req: Request) -> Self::Future { + match (req.method(), req.uri().path()) { + (&Method::GET, "/state") + | (&Method::PATCH, "/warmed") + | (&Method::GET, "/keyspace") => { + unreachable!("`this request should have already been handled in the KeyspaceLayer`") + } + (&Method::GET, "/metadata") | (&Method::GET, "/object") => { + let this = self.clone(); + Box::pin(async move { + let obj_location = parse_object_location(req.uri())?; + match this.cache_manager.in_cache(&obj_location).await { + Ok(_) => match req.uri().path() { + "/metadata" => { + let meta = this.cache_manager.fetch_metadata(&obj_location).await?; + Ok(Response::Head(meta.into())) + } + "/object" => { + let stream = this.read_handler.read_local(&obj_location).await?; + Ok(Response::Data(stream)) + } + _ => unreachable!(), + }, + Err(Error::CacheMiss) => { + // trigger write-back on another thread + let this_ = this.clone(); + tokio::spawn(async move { + let write_hint = match this_.create_write_hint(&obj_location).await + { + Ok(write_hint) => write_hint, + Err(error) => { + warn!(%error, "write-back failed to create write-hint (likely missing from catalog)"); + return; + } + }; + + if let Err(error) = this_.write_back(obj_location, write_hint).await + { + error!(%error, "write-back failed to perform local-store write"); + } + }); + + // still return immediate response, such that client will use direct_store fallback + Err(Error::CacheMiss) + } + Err(e) => Err(e), + } + }) + } + (&Method::POST, "/write-hint") => { + let this = self.clone(); + Box::pin(async move { + let reader = hyper::body::aggregate(req.into_body()) + .await + .map_err(|e| Error::BadRequest(e.to_string()))? + .reader(); + let write_hint: WriteHintRequestBody = serde_json::from_reader(reader) + .map_err(|e| Error::BadRequest(e.to_string()))?; + + match this.cache_manager.in_cache(&write_hint.location).await { + Ok(_) => Ok(Response::Written), + Err(_) => { + this.write_back(write_hint.location, write_hint.hint) + .await?; + Ok(Response::Written) + } + } + }) + } + (any_method, any_path) => { + let msg = format!("invalid path: {} {}", any_method, any_path); + Box::pin(async { Err(Error::BadRequest(msg)) }) + } + } + } +} + +fn parse_object_location(uri: &Uri) -> Result { + let as_url = url::Url::parse(uri.to_string().as_str()) + .expect("should be already validated path & query"); + match as_url.query_pairs().find(|(k, _v)| k.eq("location")) { + None => Err(Error::BadRequest( + "missing required query parameter: location".into(), + )), + Some((_key, location)) => Ok(location.to_string()), + } +} + +#[cfg(test)] +mod tests { + use std::{collections::HashMap, fs::File, io::Write, ops::Range, path::PathBuf}; + + use assert_matches::assert_matches; + use bytes::{BufMut, Bytes, BytesMut}; + use chrono::{DateTime, Utc}; + use futures::{stream::BoxStream, TryStreamExt}; + use iox_tests::TestParquetFileBuilder; + use object_store::{ + path::Path, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, + ObjectStore, PutOptions, PutResult, + }; + use tempfile::{tempdir, TempDir}; + use tokio::{fs::create_dir_all, io::AsyncWrite}; + + use crate::data_types::GetObjectMetaResponse; + + use super::*; + + const ONE_SECOND: u64 = 1_000_000_000; + + // refer to valid path in parquet_file::ParquetFilePath + const LOCATION_F: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000000.parquet"; + const LOCATION_S: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000001.parquet"; + const LOCATION_MISSING: &str = "0/0/partition_key/00000000-0000-0000-0000-000000000002.parquet"; // not in catalog, nor remote store + + const DATA: &[u8] = b"all my pretty words"; + + lazy_static::lazy_static! { + static ref LAST_MODIFIED: DateTime = Utc::now(); + } + + #[derive(Debug)] + struct MockData(Bytes, bool /* as_stream */); + + #[derive(Debug)] + struct MockDirectStore { + mocked: HashMap, + temp_dir: TempDir, + } + + impl MockDirectStore { + fn default() -> Self { + Self { + mocked: HashMap::new(), + temp_dir: tempdir().expect("should create temp dir"), + } + } + + fn put_mock(&mut self, location: String, data: MockData) { + self.mocked.insert(location, data); + } + } + + #[async_trait::async_trait] + impl ObjectStore for MockDirectStore { + async fn get_opts( + &self, + location: &Path, + _options: GetOptions, + ) -> object_store::Result { + let MockData(bytes, as_stream) = match self.mocked.get(&location.to_string()) { + Some(data) => data, + _ => { + return Err(object_store::Error::NotFound { + path: location.to_string(), + source: "not found in remote store".into(), + }) + } + }; + + let meta = ObjectMeta { + location: location.clone(), + last_modified: *LAST_MODIFIED, + size: DATA.to_vec().len(), + e_tag: Default::default(), + version: Default::default(), + }; + + let bytes = bytes.to_owned(); + let payload = + match as_stream { + true => GetResultPayload::Stream(Box::pin(futures::stream::once(async move { + Ok(bytes) + }))), + false => { + let path = self.temp_dir.path().join(location.to_string()); + create_dir_all(path.parent().unwrap()) + .await + .expect("should create nested path"); + let mut file = + File::create(path.as_path()).expect("should be able to open temp file"); + file.write_all(&bytes) + .expect("should be able to write to temp file"); + file.flush().expect("should be able to flush temp file"); + GetResultPayload::File(file, path) + } + }; + + Ok(GetResult { + payload, + meta, + range: Range { + start: 0, + end: DATA.to_vec().len(), + }, + }) + } + + async fn put_opts( + &self, + _location: &Path, + _bytes: Bytes, + _opts: PutOptions, + ) -> object_store::Result { + unimplemented!() + } + async fn put_multipart( + &self, + _location: &Path, + ) -> object_store::Result<(MultipartId, Box)> { + unimplemented!() + } + async fn abort_multipart( + &self, + _location: &Path, + _multipart_id: &MultipartId, + ) -> object_store::Result<()> { + unimplemented!() + } + async fn delete(&self, _location: &Path) -> object_store::Result<()> { + unimplemented!() + } + fn list(&self, _prefix: Option<&Path>) -> BoxStream<'_, object_store::Result> { + unimplemented!() + } + async fn list_with_delimiter( + &self, + _prefix: Option<&Path>, + ) -> object_store::Result { + unimplemented!() + } + async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { + unimplemented!() + } + async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { + unimplemented!() + } + } + + impl std::fmt::Display for MockDirectStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "MockDirectStore") + } + } + + fn make_parquet_file(location: &str) -> TestParquetFileBuilder { + let parquet_file_path = parquet_file::ParquetFilePath::try_from(&location.to_string()) + .expect("should be valid parquet file path"); + + TestParquetFileBuilder::default() + .with_creation_time(iox_time::Time::from_date_time(*LAST_MODIFIED)) + .with_file_size_bytes(DATA.to_vec().len() as u64) + .with_object_store_id(parquet_file_path.object_store_id()) + } + + async fn make_service(temp_dir: PathBuf, policy_config: Option) -> DataService { + let mut direct_store = MockDirectStore::default(); + // data returned as file, for write-back + direct_store.put_mock( + LOCATION_F.to_string(), + MockData(Bytes::from(DATA.to_vec()), false), + ); + // data returned as stream, for write-back + direct_store.put_mock( + LOCATION_S.to_string(), + MockData(Bytes::from(DATA.to_vec()), true), + ); + + // create catalog + let test_catalog = iox_tests::TestCatalog::new(); + let namespace = test_catalog.create_namespace_1hr_retention("ns0").await; + let table = namespace.create_table("table0").await; + let partition = table.create_partition("partition_key").await; + + // add parquet files to catalog + partition + .create_parquet_file_catalog_record(make_parquet_file(LOCATION_F)) + .await; + partition + .create_parquet_file_catalog_record(make_parquet_file(LOCATION_S)) + .await; + + DataService::new( + Arc::new(direct_store), + test_catalog.catalog(), + policy_config.unwrap_or(PolicyConfig { + max_capacity: 3_200_000, + event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2, + }), + Some(temp_dir.to_str().unwrap()), + ) + .await + } + + // note: uses file for write-back + #[tokio::test] + async fn test_metadata_writeback_on_cache_miss() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // return cache miss + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_F)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::CacheMiss), + "should return cache miss, instead found {:?}", + resp + ); + + // wait for write-back to complete + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // return cache hit + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_F)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + let expected = GetObjectMetaResponse::from(ObjectMeta { + location: LOCATION_F.into(), + size: DATA.to_vec().len(), + last_modified: *LAST_MODIFIED, + e_tag: Default::default(), + version: Default::default(), + }); + assert_matches!( + resp, + Ok(Response::Head(meta)) if meta == expected, + "should return metadata for location, instead found {:?}", resp + ); + } + + // note: uses file for write-back + #[tokio::test] + async fn test_object_writeback_on_cache_miss() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // return cache miss + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/object?location={}", LOCATION_F)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::CacheMiss), + "should return cache miss, instead found {:?}", + resp + ); + + // wait for write-back to complete + tokio::time::sleep(tokio::time::Duration::from_secs(1)).await; + + // return cache hit + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/object?location={}", LOCATION_F)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + match resp { + Ok(Response::Data(stream)) => { + let data = stream.try_collect::>().await.unwrap(); + assert_eq!( + data, + vec![DATA.to_vec()], + "should have returned matching bytes" + ); + } + _ => panic!("should return data for location, instead found {:?}", resp), + } + } + + // note: uses stream for write-back + #[tokio::test] + async fn test_write_hint() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // issue write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION_S.into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Ok(Response::Written), + "should return successful write-back, instead found {:?}", + resp + ); + + // return cache hit -- metadata + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_S)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + let expected = GetObjectMetaResponse::from(ObjectMeta { + location: LOCATION_S.into(), + size: DATA.to_vec().len(), + last_modified: *LAST_MODIFIED, + e_tag: Default::default(), + version: Default::default(), + }); + assert_matches!( + resp, + Ok(Response::Head(meta)) if meta == expected, + "should return metadata for location, instead found {:?}", resp + ); + + // return cache hit -- object + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/object?location={}", LOCATION_S)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + match resp { + Ok(Response::Data(stream)) => { + let data = stream.try_collect::>().await.unwrap(); + assert_eq!( + data, + vec![DATA.to_vec()], + "should have returned matching bytes" + ); + } + _ => panic!("should return data for location, instead found {:?}", resp), + } + } + + #[tokio::test] + async fn test_write_hint_fails_for_invalid_path() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // issue write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: "not_a_valid_path.parquet".into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::BadRequest(_)), + "should return failed write-back, instead found {:?}", + resp + ); + } + + #[tokio::test] + async fn test_write_hint_fails_for_incorrect_size() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // issue write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION_S.into(), + hint: WriteHint { + file_size_bytes: 12312, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::Data(_)), + "should error for incorrect file size in write-hint, instead found {:?}", + resp + ); + } + + #[tokio::test] + async fn test_fails_for_nonexistent_object() { + // setup + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), None).await; + + // issue write-hint + // Fails when looking up in remote store. Does not check catalog first. + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION_MISSING.into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::Data(DataError::DoesNotExist)), + "should return failed write-back, instead found {:?}", + resp + ); + } + + #[tokio::test] + async fn test_eviction() { + // setup + let policy_config = PolicyConfig { + max_capacity: DATA.to_vec().len() as u64 + 1, + event_recency_max_duration_nanoseconds: ONE_SECOND * 60 * 2, + }; + let dir = tempdir().expect("should create temp dir"); + let mut service = make_service(PathBuf::from(dir.path()), Some(policy_config)).await; + + // issue write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION_S.into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Ok(Response::Written), + "should return successful write-back, instead found {:?}", + resp + ); + + // return cache hit + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_S)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Ok(Response::Head(_)), + "should return metadata for location, instead found {:?}", + resp + ); + service.cache_manager.flush_pending().await; + + // issue 2nd write-hint + let mut buf = BytesMut::new().writer(); + serde_json::to_writer( + &mut buf, + &WriteHintRequestBody { + location: LOCATION_F.into(), + hint: WriteHint { + file_size_bytes: DATA.to_vec().len() as i64, + ..Default::default() + }, + ack_setting: crate::data_types::WriteHintAck::Completed, + }, + ) + .expect("should write request body"); + let req = Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint") + .body(hyper::Body::from(buf.into_inner().freeze())) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Ok(Response::Written), + "should return successful write-back, instead found {:?}", + resp + ); + service.cache_manager.flush_pending().await; + + // eviction should have happened + // should return cache miss + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_S)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Err(Error::CacheMiss), + "should return cache miss, instead found {:?}", + resp + ); + + // other object should still be in cache + let req = Request::builder() + .method(Method::GET) + .uri(format!("http://foo.io/metadata?location={}", LOCATION_F)) + .body(Body::empty()) + .unwrap(); + let resp = service.call(req).await; + assert_matches!( + resp, + Ok(Response::Head(_)), + "should return metadata for location, instead found {:?}", + resp + ); + + dir.close().expect("should close temp dir"); + } +} diff --git a/parquet_cache/src/server/data/manager.rs b/parquet_cache/src/server/data/manager.rs new file mode 100644 index 00000000000..5b29e1dc9bb --- /dev/null +++ b/parquet_cache/src/server/data/manager.rs @@ -0,0 +1,836 @@ +use std::cmp::Ordering; +use std::collections::BinaryHeap; +use std::sync::atomic::{AtomicU64, Ordering as AtomicOrdering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use arc_swap::ArcSwap; +use async_channel::Sender; +use moka::future::{Cache, FutureExt}; +use moka::notification::ListenerFuture; +use moka::Expiry; +use object_store::ObjectMeta; +use observability_deps::tracing::error; +use parking_lot::Mutex; +use tokio::task::JoinSet; + +use crate::data_types::{ObjectParams, PolicyConfig}; +use crate::server::error::Error; + +type ExternalRequestKey = String; +type CacheManagerKey = Arc; + +#[derive(Debug, Clone)] +pub struct CacheManagerValue { + /// Required for eviction policy. + pub params: ObjectParams, + /// Returned on `GET /metadata` head requests. + pub metadata: ObjectMeta, +} + +type InMemoryCache = Cache; + +/// Manages the cache eviction policy. +/// +/// Cache manager built upon a fast, concurrent in-memory cache. +/// In-memory will be the keys, as well as minimum metadata for managing cache eviction. +#[derive(Debug)] +pub struct CacheManager { + /// High-concurrency in-memory cache, used for the eviction policy. + manager: Arc, + /// Current size of the cache. + current_size: Arc, +} + +impl CacheManager { + pub fn new(config: PolicyConfig, evict_tx: Sender) -> Self { + let current_size = Arc::new(AtomicU64::new(0)); + + // listener => then evict from local store + let current_size_ = Arc::clone(¤t_size); + let listener = + move |k: Arc, v: CacheManagerValue, _cause| -> ListenerFuture { + let evict_tx = evict_tx.clone(); + let current_size = Arc::clone(¤t_size_); + async move { + // use async_channel to ensure evicted, before removing from current_size + match evict_tx.send((**k).clone()).await { + Ok(_) => { + current_size + .fetch_sub(v.params.file_size_bytes as u64, AtomicOrdering::SeqCst); + } + Err(e) => { + error!("CacheManager eviction listener failed to send: {:?}", e); + } + } + } + .boxed() + }; + + // event-recency + let evicter = Arc::new(Evictor::new_with_placeholder_cache_ref()); + let expiry = Arc::new(EventRecency::new( + Arc::clone(¤t_size), + config.max_capacity, + Arc::clone(&evicter), + )); + + // cache manager + let manager = Arc::new( + Cache::builder() + .max_capacity(config.max_capacity) + .weigher(Self::size_weigher) // triggers eviction + .expire_after(EntryExpiry::new(config, Arc::clone(&expiry))) // triggered on insert & read + .async_eviction_listener(listener) + .build(), + ); + + // set cache on evicter + evicter.set_cache(Arc::clone(&manager)); + + Self { + manager, + current_size, + } + } + + /// Maps the max_capacity to the disk bytes. + fn size_weigher(_k: &CacheManagerKey, v: &CacheManagerValue) -> u32 { + v.params.file_size_bytes as u32 + } + + /// Inserts the key-value pair into the cache. + pub async fn insert(&self, k: ExternalRequestKey, v: CacheManagerValue) { + let size = v.params.file_size_bytes; + self.manager.entry(Arc::new(k)).or_insert(v).await; + self.current_size + .fetch_add(size as u64, AtomicOrdering::SeqCst); + } + + /// Returns Ok if the key is in the cache. + pub async fn in_cache(&self, k: &ExternalRequestKey) -> Result<(), Error> { + self.manager + .get(k) + .await + .map(|_| ()) + .ok_or(Error::CacheMiss) + } + + /// Returns the metadata for the object. + pub async fn fetch_metadata(&self, k: &ExternalRequestKey) -> Result { + Ok(self.manager.get(k).await.ok_or(Error::CacheMiss)?.metadata) + } + + /// Explicitly evict key from cache. + #[cfg(test)] + async fn invalidate(&self, k: ExternalRequestKey) { + self.manager.invalidate(&k).await; + } + + /// Trigger moka to flush all pending tasks. Use for testing ONLY. + #[cfg(test)] + pub(crate) async fn flush_pending(&self) { + self.manager.run_pending_tasks().await; + } +} + +#[derive(Clone)] +pub struct EntryExpiry { + /// Outer bound on how long to hold. + max_recency_duration: Duration, + /// Handles event recency. + event_recency: Arc, +} + +impl EntryExpiry { + fn new(config: PolicyConfig, evicter: Arc) -> Self { + Self { + max_recency_duration: Duration::from_nanos( + config.event_recency_max_duration_nanoseconds, + ), + event_recency: evicter, + } + } +} + +/// Moka helps achieve high concurrency with buffered inserts. +/// +/// When pending tasks are applied, if more space is needed then existing keys are flushed +/// based upon expiration. +impl Expiry for EntryExpiry { + /// Sets the expiry duration for every insertion. + /// If incoming should not be inserted, then set expiry to 0. + fn expire_after_create( + &self, + k: &CacheManagerKey, + v: &CacheManagerValue, + _inserted_at: Instant, + ) -> Option { + if !self.event_recency.should_insert(k, v) { + return Some(Duration::from_secs(0)); + } + + if let Some(now) = chrono::Utc::now().timestamp_nanos_opt() { + let event_timestamp_nanos = v.params.max_time; + + let age_out_nanoseconds = + event_timestamp_nanos.saturating_add(self.max_recency_duration.as_nanos() as i64); + let duration_until_event_ages_out = age_out_nanoseconds.saturating_sub(now); + + Some(Duration::from_nanos(duration_until_event_ages_out as u64)) + } else { + None + } + } +} + +/// Tracks the event time recency, and evicts based upon the event time. +struct EventRecency { + /// Current size of the cache. + /// + /// Used to determine when to evict. + /// Does not rely upon the moka-buffered inserts (unlike [`Cache`].weighted_size()). + current_size: Arc, + /// Upper bound on cache size. + max_capacity: u64, + + /// Min-heap, used to track event time recency. + min_heap: Arc>>, + /// Tracks the current min, which will be updated with store() to minimize lock contention. + current_min: Arc, + /// Handles updates to min-heap on separate threads, to avoid locking on the hot path. + background_tasks: JoinSet<()>, + insert_tx: tokio::sync::mpsc::UnboundedSender, + remove_tx: tokio::sync::mpsc::UnboundedSender<()>, +} + +impl EventRecency { + /// Creates a new [`EventRecency`]. + fn new(current_size: Arc, max_capacity: u64, evictor: Arc) -> Self { + let min_heap: Arc>> = Default::default(); + let current_min: Arc = Default::default(); + + // TODO: replace with bounded channels. + let (insert_tx, mut insert_rx) = tokio::sync::mpsc::unbounded_channel(); + let (remove_tx, mut remove_rx) = tokio::sync::mpsc::unbounded_channel(); + + // insert into min-heap, off the hot path + let mut background_tasks = JoinSet::new(); + let min_heap_ = Arc::clone(&min_heap); + background_tasks.spawn(async move { + loop { + if let Some(slot) = insert_rx.recv().await { + let mut guard = min_heap_.lock(); + guard.push(slot); + drop(guard); + } + } + }); + + // remove from min-heap, off the hot path + let min_heap_ = Arc::clone(&min_heap); + let current_min_ = Arc::clone(¤t_min); + background_tasks.spawn(async move { + loop { + if remove_rx.recv().await.is_some() { + let mut guard = min_heap_.lock(); + let to_evict = guard.pop().expect("should have entry via peek"); + let new_min = guard.peek().map(|slot| slot.max_time); + drop(guard); + + if let Some(new_min) = new_min { + current_min_.store(new_min as u64, AtomicOrdering::Release); + } + + evictor.evict_from_cache(to_evict.key); + } + } + }); + + Self { + current_size, + max_capacity, + min_heap, + current_min, + background_tasks, + insert_tx, + remove_tx, + } + } + + /// Returns true if the incoming entry should be inserted. + fn should_insert(&self, incoming_k: &CacheManagerKey, incoming_v: &CacheManagerValue) -> bool { + let incoming_size = incoming_v.params.file_size_bytes as u64; + let should_insert = + if self.current_size.load(AtomicOrdering::SeqCst) + incoming_size > self.max_capacity { + self.max_capacity_should_insert(incoming_v) + } else { + true + }; + + if should_insert { + self.insert_tx + .send(Slot { + max_time: incoming_v.params.max_time, + key: Arc::clone(incoming_k), + }) + .expect("should send min heap insert"); + } + should_insert + } + + /// Returns true if incoming entry should be inserted. + /// + /// Handles the case where the cache is at max_capacity, + /// by either evicting based upon event time recency, + /// or rejecting the incoming entry. + fn max_capacity_should_insert(&self, incoming_v: &CacheManagerValue) -> bool { + match self + .current_min + .load(AtomicOrdering::Relaxed) + .partial_cmp(&(incoming_v.params.max_time as u64)) + { + Some(Ordering::Less) | Some(Ordering::Equal) => { + // incoming event time is more recent than current min + // therefore, evict current min + let _ = self.remove_tx.send(()); + true + } + Some(Ordering::Greater) => false, // incoming event time is older than current min + None => true, // no entries in min-heap + } + } +} + +/// Slot in the min-heap, used to evict based upon event timestamp recency. +/// +/// [`BinaryHeap`] is a max-heap, therefore the Ord implementation is reversed. +#[derive(Debug, Eq, PartialEq)] +struct Slot { + max_time: i64, + key: CacheManagerKey, +} + +#[allow(clippy::non_canonical_partial_ord_impl)] +impl PartialOrd for Slot { + fn partial_cmp(&self, other: &Self) -> Option { + other.max_time.partial_cmp(&self.max_time) + } +} + +impl Ord for Slot { + fn cmp(&self, other: &Self) -> Ordering { + self.partial_cmp(other).unwrap() + } +} + +/// Does the eviction. +#[derive(Debug)] +struct Evictor { + /// Ref to cache, in order to evict. + cache_manager: ArcSwap, +} + +impl Evictor { + /// Creates a new [`Evictor`], with a placeholder cache ref. + fn new_with_placeholder_cache_ref() -> Self { + Self { + cache_manager: ArcSwap::new(Arc::new(Cache::new(0))), + } + } + + /// Sets the cache ref. + fn set_cache(&self, cache: Arc) { + self.cache_manager.store(cache); + } + + /// Evicts the key from the cache. + /// + /// Must be a non-blocking downstream action from [`EntryExpiry`]. + /// + /// [`Cache`].invalidate() provides immediate invalidation of the entry, + /// outside of any pending moka insert tasks. + /// + /// When pending moka insert tasks are applied, if max_capacity is reached + /// then existing keys are flushed based upon expiration. + /// As we are spawning a non-blocking thread, we are not guaranteed + /// to have this eviction occur prior to the flushing of the task queue. + /// + /// Worst case scenario is that an incoming key is rejected (not accepted into cache) due to space. + fn evict_from_cache(&self, key: CacheManagerKey) { + let guard = self.cache_manager.load(); + let cache = guard.as_ref().clone(); + tokio::spawn(async move { + cache.invalidate(&key).await; + }); + } +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + use async_channel::unbounded; + + use crate::data_types::PolicyConfig; + + use super::*; + + fn now_nanos() -> i64 { + chrono::Utc::now().timestamp_nanos_opt().unwrap() + } + + fn cache_manager_value(size: usize, max_time: Option) -> CacheManagerValue { + let max_time = max_time.unwrap_or(now_nanos()); + + CacheManagerValue { + params: ObjectParams { + file_size_bytes: size as i64, + max_time, + min_time: max_time - 1_000_000_000, + ..Default::default() + }, + metadata: ObjectMeta { + last_modified: chrono::Utc::now(), + location: object_store::path::Path::from("not_used"), + size, + e_tag: None, + version: None, + }, + } + } + + fn policy_config(max_capacity: u64) -> PolicyConfig { + PolicyConfig { + max_capacity, + event_recency_max_duration_nanoseconds: 1_000_000_000 * 60 * 60, + } + } + + #[tokio::test] + async fn test_eviction_listener() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert entry + let value = cache_manager_value( + 1_000_000, None, // all will have same event timestamp + ); + let to_evict = "k_a".to_string(); + cache_manager.insert(to_evict.clone(), value.clone()).await; + + // check current_size + assert_eq!( + cache_manager.current_size.load(AtomicOrdering::SeqCst), + 1_000_000 + ); + + // explicitly evict + cache_manager.invalidate(to_evict.clone()).await; + + // eviction listener should receive notification + assert_matches!( + evict_rx.recv().await, + Ok(_), + "should have received eviction notice", + ); + + assert_eq!( + cache_manager.current_size.load(AtomicOrdering::SeqCst), + 0, + "should have zero current_size after eviction" + ); + } + + #[tokio::test] + async fn test_evicts_at_max_capacity() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert 2 entries + let value = cache_manager_value( + max_capacity / 2, + None, // all will have same event timestamp + ); + let oldest = "k_a".to_string(); + cache_manager.insert(oldest.clone(), value.clone()).await; + cache_manager.insert("k_b".into(), value.clone()).await; + + // To Discuss: this flush is needed, in order to apply ordering in k_a+k_b, as before k_c. + // otherwise, the k_c is evicted instead + cache_manager.manager.run_pending_tasks().await; + + // insert 1 more entry, which should force an eviction (over capacity) + cache_manager.insert("k_c".into(), value).await; + cache_manager.manager.run_pending_tasks().await; + + // should evict oldest inserted entry + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if *v == oldest, + "should have evicted oldest inserted key, instead found {:?}", res + ); + + // should still have other 2 entries + assert!( + cache_manager.in_cache(&"k_b".to_string()).await.is_ok(), + "should still have k_b" + ); + assert!( + cache_manager.in_cache(&"k_c".to_string()).await.is_ok(), + "should still have k_c" + ); + } + + #[tokio::test] + async fn test_lfu_eviction() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert 2 entries + let value = cache_manager_value( + max_capacity / 2, + None, // all will have same event timestamp + ); + let read = "k_a".to_string(); + cache_manager.insert(read.clone(), value.clone()).await; + let not_read = "k_b".to_string(); + cache_manager.insert(not_read.clone(), value.clone()).await; + + // read one entry, many times, to pass the probability threshold + // To Discuss: is this sufficient for LFU? + // * the write-back will be triggered on a single cache miss + // * the LFU eviction would be using moka's probabilistic algorithm + for _ in 0..63 { + assert!( + cache_manager.in_cache(&read).await.is_ok(), + "should have read key" + ); + } + + // insert 1 more entry + cache_manager.insert("k_c".into(), value).await; + cache_manager.manager.run_pending_tasks().await; + + // should evict unread entry + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if *v == not_read, + "should have evicted unread key, instead found {:?}", res + ); + + // should have other 2 entries + assert!( + cache_manager.in_cache(&read).await.is_ok(), + "should still have the read key" + ); + assert!( + cache_manager.in_cache(&"k_c".to_string()).await.is_ok(), + "should have newly inserted k_c" + ); + } + + #[tokio::test] + async fn test_event_time_recency_eviction() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert 2 entries, where the older entry has a more recent event time + let older_event_time = cache_manager_value( + max_capacity / 2, + Some(now_nanos() - 5_000_000_000), // 5 seconds ago + ); + let newer_event_time = cache_manager_value( + max_capacity / 2, + Some(now_nanos() - 1_000_000_000), // 1 second ago + ); + let should_keep = "younger_event_time_but_older_insert".to_string(); + cache_manager + .insert(should_keep.clone(), newer_event_time.clone()) + .await; + + let should_evict = "older_event_time_but_younger_insert".to_string(); + cache_manager + .insert(should_evict.clone(), older_event_time) + .await; + + // insert 1 more entry, with same event time as should_keep + cache_manager + .insert("k_c".into(), newer_event_time.clone()) + .await; + + // To Discuss: this is waiting for event time recency eviction to occur + // before the moka task queue is flushed. + // This is the race condition explained in doc comments for + // Evicter::evict_from_cache(). + tokio::time::sleep(Duration::from_micros(1)).await; + cache_manager.manager.run_pending_tasks().await; + + // should evict based on event time, not insertion order + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if *v == should_evict, + "should have evicted older_event_time, instead found {:?}", res + ); + + // LFU as a tie-breaker with same event time + assert!( + cache_manager.in_cache(&should_keep).await.is_ok(), + "should have read key" + ); + cache_manager.insert("k_d".into(), newer_event_time).await; // now have 3 with newer_event_time + cache_manager.manager.run_pending_tasks().await; + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if v == "k_c", + "should have evicted least recently queried key, instead found {:?}", res + ); + } + + #[tokio::test] + async fn test_event_time_trumps_lfu() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert 2 entries, where the older entry has a more recent event time + let older_event_time = cache_manager_value( + max_capacity / 2, + Some(now_nanos() - 5_000_000_000), // 5 seconds ago + ); + let newer_event_time = cache_manager_value( + max_capacity / 2, + Some(now_nanos() - 1_000_000_000), // 1 second ago + ); + let should_keep = "younger_event_time_but_older_insert".to_string(); + cache_manager + .insert(should_keep.clone(), newer_event_time.clone()) + .await; + let should_evict = "older_event_time_but_younger_insert".to_string(); + cache_manager + .insert(should_evict.clone(), older_event_time) + .await; + + // query the older timestamp, many times, to pass the probability threshold + for _ in 0..63 { + assert!( + cache_manager.in_cache(&should_evict).await.is_ok(), + "should have read key" + ); + } + + // insert 1 more entry, with same event time as should_keep + cache_manager + .insert("k_c".into(), newer_event_time.clone()) + .await; + + // To Discuss: this is waiting for event time recency eviction to occur + // before the moka task queue is flushed. + // This is the race condition explained in doc comments for + // Evicter::evict_from_cache(). + tokio::time::sleep(Duration::from_micros(1)).await; + cache_manager.manager.run_pending_tasks().await; + + // should evict based on event time, not LFU + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if *v == should_evict, + "should have evicted older_event_time, instead found {:?}", res + ); + } + + #[tokio::test] + async fn test_event_time_recency_age_out() { + let (evict_tx, _) = unbounded(); + + // build cache manager, with 2 second ageout + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + PolicyConfig { + max_capacity: max_capacity as u64, + event_recency_max_duration_nanoseconds: 1_000_000_000 * 2, + }, + evict_tx, + )); + + // insert + let value = cache_manager_value( + max_capacity / 2, + None, // will have current event timestamp + ); + let now = "now_event_time".to_string(); + cache_manager.insert(now.clone(), value.clone()).await; + assert!( + cache_manager.in_cache(&now).await.is_ok(), + "should have now" + ); + + // age out + tokio::time::sleep(Duration::from_secs(3)).await; + cache_manager.manager.run_pending_tasks().await; + assert!( + cache_manager.in_cache(&now).await.is_err(), + "should no longer have now" + ); + } + + #[tokio::test] + async fn test_event_time_recency_age_out_with_future_time() { + let (evict_tx, _) = unbounded(); + + // build cache manager, with 2 second ageout + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + PolicyConfig { + max_capacity: max_capacity as u64, + event_recency_max_duration_nanoseconds: 1_000_000_000 * 2, + }, + evict_tx, + )); + + // insert + let value = cache_manager_value( + max_capacity / 2, + Some(now_nanos() + 2_000_000_000), // 2 seconds into future + ); + let future_event = "future_event_time".to_string(); + cache_manager + .insert(future_event.clone(), value.clone()) + .await; + assert!( + cache_manager.in_cache(&future_event).await.is_ok(), + "should have future_event" + ); + + // age out + tokio::time::sleep(Duration::from_secs(3 + 2)).await; + cache_manager.manager.run_pending_tasks().await; + assert!( + cache_manager.in_cache(&future_event).await.is_err(), + "should no longer have future_event" + ); + } + + #[tokio::test] + async fn test_fetch_metadata() { + let (evict_tx, evict_rx) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // insert 2 entries + let value = cache_manager_value( + max_capacity / 2, + None, // all will have same event timestamp + ); + let read = "k_a".to_string(); + cache_manager.insert(read.clone(), value.clone()).await; + let not_read = "k_b".to_string(); + cache_manager.insert(not_read.clone(), value.clone()).await; + + // assert can find metadata + let expected_metadata = value.clone().metadata; + assert_matches!( + cache_manager.fetch_metadata(&read).await, + Ok(metadata) if metadata == expected_metadata, + "should have found metadata" + ); + + // assert metadata access applies to LFU eviction policy + for _ in 0..63 { + assert!( + cache_manager.fetch_metadata(&read).await.is_ok(), + "should have read key" + ); + } + cache_manager.manager.run_pending_tasks().await; + + // insert 1 more entry + cache_manager.insert("k_c".into(), value).await; + cache_manager.manager.run_pending_tasks().await; + + // should evict unread entry + let res = evict_rx.recv().await; + assert_matches!( + res, + Ok(v) if *v == not_read, + "should have evicted unread key, instead found {:?}", res + ); + + // should have other 2 entries + assert!( + cache_manager.in_cache(&read).await.is_ok(), + "should still have the read key" + ); + assert!( + cache_manager.in_cache(&"k_c".to_string()).await.is_ok(), + "should have newly inserted k_c" + ); + } + + #[tokio::test] + async fn test_cache_misses() { + let (evict_tx, _) = unbounded(); + + // build cache manager + let max_capacity: usize = 3_200_000; + let cache_manager = Arc::new(CacheManager::new( + policy_config(max_capacity as u64), + evict_tx, + )); + + // cache misses + assert_matches!( + cache_manager + .fetch_metadata(&"not_in_cache".to_string()) + .await, + Err(Error::CacheMiss), + "should have returned cache miss for metadata", + ); + assert_matches!( + cache_manager.in_cache(&"not_in_cache".to_string()).await, + Err(Error::CacheMiss), + "should have returned cache miss for object", + ); + // when cache miss: + // 1. return error + // 2. upper layer (DataService) will handle any write back + } +} diff --git a/parquet_cache/src/server/data/reads.rs b/parquet_cache/src/server/data/reads.rs new file mode 100644 index 00000000000..07c09afcae5 --- /dev/null +++ b/parquet_cache/src/server/data/reads.rs @@ -0,0 +1,23 @@ +use std::sync::Arc; + +use super::store::{LocalStore, StreamedObject}; +use super::DataError; + +/// Service that handles the READ requests (`GET /object`). +#[derive(Debug, Clone)] +pub struct ReadHandler { + cache: Arc, +} + +impl ReadHandler { + pub fn new(cache: Arc) -> Self { + Self { cache } + } + + pub async fn read_local(&self, location: &String) -> Result { + self.cache + .read_object(location) + .await + .map_err(|e| DataError::Read(e.to_string())) + } +} diff --git a/parquet_cache/src/server/data/store.rs b/parquet_cache/src/server/data/store.rs new file mode 100644 index 00000000000..e4ef7956f0b --- /dev/null +++ b/parquet_cache/src/server/data/store.rs @@ -0,0 +1,510 @@ +use std::{ + path::{Path, PathBuf}, + pin::Pin, + task::{Context, Poll}, +}; + +use bytes::{Bytes, BytesMut}; +use futures::{ + stream::{BoxStream, StreamExt}, + FutureExt, TryStreamExt, +}; +use pin_project::pin_project; +use tokio::fs::{create_dir_all, remove_dir, remove_file, File}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt, Error, ReadBuf}; +use tokio_util::codec::{BytesCodec, FramedRead}; + +/// object_store expected stream IO type +pub type StreamedObject = BoxStream<'static, object_store::Result>; + +/// identifier for `object_store::Error::Generic` +const DATA_CACHE: &str = "local store accessor"; + +/// Access to stored data. +#[derive(Debug)] +pub struct LocalStore { + dir: PathBuf, +} + +impl LocalStore { + pub fn new(path: Option) -> Self { + let dir = path.map(|p| p.to_string()).unwrap_or("/tmp".to_string()); + Self { + dir: Path::new(dir.as_str()).to_owned(), + } + } + + fn local_path(&self, location: &String) -> PathBuf { + self.dir.join(location) + } + + /// Move a given file location, into cache + pub async fn move_file_to_cache(&self, from: PathBuf, location: &String) -> Result<(), Error> { + let to = self.local_path(location); + match to.parent() { + None => { + return Err(Error::new( + std::io::ErrorKind::InvalidData, + "object location is not valid", + )) + } + Some(path) => create_dir_all(path).await?, + }; + std::fs::rename(from, to) + } + + /// Async write operation + pub async fn write_object( + &self, + location: &String, + size: i64, + mut stream: StreamedObject, + ) -> Result<(), Error> { + if location.starts_with('/') { + return Err(Error::new( + std::io::ErrorKind::InvalidData, + "object location cannot be an absolute path", + )); + } + let path = self.local_path(location); + let mut obj = AsyncStoreObject::new(path.as_path(), size).await?; + + while let Some(maybe_bytes) = stream.next().await { + if maybe_bytes.is_err() { + let _ = obj.delete().await; + return Err(Error::new( + std::io::ErrorKind::InvalidData, + "error reading incoming byte stream", + )); + } + + match obj.write_all(&maybe_bytes.unwrap()).await { + Ok(_) => continue, + Err(e) => { + let _ = obj.delete().await; + return Err(e); + } + } + } + + Ok(()) + } + + /// Read `GET /object` returns a stream + pub async fn read_object(&self, location: &String) -> Result { + if location.starts_with('/') { + return Err(Error::new( + std::io::ErrorKind::InvalidData, + "object location cannot be an absolute path", + )); + } + + // Potential TODO: replace the StreamedObject with sendfile? + // the the client can return a GetResultPayload::File() through the interface. + let path = self.dir.join(location); + Ok(AsyncStoreObject::open(path.as_path()).await?.read_stream()) + } + + /// Delete object in local store, such as on cache eviction. + pub async fn delete_object(&self, location: &String) -> Result<(), Error> { + if location.starts_with('/') { + return Err(Error::new( + std::io::ErrorKind::InvalidData, + "object location cannot be an absolute path", + )); + } + + let path = self.dir.join(location); + AsyncStoreObject::open(path.as_path()).await?.delete().await + } +} + +#[pin_project] +pub struct AsyncStoreObject<'a> { + #[pin] + inner: File, + path: &'a Path, +} + +impl<'a> AsyncStoreObject<'a> { + /// Create a new AsyncStoreObject, honoring the path provided. + async fn new(path: &'a Path, size: i64) -> std::io::Result { + // The path of the object (in the ObjectStore implementations) is: + // ///. + // + // Future cache eviction policies may be mapped to resource allocation per table_id. + create_dir_all(path.parent().unwrap_or(path)).await?; + let file = File::create(path).await?; + file.set_len(size as u64).await?; + + Ok(Self { inner: file, path }) + } + + async fn open(path: &'a Path) -> std::io::Result { + Ok(Self { + inner: File::open(path).await?, + path, + }) + } + + fn read_stream(self) -> StreamedObject { + Box::pin( + FramedRead::new(self.inner, BytesCodec::new()) + .map_ok(BytesMut::freeze) + .map_err(|e| object_store::Error::Generic { + store: DATA_CACHE, + source: Box::new(e), + }), + ) + } + + async fn delete(&self) -> std::io::Result<()> { + remove_file(self.path).await?; + let dir = self.path.parent().unwrap_or(self.path); + if dir.read_dir()?.next().is_none() { + remove_dir(dir).await + } else { + Ok(()) + } + } +} + +impl<'a> AsyncRead for AsyncStoreObject<'a> { + fn poll_read( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> Poll> { + let this = self.project(); + this.inner.poll_read(cx, buf) + } +} + +impl<'a> AsyncWrite for AsyncStoreObject<'a> { + fn poll_write( + self: Pin<&mut Self>, + cx: &mut Context<'_>, + buf: &[u8], + ) -> Poll> { + let this = self.project(); + this.inner.poll_write(cx, buf) + } + + fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + Box::pin(this.inner.get_mut().sync_all()).poll_unpin(cx) + } + + fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.project(); + Box::pin(this.inner.get_mut().shutdown()).poll_unpin(cx) + } +} + +#[cfg(test)] +mod test { + use std::{hash::Hasher, io::ErrorKind}; + + use assert_matches::assert_matches; + use rand::{distributions::Alphanumeric, thread_rng, Rng}; + use tempfile::TempDir; + use tokio::io::AsyncReadExt; + + use super::*; + + async fn create_incoming_stream(file_path: &PathBuf) -> StreamedObject { + let mut writeable = File::create(file_path) + .await + .expect("should create file in tempdir"); + + for _ in 0..5 { + let rand_string: String = thread_rng() + .sample_iter(&Alphanumeric) + .take(1_000_000) + .map(char::from) + .collect(); + writeable + .write_all(rand_string.as_bytes()) + .await + .expect("should write to mock incoming"); + } + writeable + .sync_all() + .await + .expect("should fsync incoming mock data file"); + + let readable = File::open(file_path) + .await + .expect("file should be readable"); + Box::pin( + FramedRead::new(readable, BytesCodec::new()) + .map_ok(BytesMut::freeze) + .map_err(|e| object_store::Error::Generic { + store: DATA_CACHE, + source: Box::new(e), + }), + ) + } + + async fn run_write_read_test() { + let tempdir = TempDir::new().expect("should make tempdir"); + let incoming_file_path = tempdir.path().join("./incoming-io.txt"); + let obj_stream = create_incoming_stream(&incoming_file_path).await; + + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "obj/to/write.parquet"; + + let write_res = local_store + .write_object(&location.to_string(), 1_000_000 * 5, obj_stream) + .await; + assert_matches!( + write_res, + Ok(()), + "write should return ok, instead found {:?}", + write_res + ); + + let read_res = local_store.read_object(&location.to_string()).await; + assert!(read_res.is_ok(), "read should return ok"); + + // expected == data which was streamed in to WRITE + let mut expected = Vec::new(); + File::open(incoming_file_path) + .await + .expect("should open original incoming data file") + .read_to_end(&mut expected) + .await + .unwrap(); + let mut expected_hash = ahash::AHasher::default(); + expected_hash.write(&expected); + + // got == data that was WRITE then READ + let mut got = Vec::new(); + tokio_util::io::StreamReader::new(read_res.unwrap()) + .read_to_end(&mut got) + .await + .unwrap(); + let mut got_hash = ahash::AHasher::default(); + got_hash.write(&got); + + assert_eq!( + 1_000_000 * 5, + expected.len(), + "incoming mock file stream was incorrect" + ); + assert_eq!( + expected.len(), + got.len(), + "expected {} bytes but found {} bytes", + expected.len(), + got.len() + ); + assert_eq!( + got_hash.finish(), + expected_hash.finish(), + "hash of file contents do not match" + ); + + tempdir.close().expect("should delete tempdir"); + } + + async fn can_duplicate_write_to_key() { + let tempdir = TempDir::new().expect("should make tempdir"); + let incoming_file_path = tempdir.path().join("./incoming-dupe-writes.txt"); + + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "obj/to/write.parquet"; + + let write_res = local_store + .write_object( + &location.to_string(), + 1_000_000 * 5, + create_incoming_stream(&incoming_file_path).await, + ) + .await; + assert!( + write_res.is_ok(), + "first write should succeed, instead found {:?}", + write_res + ); + + let duplicate_write = local_store + .write_object( + &location.to_string(), + 1_000_000 * 5, + create_incoming_stream(&incoming_file_path).await, + ) + .await; + assert!( + duplicate_write.is_ok(), + "second write should also succeed, instead found {:?}", + duplicate_write + ); + + tempdir.close().expect("should delete tempdir"); + } + + async fn run_delete_test() { + let tempdir = TempDir::new().expect("should make tempdir"); + let incoming_file_path = tempdir.path().join("./incoming-io.txt"); + let obj_stream = create_incoming_stream(&incoming_file_path).await; + + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "obj/to/write.parquet"; + + let write_res = local_store + .write_object(&location.to_string(), 1_000_000 * 5, obj_stream) + .await; + assert_matches!( + write_res, + Ok(()), + "write should return ok, instead found {:?}", + write_res + ); + + // confirm obj is written + let written_obj_path = tempdir.path().join(location); + let mut written_obj = Vec::new(); + File::open(written_obj_path.clone()) + .await + .expect("should open original incoming data file") + .read_to_end(&mut written_obj) + .await + .unwrap(); + assert_eq!( + 1_000_000 * 5, + written_obj.len(), + "object should be written to full length" + ); + + // delete obj + let del_res = local_store.delete_object(&location.to_string()).await; + assert!(del_res.is_ok(), "should return OK on delete"); + + // confirm does not exist + let should_be_err = File::open(written_obj_path).await; + assert_matches!( + should_be_err, + Err(e) if e.kind() == ErrorKind::NotFound, + "cache obj should not exist" + ); + + tempdir.close().expect("should delete tempdir"); + } + + async fn error_with_absolute_path_in_obj_key() { + let tempdir = TempDir::new().expect("should make tempdir"); + let incoming_file_path = tempdir.path().join("./incoming-abs-key-path.txt"); + + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "/absolute/pathed/object.parquet"; + + let write_res = local_store + .write_object( + &location.to_string(), + 1_000_000 * 5, + create_incoming_stream(&incoming_file_path).await, + ) + .await; + assert_matches!( + write_res, + Err(e) if e.to_string().contains("object location cannot be an absolute path"), + "expected write to error, instead found {:?}", + write_res + ); + + let read_res = local_store.read_object(&location.to_string()).await; + assert!(read_res.is_err(), "expected read to error",); + + let delete_res = local_store.delete_object(&location.to_string()).await; + assert!(delete_res.is_err(), "expected delete to error",); + + tempdir.close().expect("should delete tempdir"); + } + + async fn write_aborts_are_handled() { + let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![Err( + object_store::Error::Generic { + store: "error in bytes stream from remote object store", + source: "delete on first write".into(), + }, + )])) as StreamedObject; + + let tempdir = TempDir::new().expect("should make tempdir"); + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "obj/to/write.parquet"; + + let write_res = local_store + .write_object( + &location.to_string(), + 1_000_000 * 5, + stream_with_partial_write, + ) + .await; + assert_matches!( + write_res, + Err(e) if e.to_string().contains("error reading incoming byte stream"), + "expected write to error, instead found {:?}", + write_res + ); + + tempdir.close().expect("should delete tempdir"); + } + + async fn partial_files_are_deleted_on_write_abort() { + let stream_with_partial_write = Box::pin(tokio_stream::iter(vec![ + Ok(Bytes::from(&b"good yield"[..])), + Err(object_store::Error::Generic { + store: "error in bytes stream from remote object store", + source: "foobar".into(), + }), + ])) as StreamedObject; + + let tempdir = TempDir::new().expect("should make tempdir"); + let local_store = LocalStore::new(tempdir.path().to_str()); + let location = "obj/to/write.parquet"; + + let write_res = local_store + .write_object( + &location.to_string(), + 1_000_000 * 5, + stream_with_partial_write, + ) + .await; + assert_matches!( + write_res, + Err(e) if e.to_string().contains("error reading incoming byte stream"), + "expected write to error, instead found {:?}", + write_res + ); + + let incoming_file_path = tempdir.path().join("./incoming-partial.txt"); + let should_not_exist = File::open(incoming_file_path).await; + assert_matches!( + should_not_exist, + Err(e) if e.kind() == ErrorKind::NotFound, + "file partial should not exist" + ); + + tempdir.close().expect("should delete tempdir"); + } + + #[tokio::test] + async fn test_write_read_object() { + futures::join!(run_write_read_test(), can_duplicate_write_to_key(),); + } + + #[tokio::test] + async fn test_delete_object() { + run_delete_test().await; + } + + #[tokio::test] + async fn test_error_handling() { + futures::join!( + error_with_absolute_path_in_obj_key(), + write_aborts_are_handled(), + partial_files_are_deleted_on_write_abort(), + ); + } +} diff --git a/parquet_cache/src/server/data/writes.rs b/parquet_cache/src/server/data/writes.rs new file mode 100644 index 00000000000..d42fc5b417f --- /dev/null +++ b/parquet_cache/src/server/data/writes.rs @@ -0,0 +1,69 @@ +use std::sync::Arc; + +use object_store::{GetResult, GetResultPayload, ObjectMeta, ObjectStore}; +use observability_deps::tracing::warn; + +use crate::data_types::WriteHint; + +use super::{store::LocalStore, DataError}; + +/// Handles the WRITE requests (`/write-hint`) +#[derive(Debug, Clone)] +pub struct WriteHandler { + cache: Arc, + direct_store: Arc, +} + +impl WriteHandler { + pub fn new(cache: Arc, direct_store: Arc) -> Self { + Self { + cache, + direct_store, + } + } + + pub async fn write_local( + &self, + location: &str, + write_hint: &WriteHint, + ) -> Result { + // get from remote + let WriteHint { + file_size_bytes, .. + } = write_hint; + let GetResult { meta, payload, .. } = self + .direct_store + .get(&location.into()) + .await + .map_err(|e| match e { + object_store::Error::NotFound { .. } => DataError::DoesNotExist, + _ => DataError::Stream(e.to_string()), + })?; + + if !(meta.size as i64).eq(file_size_bytes) { + warn!( + "failed to perform writeback due to file size mismatch: {} != {}", + meta.size, file_size_bytes + ); + return Err(DataError::BadRequest( + "failed to perform writeback due to file size mismatch".to_string(), + )); + } + + // write local + match payload { + GetResultPayload::File(_, pathbuf) => self + .cache + .move_file_to_cache(pathbuf, &location.into()) + .await + .map_err(|e| DataError::File(e.to_string()))?, + GetResultPayload::Stream(stream) => self + .cache + .write_object(&location.into(), *file_size_bytes, stream) + .await + .map_err(|e| DataError::Stream(e.to_string()))?, + }; + + Ok(meta) + } +} diff --git a/parquet_cache/src/server/error.rs b/parquet_cache/src/server/error.rs new file mode 100644 index 00000000000..24e87c49f91 --- /dev/null +++ b/parquet_cache/src/server/error.rs @@ -0,0 +1,55 @@ +use hyper::StatusCode; + +use crate::server::data::DataError; + +/// Error type for the server. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Error in the keyspace layer. + #[error("Keyspace error: {0}")] + Keyspace(String), + /// Error in the precondition layer. + #[error("Precondition error: {0}")] + Precondition(String), + /// Error in the data layer. + #[error("Data error: {0}")] + Data(#[from] DataError), + + /// Error with warming. + #[error("Warming error: {0}")] + Warming(String), + /// Cache miss. + #[error("Cache miss")] + CacheMiss, + /// Bad request from the user. + #[error("Bad Request: {0}")] + BadRequest(String), + /// Object does not exist. + #[error("Bad Request: object location does not exist in catalog or object store")] + DoesNotExist, + /// Error due to server shutdown. + #[error("Server shutdown")] + ServerShutdown, +} + +impl Error { + /// Return the HTTP status code for this error. + /// + /// Should match the handling, per code, in the [client](crate::client::object_store::DataCacheObjectStore). + pub fn code(&self) -> StatusCode { + match self { + // If errors here, have the client return an error. + Self::BadRequest(_) + | Self::DoesNotExist + | Self::Data(DataError::BadRequest(_)) + | Self::Data(DataError::DoesNotExist) => StatusCode::BAD_REQUEST, + Self::Precondition(_) => StatusCode::PRECONDITION_FAILED, + // If errors below here, result in the client using the fallback. + Self::CacheMiss => StatusCode::NOT_FOUND, + Self::Keyspace(_) | Self::Warming(_) | Self::Data(_) => { + StatusCode::INTERNAL_SERVER_ERROR + } + Self::ServerShutdown => StatusCode::SERVICE_UNAVAILABLE, + } + } +} diff --git a/parquet_cache/src/server/keyspace.rs b/parquet_cache/src/server/keyspace.rs new file mode 100644 index 00000000000..88ea0f8310f --- /dev/null +++ b/parquet_cache/src/server/keyspace.rs @@ -0,0 +1,957 @@ +use std::{path::Path, sync::Arc, task::Poll}; + +use arc_swap::ArcSwap; +use futures::Future; +use http::{Method, Request}; +use hyper::Body; +use mpchash::HashRing; +use notify::{RecommendedWatcher, RecursiveMode, Watcher}; +use observability_deps::tracing::error; +use tokio::{sync::Notify, task::JoinHandle}; +use tower::{Layer, Service}; + +use crate::{ + data_types::{ + InstanceState, KeyspaceResponseBody, KeyspaceVersion, ParquetCacheInstanceSet, ServiceNode, + ServiceNodeHostname, ServiceNodeId, + }, + server::response::Response, +}; + +use super::{error::Error, response::PinnedFuture}; + +struct BackgroundTask { + path: String, + fswatcher: RecommendedWatcher, + notifier_handle: JoinHandle<()>, +} + +impl Drop for BackgroundTask { + fn drop(&mut self) { + if let Err(e) = self.fswatcher.unwatch(Path::new(&self.path)) { + error!("KeyspaceService fswatcher failed to unwatch: {}", e) + } + self.notifier_handle.abort(); + } +} + +/// Service that applies the keyspace per request. +pub struct KeyspaceService { + shared: Arc, + ready_tx: Arc, + ready_rx: std::pin::Pin + Send + Sync + 'static>>, + keyspace: Arc, + inner: S, +} + +impl std::fmt::Debug for KeyspaceService { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("KeyspaceService") + .field("keyspace", &self.keyspace) + .finish_non_exhaustive() + } +} + +impl Clone for KeyspaceService { + fn clone(&self) -> Self { + let ready_rx = Arc::clone(&self.ready_tx); + let ready_rx = Box::pin(async move { + ready_rx.notified().await; + }); + + Self { + shared: Arc::clone(&self.shared), + ready_tx: Arc::clone(&self.ready_tx), + ready_rx, + keyspace: Arc::clone(&self.keyspace), + inner: self.inner.clone(), + } + } +} + +impl> + Clone + Send + Sync + 'static> KeyspaceService { + fn new(inner: S, configfile_path: String, node_hostname: String) -> Result { + let path = configfile_path.clone(); + + let data = Arc::new(KeyspaceData::new(node_hostname)); + let keyspace = Arc::new(Keyspace { + data: data.into(), + configfile_path, + }); + + let ready_tx = Arc::new(Notify::new()); + let (fswatcher, notifier_handle) = + Self::start_background_task(Arc::clone(&keyspace), Arc::clone(&ready_tx))?; + + let ready_rx = Arc::clone(&ready_tx); + let ready_rx = Box::pin(async move { + ready_rx.notified().await; + }); + + Ok(Self { + shared: Arc::new(BackgroundTask { + path, + fswatcher, + notifier_handle, + }), + ready_tx, + ready_rx, + keyspace, + inner, + }) + } + + fn start_background_task( + keyspace: Arc, + ready_tx: Arc, + ) -> Result<(RecommendedWatcher, JoinHandle<()>), Error> { + let changed = Arc::new(Notify::new()); + let has_changed = Arc::clone(&changed); + + let configfile_path = keyspace.configfile_path.clone(); + let ready_tx_ = Arc::clone(&ready_tx); + let keyspace_ = Arc::clone(&keyspace); + + // start watcher -- default is to poll for changes every 30 seconds + let watcher_and_listener = + notify::recommended_watcher(move |res: notify::Result| match res { + Ok(notify::Event { kind, .. }) => { + if kind.is_modify() || kind.is_create() { + has_changed.notify_one(); + } + } + Err(e) => error!(error=%e, "KeyspaceService fswatcher failed"), + }) + .and_then(move |mut watcher| { + watcher.watch(Path::new(&configfile_path), RecursiveMode::NonRecursive)?; + Ok(( + watcher, + tokio::spawn(async move { + loop { + changed.notified().await; + keyspace.update(Arc::clone(&ready_tx)).await; + } + }), + )) + }) + .map_err(|e| Error::Keyspace(e.to_string()))?; + + // handle race where the file is created before the watcher is started + if Path::exists(Path::new(&keyspace_.configfile_path)) { + tokio::spawn(async move { + keyspace_.update(ready_tx_).await; + }); + } + + Ok(watcher_and_listener) + } +} + +impl Service> for KeyspaceService +where + S: Service, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static, +{ + type Response = super::response::Response; + type Error = Error; + type Future = PinnedFuture; + + fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { + if !self.keyspace.ready() { + futures::ready!(self.ready_rx.as_mut().poll(cx)); + } + self.inner.poll_ready(cx) + } + + fn call(&mut self, req: Request) -> Self::Future { + match (req.method(), req.uri().path()) { + (&Method::GET, "/state") => { + let this = self.clone(); + Box::pin(async move { + // return the version we have loaded + // serde serialization will add the CacheState enum, based on this version + Ok(Response::KeyspaceVersion( + this.keyspace.data.load().version.clone(), + )) + }) + } + (&Method::PATCH, "/warmed") => { + let this = self.clone(); + Box::pin(async move { + this.keyspace.set_to_running(); + Ok(Response::Ready) + }) + } + (&Method::GET, "/keyspace") => { + let this = self.clone(); + Box::pin(async move { + let (_, _, keyspace) = this.keyspace.read_definition().await; + Ok(Response::Keyspace(keyspace)) + }) + } + (&Method::GET, "/metadata") + | (&Method::GET, "/object") + | (&Method::POST, "/write-hint") => { + let clone = self.inner.clone(); + let mut inner = std::mem::replace(&mut self.inner, clone); + let this = self.clone(); + Box::pin(async move { + let as_url = url::Url::parse(req.uri().to_string().as_str()) + .expect("should be already validated path & query"); + let obj_location = match as_url.query_pairs().find(|(k, _v)| k.eq("location")) { + None => { + return Err(Error::Keyspace( + "invalid or missing object location".into(), + )); + } + Some((_key, location)) => location.to_string(), + }; + + // when keyspace is invalid (being re-built), return error such that + // cache client decides to (1) re-fetch keyspace, and/or (2) uses fallback + match this.keyspace.in_keyspace(&obj_location) { + true => inner.call(req).await, + false => Err(Error::Keyspace(format!( + "object {} is not found in keyspace", + obj_location + ))), + } + }) + } + (any_method, any_path) => { + let msg = format!("invalid path: {} {}", any_method, any_path); + Box::pin(async { Err(Error::BadRequest(msg)) }) + } + } + } +} + +#[derive(Debug, Clone)] +struct KeyspaceData { + /// ID self + /// Is none if keyspace has been invalidated. + own: Option, + // Hashring + keyspace: Arc>, + /// Versioning, so can provide current vs next, per GET `/state` request + /// Is none if Self::Pending (a.k.a. no definition loaded yet) + version: KeyspaceVersion, +} + +impl KeyspaceData { + pub fn new(self_node: ServiceNodeHostname) -> Self { + Self { + own: None, + keyspace: Default::default(), + version: KeyspaceVersion::new(self_node), + } + } +} + +#[derive(Debug)] +struct Keyspace { + /// Atomically updated keyspace data. + data: ArcSwap, + /// Fs-accessible file containing the [`ParquetCacheInstanceSet`] + configfile_path: String, +} + +impl Keyspace { + /// `Valid` as in able to check keyspace hashring. + /// + /// Returns true if the keyspace definition exists, and own-node is within keyspace. + fn is_valid(&self) -> bool { + self.data.load().own.is_some() + } + + /// `Ready` as in poll_ready (to receive requests). + /// Returns true if the keyspace is not in the init phase. + /// + /// Request include `GET /state` requests during warming and cooling phases. + fn ready(&self) -> bool { + let data = self.data.load(); + match InstanceState::from(&data.as_ref().version) { + InstanceState::Pending => false, + InstanceState::Warming | InstanceState::Running | InstanceState::Cooling => true, + } + } + + /// `Running` as in the [`InstanceState`]. + fn set_to_running(&self) { + self.data.rcu(|data| KeyspaceData { + own: data.own, + keyspace: Arc::clone(&data.keyspace), + version: data.version.clone_next_to_curr(), + }); + } + + /// Returns true if the object location is in the keyspace. + fn in_keyspace(&self, object: &String) -> bool { + let data = self.data.load(); + self.is_valid() + && match data.own { + None => false, + Some(id) => match data.keyspace.primary_node(object) { + Some(&assigned_node) => assigned_node == id, + None => false, + }, + } + } + + /// Read keyspace definition from file. + async fn read_definition( + &self, + ) -> ( + ParquetCacheInstanceSet, /* KeyspaceVersion.next */ + Option, /* None == current node is not in KeyspaceVersion.next */ + Vec, /* full set of KeyspaceVersion.next hashring */ + ) { + let current_instance_set_next = tokio::fs::read_to_string(self.configfile_path.clone()) + .await + .expect("config map file should always exist on pod"); + let parquet_cache_instance_set: ParquetCacheInstanceSet = + serde_json::from_str(current_instance_set_next.as_str()) + .expect("should have valid ParquetCacheInstanceSet format"); + + let service_nodes = KeyspaceResponseBody::from(&parquet_cache_instance_set).nodes; + + let self_hostname = self.data.load().version.hostname().clone(); + ( + parquet_cache_instance_set, + service_nodes + .iter() + .position(|node| node.hostname == self_hostname) + .map(|node_id| node_id as u64), + service_nodes, + ) + } + + /// Update keyspace definition. + async fn update(&self, ready: Arc) { + let (next_version, own, all_nodes) = self.read_definition().await; + + let mut keyspace = HashRing::new(); + for ServiceNode { id, hostname: _ } in all_nodes { + keyspace.add(id); + } + let keyspace = Arc::new(keyspace); + + // determine if KeyspaceVersion changed + let prev_data = self.data.rcu(|curr_data| { + match &curr_data.version.next { + Some(next) if next_version.revision == next.revision => { + // no change -- already knows about next + Arc::clone(curr_data) + } + _ => Arc::new(KeyspaceData { + own, + keyspace: Arc::clone(&keyspace), + version: curr_data.version.set_next(next_version.to_owned()), + }), + } + }); + + if InstanceState::from(&prev_data.version) == InstanceState::Pending && self.ready() { + // Let anyone waiting on poll_ready know that we're no longer pending. + ready.notify_waiters(); + } + } +} + +pub struct BuildKeyspaceService { + pub configfile_path: String, + pub node_hostname: String, +} + +impl> + Clone + Send + Sync + 'static> Layer for BuildKeyspaceService { + type Service = KeyspaceService; + + fn layer(&self, service: S) -> Self::Service { + KeyspaceService::new( + service, + self.configfile_path.clone(), + self.node_hostname.clone(), + ) + .expect("cache server failed to deploy due to keyspace layer init error") + } +} + +#[cfg(test)] +mod test { + use std::{ + io::{Seek, Write}, + sync::atomic::{AtomicU32, Ordering}, + task::Context, + time::Duration, + }; + + use assert_matches::assert_matches; + use futures::{future, task::noop_waker_ref}; + use tempfile::{NamedTempFile, TempDir}; + use tokio::io::AsyncWriteExt; + use tokio_stream::StreamExt; + use tower::{ServiceBuilder, ServiceExt}; + + use super::super::response::Response; + use super::*; + + const VALID_HOSTNAME: &str = "hostname-a"; + lazy_static::lazy_static! { + static ref KEYSPACE_DEFINITION: String = serde_json::json!(ParquetCacheInstanceSet { + revision: 0, + // a single node in the keyspace, therefore all keys should hash to this keyspace + instances: vec![VALID_HOSTNAME].into_iter().map(String::from).collect(), + }).to_string(); + } + + #[derive(Clone, Default)] + struct MockInnermostService { + call: Arc, + poll_ready: Arc, + } + + impl Service> for MockInnermostService { + type Response = Response; + type Error = Error; + type Future = PinnedFuture; + + fn poll_ready( + &mut self, + _cx: &mut std::task::Context<'_>, + ) -> Poll> { + self.poll_ready.fetch_add(1, Ordering::SeqCst); + Poll::Ready(Ok(())) + } + fn call(&mut self, _req: Request) -> Self::Future { + self.call.fetch_add(1, Ordering::SeqCst); + Box::pin(future::ok(Response::Ready)) + } + } + + fn metadata_req() -> Request { + Request::builder() + .method(Method::GET) + .uri("http://foo.io/metadata?location=bar") + .body(Body::empty()) + .unwrap() + } + + fn object_req() -> Request { + Request::builder() + .method(Method::GET) + .uri("http://foo.io/object?location=bar") + .body(Body::empty()) + .unwrap() + } + + fn write_hint_req() -> Request { + Request::builder() + .method(Method::POST) + .uri("http://foo.io/write-hint?location=bar") + .body(Body::empty()) + .unwrap() + } + + fn state_req() -> Request { + Request::builder() + .method(Method::GET) + .uri("/state") + .body(Body::empty()) + .unwrap() + } + + fn warmed_req() -> Request { + Request::builder() + .method(Method::PATCH) + .uri("/warmed") + .body(Body::empty()) + .unwrap() + } + + fn keyspace_defn_req() -> Request { + Request::builder() + .method(Method::GET) + .uri("/keyspace") + .body(Body::empty()) + .unwrap() + } + + async fn write_defn_to_file(defn: &[u8], configfile_path: &Path) { + let mut file = tokio::fs::File::create(&configfile_path).await.unwrap(); + file.write_all(defn) + .await + .expect("should write keyspace definition to configfile"); + + // notify fswatcher will sometimes skip events when the file descriptor is still open + file.shutdown() + .await + .expect("should shutdown file descriptor"); + } + + #[allow(clippy::future_not_send)] + async fn wait_until_service_is_ready(server: &mut KeyspaceService) { + future::poll_fn(move |cx| server.poll_ready(cx)) + .await + .expect("should not have failed"); + } + + #[tokio::test] + async fn test_keyspace_can_load_definition() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "{}", KEYSPACE_DEFINITION.as_str()) + .expect("should write keyspace definition to configfile"); + + let keyspace = Keyspace { + configfile_path: file.path().to_str().unwrap().to_string(), + data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(), + }; + + assert!( + !keyspace.is_valid(), + "default keyspace should be invalid, due to no definition loaded" + ); + + let notify = Arc::new(Notify::new()); + keyspace.update(Arc::clone(¬ify)).await; + assert!( + keyspace.is_valid(), + "keyspace should be valid, after definition is loaded" + ); + + // remove from keyspace, by changing keyspace definition + let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet { + revision: 1, + instances: vec!["another-node"].into_iter().map(String::from).collect(), + }) + .to_string(); + let mut file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(file.path()) + .unwrap(); + file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite + writeln!(file, "{}", new_keyspace_definition.as_str()) + .expect("should write keyspace definition to configfile"); + file.sync_all().unwrap(); + + // should no longer be in keyspace + keyspace.update(Arc::clone(¬ify)).await; + assert!( + !keyspace.is_valid(), + "keyspace should not be valid, when own-hostname not in definition" + ); + } + + #[tokio::test] + async fn test_keyspace_poll_ready_during_instance_phases() { + let mut file = NamedTempFile::new().unwrap(); + writeln!(file, "{}", KEYSPACE_DEFINITION.as_str()) + .expect("should write keyspace definition to configfile"); + + let keyspace = Keyspace { + configfile_path: file.path().to_str().unwrap().to_string(), + data: Arc::new(KeyspaceData::new(VALID_HOSTNAME.into())).into(), + }; + + // init phase + assert!(!keyspace.ready(), "default keyspace should not poll_ready"); + + // warming phase + // this in when the outer service layers will be calling the inner KeyspaceService + let notify = Arc::new(Notify::new()); + keyspace.update(Arc::clone(¬ify)).await; + assert!( + keyspace.ready(), + "keyspace should poll_ready, after definition (with own node) is loaded" + ); + + // running phase + keyspace.set_to_running(); + assert!(keyspace.ready(), "keyspace should poll_ready, when running"); + + // remove from keyspace, by changing keyspace definition + let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet { + revision: 1, + instances: vec!["another-node"].into_iter().map(String::from).collect(), + }) + .to_string(); + let mut file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(file.path()) + .unwrap(); + file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite + writeln!(file, "{}", new_keyspace_definition.as_str()) + .expect("should write keyspace definition to configfile"); + file.sync_all().unwrap(); + + // cooling phase + keyspace.update(notify).await; + assert!( + keyspace.ready(), + "keyspace should still poll_ready when cooling, to handle `GET /state` requests" + ); + } + + #[tokio::test] + async fn test_watcher_consumes_definition_file() { + // no keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + let mut file = tokio::fs::File::create(&configfile_path).await.unwrap(); + + // start service + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(MockInnermostService::default()); + + // assert poll_ready returns pending, when no keyspace definition + assert_matches!( + server.poll_ready(&mut Context::from_waker(noop_waker_ref())), + Poll::Pending, + "should return pending status, as keyspace definition does not yet exist" + ); + + // write keyspace definition to configfile + file.write_all(KEYSPACE_DEFINITION.as_bytes()) + .await + .expect("should write keyspace definition to configfile"); + file.shutdown() + .await + .expect("should shutdown file descriptor"); + + // wait for keyspace to be loaded by the watcher + wait_until_service_is_ready(&mut server).await; + + // call service + let res = server.call(state_req()).await; + assert_matches!( + res, + Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming, + "should return successful response, instead found {:?}", + res + ); + } + + #[tokio::test] + async fn test_service_instance_phases() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // wait for service.poll_ready to return ready + wait_until_service_is_ready(&mut server).await; + + // call service when warming + let res = server.call(state_req()).await; + assert_matches!( + res, + Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Warming, + "should return InstanceState::Warming, instead found {:?}", + res + ); + + // tell keyspace it's warmed + assert!( + server.call(warmed_req()).await.is_ok(), + "should be able to PATCH /warmed" + ); + + // call poll_ready when warmed + assert_matches!( + server.poll_ready(&mut Context::from_waker(noop_waker_ref())), + Poll::Ready(Ok(_)), + "should return ready status" + ); + + // call `GET /state` when warmed + let res = server.call(state_req()).await; + assert_matches!( + res, + Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Running, + "should return InstanceState::Running, instead found {:?}", + res + ); + + // tell keyspace to cool, by changing keyspace definition + let new_keyspace_definition = serde_json::json!(ParquetCacheInstanceSet { + revision: 1, + instances: vec!["another-node"].into_iter().map(String::from).collect(), + }) + .to_string(); + let mut file = std::fs::OpenOptions::new() + .write(true) + .truncate(true) + .open(&configfile_path) + .unwrap(); + file.seek(std::io::SeekFrom::Start(0)).unwrap(); // move pointer to start, to overwrite + writeln!(file, "{}", new_keyspace_definition.as_str()) + .expect("should write keyspace definition to configfile"); + file.sync_all().unwrap(); + + // waiting for new_keyspace_definition to load + // cannot use poll_ready, as it is already returning ready (to accept `GET /state` requests) + tokio::time::sleep(Duration::from_secs(10)).await; + + // call poll_ready when cooling + assert_matches!( + server.poll_ready(&mut Context::from_waker(noop_waker_ref())), + Poll::Ready(Ok(_)), + "should return ready status" + ); + // call `GET /state` when cooling + let res = server.call(state_req()).await; + assert_matches!( + res, + Ok(Response::KeyspaceVersion(ver)) if InstanceState::from(&ver) == InstanceState::Cooling, + "should return InstanceState::Cooling, instead found {:?}", + res + ); + } + + #[tokio::test] + async fn test_keyspace_service_oks_for_included_key() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // wait for service.poll_ready to return ready + wait_until_service_is_ready(&mut server).await; + + // GET /metadata + let res = server.call(metadata_req()).await; + assert!( + res.is_ok(), + "should return successful `GET /metadata`, instead found {:?}", + res + ); + + // GET /object + let res = server.call(object_req()).await; + assert!( + res.is_ok(), + "should return successful `GET /object`, instead found {:?}", + res + ); + + // GET /write-hint + let res = server.call(write_hint_req()).await; + assert!( + res.is_ok(), + "should return successful `POST /write-hint`, instead found {:?}", + res + ); + } + + #[tokio::test] + async fn test_keyspace_service_errs_for_excluded_key() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // wait for keyspace to be loaded by the watcher + wait_until_service_is_ready(&mut server).await; + + // update, to remove self from keyspace + server.keyspace.data.rcu(|data| { + Arc::new(KeyspaceData { + own: None, + keyspace: Arc::clone(&data.keyspace), + version: data.version.set_next(ParquetCacheInstanceSet { + revision: data.version.next.as_ref().unwrap().revision + 1, + instances: vec!["another-node"].into_iter().map(String::from).collect(), + }), + }) + }); + + // GET /metadata + let res = server.call(metadata_req()).await; + assert_matches!( + res, + Err(Error::Keyspace(_)), + "should return errored `GET /metadata`, instead found {:?}", + res + ); + + // GET /object + let res = server.call(object_req()).await; + assert_matches!( + res, + Err(Error::Keyspace(_)), + "should return errored `GET /object`, instead found {:?}", + res + ); + + // GET /write-hint + let res = server.call(write_hint_req()).await; + assert_matches!( + res, + Err(Error::Keyspace(_)), + "should return errored `POST /write-hint`, instead found {:?}", + res + ); + } + + #[tokio::test] + async fn test_keyspace_service_fetch_keyspace() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // wait for service.poll_ready to return ready + wait_until_service_is_ready(&mut server).await; + + // GET /keyspace + let res = server.call(keyspace_defn_req()).await; + assert_matches!( + res, + Ok(Response::Keyspace(nodes)) if matches!( + &nodes[..], + [ServiceNode { id: 0, hostname }] if hostname == VALID_HOSTNAME + ), + "should return successful `GET /keyspace`, instead found {:?}", + res + ); + } + + mod usage_of_poll_ready { + use super::*; + + #[tokio::test] + async fn test_poll_ready_is_not_triggered_on_call() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // wait for keyspace to be loaded by the watcher + wait_until_service_is_ready(&mut server).await; + let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst); + + // call service + // use `GET /object` since it calls inner service + let res = server.call(object_req()).await; + assert!( + res.is_ok(), + "should return successful response, instead found {:?}", + res + ); + + // assert that poll_ready was not called + assert_eq!( + innermost_service.call.load(Ordering::SeqCst), + 1, + "should call innermost service once" + ); + assert_eq!( + innermost_service.poll_ready.load(Ordering::SeqCst), + init_poll_ready, + "should not have called innermost poll_ready, on Service::call()" + ); + } + + #[tokio::test] + async fn test_poll_ready_used_when_connected_to_stream() { + // provide keyspace definition + let dir = TempDir::new().unwrap(); + let configfile_path = dir.path().join("configfile.json"); + write_defn_to_file(KEYSPACE_DEFINITION.as_bytes(), &configfile_path).await; + + // start service + let innermost_service = MockInnermostService::default(); + let mut server = ServiceBuilder::new() + .layer(BuildKeyspaceService { + configfile_path: configfile_path.to_str().unwrap().to_string(), + node_hostname: VALID_HOSTNAME.into(), + }) + .service(innermost_service.clone()); + + // Stream of requests, processed by service. + let (reqs, rx) = futures::channel::mpsc::unbounded(); + let mut resps = server.clone().call_all(rx); + + // wait for service.poll_ready to return ready + wait_until_service_is_ready(&mut server).await; + let init_poll_ready = innermost_service.poll_ready.load(Ordering::SeqCst); + + // stream Service::call() requests + vec![metadata_req(), object_req(), write_hint_req()] + .into_iter() + .for_each(|req| { + reqs.unbounded_send(req).unwrap(); + }); + drop(reqs); + + // await responses + while let Some(rsp) = resps.next().await { + assert!( + rsp.is_ok(), + "should return successful response, instead found {:?}", + rsp + ); + } + + // assert that Service::poll_ready() was called at least as many times as Service::call() + assert_eq!( + innermost_service.call.load(Ordering::SeqCst), + 3, + "should call innermost service once" + ); + assert!( + innermost_service.poll_ready.load(Ordering::SeqCst) >= 3 + init_poll_ready, + "should have called innermost poll_ready" + ); + } + } +} diff --git a/parquet_cache/src/server/mock.rs b/parquet_cache/src/server/mock.rs new file mode 100644 index 00000000000..deebe15ce58 --- /dev/null +++ b/parquet_cache/src/server/mock.rs @@ -0,0 +1,217 @@ +use std::{ + collections::{HashMap, HashSet}, + convert::Infallible, + ops::Range, + sync::Arc, +}; + +use bytes::{BufMut, Bytes, BytesMut}; +use hyper::{ + server::conn::{AddrIncoming, AddrStream}, + service::{make_service_fn, service_fn}, + Body, Method, Request, Response, Server, +}; +use object_store::ObjectStore; +use parking_lot::Mutex; +use std::net::SocketAddr; +use tokio::{net::TcpListener, sync::oneshot, task::JoinHandle}; + +use crate::data_types::{ + KeyspaceResponseBody, ServiceNode, X_RANGE_END_HEADER, X_RANGE_START_HEADER, +}; + +#[allow(missing_debug_implementations)] +pub struct MockCacheServer { + addr: SocketAddr, + stop: oneshot::Sender<()>, + join: JoinHandle<()>, + req_handler: Arc, +} + +impl MockCacheServer { + pub async fn create(addr: &str, _object_store: Arc) -> Self { + let listener = TcpListener::bind(addr) + .await + .expect("listener should have bound to addr"); + let addr = listener.local_addr().unwrap(); + + let req_handler: Arc = + Arc::new(MockCacheServerRequestHandler::new(addr.to_string())); + + let handler = Arc::clone(&req_handler); + let make_svc = make_service_fn(move |_socket: &AddrStream| { + let handler = Arc::clone(&handler); + async move { + Ok::<_, Infallible>(service_fn(move |req: Request| { + let handler = Arc::clone(&handler); + async move { Arc::clone(&handler).handle(req) } + })) + } + }); + + let (tx, rx) = tokio::sync::oneshot::channel::<()>(); + + let join = tokio::spawn(async { + Server::builder(AddrIncoming::from_listener(listener).unwrap()) + .http2_only(true) + .serve(make_svc) + .with_graceful_shutdown(async { + rx.await.ok(); + }) + .await + .unwrap() + }); + + Self { + addr, + stop: tx, + join, + req_handler, + } + } + + pub fn addr(&self) -> String { + format!("http://{}", self.addr) + } + + pub async fn close(self) { + self.stop + .send(()) + .expect("Error sending stop signal to server"); + self.join + .await + .expect("Error stopping parquet cache server"); + } + + pub fn was_called(&self, path_and_query: &String) -> bool { + self.req_handler.called.lock().contains(path_and_query) + } + + pub fn was_called_with_payload(&self, path_and_query: &String) -> bool { + self.req_handler.called.lock().contains(path_and_query) + } + + pub fn respond_with(&self, path_and_query: String, expected: ExpectedResponse) { + self.req_handler + .respond_with + .lock() + .insert(path_and_query, expected); + } +} + +#[derive(Clone)] +pub struct MockCacheServerRequestHandler { + pub hostname: String, + pub called: Arc>>, // route_&_query + pub respond_with: Arc>>, // route_&_query, reponse_payload_body +} + +#[derive(Clone, Debug)] +pub struct ExpectedResponse { + pub bytes: Bytes, + pub range: Option>, +} + +impl MockCacheServerRequestHandler { + fn new(hostname: String) -> Self { + Self { + hostname, + called: Default::default(), + respond_with: Default::default(), + } + } + + fn handle(&self, req: Request) -> Result, Infallible> { + let path_and_query = req.uri().path_and_query().unwrap().to_string(); + + match (req.method(), req.uri().path()) { + (&Method::GET, "/keyspace") => { + self.insert_into_tracker(req); + + let body = KeyspaceResponseBody { + nodes: vec![ServiceNode { + id: 42, + hostname: self.hostname.clone(), + }], + }; + + Ok::<_, Infallible>(Response::new(Body::from(build_resp_body(&body)))) + } + (&Method::GET, "/metadata") => { + self.insert_into_tracker(req); + Ok::<_, Infallible>(Response::new(self.get_resp_body(&path_and_query))) + } + (&Method::GET, "/object") => { + // assert range header in mock server + if let Some(range) = req.headers().get("range") { + // https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Range + // =- + let mut range = range.to_str().unwrap().to_string(); + range = range + .strip_prefix("bytes=") + .expect("should start range header with `bytes=`") + .to_string(); + let (start, end) = range.split_at( + range + .find('-') + .expect("should have dash delineating range `start-end`"), + ); + assert!(start.parse::().is_ok()); + assert!(end[1..].parse::().is_ok()); + }; + + self.insert_into_tracker(req); + + let range = self + .get_size_range(&path_and_query) + .expect("should have used respond_with() for mocked response"); + + let resp = Response::builder() + .header(X_RANGE_START_HEADER, range.start.to_string()) + .header(X_RANGE_END_HEADER, range.end.to_string()) + .body(self.get_resp_body(&path_and_query)) + .expect("should be a valid response"); + + Ok::<_, Infallible>(resp) + } + (&Method::POST, "/write-hint") => { + self.insert_into_tracker(req); + Ok::<_, Infallible>(Response::new(Body::empty())) + } + _ => unimplemented!(), + } + } + + fn insert_into_tracker(&self, req: Request) { + self.called.lock().insert( + req.uri() + .path_and_query() + .expect("should exist") + .to_string(), + ); + } + + fn get_resp_body(&self, path_and_query: &String) -> Body { + match self.respond_with.lock().get(path_and_query) { + None => Body::empty(), + Some(expected) => Body::from(expected.clone().bytes), + } + } + + fn get_size_range(&self, path_and_query: &String) -> Option> { + self.respond_with + .lock() + .get(path_and_query) + .map(|expected| expected.clone().range.unwrap()) + } +} + +pub fn build_resp_body(body: &T) -> Bytes +where + T: Sized + serde::Serialize, +{ + let mut buf = BytesMut::new().writer(); + serde_json::to_writer(&mut buf, body).expect("should write response body"); + + buf.into_inner().freeze() +} diff --git a/parquet_cache/src/server/precondition.rs b/parquet_cache/src/server/precondition.rs new file mode 100644 index 00000000000..a591e7c0f08 --- /dev/null +++ b/parquet_cache/src/server/precondition.rs @@ -0,0 +1,57 @@ +use std::task::Poll; + +use http::{HeaderMap, Request}; +use hyper::Body; +use object_store::ObjectMeta; +use tower::{Layer, Service}; + +use super::error::Error; +use super::response::PinnedFuture; + +/// Service that applies the preconditions per request. +/// +/// Refer to GetOptions: +/// +#[derive(Debug, Clone)] +pub struct PreconditionService { + inner: S, +} + +impl PreconditionService { + pub fn new(inner: S) -> Self { + Self { inner } + } + + fn passes(&self, _preconditions: HeaderMap, _metadata: ObjectMeta) -> bool { + unimplemented!("TODO: precondition applied for any request, per HTTP header contract") + } +} + +impl Service> for PreconditionService +where + S: Service, Future = PinnedFuture, Error = Error> + Clone + Send + Sync + 'static, +{ + type Response = super::response::Response; + type Error = Error; + type Future = super::response::PinnedFuture; + + fn poll_ready(&mut self, cx: &mut std::task::Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, req: Request) -> Self::Future { + let clone = self.inner.clone(); + let mut inner = std::mem::replace(&mut self.inner, clone); + Box::pin(async move { inner.call(req).await }) + } +} + +pub struct BuildPreconditionService; + +impl Layer for BuildPreconditionService { + type Service = PreconditionService; + + fn layer(&self, service: S) -> Self::Service { + PreconditionService::new(service) + } +} diff --git a/parquet_cache/src/server/response.rs b/parquet_cache/src/server/response.rs new file mode 100644 index 00000000000..70cb31a255d --- /dev/null +++ b/parquet_cache/src/server/response.rs @@ -0,0 +1,83 @@ +use std::{fmt::Debug, pin::Pin}; + +use bytes::{BufMut, Bytes, BytesMut}; +use futures::{stream::BoxStream, Future}; +use http::StatusCode; +use hyper::Body; + +use crate::data_types::{ + GetObjectMetaResponse, KeyspaceResponseBody, KeyspaceVersion, ServiceNode, +}; + +pub type PinnedFuture = Pin> + Send>>; + +pub enum Response { + /// Internal-only response used during pre-warming, for `PATCH /warmed` + Ready, + /// For `GET /keyspace` + Keyspace(Vec), + /// For `GET /state` + KeyspaceVersion(KeyspaceVersion), + /// For `GET /metadata` + Head(GetObjectMetaResponse), + /// For `GET /object` + Data(BoxStream<'static, object_store::Result>), + /// For `POST /write-hint` + Written, +} + +impl Debug for Response { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Ready => write!(f, "Response::Ready"), + Self::Keyspace(k) => write!(f, "Response::Keyspace({:?})", k), + Self::KeyspaceVersion(v) => write!(f, "Response::KeyspaceVersion({:?})", v), + Self::Head(h) => write!(f, "Response::Head({:?})", h), + Self::Data(_) => write!(f, "Response::Data"), + Self::Written => write!(f, "Response::Written"), + } + } +} + +impl Response { + pub fn code(&self) -> StatusCode { + match self { + Self::Ready => { + unreachable!("should be an internal-only Response, and not sent across the wire") + } + Self::Keyspace(_) | Self::KeyspaceVersion(_) | Self::Head(_) | Self::Data(_) => { + StatusCode::OK + } + Self::Written => StatusCode::CREATED, + } + } +} + +impl From for Body { + fn from(value: Response) -> Self { + match value { + Response::Ready => { + unreachable!("should be an internal-only Response, and not sent across the wire") + } + Response::Keyspace(nodes) => { + Self::from(build_resp_body(&KeyspaceResponseBody { nodes })) + } + Response::KeyspaceVersion(version) => { + Self::from(serde_json::json!(version).to_string()) + } + Response::Head(data) => Self::from(build_resp_body(&data)), + Response::Data(stream) => Self::wrap_stream(stream), + Response::Written => Self::empty(), + } + } +} + +fn build_resp_body(body: &T) -> Bytes +where + T: Sized + serde::Serialize, +{ + let mut buf = BytesMut::new().writer(); + serde_json::to_writer(&mut buf, body).expect("should write response body"); + + buf.into_inner().freeze() +} diff --git a/parquet_file/Cargo.toml b/parquet_file/Cargo.toml index 5f616f1d3f3..4f59e04dd97 100644 --- a/parquet_file/Cargo.toml +++ b/parquet_file/Cargo.toml @@ -5,8 +5,11 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } base64 = "0.21" bytes = "1.5" data_types = { path = "../data_types" } @@ -17,19 +20,19 @@ generated_types = { path = "../generated_types" } iox_time = { path = "../iox_time" } object_store = { workspace = true } observability_deps = { path = "../observability_deps" } -parquet = { workspace = true, features = ["experimental"]} -pbjson-types = "0.5" -prost = "0.11" +parquet = { workspace = true } +pbjson-types = { workspace = true } +prost = { workspace = true } schema = { path = "../schema" } -snafu = "0.7" -thiserror = "1.0.48" +snafu = "0.8" +thiserror = "1.0.56" thrift = "0.17" -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "rt-multi-thread", "sync"] } uuid = { version = "1", features = ["v4"] } -zstd = "0.12" +zstd = { version = "0.13", default-features = false } workspace-hack = { version = "0.1", path = "../workspace-hack" } - [dev-dependencies] # In alphabetical order +assert_matches = "1.5.0" rand = "0.8.3" -test_helpers = { version = "0.1.0", path = "../test_helpers" } \ No newline at end of file +test_helpers = { version = "0.1.0", path = "../test_helpers" } diff --git a/parquet_file/src/chunk.rs b/parquet_file/src/chunk.rs index 4e7f4e6bbbb..c9c43257b6b 100644 --- a/parquet_file/src/chunk.rs +++ b/parquet_file/src/chunk.rs @@ -5,10 +5,9 @@ use crate::{ storage::{ParquetExecInput, ParquetStorage}, ParquetFilePath, }; -use data_types::{ParquetFile, TimestampMinMax}; +use data_types::{ObjectStoreId, ParquetFile, TimestampMinMax}; use schema::Schema; use std::{mem, sync::Arc}; -use uuid::Uuid; /// A abstract representation of a Parquet file in object storage, with /// associated metadata. @@ -45,7 +44,7 @@ impl ParquetChunk { } /// Return object store id - pub fn object_store_id(&self) -> Uuid { + pub fn object_store_id(&self) -> ObjectStoreId { self.parquet_file.object_store_id } diff --git a/parquet_file/src/lib.rs b/parquet_file/src/lib.rs index 55ab7f470cb..4dc5a7f2234 100644 --- a/parquet_file/src/lib.rs +++ b/parquet_file/src/lib.rs @@ -17,6 +17,8 @@ )] #![allow(clippy::missing_docs_in_private_items)] +use std::{path::PathBuf, str::FromStr}; + // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -26,9 +28,11 @@ pub mod serialize; pub mod storage; pub mod writer; -use data_types::{NamespaceId, ParquetFile, ParquetFileParams, TableId, TransitionPartitionId}; +use data_types::{ + NamespaceId, ObjectStoreId, ParquetFile, ParquetFileParams, PartitionKey, TableId, + TransitionPartitionId, +}; use object_store::path::Path; -use uuid::Uuid; /// Location of a Parquet file within a namespace's object store. /// The exact format is an implementation detail and is subject to change. @@ -37,7 +41,7 @@ pub struct ParquetFilePath { namespace_id: NamespaceId, table_id: TableId, partition_id: TransitionPartitionId, - object_store_id: Uuid, + object_store_id: ObjectStoreId, } impl ParquetFilePath { @@ -46,7 +50,7 @@ impl ParquetFilePath { namespace_id: NamespaceId, table_id: TableId, partition_id: &TransitionPartitionId, - object_store_id: Uuid, + object_store_id: ObjectStoreId, ) -> Self { Self { namespace_id, @@ -73,12 +77,12 @@ impl ParquetFilePath { } /// Get object store ID. - pub fn objest_store_id(&self) -> Uuid { + pub fn object_store_id(&self) -> ObjectStoreId { self.object_store_id } /// Set new object store ID. - pub fn with_object_store_id(self, object_store_id: Uuid) -> Self { + pub fn with_object_store_id(self, object_store_id: ObjectStoreId) -> Self { Self { object_store_id, ..self @@ -108,7 +112,10 @@ impl From<&ParquetFile> for ParquetFilePath { Self { namespace_id: f.namespace_id, table_id: f.table_id, - partition_id: f.partition_id.clone(), + partition_id: TransitionPartitionId::from_parts( + f.partition_id, + f.partition_hash_id.clone(), + ), object_store_id: f.object_store_id, } } @@ -116,19 +123,80 @@ impl From<&ParquetFile> for ParquetFilePath { impl From<&ParquetFileParams> for ParquetFilePath { fn from(f: &ParquetFileParams) -> Self { + let partition_id = + TransitionPartitionId::from_parts(f.partition_id, f.partition_hash_id.clone()); + Self { + partition_id, namespace_id: f.namespace_id, table_id: f.table_id, - partition_id: f.partition_id.clone(), object_store_id: f.object_store_id, } } } +impl TryFrom<&String> for ParquetFilePath { + type Error = object_store::path::Error; + + fn try_from(path: &String) -> Result { + let mut parts = path.split(object_store::path::DELIMITER); + + let namespace_id = parts + .next() + .ok_or(Self::Error::EmptySegment { + path: path.to_owned(), + })? + .parse::() + .map_err(|_| Self::Error::InvalidPath { + path: PathBuf::from(path.to_owned()), + })?; + + let table_id = parts + .next() + .ok_or(Self::Error::EmptySegment { + path: path.to_owned(), + })? + .parse::() + .map_err(|_| Self::Error::InvalidPath { + path: path.clone().into(), + })?; + let table_id = TableId::new(table_id); + + let partition_id = parts.next().ok_or(Self::Error::EmptySegment { + path: path.to_owned(), + })?; + let partition_key = PartitionKey::from(partition_id); + + let object_store_id = parts.next().ok_or(Self::Error::EmptySegment { + path: path.to_owned(), + })?; // uuid.parquet + let object_store_id = + object_store_id + .split('.') + .next() + .ok_or(Self::Error::EmptySegment { + path: path.to_owned(), + })?; + + Ok(Self { + namespace_id: NamespaceId::new(namespace_id), + table_id, + partition_id: TransitionPartitionId::new(table_id, &partition_key), + object_store_id: ObjectStoreId::from_str(object_store_id).map_err(|_| { + Self::Error::InvalidPath { + path: path.clone().into(), + } + })?, + }) + } +} + #[cfg(test)] mod tests { use super::*; + use assert_matches::assert_matches; use data_types::{PartitionId, PartitionKey, TransitionPartitionId}; + use uuid::Uuid; #[test] fn parquet_file_absolute_dirs_and_file_path_database_partition_ids() { @@ -136,7 +204,7 @@ mod tests { NamespaceId::new(1), TableId::new(2), &TransitionPartitionId::Deprecated(PartitionId::new(4)), - Uuid::nil(), + ObjectStoreId::from_uuid(Uuid::nil()), ); let path = pfp.object_store_path(); assert_eq!( @@ -152,7 +220,7 @@ mod tests { NamespaceId::new(1), table_id, &TransitionPartitionId::new(table_id, &PartitionKey::from("hello there")), - Uuid::nil(), + ObjectStoreId::from_uuid(Uuid::nil()), ); let path = pfp.object_store_path(); assert_eq!( @@ -161,4 +229,99 @@ mod tests { /00000000-0000-0000-0000-000000000000.parquet", ); } + + #[test] + fn parquet_file_path_parsed_from_object_store_path() { + let object_store_id = uuid::Uuid::new_v4(); + + // valid + let path = format!("1/2/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Ok(res) if res == ParquetFilePath::new( + NamespaceId::new(1), + TableId::new(2), + &TransitionPartitionId::new( + TableId::new(2), + &PartitionKey::from("4"), + ), + ObjectStoreId::from_uuid(object_store_id), + ), + "should parse valid path, instead found {:?}", pfp + ); + + // namespace_id errors + let path = format!("2/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }), + "should error when missing part, instead found {:?}", pfp + ); + let path = format!("bad/2/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }), + "should error when invalid namespace_id, instead found {:?}", pfp + ); + + // table_id errors + let path = format!("1/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }), + "should error when missing part, instead found {:?}", pfp + ); + let path = format!("1/bad/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }), + "should error when invalid table_id, instead found {:?}", pfp + ); + + // namespace_id errors + let path = format!("2/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }), + "should error when missing part, instead found {:?}", pfp + ); + let path = format!("bad/2/4/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }), + "should error when invalid namespace_id, instead found {:?}", pfp + ); + + // partition_id errors + let path = format!("1/2/{}.parquet", object_store_id); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }), + "should error when missing part, instead found {:?}", pfp + ); + + // object_store_id errors + let path = "1/2/4".to_string(); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::EmptySegment { .. }), + "should error when missing part, instead found {:?}", pfp + ); + let path = "1/2/4/bad".to_string(); + let pfp = ParquetFilePath::try_from(&path); + assert_matches!( + pfp, + Err(e) if matches!(e, object_store::path::Error::InvalidPath { .. }), + "should error when invalid object_store_id, instead found {:?}", pfp + ); + } } diff --git a/parquet_file/src/metadata.rs b/parquet_file/src/metadata.rs index fc612e4b959..3e304a8c467 100644 --- a/parquet_file/src/metadata.rs +++ b/parquet_file/src/metadata.rs @@ -89,9 +89,9 @@ use base64::{prelude::BASE64_STANDARD, Engine}; use bytes::Bytes; use data_types::{ - ColumnId, ColumnSet, ColumnSummary, CompactionLevel, InfluxDbType, NamespaceId, - ParquetFileParams, PartitionKey, StatValues, Statistics, TableId, Timestamp, - TransitionPartitionId, + ColumnId, ColumnSet, ColumnSummary, CompactionLevel, CompactionLevelProtoError, InfluxDbType, + NamespaceId, ObjectStoreId, ParquetFileParams, PartitionHashId, PartitionId, PartitionKey, + StatValues, Statistics, TableId, Timestamp, }; use generated_types::influxdata::iox::ingester::v1 as proto; use iox_time::Time; @@ -108,6 +108,7 @@ use parquet::{ statistics::Statistics as ParquetStatistics, }, schema::types::SchemaDescriptor as ParquetSchemaDescriptor, + thrift::TSerializable, }; use prost::Message; use schema::{ @@ -116,9 +117,7 @@ use schema::{ }; use snafu::{ensure, OptionExt, ResultExt, Snafu}; use std::{convert::TryInto, fmt::Debug, mem, sync::Arc}; -use thrift::protocol::{ - TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol, TSerializable, -}; +use thrift::protocol::{TCompactInputProtocol, TCompactOutputProtocol, TOutputProtocol}; use uuid::Uuid; /// Current version for serialized metadata. @@ -218,6 +217,9 @@ pub enum Error { #[snafu(display("Field missing while parsing IOx metadata: {}", field))] IoxMetadataFieldMissing { field: String }, + #[snafu(display("Cannot parse timestamp from parquet metadata: {}", e))] + IoxInvalidTimestamp { e: String }, + #[snafu(display("Cannot parse IOx metadata from Protobuf: {}", source))] IoxMetadataBroken { source: Box, @@ -234,7 +236,7 @@ pub enum Error { #[snafu(display("{}: `{}`", source, compaction_level))] InvalidCompactionLevel { - source: Box, + source: CompactionLevelProtoError, compaction_level: i32, }, } @@ -251,7 +253,7 @@ pub type Result = std::result::Result; pub struct IoxMetadata { /// The uuid used as the location of the parquet file in the OS. /// This uuid will later be used as the catalog's ParquetFileId - pub object_store_id: Uuid, + pub object_store_id: ObjectStoreId, /// Timestamp when this file was created. pub creation_timestamp: Time, @@ -313,7 +315,7 @@ impl IoxMetadata { } /// Convert to protobuf v3 message. - pub(crate) fn to_protobuf(&self) -> std::result::Result, prost::EncodeError> { + pub fn to_protobuf(&self) -> std::result::Result, prost::EncodeError> { let sort_key = self.sort_key.as_ref().map(|key| proto::SortKey { expressions: key .iter() @@ -326,7 +328,7 @@ impl IoxMetadata { }); let proto_msg = proto::IoxMetadata { - object_store_id: self.object_store_id.as_bytes().to_vec(), + object_store_id: self.object_store_id.get_uuid().as_bytes().to_vec(), creation_timestamp: Some(self.creation_timestamp.date_time().into()), namespace_id: self.namespace_id.get(), namespace_name: self.namespace_name.to_string(), @@ -345,7 +347,7 @@ impl IoxMetadata { } /// Read from protobuf message - fn from_protobuf(data: &[u8]) -> Result { + pub fn from_protobuf(data: &[u8]) -> Result { // extract protobuf message from bytes let proto_msg = proto::IoxMetadata::decode(data) .map_err(|err| Box::new(err) as _) @@ -372,11 +374,13 @@ impl IoxMetadata { }); Ok(Self { - object_store_id: parse_uuid(&proto_msg.object_store_id)?.ok_or_else(|| { - Error::IoxMetadataFieldMissing { - field: "object_store_id".to_string(), - } - })?, + object_store_id: ObjectStoreId::from_uuid( + parse_uuid(&proto_msg.object_store_id)?.ok_or_else(|| { + Error::IoxMetadataFieldMissing { + field: "object_store_id".to_string(), + } + })?, + ), creation_timestamp, namespace_id: NamespaceId::new(proto_msg.namespace_id), namespace_name, @@ -399,7 +403,7 @@ impl IoxMetadata { /// the catalog should get valid values out-of-band. pub fn external(creation_timestamp_ns: i64, table_name: impl Into>) -> Self { Self { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::from_uuid(Uuid::nil()), creation_timestamp: Time::from_timestamp_nanos(creation_timestamp_ns), namespace_id: NamespaceId::new(1), namespace_name: "external".into(), @@ -413,8 +417,8 @@ impl IoxMetadata { } /// verify uuid - pub fn match_object_store_id(&self, uuid: Uuid) -> bool { - uuid == self.object_store_id + pub fn match_object_store_id(&self, id: ObjectStoreId) -> bool { + id == self.object_store_id } /// Create a corresponding iox catalog's ParquetFile @@ -434,7 +438,8 @@ impl IoxMetadata { /// [`RecordBatch`]: arrow::record_batch::RecordBatch pub fn to_parquet_file( &self, - partition_id: TransitionPartitionId, + partition_id: PartitionId, + partition_hash_id: Option, file_size_bytes: usize, metadata: &IoxParquetMetaData, column_id_map: F, @@ -486,6 +491,7 @@ impl IoxMetadata { namespace_id: self.namespace_id, table_id: self.table_id, partition_id, + partition_hash_id, object_store_id: self.object_store_id, min_time, max_time, @@ -534,8 +540,7 @@ fn decode_timestamp_from_field( let date_time = value .context(IoxMetadataFieldMissingSnafu { field })? .try_into() - .map_err(|e| Box::new(e) as _) - .context(IoxMetadataBrokenSnafu)?; + .map_err(|e: &str| Error::IoxInvalidTimestamp { e: e.to_string() })?; Ok(Time::from_date_time(date_time)) } @@ -985,11 +990,11 @@ mod tests { }; use data_types::CompactionLevel; use datafusion_util::{unbounded_memory_pool, MemoryStream}; - use schema::builder::SchemaBuilder; + use schema::{builder::SchemaBuilder, TIME_DATA_TIMEZONE}; #[test] fn iox_metadata_protobuf_round_trip() { - let object_store_id = Uuid::new_v4(); + let object_store_id = ObjectStoreId::new(); let sort_key = SortKeyBuilder::new().with_col("sort_col").build(); @@ -1018,7 +1023,7 @@ mod tests { #[tokio::test] async fn test_metadata_from_parquet_metadata() { let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), @@ -1101,7 +1106,11 @@ mod tests { } fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef { - let array: TimestampNanosecondArray = timestamps.iter().map(|v| Some(*v)).collect(); + let array = timestamps + .iter() + .map(|v| Some(*v)) + .collect::() + .with_timezone_opt(TIME_DATA_TIMEZONE()); Arc::new(array) } } diff --git a/parquet_file/src/serialize.rs b/parquet_file/src/serialize.rs index 5a25f07df7f..5cec2fb6c94 100644 --- a/parquet_file/src/serialize.rs +++ b/parquet_file/src/serialize.rs @@ -213,7 +213,7 @@ mod tests { record_batch::RecordBatch, }; use bytes::Bytes; - use data_types::{CompactionLevel, NamespaceId, TableId}; + use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, TableId}; use datafusion::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use datafusion_util::{unbounded_memory_pool, MemoryStream}; use iox_time::Time; @@ -222,7 +222,7 @@ mod tests { #[tokio::test] async fn test_encode_stream() { let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), diff --git a/parquet_file/src/storage.rs b/parquet_file/src/storage.rs index c520e3bd03b..69798b2abbc 100644 --- a/parquet_file/src/storage.rs +++ b/parquet_file/src/storage.rs @@ -122,7 +122,7 @@ impl ParquetExecInput { .clone() .with_metadata(Default::default()), ); - + let statistics = Statistics::new_unknown(&schema); let base_config = FileScanConfig { object_store_url: self.object_store_url.clone(), file_schema: schema, @@ -132,13 +132,12 @@ impl ParquetExecInput { range: None, extensions: None, }]], - statistics: Statistics::default(), + statistics, projection: None, limit: None, table_partition_cols: vec![], // Parquet files ARE actually sorted but we don't care here since we just construct a `collect` plan. output_ordering: vec![], - infinite_source: false, }; let exec = ParquetExec::new(base_config, None, None); let exec_schema = exec.schema(); @@ -204,7 +203,7 @@ impl ParquetStorage { pub fn test_df_context(&self) -> SessionContext { // set up "fake" DataFusion session let object_store = Arc::clone(&self.object_store); - let session_ctx = SessionContext::with_config(iox_session_config()); + let session_ctx = SessionContext::new_with_config(iox_session_config()); register_iox_object_store(session_ctx.runtime_env(), self.id, object_store); session_ctx } @@ -298,6 +297,7 @@ impl ParquetStorage { last_modified: Default::default(), size: file_size, e_tag: None, + version: None, }, } } @@ -326,10 +326,10 @@ pub enum ProjectionError { mod tests { use super::*; use arrow::{ - array::{ArrayRef, BinaryArray, Int64Array, StringArray}, + array::{ArrayRef, Int64Array, IntervalMonthDayNanoArray, StringArray}, record_batch::RecordBatch, }; - use data_types::{CompactionLevel, NamespaceId, PartitionId, TableId}; + use data_types::{CompactionLevel, NamespaceId, ObjectStoreId, PartitionId, TableId}; use datafusion::common::DataFusionError; use datafusion_util::{unbounded_memory_pool, MemoryStream}; use iox_time::Time; @@ -442,13 +442,13 @@ mod tests { #[tokio::test] async fn test_schema_check_fail_different_types() { - let batch = RecordBatch::try_from_iter([("a", to_binary_array(&["value"]))]).unwrap(); - let other_batch = RecordBatch::try_from_iter([("a", to_int_array(&[1]))]).unwrap(); + let batch = RecordBatch::try_from_iter([("a", to_interval_array(&[123456]))]).unwrap(); + let other_batch = RecordBatch::try_from_iter([("a", to_int_array(&[123456]))]).unwrap(); let schema = batch.schema(); assert_schema_check_fail( other_batch, schema, - "Error during planning: Cannot cast file schema field a of type Int64 to table schema field of type Binary", + "Error during planning: Cannot cast file schema field a of type Int64 to table schema field of type Interval(MonthDayNano)", ).await; } @@ -584,8 +584,8 @@ mod tests { Arc::new(array) } - fn to_binary_array(strs: &[&str]) -> ArrayRef { - let array: BinaryArray = strs.iter().map(|s| Some(*s)).collect(); + fn to_interval_array(vals: &[i128]) -> ArrayRef { + let array: IntervalMonthDayNanoArray = vals.iter().map(|v| Some(*v)).collect(); Arc::new(array) } @@ -598,7 +598,7 @@ mod tests { ( TransitionPartitionId::Deprecated(PartitionId::new(4)), IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), diff --git a/parquet_file/tests/metadata.rs b/parquet_file/tests/metadata.rs index 658d4dc756f..cfdf3ee855c 100644 --- a/parquet_file/tests/metadata.rs +++ b/parquet_file/tests/metadata.rs @@ -5,8 +5,8 @@ use arrow::{ record_batch::RecordBatch, }; use data_types::{ - ColumnId, CompactionLevel, NamespaceId, PartitionId, PartitionKey, TableId, Timestamp, - TransitionPartitionId, + ColumnId, CompactionLevel, NamespaceId, ObjectStoreId, PartitionHashId, PartitionId, + PartitionKey, TableId, Timestamp, TransitionPartitionId, }; use datafusion_util::{unbounded_memory_pool, MemoryStream}; use iox_time::Time; @@ -18,6 +18,7 @@ use parquet_file::{ }; use schema::{ builder::SchemaBuilder, sort::SortKey, InfluxColumnType, InfluxFieldType, TIME_COLUMN_NAME, + TIME_DATA_TIMEZONE, }; #[tokio::test] @@ -52,7 +53,7 @@ async fn test_decoded_iox_metadata() { // And the metadata the batch would be encoded with if it came through the // IOx write path. let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), @@ -193,7 +194,7 @@ async fn test_empty_parquet_file_panic() { // And the metadata the batch would be encoded with if it came through the // IOx write path. let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), @@ -285,7 +286,7 @@ async fn test_decoded_many_columns_with_null_cols_iox_metadata() { let sort_key = SortKey::from_columns(sort_key_data); let partition_id = TransitionPartitionId::Deprecated(PartitionId::new(4)); let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(42), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), @@ -371,10 +372,11 @@ async fn test_derive_parquet_file_params() { // IOx write path. let table_id = TableId::new(3); let partition_key = PartitionKey::from("potato"); - let partition_id = TransitionPartitionId::new(table_id, &partition_key); + let partition_hash_id = PartitionHashId::new(table_id, &partition_key); + let partition_id = TransitionPartitionId::Deterministic(partition_hash_id.clone()); let meta = IoxMetadata { - object_store_id: Default::default(), + object_store_id: ObjectStoreId::new(), creation_timestamp: Time::from_timestamp_nanos(1234), namespace_id: NamespaceId::new(1), namespace_name: "bananas".into(), @@ -412,9 +414,14 @@ async fn test_derive_parquet_file_params() { ("some_field".into(), ColumnId::new(1)), ("time".into(), ColumnId::new(2)), ]); - let catalog_data = meta.to_parquet_file(partition_id, file_size, &iox_parquet_meta, |name| { - *column_id_map.get(name).unwrap() - }); + let partition_id = PartitionId::new(1); + let catalog_data = meta.to_parquet_file( + partition_id, + Some(partition_hash_id), + file_size, + &iox_parquet_meta, + |name| *column_id_map.get(name).unwrap(), + ); // And verify the resulting statistics used in the catalog. // @@ -438,7 +445,11 @@ fn to_string_array(strs: &[&str]) -> ArrayRef { } fn to_timestamp_array(timestamps: &[i64]) -> ArrayRef { - let array: TimestampNanosecondArray = timestamps.iter().map(|v| Some(*v)).collect(); + let array = timestamps + .iter() + .map(|v| Some(*v)) + .collect::() + .with_timezone_opt(TIME_DATA_TIMEZONE()); Arc::new(array) } diff --git a/parquet_to_line_protocol/Cargo.toml b/parquet_to_line_protocol/Cargo.toml index ced9d53f4bc..a1e6b7fe5ca 100644 --- a/parquet_to_line_protocol/Cargo.toml +++ b/parquet_to_line_protocol/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] datafusion = { workspace = true } datafusion_util = { path = "../datafusion_util" } @@ -14,10 +17,9 @@ num_cpus = "1.16.0" object_store = { workspace = true } parquet_file = { path = "../parquet_file" } schema = { path = "../schema" } -tokio = "1.32" -snafu = "0.7" +tokio = "1.35" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } - [dev-dependencies] mutable_batch_lp = { path = "../mutable_batch_lp" } diff --git a/parquet_to_line_protocol/src/batch.rs b/parquet_to_line_protocol/src/batch.rs index 1b0b3c22f95..734628ed64d 100644 --- a/parquet_to_line_protocol/src/batch.rs +++ b/parquet_to_line_protocol/src/batch.rs @@ -192,7 +192,7 @@ mod tests { } #[test] - #[should_panic = "Error parsing line protocol: LineProtocol { source: FieldSetMissing, line: 1 }"] + #[should_panic = "Error parsing line protocol: PerLine { lines: [LineProtocol { source: FieldSetMissing, line: 1 }] }"] fn no_fields() { round_trip("my_no_tag_measurement_name,tag=4 1000"); } diff --git a/parquet_to_line_protocol/src/lib.rs b/parquet_to_line_protocol/src/lib.rs index ed8c6a73d05..9efebb6dee9 100644 --- a/parquet_to_line_protocol/src/lib.rs +++ b/parquet_to_line_protocol/src/lib.rs @@ -207,7 +207,7 @@ impl ParquetFileReader { ) -> Result { let runtime = Arc::new(RuntimeEnv::default()); let session_config = iox_session_config(); - let session_state = SessionState::with_config_rt(session_config, runtime); + let session_state = SessionState::new_with_config_rt(session_config, runtime); // Keep metadata so we can find the measurement name let format = ParquetFormat::new().with_skip_metadata(Some(false)); @@ -219,7 +219,7 @@ impl ParquetFileReader { .await .context(InferringSchemaSnafu)?; - let session_ctx = SessionContext::with_state(session_state); + let session_ctx = SessionContext::new_with_state(session_state); Ok(Self { object_store, @@ -237,21 +237,22 @@ impl ParquetFileReader { /// read the parquet file as a stream pub async fn read(&self) -> Result { + let file_schema = self.schema(); + let statistics = Statistics::new_unknown(&file_schema); let base_config = FileScanConfig { object_store_url: self.object_store_url.clone(), - file_schema: self.schema(), + file_schema, file_groups: vec![vec![PartitionedFile { object_meta: self.object_meta.clone(), partition_values: vec![], range: None, extensions: None, }]], - statistics: Statistics::default(), + statistics, projection: None, limit: None, table_partition_cols: vec![], output_ordering: vec![], - infinite_source: false, }; // set up enough datafusion context to do the real read session diff --git a/partition/Cargo.toml b/partition/Cargo.toml new file mode 100644 index 00000000000..4ec967cf9a4 --- /dev/null +++ b/partition/Cargo.toml @@ -0,0 +1,37 @@ +[package] +name = "partition" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +arrow = { workspace = true } +chrono = { version = "0.4", default-features = false } +data_types = { path = "../data_types" } +hashbrown = { workspace = true } +mutable_batch = { path = "../mutable_batch" } +percent-encoding = "2.3.1" +schema = { path = "../schema" } +thiserror = "1.0.56" +unicode-segmentation = "1.10.1" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] +assert_matches = "1.5.0" +criterion = { version = "0.5", default-features = false, features = [ + "rayon", +] } +generated_types = { path = "../generated_types" } +mutable_batch_lp = { path = "../mutable_batch_lp" } +paste = "1.0.14" +proptest = { version = "1.4.0", default-features = false } +rand = "0.8" +test_helpers = { path = "../test_helpers" } + +[[bench]] +name = "partitioner" +harness = false diff --git a/partition/benches/partitioner.rs b/partition/benches/partitioner.rs new file mode 100644 index 00000000000..21a2f9f7865 --- /dev/null +++ b/partition/benches/partitioner.rs @@ -0,0 +1,246 @@ +use std::path::Path; + +use criterion::{ + criterion_group, criterion_main, measurement::WallTime, BatchSize, BenchmarkGroup, BenchmarkId, + Criterion, Throughput, +}; +use data_types::partition_template::TablePartitionTemplateOverride; +use generated_types::influxdata::iox::partition_template::v1::{self as proto, Bucket}; +use partition::partition_batch; +use schema::Projection; + +fn partitioner_benchmarks(c: &mut Criterion) { + let mut group = c.benchmark_group("partitioner"); + + //////////////////////////////////////////////////////////////////////////// + // A medium batch. + bench( + &mut group, + "tag_hit", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("env".to_string())), + }], + "test_fixtures/lineproto/prometheus.lp", + ); + + bench( + &mut group, + "tag_miss", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".to_string())), + }], + "test_fixtures/lineproto/prometheus.lp", + ); + + bench( + &mut group, + "YYYY-MM-DD strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat( + "%Y-%m-%d".to_string(), + )), + }], + "test_fixtures/lineproto/prometheus.lp", + ); + + bench( + &mut group, + "long strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())), + }], + "test_fixtures/lineproto/prometheus.lp", + ); + + bench( + &mut group, + "hash bucket on tag", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(Bucket { + tag_name: "env".to_string(), + num_buckets: 100, + })), + }], + "test_fixtures/lineproto/prometheus.lp", + ); + + //////////////////////////////////////////////////////////////////////////// + // A large batch. + bench( + &mut group, + "tag_hit", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("host".to_string())), + }], + "test_fixtures/lineproto/metrics.lp", + ); + + bench( + &mut group, + "tag_miss", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".to_string())), + }], + "test_fixtures/lineproto/metrics.lp", + ); + + bench( + &mut group, + "YYYY-MM-DD strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat( + "%Y-%m-%d".to_string(), + )), + }], + "test_fixtures/lineproto/metrics.lp", + ); + + bench( + &mut group, + "long strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())), + }], + "test_fixtures/lineproto/metrics.lp", + ); + + bench( + &mut group, + "hash bucket on tag", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(Bucket { + tag_name: "host".to_string(), + num_buckets: 100, + })), + }], + "test_fixtures/lineproto/metrics.lp", + ); + + //////////////////////////////////////////////////////////////////////////// + // A small batch. + bench( + &mut group, + "tag_hit", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("location".to_string())), + }], + "test_fixtures/lineproto/temperature.lp", + ); + + bench( + &mut group, + "tag_miss", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TagValue("bananas".to_string())), + }], + "test_fixtures/lineproto/temperature.lp", + ); + + bench( + &mut group, + "YYYY-MM-DD strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat( + "%Y-%m-%d".to_string(), + )), + }], + "test_fixtures/lineproto/temperature.lp", + ); + + bench( + &mut group, + "long strftime", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::TimeFormat("%Y-%C-%y-%m-%b-%B-%h-%d-%e-%a-%A-%w-%u-%U-%W-%G-%g-%V-%j-%D-%x-%F-%v-%H-%k-%I-%l-%P-%p-%M-%S-%f-%.f-%.3f-%.6f-%.9f-%3f-%6f-%9f-%R-%T-%X-%r-%Z-%z-%:z-%::z-%:::z-%c-%+-%s-%t-%n-%%".to_string())), + }], + "test_fixtures/lineproto/temperature.lp", + ); + + bench( + &mut group, + "hash bucket on tag", + vec![proto::TemplatePart { + part: Some(proto::template_part::Part::Bucket(Bucket { + tag_name: "location".to_string(), + num_buckets: 100, + })), + }], + "test_fixtures/lineproto/temperature.lp", + ); + + group.finish(); +} + +fn bench( + group: &mut BenchmarkGroup<'_, WallTime>, + template_name: &str, + partition_template: Vec, + file_path: &str, // Relative to the crate root +) { + // Un-normalise the path, adjusting back to the crate root. + let file_path = format!("{}/../{}", env!("CARGO_MANIFEST_DIR"), file_path); + let path = Path::new(&file_path); + let partition_template = TablePartitionTemplateOverride::try_new( + Some(proto::PartitionTemplate { + parts: partition_template, + }), + &Default::default(), + ) + .unwrap(); + + // Read the benchmark data + let data = std::fs::read_to_string(path).unwrap(); + let row_count = data.chars().filter(|&v| v == '\n').count(); + + // Generate the mutable batch partitioner input + let mutable_batch_input: Vec<_> = mutable_batch_lp::lines_to_batches(&data, 42) + .unwrap() + .into_iter() + .map(|(_table_name, batch)| batch) + .collect(); + + // Generate the record batch partitioner input + let record_batch_input: Vec<_> = mutable_batch_input + .iter() + .map(|batch| batch.to_arrow(Projection::All).unwrap()) + .collect(); + + group.throughput(Throughput::Elements(row_count as _)); + group.bench_function( + BenchmarkId::new( + format!("{template_name} (mutable batch)"), + path.file_name().unwrap().to_str().unwrap(), + ), + |b| { + b.iter_batched( + || mutable_batch_input.clone(), + |input| { + for batch in input { + partition_batch(&batch, &partition_template).for_each(drop); + } + }, + BatchSize::NumIterations(1), + ) + }, + ); + group.bench_function( + BenchmarkId::new( + format!("{template_name} (record batch)"), + path.file_name().unwrap().to_str().unwrap(), + ), + |b| { + b.iter_batched( + || record_batch_input.clone(), + |input| { + for batch in input { + partition_batch(&batch, &partition_template).for_each(drop); + } + }, + BatchSize::NumIterations(1), + ) + }, + ); +} + +criterion_group!(benches, partitioner_benchmarks); +criterion_main!(benches); diff --git a/partition/src/bucket.rs b/partition/src/bucket.rs new file mode 100644 index 00000000000..6e7df804119 --- /dev/null +++ b/partition/src/bucket.rs @@ -0,0 +1,49 @@ +use data_types::partition_template; + +#[derive(Debug)] +pub(super) struct BucketHasher { + num_buckets: u32, + last_assigned_bucket: Option, +} + +impl BucketHasher { + pub(super) fn new(num_buckets: u32) -> Self { + Self { + num_buckets, + last_assigned_bucket: None, + } + } + + /// Assign a bucket for the provided `tag_value` using the [`BucketHasher`]s + /// configuration. + pub(super) fn assign_bucket(&mut self, tag_value: &str) -> u32 { + let bucket = partition_template::bucket_for_tag_value(tag_value, self.num_buckets); + self.last_assigned_bucket = Some(bucket); + bucket + } + + /// The last bucket assigned by the [`BucketHasher`]. + pub(super) fn last_assigned_bucket(&self) -> Option { + self.last_assigned_bucket + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_last_assigned_bucket() { + let mut bucketer = BucketHasher::new(10); + assert_eq!(bucketer.last_assigned_bucket, None); + + assert_eq!(bucketer.assign_bucket("foo"), 6); + assert_eq!(bucketer.last_assigned_bucket, Some(6)); + + assert_eq!(bucketer.assign_bucket("bat"), 5); + assert_eq!(bucketer.last_assigned_bucket, Some(5)); + + assert_eq!(bucketer.assign_bucket("qux"), 5); + assert_eq!(bucketer.last_assigned_bucket, Some(5)); + } +} diff --git a/partition/src/filter.rs b/partition/src/filter.rs new file mode 100644 index 00000000000..099c900b771 --- /dev/null +++ b/partition/src/filter.rs @@ -0,0 +1,145 @@ +//! Functions for filtering rows from a [`MutableBatch`] +//! +//! The returned ranges can then be used with `MutableBatch::extend_from_range` + +use crate::Batch; +use mutable_batch::MutableBatch; +use std::ops::Range; + +/// Given a [`MutableBatch`] a time predicate and a set of row ranges, returns the row +/// indexes that pass the predicate +/// +/// # Panic +/// +/// Panics if `batch` does not contain a time column of the correct type +pub(crate) fn filter_time<'a, F>( + batch: &'a MutableBatch, + ranges: &'a [Range], + mut predicate: F, +) -> Vec> +where + F: FnMut(i64) -> bool, +{ + let col_data = batch.time_column().expect("time column"); + + // Time column is not nullable so can skip checking mask + let mut ret = vec![]; + for range in ranges { + let offset = range.start; + ret.extend( + filter_slice(&col_data[range.clone()], &mut predicate) + .map(|r| (r.start + offset)..(r.end + offset)), + ) + } + ret +} + +fn filter_slice<'a, T, F>( + col_data: &'a [T], + predicate: &'a mut F, +) -> impl Iterator> + 'a +where + T: Copy, + F: 'a + FnMut(T) -> bool, +{ + let mut range: Range = 0..0; + let mut values = col_data.iter(); + + std::iter::from_fn(move || loop { + match values.next() { + Some(value) if predicate(*value) => { + range.end += 1; + continue; + } + // Either finished or predicate failed + _ if range.start != range.end => { + let t = range.clone(); + range.end += 1; + range.start = range.end; + return Some(t); + } + // Predicate failed and start == end + Some(_) => { + range.start += 1; + range.end += 1; + } + None => return None, + } + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use mutable_batch::writer::Writer; + use rand::prelude::*; + + fn make_rng() -> StdRng { + let seed = rand::rngs::OsRng.next_u64(); + println!("Seed: {seed}"); + StdRng::seed_from_u64(seed) + } + + #[test] + fn test_filter_slice() { + let collected: Vec<_> = + filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x != 1 && x != 4).collect(); + assert_eq!(collected, vec![0..1, 2..4, 5..7]); + + let collected: Vec<_> = + filter_slice(&[0, 1, 2, 3, 4, 5, 6], &mut |x| x == 1 || x == 2 || x == 6).collect(); + assert_eq!(collected, vec![1..3, 6..7]) + } + + #[test] + fn test_filter_fuzz() { + let mut rng = make_rng(); + let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32())) + .take(1000) + .collect(); + + let mut predicate = |x: u32| x & 1 == 0; + + let indexes: Vec<_> = filter_slice(&data, &mut predicate).flatten().collect(); + + let expected: Vec<_> = data + .iter() + .enumerate() + .filter_map(|(idx, x)| match predicate(*x) { + true => Some(idx), + false => None, + }) + .collect(); + + assert_eq!(indexes, expected); + } + + #[test] + fn test_filter_batch() { + let mut batch = MutableBatch::new(); + let mut rng = make_rng(); + let data: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() as i64)) + .take(1000) + .collect(); + + let ranges = &[0..87, 90..442, 634..800]; + let mut predicate = |x: i64| x & 1 == 0; + + let mut writer = Writer::new(&mut batch, 1000); + writer.write_time("time", data.iter().cloned()).unwrap(); + writer.commit(); + + let actual: Vec<_> = filter_time(&batch, ranges, &mut predicate) + .into_iter() + .flatten() + .collect(); + + let expected: Vec<_> = ranges + .iter() + .flat_map(|r| r.clone()) + .filter(|idx| predicate(data[*idx])) + .collect(); + + assert_eq!(actual, expected); + } +} diff --git a/partition/src/lib.rs b/partition/src/lib.rs new file mode 100644 index 00000000000..d542bba454e --- /dev/null +++ b/partition/src/lib.rs @@ -0,0 +1,1704 @@ +//! Functionality for partitioning data based on a partition template. +//! +//! The partitioning template, derived partition key format, and encodings are +//! described in detail in the [`data_types::partition_template`] module. + +mod bucket; +mod filter; +mod strftime; +mod traits; + +use std::{borrow::Cow, num::NonZeroUsize, ops::Range}; + +use data_types::{ + partition_template::{ + TablePartitionTemplateOverride, TemplatePart, ENCODED_PARTITION_KEY_CHARS, + MAXIMUM_NUMBER_OF_TEMPLATE_PARTS, PARTITION_KEY_DELIMITER, PARTITION_KEY_MAX_PART_LEN, + PARTITION_KEY_PART_TRUNCATED, PARTITION_KEY_VALUE_EMPTY_STR, PARTITION_KEY_VALUE_NULL_STR, + }, + PartitionKey, +}; +use hashbrown::HashMap; +use mutable_batch::{MutableBatch, WritePayload}; +use percent_encoding::utf8_percent_encode; +use thiserror::Error; +use unicode_segmentation::UnicodeSegmentation; + +pub use self::traits::{Batch, PartitioningColumn, TimeColumnError}; +use self::{bucket::BucketHasher, strftime::StrftimeFormatter}; + +/// An error generating a partition key for a row. +#[allow(missing_copy_implementations)] +#[derive(Debug, Error, PartialEq, Eq, Clone)] +pub enum PartitionKeyError { + /// The partition template defines a [`Template::TimeFormat`] part, but the + /// provided strftime formatter is invalid. + #[error("invalid strftime format in partition template")] + InvalidStrftime, + + /// The partition template defines a [`Template::TagValue`] part, but the + /// column type is not "tag". + #[error("tag value partitioner does not accept input columns of type {0}")] + TagValueNotTag(String), + + /// A "catch all" error for when a formatter returns [`std::fmt::Error`], + /// which contains no context. + #[error("partition key generation error")] + FmtError(#[from] std::fmt::Error), +} + +/// Returns an iterator identifying consecutive ranges for a given partition key +pub fn partition_batch<'a, T>( + batch: &'a T, + template: &'a TablePartitionTemplateOverride, +) -> impl Iterator, Range)> + 'a +where + T: Batch, +{ + let parts = template.len(); + if parts > MAXIMUM_NUMBER_OF_TEMPLATE_PARTS { + panic!( + "partition template contains {} parts, which exceeds the maximum of {} parts", + parts, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS + ); + } + + range_encode(partition_keys(batch, template.parts())) +} + +/// A [`TablePartitionTemplateOverride`] is made up of one of more +/// [`TemplatePart`]s that are rendered and joined together by +/// [`PARTITION_KEY_DELIMITER`] to form a single partition key. +/// +/// To avoid allocating intermediate strings, and performing column lookups for +/// every row, each [`TemplatePart`] is converted to a [`Template`]. +/// +/// [`Template::fmt_row`] can then be used to render the template for that +/// particular row to the provided string, without performing any additional +/// column lookups +#[derive(Debug)] +#[allow(clippy::large_enum_variant)] +enum Template<'a, T: PartitioningColumn> { + TagValue(&'a T, Option<&'a T::TagIdentityKey>), + TimeFormat(&'a [i64], StrftimeFormatter<'a>), + Bucket(&'a T, BucketHasher, Option<&'a T::TagIdentityKey>), + + /// This batch is missing a partitioning tag column. + MissingTag, +} + +impl<'a, T> Template<'a, T> +where + T: PartitioningColumn, +{ + /// Renders this template to `out` for the row `idx`. + fn fmt_row( + &mut self, + out: &mut W, + idx: usize, + ) -> Result<(), PartitionKeyError> { + match self { + Template::TagValue(col, last_key) if col.is_valid(idx) => { + let this_key = col + .get_tag_identity_key(idx) + .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?; + + // Update the "is identical" tracking key for this new, + // potentially different key. + *last_key = Some(this_key); + + out.write_str(encode_key_part(col.get_tag_value(this_key).unwrap()).as_ref())? + } + Template::TimeFormat(t, fmt) => fmt.render(t[idx], out)?, + Template::Bucket(col, bucketer, last_key) if col.is_valid(idx) => { + let this_key = col + .get_tag_identity_key(idx) + .ok_or_else(|| PartitionKeyError::TagValueNotTag(col.type_description()))?; + let this_value = col.get_tag_value(this_key).unwrap(); + let bucket = bucketer.assign_bucket(this_value); + + // Update the "is identical" tracking key for this new, + // potentially different key. + *last_key = Some(this_key); + + write!(out, "{bucket}")? + } + // Either a tag that has no value for this given row index, or the + // batch does not contain this tag at all. + Template::TagValue(_, last_key) => { + // This row doesn't have a tag value, which should be carried + // forwards to be checked against the next row. + *last_key = None; + out.write_str(PARTITION_KEY_VALUE_NULL_STR)? + } + // Either a tag that has no value for this given row index, or the + // batch does not contain this tag at all. + Template::Bucket(_, _, last_key) => { + // This row doesn't have a tag value, which should be carried + // forwards to be checked against the next row. + *last_key = None; + out.write_str(PARTITION_KEY_VALUE_NULL_STR)? + } + Template::MissingTag => out.write_str(PARTITION_KEY_VALUE_NULL_STR)?, + } + + Ok(()) + } + + /// Returns true if the partition key generated by `self` for `idx` will be + /// identical to the last generated key. + fn is_identical(&mut self, idx: usize) -> bool { + match self { + Template::TagValue(col, last_key) if col.is_valid(idx) => { + let this_key = match col.get_tag_identity_key(idx) { + Some(key) => key, + // This is an error, but for the purposes of identical checks, + // it is treated as not identical, causing the error to be + // raised when formatting is attempted. + None => return false, + }; + + // Check if the key matches the last key, indicating the same value is going to + // be rendered. + last_key.map(|v| v == this_key).unwrap_or_default() + } + Template::TimeFormat(t, fmt) => { + // Check if the last value matches the current value, after + // optionally applying the precision reduction optimisation. + fmt.equals_last(t[idx]) + } + Template::Bucket(col, fmt, last_key) if col.is_valid(idx) => { + // To perform an equality check for `idx` when it is a + // `Bucket` template part we must check in order: + // + // 1. If this dictionary key is the same as the + // previous + // 2. If the assigned bucket is the same as the + // previous + // + // While just checking the bucket is correct, checking + // the dictionary key first avoids unnecessary throwaway + // hashing work. + let this_key = match col.get_tag_identity_key(idx) { + Some(key) => key, + // This is an error, but for the purposes of identical checks, + // it is treated as not identical, causing the error to be + // raised when formatting is attempted. + None => return false, + }; + + match last_key { + Some(v) if this_key == *v => true, + Some(_) => { + col.get_tag_value(this_key) + .map(|this_value| { + // Grab the last assigned bucket, assign + // a bucket for the current value and + // check for equality. + fmt.last_assigned_bucket() + .map(|last_bucket| last_bucket == fmt.assign_bucket(this_value)) + .unwrap_or_default() + }) + .unwrap_or_default() + } + None => false, + } + } + // The last row did not contain this key, and neither does this. + Template::TagValue(_, None) | Template::Bucket(_, _, None) => true, + // The last row did contain a key, but this one does not (therefore + // it differs). + Template::TagValue(_, Some(_)) | Template::Bucket(_, _, Some(_)) => false, + // The batch does not contain this tag at all - it always matches + // with the previous row. + Template::MissingTag => true, + } + } +} + +fn encode_key_part(s: &str) -> Cow<'_, str> { + // Encode reserved characters and non-ascii characters. + let as_str: Cow<'_, str> = utf8_percent_encode(s, &ENCODED_PARTITION_KEY_CHARS).into(); + + match as_str.len() { + 0 => Cow::Borrowed(PARTITION_KEY_VALUE_EMPTY_STR), + 1..=PARTITION_KEY_MAX_PART_LEN => as_str, + _ => { + // This string exceeds the maximum byte length limit and must be + // truncated. + // + // Truncation of unicode strings can be tricky - this implementation + // avoids splitting unicode code-points nor graphemes. See the + // partition_template module docs in data_types before altering + // this. + + // Preallocate the string to hold the long partition key part. + let mut buf = String::with_capacity(PARTITION_KEY_MAX_PART_LEN); + + // This is a slow path, re-encoding the original input string - + // fortunately this is an uncommon path. + // + // Walk the string, encoding each grapheme (which includes spaces) + // individually, tracking the total length of the encoded string. + // Once it hits 199 bytes, stop and append a #. + + let mut bytes = 0; + s.graphemes(true) + .map(|v| Cow::from(utf8_percent_encode(v, &ENCODED_PARTITION_KEY_CHARS))) + .take_while(|v| { + bytes += v.len(); // Byte length of encoded grapheme + bytes < PARTITION_KEY_MAX_PART_LEN + }) + .for_each(|v| buf.push_str(v.as_ref())); + + // Append the truncation marker. + buf.push(PARTITION_KEY_PART_TRUNCATED); + + assert!(buf.len() <= PARTITION_KEY_MAX_PART_LEN); + + Cow::Owned(buf) + } + } +} + +/// Returns an iterator of partition keys for the given table batch. +/// +/// This function performs deduplication on returned keys; the returned iterator +/// yields [`Some`] containing the partition key string when a new key is +/// generated, and [`None`] when the generated key would equal the last key. +fn partition_keys<'a, T>( + batch: &'a T, + template_parts: impl Iterator>, +) -> impl Iterator>> + 'a +where + T: Batch, +{ + // Extract the timestamp data. + let time = batch.time_column().expect("error reading time column"); + + // Convert TemplatePart into an ordered array of Template + let mut template = template_parts + .map(|v| match v { + TemplatePart::TagValue(col_name) => batch + .column(col_name) + .map_or_else(|| Template::MissingTag, |v| Template::TagValue(v, None)), + TemplatePart::TimeFormat(fmt) => { + Template::TimeFormat(time, StrftimeFormatter::new(fmt)) + } + TemplatePart::Bucket(col_name, num_buckets) => batch.column(col_name).map_or_else( + || Template::MissingTag, + |v| Template::Bucket(v, BucketHasher::new(num_buckets), None), + ), + }) + .collect::>(); + + // Track the length of the last yielded partition key, and pre-allocate the + // next partition key string to match it. + // + // In the happy path, keys of consistent sizes are generated and the + // allocations reach a minimum. If the keys are inconsistent, at best a + // subset of allocations are eliminated, and at worst, a few bytes of memory + // is temporarily allocated until the resulting string is shrunk down. + let mut last_len = 5; + + // The first row in a batch must always be evaluated to produce a key. + // + // Row 0 is guaranteed to exist, otherwise attempting to read the time + // column above would have caused a panic (no rows -> no time column). + let first = std::iter::once(Some(evaluate_template(&mut template, &mut last_len, 0))); + + // The subsequent rows in a batch may generate the same key, and therefore a + // dedupe check is used before allocating & populating the partition key. + let rest = (1..batch.num_rows()).map(move |idx| { + // Check if this partition key is going to be different from the + // last, short-circuiting the check if it is. + if template.iter_mut().all(|t| t.is_identical(idx)) { + return None; + } + + Some(evaluate_template(&mut template, &mut last_len, idx)) + }); + + first.chain(rest) +} + +/// Evaluate the partition template against the row indexed by `idx`. +/// +/// # Panics +/// +/// This method panics if `idx` exceeds the number of rows in the batch. +fn evaluate_template( + template: &mut [Template<'_, T>], + last_len: &mut usize, + idx: usize, +) -> Result { + let mut buf = String::with_capacity(*last_len); + let template_len = template.len(); + + // Evaluate each template part for this row + for (col_idx, col) in template.iter_mut().enumerate() { + // Evaluate the formatter for this template part against the row. + col.fmt_row(&mut buf, idx)?; + + // If this isn't the last element in the template, insert a field + // delimiter. + if col_idx + 1 != template_len { + buf.push(PARTITION_KEY_DELIMITER); + } + } + + *last_len = buf.len(); + Ok(buf) +} + +/// Takes an iterator of [`Option`] and merges identical consecutive elements +/// together. +/// +/// Any [`None`] yielded by `iterator` is added to the range for the previous +/// [`Some`]. +fn range_encode(mut iterator: I) -> impl Iterator)> +where + I: Iterator>, + T: Eq, +{ + let mut last: Option = None; + let mut range: Range = 0..0; + std::iter::from_fn(move || loop { + match (iterator.next(), last.take()) { + // The iterator yeilds a NULL/identical value and there is a prior value + (Some(None), Some(v)) => { + range.end += 1; + last = Some(v); + } + // The iterator yeilds a value, and the last value matches + (Some(cur), Some(next)) => match cur == next { + true => { + range.end += 1; + last = Some(next); + } + false => { + let t = range.clone(); + range.start = range.end; + range.end += 1; + last = Some(cur); + return Some((next.unwrap(), t)); + } + }, + // There is no last value + (Some(cur), None) => { + range.end += 1; + last = Some(cur); + } + (None, Some(next)) => return Some((next.unwrap(), range.clone())), + (None, None) => return None, + } + }) +} + +/// An error partitioning a batch. +#[allow(missing_copy_implementations)] +#[derive(Debug, Error, PartialEq, Eq, Clone)] +pub enum PartitionWriteError { + /// An error deriving the partition key from the partition key template. + #[error("{0}")] + PartitionKey(#[from] PartitionKeyError), + + /// An error accessing the time column. + #[error("{0}")] + TimeColumn(#[from] TimeColumnError), +} + +/// A [`MutableBatch`] with a non-zero set of row ranges to write +#[derive(Debug)] +pub struct PartitionWrite<'a> { + batch: &'a MutableBatch, + ranges: Vec>, + min_timestamp: i64, + max_timestamp: i64, + row_count: NonZeroUsize, +} + +impl<'a> PartitionWrite<'a> { + /// Create a new [`PartitionWrite`] with the entire range of the provided batch + /// + /// # Panic + /// + /// Panics if the batch has no rows + pub fn new(batch: &'a MutableBatch) -> Result { + let row_count = NonZeroUsize::new(batch.rows()).unwrap(); + let time = batch.time_column()?; + let (min_timestamp, max_timestamp) = min_max_time(time); + + // This `allow` can be removed when this issue is fixed and released: + // + #[allow(clippy::single_range_in_vec_init)] + Ok(Self { + batch, + ranges: vec![0..batch.rows()], + min_timestamp, + max_timestamp, + row_count, + }) + } + + /// Returns the minimum timestamp in the write + pub fn min_timestamp(&self) -> i64 { + self.min_timestamp + } + + /// Returns the maximum timestamp in the write + pub fn max_timestamp(&self) -> i64 { + self.max_timestamp + } + + /// Returns the number of rows in the write + pub fn rows(&self) -> NonZeroUsize { + self.row_count + } + + /// Returns a [`PartitionWrite`] containing just the rows of `Self` that pass + /// the provided time predicate, or None if no rows + pub fn filter(&self, predicate: impl Fn(i64) -> bool) -> Option> { + let mut min_timestamp = i64::MAX; + let mut max_timestamp = i64::MIN; + let mut row_count = 0_usize; + + // Construct a predicate that lets us inspect the timestamps as they are filtered + let inspect = |t| match predicate(t) { + true => { + min_timestamp = min_timestamp.min(t); + max_timestamp = max_timestamp.max(t); + row_count += 1; + true + } + false => false, + }; + + let ranges: Vec<_> = filter::filter_time(self.batch, &self.ranges, inspect); + let row_count = NonZeroUsize::new(row_count)?; + + Some(PartitionWrite { + batch: self.batch, + ranges, + min_timestamp, + max_timestamp, + row_count, + }) + } + + /// Create a collection of [`PartitionWrite`] indexed by partition key + /// from a [`MutableBatch`] and [`TablePartitionTemplateOverride`] + pub fn partition( + batch: &'a MutableBatch, + partition_template: &TablePartitionTemplateOverride, + ) -> Result, PartitionWriteError> { + use hashbrown::hash_map::Entry; + let time = batch.time_column()?; + + let mut partition_ranges = HashMap::new(); + for (partition, range) in partition_batch(batch, partition_template) { + let row_count = NonZeroUsize::new(range.end - range.start).unwrap(); + let (min_timestamp, max_timestamp) = min_max_time(&time[range.clone()]); + + match partition_ranges.entry(PartitionKey::from(partition?)) { + Entry::Vacant(v) => { + v.insert(PartitionWrite { + batch, + ranges: vec![range], + min_timestamp, + max_timestamp, + row_count, + }); + } + Entry::Occupied(mut o) => { + let pw = o.get_mut(); + pw.min_timestamp = pw.min_timestamp.min(min_timestamp); + pw.max_timestamp = pw.max_timestamp.max(max_timestamp); + pw.row_count = NonZeroUsize::new(pw.row_count.get() + row_count.get()).unwrap(); + pw.ranges.push(range); + } + } + } + Ok(partition_ranges) + } +} + +impl<'a> WritePayload for PartitionWrite<'a> { + fn write_to_batch(&self, batch: &mut MutableBatch) -> mutable_batch::Result<()> { + batch.extend_from_ranges(self.batch, &self.ranges) + } +} + +fn min_max_time(col: &[i64]) -> (i64, i64) { + let mut min_timestamp = i64::MAX; + let mut max_timestamp = i64::MIN; + for t in col { + min_timestamp = min_timestamp.min(*t); + max_timestamp = max_timestamp.max(*t); + } + (min_timestamp, max_timestamp) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::*; + + use assert_matches::assert_matches; + use chrono::{format::StrftimeItems, DateTime, Datelike, Days, TimeZone, Utc}; + use data_types::partition_template::{ + build_column_values, test_table_partition_override, ColumnValue, + }; + use mutable_batch::{writer::Writer, MutableBatch}; + use proptest::{prelude::*, prop_compose, proptest, strategy::Strategy}; + use rand::prelude::*; + use schema::{Projection, TIME_COLUMN_NAME}; + use test_helpers::assert_error; + + #[test] + fn return_err_if_no_time_column() { + let batch = MutableBatch::new(); + let table_partition_template = Default::default(); + assert_error!( + PartitionWrite::partition(&batch, &table_partition_template), + PartitionWriteError::TimeColumn(TimeColumnError::NotFound), + ); + } + + fn make_rng() -> StdRng { + let seed = rand::rngs::OsRng.next_u64(); + println!("Seed: {seed}"); + StdRng::seed_from_u64(seed) + } + + /// Reproducer for https://github.com/influxdata/idpe/issues/17765 + #[test] + fn test_equals_last() { + let ts = [ + 1686756903736785920, // last_eq=false, render, set last_ptr + 42, // last_eq=false, render, set last_ptr + 1686756903736785920, // last_eq=false, re-use, don't change last_ptr + 1686756903736785920, // last_eq=false, re-use, don't change last_ptr + 42, // last_eq=true (wrong), re-use + ]; + + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, ts.len()); + + writer.write_time("time", ts.into_iter()).unwrap(); + writer.commit(); + + let keys = + generate_denormalised_keys(&batch, TablePartitionTemplateOverride::default().parts()) + .unwrap(); + + assert_eq!( + keys, + &[ + "2023-06-14", + "1970-01-01", + "2023-06-14", + "2023-06-14", + "1970-01-01", + ] + ); + } + + /// Generates a vector of partition key strings, or an error. + /// + /// This function normalises the de-duplicated output of + /// [`partition_keys()`], returning the last observed key when the dedupe + /// [`partition_keys()`] process returns [`None`]. + fn generate_denormalised_keys<'a, 'b: 'a, T: Batch>( + batch: &'b T, + template_parts: impl Iterator>, + ) -> Result, PartitionKeyError> { + let mut last_ret = None; + partition_keys(batch, template_parts) + .map(|v| match v { + Some(this) => { + last_ret = Some(this.clone()); + this + } + None => last_ret + .as_ref() + .expect("must have observed prior key") + .clone(), + }) + .collect::, _>>() + } + + /// A fixture test asserting the default partition key format, derived from + /// the default partition key template. + #[test] + fn test_default_fixture() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 1); + + writer.write_time("time", vec![1].into_iter()).unwrap(); + writer + .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter()) + .unwrap(); + writer.commit(); + + let template_parts = + TablePartitionTemplateOverride::try_new(None, &Default::default()).unwrap(); + let keys: Vec<_> = partition_keys(&batch, template_parts.parts()) + .map(|v| v.expect("non-identical consecutive keys")) + .collect::, _>>() + .unwrap(); + + assert_eq!(keys, vec!["1970-01-01".to_string()]) + } + + #[test] + #[should_panic(expected = r#"error reading time column: NotFound"#)] + fn test_zero_sized_batch() { + let batch = MutableBatch::new(); + + let template_parts = test_table_partition_override(vec![ + TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"), + TemplatePart::TagValue("region"), + TemplatePart::TagValue("bananas"), + ]); + + let keys: Vec<_> = partition_batch(&batch, &template_parts).collect::>(); + assert_eq!(keys, vec![]) + } + + #[test] + fn test_range_encode() { + let collected: Vec<_> = + range_encode(vec![5, 5, 5, 7, 2, 2, 3].into_iter().map(Some)).collect(); + assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)]) + } + + #[test] + fn test_range_encode_sparse() { + let collected: Vec<_> = + range_encode(vec![Some(5), None, None, Some(7), Some(2), None, Some(3)].into_iter()) + .collect(); + assert_eq!(collected, vec![(5, 0..3), (7, 3..4), (2, 4..6), (3, 6..7)]) + } + + #[test] + fn test_range_encode_fuzz() { + let mut rng = make_rng(); + let original: Vec<_> = std::iter::from_fn(|| Some(rng.next_u32() % 20)) + .take(1000) + .collect(); + + let rle: Vec<_> = range_encode(original.iter().cloned().map(Some)).collect(); + + let mut last_range = rle[0].1.clone(); + for (_, range) in &rle[1..] { + assert_eq!(range.start, last_range.end); + assert_ne!(range.start, range.end); + last_range = range.clone(); + } + + let hydrated: Vec<_> = rle + .iter() + .flat_map(|(v, r)| std::iter::repeat(*v).take(r.end - r.start)) + .collect(); + + assert_eq!(original, hydrated) + } + + #[test] + fn test_partition() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 5); + + writer + .write_time("time", vec![1, 2, 3, 4, 5].into_iter()) + .unwrap(); + + writer + .write_tag( + "region", + Some(&[0b00001010]), + vec!["west", "east"].into_iter(), + ) + .unwrap(); + writer + .write_tag( + "device", + Some(&[0b00001110]), + vec![ + "97c953a1-70e6-4569-80e4-59d1f49ec3fa", + "f1aac284-b8a1-4938-acf3-52a3d516ca14", + "420bb984-4d1e-48ec-bbfc-10825fbf3221", + ] + .into_iter(), + ) + .unwrap(); + + let template_parts = [ + TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"), + TemplatePart::TagValue("region"), + TemplatePart::Bucket("device", 10), + TemplatePart::TagValue("bananas"), // column not present + ]; + + writer.commit(); + + let keys: Vec<_> = partition_keys(&batch, template_parts.clone().into_iter()) + .map(|v| v.expect("non-identical consecutive keys")) + .collect::, _>>() + .unwrap(); + + assert_eq!( + keys, + vec![ + "1970-01-01 00:00:00|!|!|!".to_string(), + "1970-01-01 00:00:00|west|6|!".to_string(), + "1970-01-01 00:00:00|!|4|!".to_string(), + "1970-01-01 00:00:00|east|5|!".to_string(), + "1970-01-01 00:00:00|!|!|!".to_string() + ] + ); + + let record_batch = batch.to_arrow(Projection::All).unwrap(); + + let keys: Vec<_> = partition_keys(&record_batch, template_parts.into_iter()) + .map(|v| v.expect("non-identical consecutive keys")) + .collect::, _>>() + .unwrap(); + + assert_eq!( + keys, + vec![ + "1970-01-01 00:00:00|!|!|!".to_string(), + "1970-01-01 00:00:00|west|6|!".to_string(), + "1970-01-01 00:00:00|!|4|!".to_string(), + "1970-01-01 00:00:00|east|5|!".to_string(), + "1970-01-01 00:00:00|!|!|!".to_string() + ] + ); + } + + #[test] + fn test_bucket_fixture() { + let mut bucketer = BucketHasher::new(10); + assert_eq!(bucketer.assign_bucket("foo"), 6); + assert_eq!(bucketer.last_assigned_bucket(), Some(6)); + assert_eq!(bucketer.assign_bucket("bat"), 5); + assert_eq!(bucketer.last_assigned_bucket(), Some(5)); + assert_eq!(bucketer.assign_bucket("qux"), 5); + assert_eq!(bucketer.last_assigned_bucket(), Some(5)); + } + + #[test] + fn test_sparse_representation() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 7); + + writer + .write_time( + "time", + vec![ + 1, + 1, + 1, + 1, + 1685971961464736000, + 1685971961464736000, + 1685971961464736000, + ] + .into_iter(), + ) + .unwrap(); + + writer + .write_tag( + "region", + Some(&[0b01111111]), + vec![ + "platanos", "platanos", "platanos", "platanos", "platanos", "platanos", + "bananas", + ] + .into_iter(), + ) + .unwrap(); + + writer + .write_tag( + "device", + Some(&[0b01111111]), + vec!["foo", "bat", "qux", "bat", "foo", "foo", "foo"].into_iter(), // `bat` and `qux` both go to bucket 5, so those 3 values should yield the same key + ) + .unwrap(); + + let template_parts = [ + TemplatePart::TimeFormat("%Y-%m-%d %H:%M:%S"), + TemplatePart::TagValue("region"), + TemplatePart::Bucket("device", 10), + TemplatePart::TagValue("bananas"), // column not present + ]; + + writer.commit(); + + let mut iter = partition_keys(&batch, template_parts.into_iter()); + + assert_eq!( + iter.next().unwrap(), + Some(Ok("1970-01-01 00:00:00|platanos|6|!".to_string())) + ); + assert_eq!( + iter.next().unwrap(), + Some(Ok("1970-01-01 00:00:00|platanos|5|!".to_string())) + ); + assert_eq!(iter.next().unwrap(), None); + assert_eq!(iter.next().unwrap(), None); + assert_eq!( + iter.next().unwrap(), + Some(Ok("2023-06-05 13:32:41|platanos|6|!".to_string())) + ); + assert_eq!(iter.next().unwrap(), None); + assert_eq!( + iter.next().unwrap(), + Some(Ok("2023-06-05 13:32:41|bananas|6|!".to_string())) + ); + } + + #[test] + fn partitioning_on_fields_panics() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 5); + + writer + .write_time("time", vec![1, 2, 3, 4, 5].into_iter()) + .unwrap(); + + writer + .write_string( + "region", + Some(&[0b00001010]), + vec!["west", "east"].into_iter(), + ) + .unwrap(); + + let template_parts = [TemplatePart::TagValue("region")]; + + writer.commit(); + + let got: Result, _> = generate_denormalised_keys(&batch, template_parts.into_iter()); + assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_))); + } + + #[test] + fn bucketing_on_fields_panics() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 5); + + writer + .write_time("time", vec![1, 2, 3, 4, 5].into_iter()) + .unwrap(); + + writer + .write_string( + "region", + Some(&[0b00001010]), + vec!["west", "east"].into_iter(), + ) + .unwrap(); + + let template_parts = [TemplatePart::Bucket("region", 10)]; + + writer.commit(); + + let got: Result, _> = generate_denormalised_keys(&batch, template_parts.into_iter()); + assert_matches::assert_matches!(got, Err(PartitionKeyError::TagValueNotTag(_))); + } + + fn identity<'a, T>(s: T) -> ColumnValue<'a> + where + T: Into>, + { + ColumnValue::Identity(s.into()) + } + + fn prefix<'a, T>(s: T) -> ColumnValue<'a> + where + T: Into>, + { + ColumnValue::Prefix(s.into()) + } + + fn year(y: i32) -> ColumnValue<'static> { + ColumnValue::Datetime { + begin: Utc.with_ymd_and_hms(y, 1, 1, 0, 0, 0).unwrap(), + end: Utc.with_ymd_and_hms(y + 1, 1, 1, 0, 0, 0).unwrap(), + } + } + + fn bucket(bucket_id: u32) -> ColumnValue<'static> { + ColumnValue::Bucket(bucket_id) + } + + // Generate a test that asserts the derived partition key matches + // "want_key", when using the provided "template" parts and set of "tags". + // + // Additionally validates that the derived key is reversible into the + // expected set of "want_reversed_tags" from the original inputs. + macro_rules! test_partition_key { + ( + $name:ident, + template = $template:expr, // Array/vec of TemplatePart + tags = $tags:expr, // Array/vec of (tag_name, value) tuples + want_key = $want_key:expr, // Expected partition key string + want_reversed_tags = $want_reversed_tags:expr // Array/vec of (tag_name, value) reversed from $tags + ) => { + paste::paste! { + #[test] + fn []() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 1); + + let template = $template.into_iter().collect::>(); + let template = test_table_partition_override(template); + + // Timestamp: 2023-05-29T13:03:16Z + writer + .write_time("time", vec![1685365396931384064].into_iter()) + .unwrap(); + + for (col, value) in $tags { + let v = String::from(value); + writer + .write_tag(col, Some(&[0b00000001]), vec![v.as_str()].into_iter()) + .unwrap(); + } + + writer.commit(); + + // Generate the full set of partition keys, inserting the + // last observed value when the next key is identical to + // normalise the values. + let keys = generate_denormalised_keys(&batch, template.parts()) + .unwrap(); + assert_eq!(keys, vec![$want_key.to_string()], "generated key differs"); + + // Reverse the encoding. + let reversed = build_column_values(&template, &keys[0]); + + // Expect the tags to be (str, ColumnValue) for the + // comparison + let want: Vec<(&str, ColumnValue<'_>)> = $want_reversed_tags + .into_iter() + .collect(); + + let got = reversed.collect::>(); + assert_eq!(got, want, "reversed key differs"); + } + } + }; + } + + test_partition_key!( + simple, + template = [ + TemplatePart::TimeFormat("%Y"), + TemplatePart::TagValue("a"), + TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 5), + ], + tags = [ + ("a", "bananas"), + ("b", "are_good"), + ("c", "for_test_strings") + ], + want_key = "2023|bananas|are_good|1", + want_reversed_tags = [ + (TIME_COLUMN_NAME, year(2023)), + ("a", identity("bananas")), + ("b", identity("are_good")), + ("c", bucket(1)), + ] + ); + + test_partition_key!( + non_ascii, + template = [ + TemplatePart::TimeFormat("%Y"), + TemplatePart::TagValue("a"), + TemplatePart::TagValue("b"), + ], + tags = [("a", "bananas"), ("b", "plátanos")], + want_key = "2023|bananas|pl%C3%A1tanos", + want_reversed_tags = [ + (TIME_COLUMN_NAME, year(2023)), + ("a", identity("bananas")), + ("b", identity("plátanos")), + ] + ); + + test_partition_key!( + single_tag_template_tag_not_present, + template = [TemplatePart::TagValue("a")], + tags = [("b", "bananas")], + want_key = "!", + want_reversed_tags = [] + ); + + test_partition_key!( + single_bucket_template_tag_not_present, + template = [TemplatePart::Bucket("a", 10)], + tags = [("b", "bananas")], + want_key = "!", + want_reversed_tags = [] + ); + + test_partition_key!( + single_tag_template_tag_empty, + template = [TemplatePart::TagValue("a")], + tags = [("a", "")], + want_key = "^", + want_reversed_tags = [("a", identity(""))] + ); + + test_partition_key!( + single_bucket_template_tag_empty, + template = [TemplatePart::Bucket("a", 10)], + tags = [("a", "")], + want_key = "0", + want_reversed_tags = [("a", bucket(0))] + ); + + test_partition_key!( + missing_tag, + template = [ + TemplatePart::TagValue("a"), + TemplatePart::TagValue("b"), + TemplatePart::Bucket("c", 10) + ], + tags = [("a", "bananas")], + want_key = "bananas|!|!", + want_reversed_tags = [("a", identity("bananas"))] + ); + + test_partition_key!( + unambiguous, + template = [ + TemplatePart::TimeFormat("%Y"), + TemplatePart::TagValue("a"), + TemplatePart::TagValue("b"), + TemplatePart::TagValue("c"), + TemplatePart::TagValue("d"), + TemplatePart::TagValue("e"), + ], + tags = [("a", "|"), ("b", "!"), ("d", "%7C%21%257C"), ("e", "^")], + want_key = "2023|%7C|%21|!|%257C%2521%25257C|%5E", + want_reversed_tags = [ + (TIME_COLUMN_NAME, year(2023)), + ("a", identity("|")), + ("b", identity("!")), + ("d", identity("%7C%21%257C")), + ("e", identity("^")) + ] + ); + + test_partition_key!( + truncated_char_reserved, + template = [TemplatePart::TagValue("a")], + tags = [("a", "#")], + want_key = "%23", + want_reversed_tags = [("a", identity("#"))] + ); + + // Keys < 200 bytes long should not be truncated. + test_partition_key!( + truncate_length_199, + template = [TemplatePart::TagValue("a")], + tags = [("a", "A".repeat(199))], + want_key = "A".repeat(199), + want_reversed_tags = [("a", identity("A".repeat(199)))] + ); + + // Keys of exactly 200 bytes long should not be truncated. + test_partition_key!( + truncate_length_200, + template = [TemplatePart::TagValue("a")], + tags = [("a", "A".repeat(200))], + want_key = "A".repeat(200), + want_reversed_tags = [("a", identity("A".repeat(200)))] + ); + + // Keys > 200 bytes long should be truncated to exactly 200 bytes, + // terminated by a # character. + test_partition_key!( + truncate_length_201, + template = [TemplatePart::TagValue("a")], + tags = [("a", "A".repeat(201))], + want_key = format!("{}#", "A".repeat(199)), + want_reversed_tags = [("a", prefix("A".repeat(199)))] + ); + + // A key ending in an encoded sequence that does not cross the cut-off point + // is preserved. + // + // This subtest generates a key of: + // + // `A..%` + // ^ cutoff + // + // Which when encoded, becomes: + // + // `A..%25` + // ^ cutoff + // + // So the entire encoded sequence should be preserved. + test_partition_key!( + truncate_encoding_sequence_ok, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}%", "A".repeat(197)))], + want_key = format!("{}%25", "A".repeat(197)), // Not truncated + want_reversed_tags = [("a", identity(format!("{}%", "A".repeat(197))))] + ); + + // A key ending in an encoded sequence should not be split. + // + // This subtest generates a key of: + // + // `A..%` + // ^ cutoff + // + // Which when encoded, becomes: + // + // `A..% 25` (space added for clarity) + // ^ cutoff + // + // Where naive slicing would result in truncating an encoding sequence and + // therefore the whole encoded sequence should be truncated. + test_partition_key!( + truncate_encoding_sequence_truncated_1, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}%", "A".repeat(198)))], + want_key = format!("{}#", "A".repeat(198)), // Truncated + want_reversed_tags = [("a", prefix("A".repeat(198)))] + ); + + // A key ending in an encoded sequence should not be split. + // + // This subtest generates a key of: + // + // `A..%` + // ^ cutoff + // + // Which when encoded, becomes: + // + // `A..%2 5` (space added for clarity) + // ^ cutoff + // + // Where naive slicing would result in truncating an encoding sequence and + // therefore the whole encoded sequence should be truncated. + test_partition_key!( + truncate_encoding_sequence_truncated_2, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}%", "A".repeat(199)))], + want_key = format!("{}#", "A".repeat(199)), // Truncated + want_reversed_tags = [("a", prefix("A".repeat(199)))] + ); + + // A key ending in a unicode code-point should never be split. + // + // This subtest generates a key of: + // + // `A..🍌` + // ^ cutoff + // + // Which when encoded, becomes: + // + // `A..%F0%9F%8D%8C` + // ^ cutoff + // + // Therefore the entire code-point should be removed from the truncated + // output. + // + // This test MUST NOT fail, or an invalid UTF-8 string is being generated + // which is unusable in languages (like Rust). + // + // Advances the cut-off to ensure the position within the code-point doesn't + // affect the output. + test_partition_key!( + truncate_within_code_point_1, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}🍌", "A".repeat(194)))], + want_key = format!("{}#", "A".repeat(194)), + want_reversed_tags = [("a", prefix("A".repeat(194)))] + ); + test_partition_key!( + truncate_within_code_point_2, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}🍌", "A".repeat(195)))], + want_key = format!("{}#", "A".repeat(195)), + want_reversed_tags = [("a", prefix("A".repeat(195)))] + ); + test_partition_key!( + truncate_within_code_point_3, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}🍌", "A".repeat(196)))], + want_key = format!("{}#", "A".repeat(196)), + want_reversed_tags = [("a", prefix("A".repeat(196)))] + ); + + // A key ending in a unicode grapheme should never be split. + // + // This subtest generates a key of: + // + // `A..நிbananas` + // ^ cutoff + // + // Which when encoded, becomes: + // + // `A..நிbananas` (within a grapheme) + // ^ cutoff + // + // Therefore the entire grapheme (நி) should be removed from the truncated + // output. + // + // This is a conservative implementation, and may be relaxed in the future. + // + // This first test asserts that a grapheme can be included, and then + // subsequent tests increment the cut-off point by 1 byte each time. + test_partition_key!( + truncate_within_grapheme_0, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(181)))], + want_key = format!("{}%E0%AE%A8%E0%AE%BF#", "A".repeat(181)), + want_reversed_tags = [("a", prefix(format!("{}நி", "A".repeat(181))))] + ); + test_partition_key!( + truncate_within_grapheme_1, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(182)))], + want_key = format!("{}#", "A".repeat(182)), + want_reversed_tags = [("a", prefix("A".repeat(182)))] + ); + test_partition_key!( + truncate_within_grapheme_2, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(183)))], + want_key = format!("{}#", "A".repeat(183)), + want_reversed_tags = [("a", prefix("A".repeat(183)))] + ); + test_partition_key!( + truncate_within_grapheme_3, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(184)))], + want_key = format!("{}#", "A".repeat(184)), + want_reversed_tags = [("a", prefix("A".repeat(184)))] + ); + test_partition_key!( + truncate_within_grapheme_4, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(185)))], + want_key = format!("{}#", "A".repeat(185)), + want_reversed_tags = [("a", prefix("A".repeat(185)))] + ); + test_partition_key!( + truncate_within_grapheme_5, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(186)))], + want_key = format!("{}#", "A".repeat(186)), + want_reversed_tags = [("a", prefix("A".repeat(186)))] + ); + test_partition_key!( + truncate_within_grapheme_6, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(187)))], + want_key = format!("{}#", "A".repeat(187)), + want_reversed_tags = [("a", prefix("A".repeat(187)))] + ); + test_partition_key!( + truncate_within_grapheme_7, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(188)))], + want_key = format!("{}#", "A".repeat(188)), + want_reversed_tags = [("a", prefix("A".repeat(188)))] + ); + test_partition_key!( + truncate_within_grapheme_8, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(189)))], + want_key = format!("{}#", "A".repeat(189)), + want_reversed_tags = [("a", prefix("A".repeat(189)))] + ); + test_partition_key!( + truncate_within_grapheme_9, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நிbananas", "A".repeat(190)))], + want_key = format!("{}#", "A".repeat(190)), + want_reversed_tags = [("a", prefix("A".repeat(190)))] + ); + + // As above, but the grapheme is the last portion of the generated string + // (no trailing bananas). + test_partition_key!( + truncate_grapheme_identity, + template = [TemplatePart::TagValue("a")], + tags = [("a", format!("{}நி", "A".repeat(182)))], + want_key = format!("{}%E0%AE%A8%E0%AE%BF", "A".repeat(182)), + want_reversed_tags = [("a", identity(format!("{}நி", "A".repeat(182))))] + ); + + /// A test using an invalid strftime format string. + #[test] + fn test_invalid_strftime() { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 1); + + writer.write_time("time", vec![1].into_iter()).unwrap(); + writer + .write_tag("region", Some(&[0b00000001]), vec!["bananas"].into_iter()) + .unwrap(); + writer.commit(); + + let template = [TemplatePart::TimeFormat("%3F")] + .into_iter() + .collect::>(); + let template = test_table_partition_override(template); + + let ret = partition_keys(&batch, template.parts()) + .map(|v| v.expect("non-identical consecutive keys")) + .collect::, _>>(); + + assert_matches!(ret, Err(PartitionKeyError::InvalidStrftime)); + } + + #[test] + #[should_panic( + expected = "partition template contains 9 parts, which exceeds the maximum of 8 parts" + )] + fn test_too_many_parts() { + let template = test_table_partition_override( + std::iter::repeat(TemplatePart::TagValue("bananas")) + .take(9) + .collect(), + ); + + let _ = partition_batch(&MutableBatch::new(), &template); + } + + // These values are arbitrarily chosen when building an input to the + // partitioner. + + // Arbitrary tag names are selected from this set of candidates (to ensure + // there's always some overlap, rather than truly random strings). + const TEST_TAG_NAME_SET: &[&str] = &["A", "B", "C", "D", "E", "F"]; + + // Arbitrary template parts are selected from this set. + const TEST_TEMPLATE_PARTS: &[TemplatePart<'static>] = &[ + TemplatePart::TimeFormat("%Y|%m|%d!-string"), + TemplatePart::TimeFormat("%Y|%m|%d!-%%bananas"), + TemplatePart::TimeFormat("%Y/%m/%d"), + TemplatePart::TimeFormat("%Y-%m-%d"), + TemplatePart::TagValue(""), + TemplatePart::TagValue("A"), + TemplatePart::TagValue("B"), + TemplatePart::TagValue("C"), + TemplatePart::TagValue("tags!"), + TemplatePart::TagValue("%tags!"), + TemplatePart::TagValue("my_tag"), + TemplatePart::TagValue("my|tag"), + TemplatePart::TagValue("%%%%|!!!!|"), + TemplatePart::Bucket("D", 10), + TemplatePart::Bucket("E", 100), + TemplatePart::Bucket("F", 1000), + ]; + + prop_compose! { + /// Yields a vector of up to [`MAXIMUM_NUMBER_OF_TEMPLATE_PARTS`] unique + /// template parts, chosen from [`TEST_TEMPLATE_PARTS`]. + fn arbitrary_template_parts()(set in proptest::collection::vec( + proptest::sample::select(TEST_TEMPLATE_PARTS), + (1, MAXIMUM_NUMBER_OF_TEMPLATE_PARTS) // Set size range + )) -> Vec> { + let mut set = set; + set.dedup_by(|a, b| format!("{a:?}") == format!("{b:?}")); + set + } + } + + prop_compose! { + /// Yield a HashMap of between 1 and 10 (column_name, random string + /// value) with tag names chosen from [`TEST_TAG_NAME_SET`]. + fn arbitrary_tag_value_map()(v in proptest::collection::hash_map( + proptest::sample::select(TEST_TAG_NAME_SET).prop_map(ToString::to_string), + any::(), + (1, 10) // Set size range + )) -> HashMap { + v + } + } + + prop_compose! { + /// Yield a Vec containing an identical timestamp run of random length, + /// up to `max_run_len`, + fn arbitrary_timestamp_run(max_run_len: usize)(v in 0_i64..i64::MAX, run_len in 1..max_run_len) -> Vec { + let mut x = Vec::with_capacity(run_len); + x.resize(run_len, v); + x + } + } + + /// Yield a Vec of timestamp values that more accurately model real + /// timestamps than pure random selection. + /// + /// Runs of identical timestamps are generated with + /// [`arbitrary_timestamp_run()`], which are then shuffled to produce a list + /// of timestamps with limited repeats, sometimes consecutively. + fn arbitrary_timestamps() -> impl Strategy> { + proptest::collection::vec(arbitrary_timestamp_run(6), 10..100) + .prop_map(|v| v.into_iter().flatten().collect::>()) + .prop_shuffle() + } + + enum ExpectedColumnValue { + String(String), + TSRange(DateTime, DateTime), + Bucket(u32), + } + + impl ExpectedColumnValue { + fn expect_string(&self) -> &String { + match self { + Self::String(s) => s, + Self::TSRange(_, _) => panic!("expected string, got TS range"), + Self::Bucket(_) => panic!("expected string, got bucket id"), + } + } + + fn expect_ts_range(&self) -> (DateTime, DateTime) { + match self { + Self::String(_) => panic!("expected TS range, got string"), + Self::TSRange(b, e) => (*b, *e), + Self::Bucket(_) => panic!("expected TS range, got bucket id"), + } + } + + fn expect_bucket_id(&self) -> u32 { + match self { + Self::String(_) => panic!("expected bucket id, got string"), + Self::TSRange(_, _) => panic!("expected bucket id, got TS range"), + Self::Bucket(bucket_id) => *bucket_id, + } + } + } + + proptest! { + /// A property test that asserts a write comprised of an arbitrary + /// subset of [`TEST_TAG_NAME_SET`] with randomised values, that is + /// partitioned using a partitioning template arbitrarily selected from + /// [`TEST_TEMPLATE_PARTS`], can be reversed to the full set of tags + /// and/or hash-bucket IDs via [`build_column_values()`]. + #[test] + fn prop_reversible_mapping( + template in arbitrary_template_parts(), + tag_values in arbitrary_tag_value_map(), + ts in 0_i64..i64::MAX, + ) { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 1); + + let template = template.clone().into_iter().collect::>(); + let template = test_table_partition_override(template); + + writer + .write_time("time", vec![ts].into_iter()) + .unwrap(); + + for (col, value) in &tag_values { + writer + .write_tag(col.as_str(), Some(&[0b00000001]), vec![value.as_str()].into_iter()) + .unwrap(); + } + + writer.commit(); + let keys: Vec<_> = generate_denormalised_keys(&batch, template.parts()) + .unwrap(); + assert_eq!(keys.len(), 1); + + // Reverse the encoding. + let reversed: Vec<(&str, ColumnValue<'_>)> = build_column_values(&template, &keys[0]).collect(); + + // Build the expected set of reversed tags by filtering out any + // NULL tags (preserving empty string values). + let ts = Utc.timestamp_nanos(ts); + let want_reversed: Vec<(&str, ExpectedColumnValue)> = template.parts().filter_map(|v| match v { + TemplatePart::TagValue(col_name) if tag_values.contains_key(col_name) => { + // This tag had a (potentially empty) value wrote and should + // appear in the reversed output. + Some((col_name, ExpectedColumnValue::String(tag_values.get(col_name).unwrap().to_string()))) + } + TemplatePart::TimeFormat("%Y/%m/%d" | "%Y-%m-%d") => { + let begin = Utc.with_ymd_and_hms(ts.year(), ts.month(), ts.day(), 0, 0, 0).unwrap(); + let end = begin + Days::new(1); + Some((TIME_COLUMN_NAME, ExpectedColumnValue::TSRange(begin, end))) + } + TemplatePart::Bucket(col_name, num_buckets) if tag_values.contains_key(col_name) => { + // Hash-bucketing is not fully-reversible from value to + // tag-name (intentionally so, it makes it much simpler to + // implement). + // + // The test must assign buckets as they are when the + // partition key is rendered. + let want_bucket = BucketHasher::new(num_buckets).assign_bucket(tag_values.get(col_name).unwrap()); + Some((col_name, ExpectedColumnValue::Bucket(want_bucket))) + } + _ => None, + }).collect(); + + assert_eq!(want_reversed.len(), reversed.len()); + + for ((want_col, want_val), (got_col, got_val)) in want_reversed.iter().zip(reversed.iter()) { + assert_eq!(got_col, want_col, "column names differ"); + + match got_val { + ColumnValue::Identity(_) => { + // An identity is both equal to, and a prefix of, the + // original value. + let want_val = want_val.expect_string(); + assert_eq!(got_val, &want_val, "identity values differ"); + assert!( + got_val.is_prefix_match_of(want_val), + "prefix mismatch; {:?} is not a prefix of {:?}", + got_val, + want_val, + ); + }, + ColumnValue::Prefix(_) => { + let want_val = want_val.expect_string(); + assert!( + got_val.is_prefix_match_of(want_val), + "prefix mismatch; {:?} is not a prefix of {:?}", + got_val, + want_val, + ); + }, + ColumnValue::Datetime{..} => { + let (want_begin, want_end) = want_val.expect_ts_range(); + match got_val { + ColumnValue::Datetime{begin, end} => { + assert_eq!(want_begin, *begin); + assert_eq!(want_end, *end); + } + _ => panic!("expected datatime column value but got: {:?}", got_val) + } + }, + ColumnValue::Bucket(got_bucket_id) => { + let want_bucket_id = want_val.expect_bucket_id(); + assert_eq!(*got_bucket_id, want_bucket_id); + } + }; + } + } + + /// A property test that asserts the partitioner tolerates (does not + /// panic) randomised, potentially invalid strftime formatter strings. + #[test] + fn prop_arbitrary_strftime_format(fmt in any::()) { + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, 1); + + // This sequence causes chrono's formatter to panic with a "do not + // use this" message... + // + // This is validated to not be part of the formatter (among other + // invalid sequences) when constructing a template from the user + // input/proto. + // + // Uniquely this causes a panic, whereas others do not - so it must + // be filtered out when fuzz-testing that invalid sequences do not + // cause a panic in the key generator. + prop_assume!(!fmt.contains("%#z")); + + // Generate a single time-based partitioning template with a + // randomised format string. + let template = vec![ + TemplatePart::TimeFormat(&fmt), + ]; + let template = test_table_partition_override(template); + + // Timestamp: 2023-05-29T13:03:16Z + writer + .write_time("time", vec![1685365396931384064].into_iter()) + .unwrap(); + + writer + .write_tag("bananas", Some(&[0b00000001]), vec!["great"].into_iter()) + .unwrap(); + + writer.commit(); + let ret = partition_keys(&batch, template.parts()) + .map(|v| v.expect("non-identical consecutive keys")) + .collect::, _>>(); + + // The is allowed to succeed or fail under this test (but not + // panic), and the returned error/value must match certain + // properties: + match ret { + Ok(v) => { assert_eq!(v.len(), 1); }, + Err(e) => { assert_matches!(e, PartitionKeyError::InvalidStrftime); }, + } + } + + // Drives the strftime formatter through the "front door", using the + // same interface as a user would call to partition data. This validates + // the integration between the various formatters, range encoders, + // dedupe, etc. + #[test] + fn prop_strftime_integration( + times in arbitrary_timestamps(), + format in prop_oneof![ + Just("%Y-%m-%d"), // Default scheme + Just("%s") // Unix seconds, to drive increased cache miss rate in strftime formatter + ] + ) { + use std::fmt::Write; + + let mut batch = MutableBatch::new(); + let mut writer = Writer::new(&mut batch, times.len()); + let row_count = times.len(); + + let template = test_table_partition_override(vec![TemplatePart::TimeFormat(format)]); + + writer + .write_time("time", times.clone().into_iter()) + .unwrap(); + + writer.commit(); + + let fmt = StrftimeItems::new(format); + let iter = partition_batch(&batch, &template); + + let mut observed_rows = 0; + + // For each partition key and the calculated row range + for (key, range) in iter { + let key = key.unwrap(); + + observed_rows += range.len(); + + // Validate all rows in that range render to the same timestamp + // value as the partition key when using the same format, using + // a known-good formatter. + for ts in ×[range] { + // Generate the control string. + let mut control = String::new(); + let _ = write!( + control, + "{}", + Utc.timestamp_nanos(*ts) + .format_with_items(fmt.clone()) + ); + assert_eq!(control, key); + } + } + + assert_eq!(observed_rows, row_count); + } + } +} diff --git a/partition/src/strftime.rs b/partition/src/strftime.rs new file mode 100644 index 00000000000..bd5230035d7 --- /dev/null +++ b/partition/src/strftime.rs @@ -0,0 +1,415 @@ +use std::fmt::Write; + +use chrono::{format::StrftimeItems, TimeZone, Utc}; + +use crate::PartitionKeyError; + +use super::encode_key_part; + +/// The number of nanoseconds in 1 day, definitely recited from memory. +const DAY_NANOSECONDS: i64 = 86_400_000_000_000; + +/// The default YMD formatter spec. +const YMD_SPEC: &str = "%Y-%m-%d"; + +/// A FIFO ring buffer, holding `N` lazily initialised slots. +/// +/// This is optimised for low values of `N` (where N*T covers a few cache lines) +/// as it performs an O(n) linear search. +#[derive(Debug)] +struct RingBuffer { + buf: [Option; N], + + /// Index into to the last wrote value. + last_idx: usize, +} + +impl Default for RingBuffer +where + T: Default, +{ + fn default() -> Self { + Self { + buf: [(); N].map(|_| Default::default()), // default init for non-const type + last_idx: N - 1, + } + } +} + +impl RingBuffer +where + T: Default, +{ + /// Return a mutable reference to the next slot to be overwrote. This method + /// initialises the slot if it has not been previously used. + /// + /// This is like an "insert" operation, but allows the caller to re-use the + /// contents of the slot to minimise allocations. + /// + /// This is an O(1) operation. + fn next_slot(&mut self) -> &mut T { + // Advance the next slot pointer + self.last_idx += 1; + self.last_idx %= N; + + let v = self.buf[self.last_idx].get_or_insert_with(Default::default); + + v + } + + /// Drop the last buffer entry. + /// + /// This may cause spurious cache misses due to the short-circuiting search + /// observing an empty element, potentially before non-empty elements. + fn drop_last(&mut self) { + self.buf[self.last_idx] = None; + } + + /// Find the first initialised slot that causes `F` to evaluate to true, + /// returning the slot contents. + /// + /// This is a O(n) linear search operation, which for small N can be as + /// fast, or faster, than a hashmap lookup by key. + fn find(&self, f: F) -> Option<&'_ T> + where + F: Fn(&T) -> bool, + { + for v in &self.buf { + let v = v.as_ref()?; + if f(v) { + return Some(v); + } + } + None + } +} + +/// A strftime-like formatter of epoch timestamps with nanosecond granularity. +/// +/// # Deferred Errors +/// +/// If the provided stftime formatter is invalid, an +/// [`PartitionKeyError::InvalidStrftime`] error is raised during the formatting +/// call to [`StrftimeFormatter::render()`] and not during initialisation. This +/// is a limitation of the underlying library. +/// +/// # Caching +/// +/// It is very common for batches of writes to contain multiple measurements +/// taken at the same timestamp; for example, a periodic scraper of metric +/// values will assign a single timestamp for the entire batch of observations. +/// +/// To leverage this reuse of timestamps, this type retains a cache of the 5 +/// most recently observed distinct timestamps to avoid recomputing the same +/// formatted string for each repeat occurrence. +/// +/// In the best case, this reduces N row formats down to a single format +/// operation, and in the worst case, it changes the memory overhead from "rows" +/// to "rows + 5" which amortises nicely as batch sizes increase. If more than 5 +/// timestamps are observed, the existing buffer allocations are reused when +/// computing the replacement values. +/// +/// # `YYYY-MM-DD` Reduction Specialisation +/// +/// The default (and therefore most common) formatting spec is "%Y-%m-%d", as +/// this is the IOx default partitioning template. The vast majority of writes +/// will utilise this format spec. +/// +/// Because this spec is so common, a special case optimisation is utilised for +/// it: for any given timestamp, first normalise the value by reducing the +/// precision such that the timestamp is rounded down to the nearest whole day +/// before further processing. +/// +/// This removes all the sub-day variance (hours, minutes, seconds, etc) from +/// the value, without changing the formatter output (it still produces the same +/// string). This in turn causes any timestamp from the same day to be a cache +/// hit with any prior value for the same day, regardless of "time" portion of +/// the timestamp. +/// +/// Combined with the above cache, this raises the cache hit rate to ~100% for +/// write batches that span less than 6 days, effectively amortising the cost of +/// timestamp formatting to O(1) for these very common batches. +#[derive(Debug)] +pub(super) struct StrftimeFormatter<'a> { + /// The strftime formatter definition. + /// + /// NOTE: the value below is UNVALIDATED - if the input strftime format + /// contains invalid formatter directives, then the error is deferred until + /// formatting a timestamp. + format: StrftimeItems<'a>, + + /// As an optimisation, when this formatter is using the default YYYY-MM-DD + /// partitioning template, timestamps are normalised to per-day granularity, + /// preventing variances in the timestamp of less-than 1 day from causing a + /// miss in the cached "values". + /// + /// This optimisation massively increases the reuse of cached, pre-formatted + /// strings. + is_ymd_format: bool, + + /// A set of 5 most recently added timestamps, and the formatted string they + /// map to. + values: RingBuffer<5, (i64, String)>, + + /// The last observed timestamp. + /// + /// This value changes each time a timestamp is returned to the user, either + /// from the cache of pre-generated strings, or by generating a new one and + /// MUST always track the last timestamp given to + /// [`StrftimeFormatter::render()`]. + last_ts: Option, +} + +impl<'a> StrftimeFormatter<'a> { + /// Initialise a new [`StrftimeFormatter`] with the given stftime-like + /// format string. + /// + /// The exact formatter specification is [documented here]. + /// + /// If the formatter contains an invalid spec, an error is raised when + /// formatting. + /// + /// [documented here]: + /// https://docs.rs/chrono/latest/chrono/format/strftime/index.html + pub(super) fn new(format: &'a str) -> Self { + let mut is_default_format = false; + if format == YMD_SPEC { + is_default_format = true; + } + + Self { + format: StrftimeItems::new(format), + is_ymd_format: is_default_format, + values: RingBuffer::default(), + last_ts: None, + } + } + + /// Format `timestamp` to the format spec provided during initialisation, + /// writing the result to `out`. + pub(super) fn render(&mut self, timestamp: i64, mut out: W) -> Result<(), PartitionKeyError> + where + W: std::fmt::Write, + { + // Optionally apply the default format reduction optimisation. + let timestamp = self.maybe_reduce(timestamp); + + // Retain this timestamp as the last observed timestamp. + self.last_ts = Some(timestamp); + + // Check if this timestamp has already been rendered. + if let Some(v) = self.values.find(|(t, _v)| *t == timestamp) { + // It has! Re-use the existing formatted string. + out.write_str(&v.1)?; + return Ok(()); + } + + // Obtain a mutable reference to the next item to be replaced, re-using + // the string buffer within it to avoid allocating (or initialising it + // if it was not yet initialised). + let buf = self.values.next_slot(); + + // Reset the slot value + buf.0 = timestamp; + buf.1.clear(); + + // Format the timestamp value into the slot buffer. + if write!( + buf.1, + "{}", + Utc.timestamp_nanos(timestamp) + .format_with_items(self.format.clone()) // Cheap clone of refs + ) + .is_err() + { + // The string buffer may be empty, or contain partially rendered + // output before the error was raised. + // + // Remove this entry from the cache to prevent there being a mapping + // of `timestamp -> `. + self.values.drop_last(); + return Err(PartitionKeyError::InvalidStrftime); + }; + + // Encode any reserved characters in this new string. + buf.1 = encode_key_part(&buf.1).to_string(); + + // Render this new value to the caller's buffer + out.write_str(&buf.1)?; + + Ok(()) + } + + /// Reduce the precision of the timestamp iff using the default "%Y-%m-%d" + /// formatter string, returning a value rounded to the nearest whole day. + /// + /// If the formatter is not this special-case value, `timestamp` is returned + /// unchanged. + fn maybe_reduce(&self, timestamp: i64) -> i64 { + if !self.is_ymd_format { + return timestamp; + } + // Don't map timestamps less than the value we would subtract. + if timestamp < DAY_NANOSECONDS { + return timestamp; + } + timestamp - (timestamp % DAY_NANOSECONDS) + } + + /// Returns true if the output of rendering `timestamp` will match the last + /// rendered timestamp, after optionally applying the precision reduction + /// optimisation. + pub(crate) fn equals_last(&self, timestamp: i64) -> bool { + // Optionally apply the default format reduction optimisation. + let timestamp = self.maybe_reduce(timestamp); + + self.last_ts.map(|v| v == timestamp).unwrap_or_default() + } +} + +#[cfg(test)] +mod tests { + use assert_matches::assert_matches; + use data_types::partition_template::{TablePartitionTemplateOverride, TemplatePart}; + use proptest::prelude::*; + + use super::*; + + #[test] + fn test_default_formatter() { + let template = TablePartitionTemplateOverride::default(); + let expect = template.parts().collect::>(); + + // If this assert fails (and it probably shouldn't!) then you may want + // to consider changing the special case optimisation above. + assert_matches!(expect.as_slice(), &[TemplatePart::TimeFormat(YMD_SPEC)]); + } + + #[test] + fn test_never_empty() { + let mut fmt = StrftimeFormatter::new(""); + + let mut buf = String::new(); + fmt.render(42, &mut buf).expect("should render string"); + assert!(!buf.is_empty()); + assert_eq!(buf, "^"); + } + + #[test] + fn test_incomplete_formatter() { + let mut fmt = StrftimeFormatter::new("%"); + + let mut buf = String::new(); + let got = fmt.render(42, &mut buf); + assert_matches!(got, Err(PartitionKeyError::InvalidStrftime)); + } + + #[test] + fn test_incomplete_formatter_removes_bad_mapping() { + let mut fmt = StrftimeFormatter::new("%s"); + + let mut buf = String::new(); + fmt.render(42, &mut buf).unwrap(); + + assert_matches!( + fmt.values.buf.as_slice(), + [Some((42, _)), None, None, None, None] + ); + + // This obviously isn't possible through normal usage, but to trigger + // the "failed to render" code path, reach in and tweak the formatter to + // cause it to fail. + fmt.format = StrftimeItems::new("%"); + + // Trigger the "cannot format" code path + fmt.render(4242, &mut buf).expect_err("invalid formatter"); + + // And ensure the ring buffer was left in a clean state + assert_matches!( + fmt.values.buf.as_slice(), + [Some((42, _)), None, None, None, None] + ); + } + + #[test] + fn test_uses_ring_buffer() { + let mut fmt = StrftimeFormatter::new("%H"); + let mut buf = String::new(); + + fmt.render(42, &mut buf).expect("should render string"); + fmt.render(42, &mut buf).expect("should render string"); + fmt.render(42, &mut buf).expect("should render string"); + fmt.render(12345, &mut buf).expect("should render string"); + fmt.render(42, &mut buf).expect("should render string"); + + // Assert the above repetitive values were deduped in the cache. + assert_matches!( + fmt.values.buf.as_slice(), + [Some((42, _)), Some((12345, _)), None, None, None] + ); + assert_eq!(fmt.values.last_idx, 1); + } + + const FORMATTER_SPEC_PARTS: &[&str] = &[ + "%Y", "%m", "%d", "%H", "%m", "%.9f", "%r", "%+", "%t", "%n", "%A", "%c", + ]; + + prop_compose! { + /// Yield an arbitrary formatter spec selected from + /// [`FORMATTER_SPEC_PARTS`] delimited by a random character. + fn arbitrary_formatter_spec()( + delimiter in any::(), + v in proptest::collection::vec( + proptest::sample::select(FORMATTER_SPEC_PARTS).prop_map(ToString::to_string), + (0, 10) // Set size range + )) -> String { + v.join(&delimiter.to_string()) + } + } + + fn default_formatter_spec() -> impl Strategy { + Just(YMD_SPEC.to_string()) + } + + proptest! { + /// The [`StrftimeFormatter`] is a glorified wrapper around chrono's + /// formatter, therefore this test asserts the following property: + /// + /// For any timestamp and formatter, the output of this type must + /// match the output of chrono's formatter, after key encoding. + /// + /// Validating this asserts correctness of the wrapper itself, assuming + /// chrono's formatter produces correct output. Note the encoding is + /// tested in the actual partitioner module. + #[test] + fn prop_differential_validation( + timestamps in prop::collection::vec(any::(), 1..100), + format in prop_oneof![arbitrary_formatter_spec(), default_formatter_spec(), any::()], + ) { + let mut fmt = StrftimeFormatter::new(&format); + let items = StrftimeItems::new(&format); + + for ts in timestamps { + // Generate the control string. + let mut control = String::new(); + let _ = write!( + control, + "{}", + Utc.timestamp_nanos(ts) + .format_with_items(items.clone()) + ); + let control = encode_key_part(&control); + + // Generate the test string. + let mut test = String::new(); + if fmt.render(ts, &mut test).is_err() { + // Any error results in the key not being used, so any + // differences are inconsequential. + continue; + } + + assert_eq!(control, test); + } + } + } +} diff --git a/partition/src/traits.rs b/partition/src/traits.rs new file mode 100644 index 00000000000..439e2a67ef9 --- /dev/null +++ b/partition/src/traits.rs @@ -0,0 +1,61 @@ +mod mutable_batch; +mod record_batch; + +use thiserror::Error; + +/// An error accessing the time column of a batch. +#[allow(missing_copy_implementations)] +#[derive(Debug, Error, PartialEq, Eq, Clone)] +pub enum TimeColumnError { + /// The batch did not have a time column. + #[error("No time column found")] + NotFound, +} + +/// The behavior a column in a batch needs to have to be partitioned +pub trait PartitioningColumn: std::fmt::Debug { + /// The type of a thing that can be used to identify whether a tag has changed or not; may or + /// may not be the actual tag + type TagIdentityKey: ?Sized + PartialEq; + + /// Whether the value at the given row index is valid or NULL + fn is_valid(&self, idx: usize) -> bool; + + /// The raw packed validity bytes. + /// + /// The validity mask MUST follow the Arrow specification for validity masks + /// (). + fn valid_bytes(&self) -> &[u8]; + + /// Get the identity of the tag at the given row index. + /// + /// The return value is only valid if `is_valid(idx)` for the same `idx` + /// returns true. + fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey>; + + /// Get the value of the tag that has the given identity + fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str>; + + /// A string describing this column's data type; used in error messages + fn type_description(&self) -> String; +} + +/// Behavior of a batch of data used by partitioning code +pub trait Batch { + /// The type of this batch's columns + type Column: PartitioningColumn; + + /// How many rows are in this batch + fn num_rows(&self) -> usize; + + /// The column in the batch with the given name, if any + fn column(&self, column: &str) -> Option<&Self::Column>; + + /// Return the values in the time column in this batch. Return an error if the batch has no + /// time column. + /// + /// # Panics + /// + /// If a time column exists but its data isn't the expected type, this function will panic. + fn time_column(&self) -> Result<&[i64], TimeColumnError>; +} diff --git a/partition/src/traits/mutable_batch.rs b/partition/src/traits/mutable_batch.rs new file mode 100644 index 00000000000..981740df4dc --- /dev/null +++ b/partition/src/traits/mutable_batch.rs @@ -0,0 +1,60 @@ +use super::{Batch, PartitioningColumn, TimeColumnError}; +use mutable_batch::{ + column::{Column as MutableBatchColumn, ColumnData}, + MutableBatch, +}; +use schema::TIME_COLUMN_NAME; + +impl PartitioningColumn for MutableBatchColumn { + type TagIdentityKey = i32; + + fn is_valid(&self, idx: usize) -> bool { + self.valid_mask().get(idx) + } + + fn valid_bytes(&self) -> &[u8] { + self.valid_mask().bytes() + } + + fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> { + debug_assert!(PartitioningColumn::is_valid(self, idx)); + match self.data() { + ColumnData::Tag(col_data, _, _) => Some(&col_data[idx]), + _ => None, + } + } + + fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> { + match self.data() { + ColumnData::Tag(_, dictionary, _) => dictionary.lookup_id(*tag_identity_key), + _ => None, + } + } + + fn type_description(&self) -> String { + self.influx_type().to_string() + } +} + +impl Batch for MutableBatch { + type Column = MutableBatchColumn; + + fn num_rows(&self) -> usize { + self.rows() + } + + fn column(&self, column: &str) -> Option<&Self::Column> { + self.column(column).ok() + } + + fn time_column(&self) -> Result<&[i64], TimeColumnError> { + let time_column = self + .column(TIME_COLUMN_NAME) + .map_err(|_| TimeColumnError::NotFound)?; + + match &time_column.data() { + ColumnData::I64(col_data, _) => Ok(col_data), + x => unreachable!("expected i64 got {}", x), + } + } +} diff --git a/partition/src/traits/record_batch.rs b/partition/src/traits/record_batch.rs new file mode 100644 index 00000000000..57f0dff9cea --- /dev/null +++ b/partition/src/traits/record_batch.rs @@ -0,0 +1,82 @@ +use super::{Batch, PartitioningColumn, TimeColumnError}; +use arrow::{ + array::{Array, DictionaryArray, StringArray, TimestampNanosecondArray}, + datatypes::{DataType, Int32Type}, + record_batch::RecordBatch, +}; +use schema::TIME_COLUMN_NAME; +use std::sync::Arc; + +impl PartitioningColumn for Arc { + type TagIdentityKey = str; + + fn is_valid(&self, idx: usize) -> bool { + Array::is_valid(&self, idx) + } + + fn valid_bytes(&self) -> &[u8] { + self.nulls() + .expect("this RecordBatch's Array should be nullable") + .validity() + } + + fn get_tag_identity_key(&self, idx: usize) -> Option<&Self::TagIdentityKey> { + debug_assert!(PartitioningColumn::is_valid(self, idx)); + match self.data_type() { + DataType::Utf8 => self + .as_any() + .downcast_ref::() + .map(|col_data| col_data.value(idx)), + DataType::Dictionary(key, value) + if key.as_ref() == &DataType::Int32 && value.as_ref() == &DataType::Utf8 => + { + let dict = self + .as_any() + .downcast_ref::>() + .expect("should have gotten a DictionaryArray"); + + let values = dict + .values() + .as_any() + .downcast_ref::() + .expect("should have gotten a StringArray"); + Some(values.value(dict.key(idx)?)) + } + _ => None, + } + } + + fn get_tag_value<'a>(&'a self, tag_identity_key: &'a Self::TagIdentityKey) -> Option<&'a str> { + Some(tag_identity_key) + } + + fn type_description(&self) -> String { + self.data_type().to_string() + } +} + +impl Batch for RecordBatch { + type Column = Arc; + + fn num_rows(&self) -> usize { + self.num_rows() + } + + fn column(&self, column: &str) -> Option<&Self::Column> { + self.column_by_name(column) + } + + fn time_column(&self) -> Result<&[i64], TimeColumnError> { + let time_column = self + .column_by_name(TIME_COLUMN_NAME) + .ok_or(TimeColumnError::NotFound)?; + + Ok(time_column + .as_any() + .downcast_ref::() + .expect("time column was an unexpected type") + .values() + .inner() + .typed_data()) + } +} diff --git a/predicate/Cargo.toml b/predicate/Cargo.toml index ffd92b17078..5e5c828deb9 100644 --- a/predicate/Cargo.toml +++ b/predicate/Cargo.toml @@ -5,18 +5,21 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } chrono = { version = "0.4", default-features = false } data_types = { path = "../data_types" } datafusion = { workspace = true } datafusion_util = { path = "../datafusion_util" } -itertools = "0.11" +itertools = "0.12" observability_deps = { path = "../observability_deps" } query_functions = { path = "../query_functions"} schema = { path = "../schema" } -snafu = "0.7" -sqlparser = "0.37.0" +snafu = "0.8" +sqlparser = { workspace = true } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/predicate/src/delete_expr.rs b/predicate/src/delete_expr.rs index de7b1916464..fc241f0a032 100644 --- a/predicate/src/delete_expr.rs +++ b/predicate/src/delete_expr.rs @@ -126,9 +126,9 @@ pub(crate) fn df_to_scalar( #[cfg(test)] mod tests { - use std::{ops::Not, sync::Arc}; + use std::ops::Not; - use arrow::datatypes::Field; + use arrow::datatypes::DataType; use test_helpers::assert_contains; use super::*; @@ -194,24 +194,17 @@ mod tests { #[test] fn test_unsupported_scalar_value() { - let scalar = datafusion::scalar::ScalarValue::List( - Some(vec![]), - Arc::new(Field::new( - "field", - arrow::datatypes::DataType::Float64, - true, - )), - ); + let array = datafusion::scalar::ScalarValue::new_list(&[], &DataType::Float64); + let scalar = datafusion::scalar::ScalarValue::List(array); let res = df_to_scalar(scalar); assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:"); } #[test] fn test_unsupported_scalar_value_in_expr() { - let expr = col("foo").eq(lit(datafusion::scalar::ScalarValue::new_list( - Some(vec![]), - arrow::datatypes::DataType::Float64, - ))); + let arr = + datafusion::scalar::ScalarValue::new_list(&[], &arrow::datatypes::DataType::Float64); + let expr = col("foo").eq(lit(datafusion::scalar::ScalarValue::List(arr))); let res = df_to_expr(expr); assert_contains!(res.unwrap_err().to_string(), "unsupported scalar value:"); } diff --git a/predicate/src/lib.rs b/predicate/src/lib.rs index 16fa16d026c..5dd9591ce60 100644 --- a/predicate/src/lib.rs +++ b/predicate/src/lib.rs @@ -24,9 +24,9 @@ use datafusion::{ common::tree_node::{TreeNodeVisitor, VisitRecursion}, error::DataFusionError, logical_expr::{binary_expr, BinaryExpr}, - prelude::{col, lit_timestamp_nano, Expr}, + prelude::{col, Expr}, }; -use datafusion_util::{make_range_expr, AsExpr}; +use datafusion_util::{lit_timestamptz_nano, make_range_expr, AsExpr}; use observability_deps::tracing::debug; use rpc_predicate::VALUE_COLUMN_NAME; use schema::TIME_COLUMN_NAME; @@ -188,8 +188,8 @@ impl Predicate { // time_expr = NOT(start <= time_range <= end) // Equivalent to: (time < start OR time > end) let time_expr = col(TIME_COLUMN_NAME) - .lt(lit_timestamp_nano(range.start())) - .or(col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(range.end()))); + .lt(lit_timestamptz_nano(range.start())) + .or(col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(range.end()))); match expr { None => expr = Some(time_expr), @@ -301,7 +301,7 @@ impl Predicate { /// Add an exprestion "time > retention_time" pub fn with_retention(mut self, retention_time: i64) -> Self { - let expr = col(TIME_COLUMN_NAME).gt(lit_timestamp_nano(retention_time)); + let expr = col(TIME_COLUMN_NAME).gt(lit_timestamptz_nano(retention_time)); self.exprs.push(expr); self } @@ -458,19 +458,14 @@ impl TreeNodeVisitor for RowBasedVisitor { | Expr::Not(_) | Expr::OuterReferenceColumn(_, _) | Expr::Placeholder { .. } - | Expr::QualifiedWildcard { .. } | Expr::ScalarFunction { .. } | Expr::ScalarSubquery(_) - | Expr::ScalarUDF { .. } | Expr::ScalarVariable(_, _) | Expr::SimilarTo { .. } | Expr::Sort { .. } | Expr::TryCast { .. } - | Expr::Wildcard => Ok(VisitRecursion::Continue), - Expr::AggregateFunction { .. } - | Expr::AggregateUDF { .. } - | Expr::GroupingSet(_) - | Expr::WindowFunction { .. } => { + | Expr::Wildcard { .. } => Ok(VisitRecursion::Continue), + Expr::AggregateFunction { .. } | Expr::GroupingSet(_) | Expr::WindowFunction { .. } => { self.row_based = false; Ok(VisitRecursion::Stop) } diff --git a/predicate/src/rpc_predicate/column_rewrite.rs b/predicate/src/rpc_predicate/column_rewrite.rs index c58914fa957..a4cdf72016c 100644 --- a/predicate/src/rpc_predicate/column_rewrite.rs +++ b/predicate/src/rpc_predicate/column_rewrite.rs @@ -6,7 +6,10 @@ use schema::{InfluxColumnType, Schema}; /// Logic for rewriting expressions from influxrpc that reference non /// existent columns, or columns that are not tags, to NULL. -pub fn missing_tag_to_null(schema: &Schema, expr: Expr) -> DataFusionResult> { +pub(crate) fn missing_tag_to_null( + schema: &Schema, + expr: Expr, +) -> DataFusionResult> { Ok(match expr { Expr::Column(col) if !tag_column_exists(schema, &col)? => Transformed::Yes(lit_null()), expr => Transformed::No(expr), diff --git a/predicate/src/rpc_predicate/field_rewrite.rs b/predicate/src/rpc_predicate/field_rewrite.rs index bcf0299196f..94cc4db4138 100644 --- a/predicate/src/rpc_predicate/field_rewrite.rs +++ b/predicate/src/rpc_predicate/field_rewrite.rs @@ -7,7 +7,7 @@ use arrow::record_batch::RecordBatch; use datafusion::common::tree_node::{TreeNode, TreeNodeVisitor, VisitRecursion}; use datafusion::common::DFSchema; use datafusion::error::{DataFusionError, Result as DataFusionResult}; -use datafusion::optimizer::utils::split_conjunction_owned; +use datafusion::logical_expr::utils::split_conjunction_owned; use datafusion::physical_expr::create_physical_expr; use datafusion::physical_expr::execution_props::ExecutionProps; use datafusion::physical_plan::ColumnarValue; diff --git a/query_functions/Cargo.toml b/query_functions/Cargo.toml index 5fec6b2e46d..4585bad1218 100644 --- a/query_functions/Cargo.toml +++ b/query_functions/Cargo.toml @@ -5,18 +5,21 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } chrono = { version = "0.4", default-features = false } datafusion = { workspace = true } once_cell = "1" regex = "1" -regex-syntax = "0.7.4" +regex-syntax = "0.8.1" schema = { path = "../schema" } -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -itertools = "0.11.0" -tokio = { version = "1.32", features = ["macros", "parking_lot"] } datafusion_util = { path = "../datafusion_util" } +itertools = "0.12.0" +tokio = { version = "1.35", features = ["macros", "parking_lot"] } diff --git a/query_functions/src/coalesce_struct.rs b/query_functions/src/coalesce_struct.rs index 0892e721e14..b33920db4d9 100644 --- a/query_functions/src/coalesce_struct.rs +++ b/query_functions/src/coalesce_struct.rs @@ -40,7 +40,7 @@ //! d: {a: 2, b: 3}, //! } //! ``` -use std::sync::Arc; +use std::{any::Any, sync::Arc}; use arrow::{ array::{Array, StructArray}, @@ -49,10 +49,8 @@ use arrow::{ }; use datafusion::{ common::cast::as_struct_array, - error::DataFusionError, - logical_expr::{ - ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, Signature, Volatility, - }, + error::{DataFusionError, Result}, + logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility}, physical_plan::ColumnarValue, prelude::Expr, scalar::ScalarValue, @@ -62,11 +60,25 @@ use once_cell::sync::Lazy; /// The name of the `coalesce_struct` UDF given to DataFusion. pub const COALESCE_STRUCT_UDF_NAME: &str = "coalesce_struct"; -/// Implementation of `coalesce_struct`. -/// -/// See [module-level docs](self) for more information. -pub static COALESCE_STRUCT_UDF: Lazy> = Lazy::new(|| { - let return_type: ReturnTypeFunction = Arc::new(move |arg_types| { +#[derive(Debug)] +struct CoalesceStructUDF { + signature: Signature, +} + +impl ScalarUDFImpl for CoalesceStructUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + COALESCE_STRUCT_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { if arg_types.is_empty() { return Err(DataFusionError::Plan(format!( "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument" @@ -83,10 +95,10 @@ pub static COALESCE_STRUCT_UDF: Lazy> = Lazy::new(|| { } } - Ok(Arc::new(first_dt.clone())) - }); + Ok(first_dt.clone()) + } - let fun: ScalarFunctionImplementation = Arc::new(move |args: &[ColumnarValue]| { + fn invoke(&self, args: &[ColumnarValue]) -> Result { #[allow(clippy::manual_try_fold)] args.iter().enumerate().fold(Ok(None), |accu, (pos, arg)| { let Some(accu) = accu? else {return Ok(Some(arg.clone()))}; @@ -106,11 +118,11 @@ pub static COALESCE_STRUCT_UDF: Lazy> = Lazy::new(|| { return Ok(Some(ColumnarValue::Scalar(scalar_coalesce_struct(scalar1, scalar2)))); } (ColumnarValue::Scalar(s), ColumnarValue::Array(array2)) => { - let array1 = s.to_array_of_size(array2.len()); + let array1 = s.to_array_of_size(array2.len())?; (array1, Arc::clone(array2)) } (ColumnarValue::Array(array1), ColumnarValue::Scalar(s)) => { - let array2 = s.to_array_of_size(array1.len()); + let array2 = s.to_array_of_size(array1.len())?; (array1, array2) } (ColumnarValue::Array(array1), ColumnarValue::Array(array2)) => { @@ -123,14 +135,16 @@ pub static COALESCE_STRUCT_UDF: Lazy> = Lazy::new(|| { })?.ok_or_else(|| DataFusionError::Plan(format!( "{COALESCE_STRUCT_UDF_NAME} expects at least 1 argument" ))) - }); - - Arc::new(ScalarUDF::new( - COALESCE_STRUCT_UDF_NAME, - &Signature::variadic_any(Volatility::Immutable), - &return_type, - &fun, - )) + } +} + +/// Implementation of `coalesce_struct`. +/// +/// See [module-level docs](self) for more information. +pub static COALESCE_STRUCT_UDF: Lazy> = Lazy::new(|| { + Arc::new(ScalarUDF::from(CoalesceStructUDF { + signature: Signature::variadic_any(Volatility::Immutable), + })) }); /// Recursively fold [`Array`]s. @@ -181,10 +195,7 @@ fn scalar_coalesce_struct(scalar1: ScalarValue, scalar2: &ScalarValue) -> Scalar /// /// See [module-level docs](self) for more information. pub fn coalesce_struct(args: Vec) -> Expr { - Expr::ScalarUDF(datafusion::logical_expr::expr::ScalarUDF { - fun: Arc::clone(&COALESCE_STRUCT_UDF), - args, - }) + COALESCE_STRUCT_UDF.call(args) } #[cfg(test)] @@ -193,13 +204,13 @@ mod tests { datatypes::{Field, Fields, Schema}, record_batch::RecordBatch, }; + use datafusion::prelude::SessionContext; use datafusion::{ assert_batches_eq, common::assert_contains, prelude::{col, lit}, scalar::ScalarValue, }; - use datafusion_util::context_with_table; use super::*; @@ -217,9 +228,9 @@ mod tests { assert_case_ok( [ - ColumnarValue::Array(ScalarValue::UInt64(None).to_array()), - ColumnarValue::Array(ScalarValue::UInt64(Some(1)).to_array()), - ColumnarValue::Array(ScalarValue::UInt64(Some(2)).to_array()), + ColumnarValue::Array(ScalarValue::UInt64(None).to_array().unwrap()), + ColumnarValue::Array(ScalarValue::UInt64(Some(1)).to_array().unwrap()), + ColumnarValue::Array(ScalarValue::UInt64(Some(2)).to_array().unwrap()), ], &DataType::UInt64, ["+-----+", "| out |", "+-----+", "| 1 |", "+-----+"], @@ -228,7 +239,9 @@ mod tests { assert_case_ok( [ColumnarValue::Array( - ScalarValue::Struct(None, fields.clone()).to_array(), + ScalarValue::Struct(None, fields.clone()) + .to_array() + .unwrap(), )], &dt, ["+-----+", "| out |", "+-----+", "| |", "+-----+"], @@ -237,7 +250,11 @@ mod tests { assert_case_ok( [ - ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), + ColumnarValue::Array( + ScalarValue::Struct(None, fields.clone()) + .to_array() + .unwrap(), + ), ColumnarValue::Array( ScalarValue::Struct( Some(vec![ @@ -246,9 +263,14 @@ mod tests { ]), fields.clone(), ) - .to_array(), + .to_array() + .unwrap(), + ), + ColumnarValue::Array( + ScalarValue::Struct(None, fields.clone()) + .to_array() + .unwrap(), ), - ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), ColumnarValue::Array( ScalarValue::Struct( Some(vec![ @@ -263,7 +285,8 @@ mod tests { ]), fields.clone(), ) - .to_array(), + .to_array() + .unwrap(), ), ], &dt, @@ -302,7 +325,11 @@ mod tests { ]), fields.clone(), )), - ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), + ColumnarValue::Array( + ScalarValue::Struct(None, fields.clone()) + .to_array() + .unwrap(), + ), ], &dt, [ @@ -323,21 +350,21 @@ mod tests { .await; assert_case_err( - [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array())], + [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())], &dt, "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is" ) .await; assert_case_err( - [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array()), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))], + [ColumnarValue::Array(ScalarValue::Struct(None, fields.clone()).to_array().unwrap()), ColumnarValue::Scalar(ScalarValue::Struct(None, fields_b.clone()))], &dt, "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is" ) .await; assert_case_err( - [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array())], + [ColumnarValue::Scalar(ScalarValue::Struct(None, fields.clone())), ColumnarValue::Array(ScalarValue::Struct(None, fields_b.clone()).to_array().unwrap())], &dt, "Error during planning: coalesce_struct expects all arguments to have the same type, but first arg is" ) @@ -391,7 +418,8 @@ mod tests { RecordBatch::try_from_iter(cols.into_iter())? }; - let ctx = context_with_table(rb); + let ctx = SessionContext::new(); + ctx.register_batch("t", rb).unwrap(); let df = ctx.table("t").await?; let df = df.select(vec![coalesce_struct( vals.iter() diff --git a/query_functions/src/gapfill.rs b/query_functions/src/gapfill.rs index fe288f10320..a47fececf77 100644 --- a/query_functions/src/gapfill.rs +++ b/query_functions/src/gapfill.rs @@ -22,11 +22,11 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion::{ - error::DataFusionError, + error::{DataFusionError, Result}, logical_expr::{ - BuiltinScalarFunction, ReturnTypeFunction, ScalarFunctionImplementation, ScalarUDF, - Signature, TypeSignature, Volatility, + BuiltinScalarFunction, ScalarUDF, ScalarUDFImpl, Signature, TypeSignature, Volatility, }, + physical_plan::ColumnarValue, }; use once_cell::sync::Lazy; use schema::InfluxFieldType; @@ -34,6 +34,35 @@ use schema::InfluxFieldType; /// The name of the date_bin_gapfill UDF given to DataFusion. pub const DATE_BIN_GAPFILL_UDF_NAME: &str = "date_bin_gapfill"; +#[derive(Debug)] +struct DateBinGapFillUDF { + signature: Signature, +} + +impl ScalarUDFImpl for DateBinGapFillUDF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + DATE_BIN_GAPFILL_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + Err(DataFusionError::NotImplemented(format!( + "{DATE_BIN_GAPFILL_UDF_NAME} is not yet implemented" + ))) + } +} + /// (Non-)Implementation of date_bin_gapfill. /// This function takes arguments identical to `date_bin()` but /// works in conjunction with the logical optimizer rule @@ -45,19 +74,48 @@ pub(crate) static DATE_BIN_GAPFILL: Lazy> = Lazy::new(|| { // We don't want this to be optimized away before we can give a helpful error message signatures.volatility = Volatility::Volatile; - let return_type_fn: ReturnTypeFunction = - Arc::new(|_| Ok(Arc::new(DataType::Timestamp(TimeUnit::Nanosecond, None)))); - Arc::new(ScalarUDF::new( - DATE_BIN_GAPFILL_UDF_NAME, - &signatures, - &return_type_fn, - &unimplemented_scalar_impl(DATE_BIN_GAPFILL_UDF_NAME), - )) + Arc::new(ScalarUDF::from(DateBinGapFillUDF { + signature: signatures, + })) }); /// The name of the locf UDF given to DataFusion. pub const LOCF_UDF_NAME: &str = "locf"; +#[derive(Debug)] +struct LocfUDF { + signature: Signature, +} + +impl ScalarUDFImpl for LocfUDF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + LOCF_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.is_empty() { + return Err(DataFusionError::Plan(format!( + "{LOCF_UDF_NAME} should have at least 1 argument" + ))); + } + Ok(arg_types[0].clone()) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + Err(DataFusionError::NotImplemented(format!( + "{LOCF_UDF_NAME} is not yet implemented" + ))) + } +} + /// (Non-)Implementation of locf. /// This function takes a single argument of any type and /// produces a value of the same type. It is @@ -66,18 +124,48 @@ pub const LOCF_UDF_NAME: &str = "locf"; /// an implementation since it will be consumed by the logical optimizer rule /// `HandleGapFill`. pub(crate) static LOCF: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone()))); - Arc::new(ScalarUDF::new( - LOCF_UDF_NAME, - &Signature::any(1, Volatility::Volatile), - &return_type_fn, - &unimplemented_scalar_impl(LOCF_UDF_NAME), - )) + Arc::new(ScalarUDF::from(LocfUDF { + signature: Signature::any(1, Volatility::Volatile), + })) }); /// The name of the interpolate UDF given to DataFusion. pub const INTERPOLATE_UDF_NAME: &str = "interpolate"; +#[derive(Debug)] +struct InterpolateUDF { + signature: Signature, +} + +impl ScalarUDFImpl for InterpolateUDF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + INTERPOLATE_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + if arg_types.is_empty() { + return Err(DataFusionError::Plan(format!( + "{INTERPOLATE_UDF_NAME} should have at least 1 argument" + ))); + } + Ok(arg_types[0].clone()) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + Err(DataFusionError::NotImplemented(format!( + "{INTERPOLATE_UDF_NAME} is not yet implemented" + ))) + } +} + /// (Non-)Implementation of interpolate. /// This function takes a single numeric argument and /// produces a value of the same type. It is @@ -86,7 +174,6 @@ pub const INTERPOLATE_UDF_NAME: &str = "interpolate"; /// an implementation since it will be consumed by the logical optimizer rule /// `HandleGapFill`. pub(crate) static INTERPOLATE: Lazy> = Lazy::new(|| { - let return_type_fn: ReturnTypeFunction = Arc::new(|args| Ok(Arc::new(args[0].clone()))); let signatures = [ InfluxFieldType::Float, InfluxFieldType::Integer, @@ -107,34 +194,35 @@ pub(crate) static INTERPOLATE: Lazy> = Lazy::new(|| { ] .into(), )]), + TypeSignature::Exact(vec![DataType::Struct( + vec![ + Field::new("value", influx_type.into(), true), + Field::new( + "time", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ), + ] + .into(), + )]), ] }) .collect(); - Arc::new(ScalarUDF::new( - INTERPOLATE_UDF_NAME, - &Signature::one_of(signatures, Volatility::Volatile), - &return_type_fn, - &unimplemented_scalar_impl(INTERPOLATE_UDF_NAME), - )) + Arc::new(ScalarUDF::from(InterpolateUDF { + signature: Signature::one_of(signatures, Volatility::Volatile), + })) }); -fn unimplemented_scalar_impl(name: &'static str) -> ScalarFunctionImplementation { - Arc::new(move |_| { - Err(DataFusionError::NotImplemented(format!( - "{name} is not yet implemented" - ))) - }) -} - #[cfg(test)] mod test { use arrow::array::{ArrayRef, Float64Array, TimestampNanosecondArray}; use arrow::record_batch::RecordBatch; use datafusion::common::assert_contains; use datafusion::error::Result; - use datafusion::prelude::{col, lit_timestamp_nano, Expr}; + use datafusion::prelude::{col, Expr, SessionContext}; use datafusion::scalar::ScalarValue; - use datafusion_util::context_with_table; + use datafusion_util::lit_timestamptz_nano; + use schema::TIME_DATA_TIMEZONE; use std::sync::Arc; fn date_bin_gapfill(stride: Expr, source: Expr, origin: Expr) -> Expr { @@ -150,13 +238,18 @@ mod test { #[tokio::test] async fn date_bin_gapfill_errs() -> Result<()> { - let times = Arc::new(TimestampNanosecondArray::from(vec![Some(1000)])); + let times = Arc::new( + TimestampNanosecondArray::from(vec![Some(1000)]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let rb = RecordBatch::try_from_iter(vec![("time", times as ArrayRef)])?; - let ctx = context_with_table(rb); + let ctx = SessionContext::new(); + ctx.register_batch("t", rb).unwrap(); + let df = ctx.table("t").await?.select(vec![date_bin_gapfill( lit_interval_milliseconds(360_000), col("time"), - lit_timestamp_nano(0), + lit_timestamptz_nano(0), )])?; let res = df.collect().await; let expected = "date_bin_gapfill is not yet implemented"; @@ -175,7 +268,8 @@ mod test { async fn locf_errs() { let arg = Arc::new(Float64Array::from(vec![100.0])); let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap(); - let ctx = context_with_table(rb); + let ctx = SessionContext::new(); + ctx.register_batch("t", rb).unwrap(); let df = ctx .table("t") .await @@ -198,7 +292,8 @@ mod test { async fn interpolate_errs() { let arg = Arc::new(Float64Array::from(vec![100.0])); let rb = RecordBatch::try_from_iter(vec![("f0", arg as ArrayRef)]).unwrap(); - let ctx = context_with_table(rb); + let ctx = SessionContext::new(); + ctx.register_batch("t", rb).unwrap(); let df = ctx .table("t") .await diff --git a/query_functions/src/lib.rs b/query_functions/src/lib.rs index 34586b8dc11..658a5fafb5b 100644 --- a/query_functions/src/lib.rs +++ b/query_functions/src/lib.rs @@ -12,6 +12,7 @@ clippy::dbg_macro, unused_crate_dependencies )] +#![allow(unreachable_pub)] // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -34,6 +35,9 @@ mod regex; /// Selector Functions pub mod selectors; +/// Sleep function. +mod sleep; + /// window_bounds expressions mod window; @@ -41,10 +45,12 @@ pub mod gapfill; /// Function registry mod registry; +mod to_timestamp; pub use crate::regex::clean_non_meta_escapes; pub use crate::regex::REGEX_MATCH_UDF_NAME; pub use crate::regex::REGEX_NOT_MATCH_UDF_NAME; +pub use crate::sleep::SLEEP_UDF_NAME; /// Return an Expr that invokes a InfluxRPC compatible regex match to /// determine which values satisfy the pattern. Equivalent to: @@ -117,7 +123,7 @@ mod test { record_batch::RecordBatch, }; use datafusion::{assert_batches_eq, prelude::col}; - use datafusion_util::context_with_table; + use schema::TIME_DATA_TIMEZONE; use std::sync::Arc; use super::*; @@ -132,7 +138,8 @@ mod test { )]) .unwrap(); - let ctx = context_with_table(batch); + let ctx = SessionContext::new(); + ctx.register_batch("t", batch).unwrap(); let result = ctx .table("t") .await @@ -165,7 +172,8 @@ mod test { )]) .unwrap(); - let ctx = context_with_table(batch); + let ctx = SessionContext::new(); + ctx.register_batch("t", batch).unwrap(); let result = ctx .table("t") .await @@ -187,14 +195,18 @@ mod test { async fn test_make_window_bound_expr() { let batch = RecordBatch::try_from_iter(vec![( "time", - Arc::new(TimestampNanosecondArray::from(vec![Some(1000), Some(2000)])) as ArrayRef, + Arc::new( + TimestampNanosecondArray::from(vec![Some(1000), Some(2000)]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ) as ArrayRef, )]) .unwrap(); let each = WindowDuration::Fixed { nanoseconds: 100 }; let every = WindowDuration::Fixed { nanoseconds: 200 }; - let ctx = context_with_table(batch); + let ctx = SessionContext::new(); + ctx.register_batch("t", batch).unwrap(); let result = ctx .table("t") .await diff --git a/query_functions/src/regex.rs b/query_functions/src/regex.rs index f153a432149..2e3feae1239 100644 --- a/query_functions/src/regex.rs +++ b/query_functions/src/regex.rs @@ -201,11 +201,11 @@ mod test { record_batch::RecordBatch, util::pretty::pretty_format_batches, }; + use datafusion::prelude::SessionContext; use datafusion::{ error::DataFusionError, prelude::{col, lit, Expr}, }; - use datafusion_util::context_with_table; use std::sync::Arc; use super::*; @@ -338,7 +338,8 @@ mod test { ]) .unwrap(); - let ctx = context_with_table(rb); + let ctx = SessionContext::new(); + ctx.register_batch("t", rb).unwrap(); let df = ctx.table("t").await.unwrap(); let df = df.filter(op).unwrap(); diff --git a/query_functions/src/registry.rs b/query_functions/src/registry.rs index a4f920db1c6..609b83f93c1 100644 --- a/query_functions/src/registry.rs +++ b/query_functions/src/registry.rs @@ -7,7 +7,7 @@ use datafusion::{ }; use once_cell::sync::Lazy; -use crate::{gapfill, regex, window}; +use crate::{gapfill, regex, sleep, to_timestamp, window}; static REGISTRY: Lazy = Lazy::new(IOxFunctionRegistry::new); @@ -24,11 +24,13 @@ impl IOxFunctionRegistry { impl FunctionRegistry for IOxFunctionRegistry { fn udfs(&self) -> HashSet { [ + to_timestamp::TO_TIMESTAMP_FUNCTION_NAME, gapfill::DATE_BIN_GAPFILL_UDF_NAME, gapfill::LOCF_UDF_NAME, gapfill::INTERPOLATE_UDF_NAME, regex::REGEX_MATCH_UDF_NAME, regex::REGEX_NOT_MATCH_UDF_NAME, + sleep::SLEEP_UDF_NAME, window::WINDOW_BOUNDS_UDF_NAME, ] .into_iter() @@ -38,11 +40,13 @@ impl FunctionRegistry for IOxFunctionRegistry { fn udf(&self, name: &str) -> DataFusionResult> { match name { + to_timestamp::TO_TIMESTAMP_FUNCTION_NAME => Ok(to_timestamp::TO_TIMESTAMP_UDF.clone()), gapfill::DATE_BIN_GAPFILL_UDF_NAME => Ok(gapfill::DATE_BIN_GAPFILL.clone()), gapfill::LOCF_UDF_NAME => Ok(gapfill::LOCF.clone()), gapfill::INTERPOLATE_UDF_NAME => Ok(gapfill::INTERPOLATE.clone()), regex::REGEX_MATCH_UDF_NAME => Ok(regex::REGEX_MATCH_UDF.clone()), regex::REGEX_NOT_MATCH_UDF_NAME => Ok(regex::REGEX_NOT_MATCH_UDF.clone()), + sleep::SLEEP_UDF_NAME => Ok(sleep::SLEEP_UDF.clone()), window::WINDOW_BOUNDS_UDF_NAME => Ok(window::WINDOW_BOUNDS_UDF.clone()), _ => Err(DataFusionError::Plan(format!( "IOx FunctionRegistry does not contain function '{name}'" diff --git a/query_functions/src/selectors/internal.rs b/query_functions/src/selectors/internal.rs index 63c6f042bb2..a136cc1dc32 100644 --- a/query_functions/src/selectors/internal.rs +++ b/query_functions/src/selectors/internal.rs @@ -214,7 +214,7 @@ impl Selector { let time_arr = arrow::compute::nullif( time_arr, &arrow::compute::kernels::cmp::neq( - &self.value.to_array_of_size(time_arr.len()), + &self.value.to_array_of_size(time_arr.len())?, &value_arr, )?, )?; diff --git a/query_functions/src/sleep.rs b/query_functions/src/sleep.rs new file mode 100644 index 00000000000..1995c405ce2 --- /dev/null +++ b/query_functions/src/sleep.rs @@ -0,0 +1,94 @@ +use std::{any::Any, sync::Arc}; + +use arrow::datatypes::{DataType, TimeUnit}; +use datafusion::{ + error::{DataFusionError, Result}, + logical_expr::{ScalarUDF, ScalarUDFImpl, Signature, Volatility}, + physical_plan::ColumnarValue, +}; +use once_cell::sync::Lazy; + +/// The name of the "sleep" UDF given to DataFusion. +pub const SLEEP_UDF_NAME: &str = "sleep"; + +#[derive(Debug)] +struct SleepUDF { + signature: Signature, +} + +impl ScalarUDFImpl for SleepUDF { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + SLEEP_UDF_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Null) + } + + fn invoke(&self, _args: &[ColumnarValue]) -> Result { + Err(DataFusionError::Internal( + "sleep function should have been replaced by optimizer pass to avoid thread blocking" + .to_owned(), + )) + } +} + +/// Implementation of "sleep" +pub(crate) static SLEEP_UDF: Lazy> = Lazy::new(|| { + Arc::new(ScalarUDF::from(SleepUDF { + signature: Signature::uniform( + 1, + vec![ + DataType::Null, + DataType::Duration(TimeUnit::Second), + DataType::Duration(TimeUnit::Millisecond), + DataType::Duration(TimeUnit::Millisecond), + DataType::Duration(TimeUnit::Microsecond), + DataType::Duration(TimeUnit::Nanosecond), + DataType::Float32, + DataType::Float64, + ], + Volatility::Volatile, + ), + })) +}); + +#[cfg(test)] +mod tests { + use datafusion::{ + common::assert_contains, + logical_expr::LogicalPlanBuilder, + physical_plan::common::collect, + prelude::{lit, SessionContext}, + scalar::ScalarValue, + }; + + use super::*; + + #[tokio::test] + async fn test() { + let ctx = SessionContext::new(); + let plan = LogicalPlanBuilder::empty(true) + .project([SLEEP_UDF.call(vec![lit(ScalarValue::Null)]).alias("sleep")]) + .unwrap() + .build() + .unwrap(); + let plan = ctx.state().create_physical_plan(&plan).await.unwrap(); + let err = collect(plan.execute(0, ctx.task_ctx()).unwrap()) + .await + .unwrap_err(); + + assert_contains!( + err.to_string(), + "sleep function should have been replaced by optimizer pass" + ); + } +} diff --git a/query_functions/src/to_timestamp.rs b/query_functions/src/to_timestamp.rs new file mode 100644 index 00000000000..4df6c0bcb52 --- /dev/null +++ b/query_functions/src/to_timestamp.rs @@ -0,0 +1,85 @@ +//! Implementation of `to_timestamp` function that +//! overrides the built in version in DataFusion because the semantics changed +//! upstream: +//! +//! +//! See for more details +use std::sync::Arc; + +use arrow::datatypes::DataType; +use arrow::datatypes::TimeUnit; +use datafusion::common::internal_err; +use datafusion::error::Result; +use datafusion::logical_expr::ScalarUDFImpl; +use datafusion::logical_expr::Signature; +use datafusion::physical_expr::datetime_expressions; +use datafusion::physical_expr::expressions::cast_column; +use datafusion::{ + error::DataFusionError, + logical_expr::{ScalarUDF, Volatility}, + physical_plan::ColumnarValue, +}; +use once_cell::sync::Lazy; + +/// The name of the function +pub const TO_TIMESTAMP_FUNCTION_NAME: &str = "to_timestamp"; + +#[derive(Debug)] +struct ToTimestampUDF { + signature: Signature, +} + +impl ScalarUDFImpl for ToTimestampUDF { + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn name(&self) -> &str { + TO_TIMESTAMP_FUNCTION_NAME + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + if args.len() != 1 { + return internal_err!("to_timestamp expected 1 argument, got {}", args.len()); + } + + match args[0].data_type() { + // call through to arrow cast kernel + DataType::Int64 | DataType::Timestamp(_, _) => cast_column( + &args[0], + &DataType::Timestamp(TimeUnit::Nanosecond, None), + None, + ), + DataType::Utf8 => datetime_expressions::to_timestamp_nanos(args), + dt => internal_err!("to_timestamp does not support argument type '{dt}'"), + } + } +} + +/// Implementation of to_timestamp +pub(crate) static TO_TIMESTAMP_UDF: Lazy> = Lazy::new(|| { + Arc::new(ScalarUDF::from(ToTimestampUDF { + signature: Signature::uniform( + 1, + vec![ + DataType::Int64, + DataType::Timestamp(TimeUnit::Nanosecond, None), + DataType::Timestamp(TimeUnit::Microsecond, None), + DataType::Timestamp(TimeUnit::Millisecond, None), + DataType::Timestamp(TimeUnit::Second, None), + DataType::Utf8, + ], + Volatility::Immutable, + ), + })) +}); + +// https://github.com/apache/arrow-datafusion/pull/7844 diff --git a/query_functions/src/window.rs b/query_functions/src/window.rs index db6058efd18..7196b0aefc4 100644 --- a/query_functions/src/window.rs +++ b/query_functions/src/window.rs @@ -1,7 +1,7 @@ mod internal; pub use internal::Duration; -use schema::TIME_DATA_TYPE; +use schema::{TIME_DATA_TIMEZONE, TIME_DATA_TYPE}; use std::sync::Arc; @@ -158,7 +158,9 @@ fn window_bounds(arg: &dyn Array, every: WindowDuration, offset: WindowDuration) }) }); - let array = values.collect::(); + let array = values + .collect::() + .with_timezone_opt(TIME_DATA_TIMEZONE()); Arc::new(array) as ArrayRef } @@ -264,26 +266,20 @@ mod tests { #[test] fn test_window_bounds() { - let input: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![ - Some(100), - None, - Some(200), - Some(300), - Some(400), - ])); + let input: ArrayRef = Arc::new( + TimestampNanosecondArray::from(vec![Some(100), None, Some(200), Some(300), Some(400)]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); let every = WindowDuration::from_nanoseconds(200); let offset = WindowDuration::from_nanoseconds(50); let bounds_array = window_bounds(&input, every, offset); - let expected_array: ArrayRef = Arc::new(TimestampNanosecondArray::from(vec![ - Some(250), - None, - Some(250), - Some(450), - Some(450), - ])); + let expected_array: ArrayRef = Arc::new( + TimestampNanosecondArray::from(vec![Some(250), None, Some(250), Some(450), Some(450)]) + .with_timezone_opt(TIME_DATA_TIMEZONE()), + ); assert_eq!( &expected_array, &bounds_array, diff --git a/schema/Cargo.toml b/schema/Cargo.toml index 2b5a49c8c5e..0e595b344ec 100644 --- a/schema/Cargo.toml +++ b/schema/Cargo.toml @@ -6,10 +6,14 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } hashbrown = { workspace = true } -indexmap = { version = "2.0", features = ["std"] } +indexmap = { version = "2.1", features = ["std"] } observability_deps = { path = "../observability_deps" } -snafu = "0.7" +snafu = "0.8" workspace-hack = { version = "0.1", path = "../workspace-hack" } +once_cell = "1" diff --git a/schema/src/lib.rs b/schema/src/lib.rs index c7c814813e8..08fc697efa9 100644 --- a/schema/src/lib.rs +++ b/schema/src/lib.rs @@ -33,6 +33,7 @@ use arrow::datatypes::{ use hashbrown::HashSet; use crate::sort::SortKey; +use once_cell::sync::OnceCell; use snafu::{OptionExt, Snafu}; /// The name of the timestamp column in the InfluxDB datamodel @@ -44,16 +45,21 @@ pub const INFLUXQL_MEASUREMENT_COLUMN_NAME: &str = "iox::measurement"; pub const INFLUXQL_METADATA_KEY: &str = "iox::influxql::group_key::metadata"; /// The Timezone to use for InfluxDB timezone (should be a constant) +// TODO: Start Epic Add timezone support to IOx #18154 +// https://github.com/influxdata/idpe/issues/18154 #[allow(non_snake_case)] pub fn TIME_DATA_TIMEZONE() -> Option> { - // TODO: we should use the "UTC" timezone as that is what the - // InfluxDB data model timestamps are relative to. However, - // DataFusion doesn't currently do a great job with such - // timezones so punting for now - //Some(String::from("UTC")); - None + _TIME_DATA_TIMEZONE + .get_or_init(|| { + std::env::var("INFLUXDB_IOX_TIME_DATA_TIMEZONE") + .map_or_else(|_| None, |v| Some(v.into())) + }) + .clone() } +// TODO: refactor TIME_DATA_TIMEZONE() into a lazy static +static _TIME_DATA_TIMEZONE: OnceCell>> = OnceCell::new(); + /// the [`ArrowDataType`] to use for InfluxDB timestamps #[allow(non_snake_case)] pub fn TIME_DATA_TYPE() -> ArrowDataType { @@ -783,7 +789,7 @@ macro_rules! assert_column_eq { pub(crate) mod test_util { use super::*; - pub fn make_field( + pub(crate) fn make_field( name: &str, data_type: arrow::datatypes::DataType, nullable: bool, diff --git a/schema/src/sort.rs b/schema/src/sort.rs index 52a8ad2652b..7d4c412a53c 100644 --- a/schema/src/sort.rs +++ b/schema/src/sort.rs @@ -251,6 +251,18 @@ impl SortKey { } } +impl From for Vec { + fn from(val: SortKey) -> Self { + val.columns.iter().map(|(id, _)| id.to_string()).collect() + } +} + +impl From> for SortKey { + fn from(val: Vec) -> Self { + Self::from_columns(val) + } +} + // Produces a human-readable representation of a sort key that looks like: // // "host, region DESC, env NULLS FIRST, time" @@ -288,20 +300,26 @@ pub fn compute_sort_key<'a>( let primary_key = schema.primary_key(); let cardinalities = distinct_counts(batches, &primary_key); + let sort_key = sort_key_from_cardinalities(&cardinalities); - let mut cardinalities: Vec<_> = cardinalities.into_iter().collect(); + debug!(?primary_key, ?sort_key, "computed sort key"); + sort_key +} + +/// Given columns and their cardinalities (the number of distinct values in the data), sort the +/// columns by cardinality and turn that ordering into a [`SortKey`], with the time column always +/// appearing last. +pub fn sort_key_from_cardinalities(cardinalities: &HashMap) -> SortKey { + let mut cardinalities: Vec<_> = cardinalities.iter().collect(); // Sort by (cardinality, column_name) to have deterministic order if same cardinality cardinalities.sort_by_cached_key(|x| (x.1, x.0.clone())); let mut builder = SortKeyBuilder::with_capacity(cardinalities.len() + 1); for (col, _) in cardinalities { - builder = builder.with_col(col) + builder = builder.with_col(col.as_str()) } builder = builder.with_col(TIME_COLUMN_NAME); - let sort_key = builder.build(); - - debug!(?primary_key, ?sort_key, "computed sort key"); - sort_key + builder.build() } /// Takes batches of data and the columns that make up the primary key. Computes the number of @@ -310,7 +328,7 @@ pub fn compute_sort_key<'a>( fn distinct_counts<'a>( batches: impl Iterator, primary_key: &[&str], -) -> HashMap { +) -> HashMap { let mut distinct_values_across_batches = HashMap::with_capacity(primary_key.len()); for batch in batches { @@ -324,14 +342,7 @@ fn distinct_counts<'a>( distinct_values_across_batches .into_iter() - .map(|(column, distinct_values)| { - let count = distinct_values - .len() - .try_into() - .expect("usize -> u64 overflow"); - - (column, count) - }) + .map(|(column, distinct_values)| (column, distinct_values.len())) .collect() } @@ -404,7 +415,7 @@ pub fn adjust_sort_key_columns( let existing_columns_without_time = catalog_sort_key .iter() .map(|(col, _opts)| col) - .filter(|&col| TIME_COLUMN_NAME != col.as_ref()) + .filter(|col| TIME_COLUMN_NAME != col.as_ref()) .cloned(); let new_columns: Vec<_> = primary_key .iter() diff --git a/service_common/Cargo.toml b/service_common/Cargo.toml index a4d83333b0a..ec328aa0b67 100644 --- a/service_common/Cargo.toml +++ b/service_common/Cargo.toml @@ -5,19 +5,12 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order -async-trait = "0.1.73" -bytes = "1.5" +arrow = { workspace = true } datafusion = { workspace = true } executor = { path = "../executor" } -iox_query = { path = "../iox_query" } -iox_query_influxql = { path = "../iox_query_influxql" } -iox_query_influxrpc = { path = "../iox_query_influxrpc" } -flightsql = { path = "../flightsql" } -metric = { path = "../metric" } -parking_lot = "0.12" -predicate = { path = "../predicate" } tonic = { workspace = true } -trace = { path = "../trace" } -tracker = { path = "../tracker" } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/service_common/src/error.rs b/service_common/src/error.rs index 7e0924e70d3..f9a5b2ecf3c 100644 --- a/service_common/src/error.rs +++ b/service_common/src/error.rs @@ -26,8 +26,8 @@ pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code { match e { DataFusionError::ResourcesExhausted(_) => tonic::Code::ResourceExhausted, // Map as many as possible back into user visible (non internal) errors - DataFusionError::SQL(_) - | DataFusionError::SchemaError(_) + DataFusionError::SQL(_, _) + | DataFusionError::SchemaError(_, _) // Execution, ArrowError and ParquetError might be due to an // internal error (e.g. some sort of IO error or bug) or due // to a user input error (e.g. you can get an Arrow error if @@ -37,7 +37,7 @@ pub fn datafusion_error_to_tonic_code(e: &DataFusionError) -> tonic::Code { // classify them as InvalidArgument so the user has a chance // to see them | DataFusionError::Execution(_) - | DataFusionError::ArrowError(_) + | DataFusionError::ArrowError(_, _) | DataFusionError::ParquetError(_) // DataFusion most often returns "NotImplemented" when a // particular SQL feature is not implemented. This @@ -99,7 +99,7 @@ mod test { ); let e = ParserError::ParserError(s.clone()); - do_transl_test(DataFusionError::SQL(e), tonic::Code::InvalidArgument); + do_transl_test(DataFusionError::SQL(e, None), tonic::Code::InvalidArgument); do_transl_test( DataFusionError::NotImplemented(s.clone()), diff --git a/service_common/src/lib.rs b/service_common/src/lib.rs index 23c420663ee..5b055ec8602 100644 --- a/service_common/src/lib.rs +++ b/service_common/src/lib.rs @@ -18,37 +18,8 @@ use workspace_hack as _; mod error; -pub mod planner; -pub mod test_util; - -use std::sync::Arc; - -use async_trait::async_trait; -use iox_query::QueryNamespace; -use trace::span::Span; -use tracker::InstrumentedAsyncOwnedSemaphorePermit; - -/// Trait that allows the query engine (which includes flight and storage/InfluxRPC) to access a -/// virtual set of namespaces. -/// -/// The query engine MUST ONLY use this trait to access the namespaces / catalogs. -#[async_trait] -pub trait QueryNamespaceProvider: std::fmt::Debug + Send + Sync + 'static { - /// Abstract namespace. - type Db: QueryNamespace; - - /// Get namespace if it exists. - /// - /// System tables may contain debug information depending on `include_debug_info_tables`. - async fn db( - &self, - name: &str, - span: Option, - include_debug_info_tables: bool, - ) -> Option>; - - /// Acquire concurrency-limiting sempahore - async fn acquire_semaphore(&self, span: Option) -> InstrumentedAsyncOwnedSemaphorePermit; -} pub use error::datafusion_error_to_tonic_code; + +// Included to avoid arrow in workspace-hack crate +use arrow as _; diff --git a/service_grpc_flight/Cargo.toml b/service_grpc_flight/Cargo.toml index b78a03b670b..73386f61658 100644 --- a/service_grpc_flight/Cargo.toml +++ b/service_grpc_flight/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # Workspace dependencies, in alphabetical order authz = { path = "../authz" } @@ -14,21 +17,24 @@ flightsql = { path = "../flightsql" } generated_types = { path = "../generated_types" } observability_deps = { path = "../observability_deps" } iox_query = { path = "../iox_query" } +iox_query_influxql = { path = "../iox_query_influxql" } +iox_query_params = { path = "../iox_query_params" } service_common = { path = "../service_common" } +tower_trailer = { path = "../tower_trailer"} trace = { path = "../trace"} trace_http = { path = "../trace_http"} tracker = { path = "../tracker" } # Crates.io dependencies, in alphabetical order -arrow = { workspace = true, features = ["prettyprint"] } -arrow-flight = { workspace = true, features=["flight-sql-experimental"] } +arrow = { workspace = true } +arrow-flight = { workspace = true } bytes = "1.5" futures = "0.3" -prost = "0.11" +prost = { workspace = true } serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.107" -snafu = "0.7" -tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } +serde_json = "1.0.111" +snafu = "0.8" +tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tonic = { workspace = true } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/service_grpc_flight/src/keep_alive.rs b/service_grpc_flight/src/keep_alive.rs index 38f470e515c..0a1836cd9ab 100644 --- a/service_grpc_flight/src/keep_alive.rs +++ b/service_grpc_flight/src/keep_alive.rs @@ -136,23 +136,29 @@ use arrow::{ ipc::writer::{DictionaryTracker, IpcDataGenerator, IpcWriteOptions}, record_batch::RecordBatch, }; -use arrow_flight::{error::FlightError, FlightData}; +use arrow_flight::FlightData; use futures::{stream::BoxStream, Stream, StreamExt}; use observability_deps::tracing::{info, warn}; use tokio::time::{Interval, MissedTickBehavior}; /// Keep alive underlying response stream by sending regular empty [`RecordBatch`]es. -pub struct KeepAliveStream { - inner: BoxStream<'static, Result>, +pub(crate) struct KeepAliveStream +where + E: 'static, +{ + inner: BoxStream<'static, Result>, } -impl KeepAliveStream { +impl KeepAliveStream +where + E: 'static, +{ /// Create new keep-alive wrapper from the underlying stream and the given interval. /// /// The interval is measured from the last message -- which can either be a "real" message or a keep-alive. - pub fn new(s: S, interval: Duration) -> Self + pub(crate) fn new(s: S, interval: Duration) -> Self where - S: Stream> + Send + 'static, + S: Stream> + Send + 'static, { let mut ticker = tokio::time::interval(interval); ticker.set_missed_tick_behavior(MissedTickBehavior::Delay); @@ -194,8 +200,11 @@ impl KeepAliveStream { } } -impl Stream for KeepAliveStream { - type Item = Result; +impl Stream for KeepAliveStream +where + E: 'static, +{ + type Item = Result; fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { self.inner.poll_next_unpin(cx) @@ -203,9 +212,12 @@ impl Stream for KeepAliveStream { } /// Inner state of [`KeepAliveStream`] -struct State { +struct State +where + E: 'static, +{ /// The underlying stream that is kept alive. - inner: BoxStream<'static, Result>, + inner: BoxStream<'static, Result>, /// A [`Schema`] that was already received from the stream. /// @@ -274,13 +286,13 @@ fn build_empty_batch_msg(schema: Option<&SchemaRef>) -> Option { } #[cfg(test)] -pub mod test_util { +pub(crate) mod test_util { use std::time::Duration; use futures::{stream::BoxStream, Stream, StreamExt}; /// Ensure that there is a delay between steam responses. - pub fn make_stream_slow(s: S, delay: Duration) -> BoxStream<'static, S::Item> + pub(crate) fn make_stream_slow(s: S, delay: Duration) -> BoxStream<'static, S::Item> where S: Send + Stream + Unpin + 'static, { @@ -296,7 +308,9 @@ pub mod test_util { #[cfg(test)] mod tests { use arrow::{array::Int64Array, datatypes::Field}; - use arrow_flight::{decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder}; + use arrow_flight::{ + decode::FlightRecordBatchStream, encode::FlightDataEncoderBuilder, error::FlightError, + }; use datafusion::assert_batches_eq; use futures::TryStreamExt; use test_helpers::maybe_start_logging; @@ -376,6 +390,6 @@ mod tests { s }; - (panic_on_stream_timeout(s, Duration::from_millis(250))) as _ + panic_on_stream_timeout(s, Duration::from_millis(250)) } } diff --git a/service_grpc_flight/src/lib.rs b/service_grpc_flight/src/lib.rs index d842556aa58..5e345c7287c 100644 --- a/service_grpc_flight/src/lib.rs +++ b/service_grpc_flight/src/lib.rs @@ -17,15 +17,19 @@ )] use keep_alive::KeepAliveStream; +use planner::Planner; +use tower_trailer::{HeaderMap, Trailers}; // Workaround for "unused crate" lint false positives. use workspace_hack as _; mod keep_alive; +mod planner; mod request; use arrow::error::ArrowError; use arrow_flight::{ encode::FlightDataEncoderBuilder, + error::FlightError, flight_descriptor::DescriptorType, flight_service_server::{FlightService as Flight, FlightServiceServer as FlightServer}, Action, ActionType, Criteria, Empty, FlightData, FlightDescriptor, FlightEndpoint, FlightInfo, @@ -35,20 +39,24 @@ use authz::{extract_token, Authorizer}; use data_types::NamespaceNameError; use datafusion::{error::DataFusionError, physical_plan::ExecutionPlan}; use flightsql::FlightSQLCommand; -use futures::{ready, Stream, StreamExt, TryStreamExt}; +use futures::{ready, stream::BoxStream, Stream, StreamExt, TryStreamExt}; use generated_types::influxdata::iox::querier::v1 as proto; -use iox_query::{exec::IOxSessionContext, QueryCompletedToken, QueryNamespace}; +use iox_query::{ + exec::IOxSessionContext, + query_log::{QueryCompletedToken, QueryLogEntry, StatePermit, StatePlanned}, + QueryNamespaceProvider, +}; use observability_deps::tracing::{debug, info, warn}; use prost::Message; use request::{IoxGetRequest, RunQuery}; -use service_common::{datafusion_error_to_tonic_code, planner::Planner, QueryNamespaceProvider}; +use service_common::datafusion_error_to_tonic_code; use snafu::{OptionExt, ResultExt, Snafu}; use std::{ fmt::Debug, pin::Pin, - sync::Arc, + sync::{Arc, Mutex}, task::Poll, - time::{Duration, Instant}, + time::Duration, }; use tonic::{ metadata::{AsciiMetadataValue, MetadataMap}, @@ -63,13 +71,27 @@ use tracker::InstrumentedAsyncOwnedSemaphorePermit; /// /// See /// for discussion on adding support to FlightSQL itself. -const IOX_FLIGHT_SQL_DATABASE_HEADERS: [&str; 4] = [ +const IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS: [&str; 4] = [ "database", // preferred "bucket", "bucket-name", "iox-namespace-name", // deprecated ]; +/// Trailer that describes the duration (in seconds) for which a query was queued due to concurrency limits. +const IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-queue-duration-seconds"; + +/// Trailer that describes the duration (in seconds) of the planning phase of a query. +const IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER: &str = + "x-influxdata-planning-duration-seconds"; + +/// Trailer that describes the duration (in seconds) of the execution phase of a query. +const IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER: &str = + "x-influxdata-execution-duration-seconds"; + +/// Trailer that describes the duration (in seconds) the CPU(s) took to compute the results. +const IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER: &str = "x-influxdata-compute-duration-seconds"; + /// In which interval should the `DoGet` stream send empty messages as keep alive markers? const DO_GET_KEEP_ALIVE_INTERVAL: Duration = Duration::from_secs(5); @@ -127,7 +149,7 @@ pub enum Error { Planning { namespace_name: String, query: String, - source: service_common::planner::Error, + source: planner::Error, }, #[snafu(display("Error while planning Flight SQL : {}", source))] @@ -488,82 +510,80 @@ where { /// Implementation of the `DoGet` method async fn run_do_get( - &self, + server: Arc, span_ctx: Option, external_span_ctx: Option, - permit: InstrumentedAsyncOwnedSemaphorePermit, - query: RunQuery, - namespace_name: String, - is_debug: bool, - ) -> Result>, tonic::Status> { - let db = self - .server + request: IoxGetRequest, + log_entry: &mut Option>, + ) -> Result, tonic::Status> { + let IoxGetRequest { + database, + query, + params, + is_debug, + } = request; + let namespace_name = database.as_str(); + + let db = server .db( - &namespace_name, + namespace_name, span_ctx.child_span("get namespace"), is_debug, ) .await - .context(DatabaseNotFoundSnafu { - namespace_name: &namespace_name, - })?; + .context(DatabaseNotFoundSnafu { namespace_name })?; + + //TODO: add structured logging for parameterized queries https://github.com/influxdata/influxdb_iox/issues/9626 + let query_completed_token = db.record_query( + external_span_ctx.as_ref().map(RequestLogContext::ctx), + query.variant(), + Box::new(query.to_string()), + ); + + *log_entry = Some(Arc::clone(query_completed_token.entry())); + + // Log after we acquire the permit and are about to start execution + info!( + %namespace_name, + %query, + trace=external_span_ctx.format_jaeger().as_str(), + variant=query.variant(), + "DoGet request", + ); let ctx = db.new_query_context(span_ctx); - let (query_completed_token, physical_plan) = match &query { - RunQuery::Sql(sql_query) => { - let token = db.record_query( - external_span_ctx.as_ref().map(RequestLogContext::ctx), - "sql", - Box::new(sql_query.clone()), - ); - let plan = Planner::new(&ctx) - .sql(sql_query) - .await - .context(PlanningSnafu { - namespace_name: &namespace_name, - query: query.to_string(), - })?; - (token, plan) - } - RunQuery::InfluxQL(sql_query) => { - let token = db.record_query( - external_span_ctx.as_ref().map(RequestLogContext::ctx), - "influxql", - Box::new(sql_query.clone()), - ); - let plan = Planner::new(&ctx) - .influxql(sql_query) - .await - .context(PlanningSnafu { - namespace_name: &namespace_name, - query: query.to_string(), - })?; - (token, plan) - } - RunQuery::FlightSQL(msg) => { - let token = db.record_query( - external_span_ctx.as_ref().map(RequestLogContext::ctx), - "flightsql", - Box::new(msg.to_string()), - ); - let plan = Planner::new(&ctx) - .flight_sql_do_get(&namespace_name, db, msg.clone()) - .await - .context(PlanningSnafu { - namespace_name: &namespace_name, - query: query.to_string(), - })?; - (token, plan) - } + let physical_plan = match &query { + RunQuery::Sql(sql_query) => Planner::new(&ctx) + .sql(sql_query, params) + .await + .with_context(|_| PlanningSnafu { + namespace_name, + query: query.to_string(), + })?, + RunQuery::InfluxQL(sql_query) => Planner::new(&ctx) + .influxql(sql_query, params) + .await + .with_context(|_| PlanningSnafu { + namespace_name, + query: query.to_string(), + })?, + RunQuery::FlightSQL(msg) => Planner::new(&ctx) + .flight_sql_do_get(namespace_name, db, msg.clone(), params) + .await + .with_context(|_| PlanningSnafu { + namespace_name, + query: query.to_string(), + })?, }; + let query_completed_token = query_completed_token.planned(Arc::clone(&physical_plan)); let output = GetStream::new( + server, ctx, physical_plan, namespace_name.to_string(), &query, query_completed_token, - permit, ) .await?; @@ -572,7 +592,7 @@ where let output = output.map(move |res| { if let Err(e) = &res { info!( - %namespace_name, + %database, %query, trace=external_span_ctx.format_jaeger().as_str(), %e, @@ -582,7 +602,7 @@ where res }); - Ok(Response::new(Box::pin(output) as TonicStream)) + Ok(Box::pin(output) as TonicStream) } } @@ -613,9 +633,12 @@ where request: Request, ) -> Result, tonic::Status> { let external_span_ctx: Option = request.extensions().get().cloned(); + // technically the trailers layer should always be installed but for testing this isn' always the case, so lets + // make this optional + let trailers: Option = request.extensions().get().cloned(); let span_ctx: Option = request.extensions().get().cloned(); let authz_token = get_flight_authz(request.metadata()); - let mut is_debug = has_debug_header(request.metadata()); + let debug_header = has_debug_header(request.metadata()); let ticket = request.into_inner(); // attempt to decode ticket @@ -625,15 +648,12 @@ where info!(%e, "Error decoding Flight API ticket"); }; - let request = request?; - let namespace_name = request.database(); - let query = request.query(); - is_debug |= request.is_debug(); + let request = request?.add_debug_header(debug_header); - let perms = match query { - RunQuery::FlightSQL(cmd) => flightsql_permissions(namespace_name, cmd), + let perms = match request.query() { + RunQuery::FlightSQL(cmd) => flightsql_permissions(request.database(), cmd), RunQuery::Sql(_) | RunQuery::InfluxQL(_) => vec![authz::Permission::ResourceAction( - authz::Resource::Database(namespace_name.to_string()), + authz::Resource::Database(request.database().to_string()), authz::Action::Read, )], }; @@ -642,51 +662,49 @@ where .await .map_err(Error::from)?; - let permit = self - .server - .acquire_semaphore(span_ctx.child_span("query rate limit semaphore")) - .await; - - // Log after we acquire the permit and are about to start execution - let start = Instant::now(); - info!( - %namespace_name, - %query, - trace=external_span_ctx.format_jaeger().as_str(), - variant=query.variant(), - "DoGet request", - ); - - let response = self - .run_do_get( - span_ctx, - external_span_ctx.clone(), - permit, - query.clone(), - namespace_name.to_string(), - is_debug, - ) - .await; + // `run_do_get` may wait for the semaphore. In this case, we shall send empty "keep alive" messages already. So + // wrap the whole implementation into the keep alive stream. + // + // Also note that due to the keep alive mechanism, we cannot send any headers back because they might come + // after a keep alive message and therefore aren't headers. gRPC metadata can only be sent at the very beginning + // (headers) or at the very end (trailers). We shall use trailers. + let server = Arc::clone(&self.server); + let mut log_entry = None; + let response = Self::run_do_get( + server, + span_ctx, + external_span_ctx.clone(), + request.clone(), + &mut log_entry, + ) + .await; if let Err(e) = &response { info!( - %namespace_name, - %query, + %request.database, + %request.query, trace=external_span_ctx.format_jaeger().as_str(), %e, "Error running DoGet", ); } else { - let elapsed = Instant::now() - start; debug!( - %namespace_name, - %query, + %request.database, + %request.query, trace=external_span_ctx.format_jaeger().as_str(), - ?elapsed, - "Completed DoGet request", + "Planned DoGet request", ); } - response + + let md = QueryResponseMetadata { log_entry }; + let md_captured = md.clone(); + if let Some(trailers) = trailers { + trailers.add_callback(move |trailers| md_captured.write_trailers(trailers)); + } + + let stream = response?; + + Ok(Response::new(Box::pin(stream) as _)) } async fn handshake( @@ -919,7 +937,7 @@ fn cmd_from_descriptor(flight_descriptor: FlightDescriptor) -> Result Result { let mut found_header_keys: Vec = vec![]; - for key in IOX_FLIGHT_SQL_DATABASE_HEADERS { + for key in IOX_FLIGHT_SQL_DATABASE_REQUEST_HEADERS { if metadata.contains_key(key) { found_header_keys.push(key.to_string()); } @@ -982,25 +1000,32 @@ fn has_debug_header(metadata: &MetadataMap) -> bool { .unwrap_or_default() } -/// Wrapper over a FlightDataEncodeStream that adds IOx specfic -/// metadata and records completion -struct GetStream { - inner: KeepAliveStream, +struct PermitAndToken { #[allow(dead_code)] permit: InstrumentedAsyncOwnedSemaphorePermit, - query_completed_token: QueryCompletedToken, + query_completed_token: QueryCompletedToken, +} + +/// Wrapper over a FlightDataEncodeStream that adds IOx specific +/// metadata and records completion +struct GetStream { + inner: BoxStream<'static, Result>, + permit_state: Arc>>, done: bool, } impl GetStream { - async fn new( + async fn new( + server: Arc, ctx: IOxSessionContext, physical_plan: Arc, namespace_name: String, query: &RunQuery, - query_completed_token: QueryCompletedToken, - permit: InstrumentedAsyncOwnedSemaphorePermit, - ) -> Result { + query_completed_token: QueryCompletedToken, + ) -> Result + where + S: QueryNamespaceProvider, + { let app_metadata = proto::AppMetadata {}; let schema = physical_plan.schema(); @@ -1017,22 +1042,45 @@ impl GetStream { tonic::Status::new(code, e.to_string()).into() }); - // setup inner stream - let inner = FlightDataEncoderBuilder::new() + // acquire token (after planning) + let permit_state: Arc>> = Default::default(); + let permit_state_captured = Arc::clone(&permit_state); + let permit_span = ctx.child_span("query rate limit semaphore"); + let query_results = futures::stream::once(async move { + let permit = server.acquire_semaphore(permit_span).await; + let query_completed_token = query_completed_token.permit(); + *permit_state_captured.lock().expect("not poisened") = Some(PermitAndToken { + permit, + query_completed_token, + }); + query_results + }) + .flatten(); + + // setup encoding stream + let encoded = FlightDataEncoderBuilder::new() .with_schema(schema) .with_metadata(app_metadata.encode_to_vec().into()) .build(query_results); - // add keep alive - let inner = KeepAliveStream::new(inner, DO_GET_KEEP_ALIVE_INTERVAL); + // keep-alive + let inner = KeepAliveStream::new(encoded, DO_GET_KEEP_ALIVE_INTERVAL).boxed(); Ok(Self { inner, - permit, - query_completed_token, + permit_state, done: false, }) } + + #[must_use] + fn finish_stream(&self) -> Option> { + self.permit_state + .lock() + .expect("not poisened") + .take() + .map(|state| state.query_completed_token) + } } impl Stream for GetStream { @@ -1052,27 +1100,78 @@ impl Stream for GetStream { None => { self.done = true; // if we get here, all is good - self.query_completed_token.set_success(); + if let Some(token) = self.finish_stream() { + token.success(); + } } Some(Ok(data)) => { return Poll::Ready(Some(Ok(data))); } Some(Err(e)) => { self.done = true; + if let Some(token) = self.finish_stream() { + token.fail(); + } return Poll::Ready(Some(Err(e.into()))); } } } } } + +/// Header/trailer data added to query responses. +#[derive(Debug, Clone)] +struct QueryResponseMetadata { + log_entry: Option>, +} + +impl QueryResponseMetadata { + fn write_trailer_duration(md: &mut HeaderMap, key: &'static str, d: Option) { + let Some(d) = d else { return }; + + md.insert( + key, + d.as_secs_f64().to_string().parse().expect("always valid"), + ); + } + + fn write_trailers(&self, md: &mut HeaderMap) { + let Some(log_entry) = &self.log_entry else { + return; + }; + + Self::write_trailer_duration( + md, + IOX_FLIGHT_QUEUE_DURATION_RESPONSE_TRAILER, + log_entry.permit_duration(), + ); + Self::write_trailer_duration( + md, + IOX_FLIGHT_PLANNING_DURATION_RESPONSE_TRAILER, + log_entry.plan_duration(), + ); + Self::write_trailer_duration( + md, + IOX_FLIGHT_EXECUTION_DURATION_RESPONSE_TRAILER, + log_entry.execute_duration(), + ); + Self::write_trailer_duration( + md, + IOX_FLIGHT_COMPUTE_DURATION_RESPONSE_TRAILER, + log_entry.compute_duration(), + ); + } +} + #[cfg(test)] mod tests { use arrow_flight::sql::ProstMessageExt; use async_trait::async_trait; use authz::Permission; use futures::Future; + use iox_query::test::TestDatabaseStore; use metric::{Attributes, Metric, U64Gauge}; - use service_common::test_util::TestDatabaseStore; + use test_helpers::maybe_start_logging; use tokio::pin; use tonic::metadata::{MetadataKey, MetadataValue}; @@ -1111,10 +1210,13 @@ mod tests { .to_vec() .into(), }; - let streaming_resp1 = service + let mut streaming_resp1 = service .do_get(tonic::Request::new(ticket.clone())) .await - .unwrap(); + .unwrap() + .into_inner(); + streaming_resp1.next().await.unwrap().unwrap(); // schema (planning) + streaming_resp1.next().await.unwrap().unwrap(); // record batch (execution) assert_semaphore_metric( &test_storage.metric_registry, @@ -1132,10 +1234,13 @@ mod tests { 1, ); - let streaming_resp2 = service + let mut streaming_resp2 = service .do_get(tonic::Request::new(ticket.clone())) .await - .unwrap(); + .unwrap() + .into_inner(); + streaming_resp2.next().await.unwrap().unwrap(); // schema (planning) + streaming_resp2.next().await.unwrap().unwrap(); // record batch (execution) assert_semaphore_metric( &test_storage.metric_registry, @@ -1154,7 +1259,13 @@ mod tests { ); // 3rd request is pending - let fut = service.do_get(tonic::Request::new(ticket.clone())); + let mut streaming_resp3 = service + .do_get(tonic::Request::new(ticket.clone())) + .await + .unwrap() + .into_inner(); + streaming_resp3.next().await.unwrap().unwrap(); // schema (planning) + let fut = streaming_resp3.next(); // record batch (execution) pin!(fut); assert_fut_pending(&mut fut).await; @@ -1176,7 +1287,7 @@ mod tests { // free permit drop(streaming_resp1); - let streaming_resp3 = fut.await; + fut.await.unwrap().unwrap(); assert_semaphore_metric( &test_storage.metric_registry, @@ -1227,6 +1338,7 @@ mod tests { }; } + #[track_caller] fn assert_semaphore_metric(registry: &metric::Registry, name: &'static str, expected: u64) { let actual = registry .get_instrument::>(name) @@ -1262,6 +1374,8 @@ mod tests { #[tokio::test] async fn do_get_authz() { + maybe_start_logging(); + let test_storage = Arc::new(TestDatabaseStore::default()); test_storage.db_or_create("bananas").await; diff --git a/service_grpc_flight/src/planner.rs b/service_grpc_flight/src/planner.rs new file mode 100644 index 00000000000..9c6caf49e7f --- /dev/null +++ b/service_grpc_flight/src/planner.rs @@ -0,0 +1,113 @@ +//! Query planner wrapper for use in IOx services +use std::sync::Arc; + +use bytes::Bytes; +use datafusion::{ + arrow::datatypes::SchemaRef, error::DataFusionError, physical_plan::ExecutionPlan, +}; +use flightsql::{FlightSQLCommand, FlightSQLPlanner}; +use iox_query::{exec::IOxSessionContext, frontend::sql::SqlQueryPlanner, QueryNamespace}; + +pub(crate) use datafusion::error::{DataFusionError as Error, Result}; +use iox_query_influxql::frontend::planner::InfluxQLQueryPlanner; +use iox_query_params::StatementParams; + +/// Query planner that plans queries on a separate threadpool. +/// +/// Query planning was, at time of writing, a single threaded affair. In order +/// to avoid tying up the tokio executor that is handling API requests, IOx plan +/// queries using a separate thread pool. +#[derive(Debug)] +pub(crate) struct Planner { + /// Executors (whose threadpool to use) + ctx: IOxSessionContext, +} + +impl Planner { + /// Create a new planner that will plan queries using the provided context + pub(crate) fn new(ctx: &IOxSessionContext) -> Self { + Self { + ctx: ctx.child_ctx("Planner"), + } + } + + /// Plan a SQL query against the data in a namespace, and return a + /// DataFusion physical execution plan. + pub(crate) async fn sql( + &self, + query: impl AsRef + Send, + params: StatementParams, + ) -> Result> { + let planner = SqlQueryPlanner::new(); + let query = query.as_ref(); + let ctx = self.ctx.child_ctx("planner sql"); + let params = params.into_df_param_values(); + + planner.query(query, params, &ctx).await + } + + /// Plan an InfluxQL query against the data in `database`, and return a + /// DataFusion physical execution plan. + pub(crate) async fn influxql( + &self, + query: impl AsRef + Send, + params: impl Into + Send, + ) -> Result> { + let planner = InfluxQLQueryPlanner::new(); + let query = query.as_ref(); + let ctx = self.ctx.child_ctx("planner influxql"); + let params = params.into(); + + planner.query(query, params, &ctx).await + } + + /// Creates a plan for a `DoGet` FlightSQL message, as described on + /// [`FlightSQLPlanner::do_get`], on a separate threadpool + pub(crate) async fn flight_sql_do_get( + &self, + namespace_name: impl AsRef + Send, + namespace: Arc, + cmd: FlightSQLCommand, + params: StatementParams, + ) -> Result> { + let namespace_name = namespace_name.as_ref(); + let ctx = self.ctx.child_ctx("planner flight_sql_do_get"); + let params = params.into_df_param_values(); + + FlightSQLPlanner::do_get(namespace_name, namespace, cmd, params, &ctx) + .await + .map_err(DataFusionError::from) + } + + /// Creates a plan for a `DoAction` FlightSQL message, as described on + /// [`FlightSQLPlanner::do_action`], on a separate threadpool + pub(crate) async fn flight_sql_do_action( + &self, + namespace_name: impl Into + Send, + namespace: Arc, + cmd: FlightSQLCommand, + ) -> Result { + let namespace_name = namespace_name.into(); + let ctx = self.ctx.child_ctx("planner flight_sql_do_get"); + + FlightSQLPlanner::do_action(namespace_name, namespace, cmd, &ctx) + .await + .map_err(DataFusionError::from) + } + + /// Returns the [`SchemaRef`] to be included in the response to a + /// `GetFlightInfo` FlightSQL message as described on + /// [`FlightSQLPlanner::get_schema`], on a separate threadpool. + pub(crate) async fn flight_sql_get_flight_info_schema( + &self, + namespace_name: impl Into + Send, + cmd: FlightSQLCommand, + ) -> Result { + let namespace_name = namespace_name.into(); + let ctx = self.ctx.child_ctx("planner flight_sql_get_flight_info"); + + FlightSQLPlanner::get_schema(namespace_name, cmd, &ctx) + .await + .map_err(DataFusionError::from) + } +} diff --git a/service_grpc_flight/src/request.rs b/service_grpc_flight/src/request.rs index 35e10e4c368..eeafd6f6519 100644 --- a/service_grpc_flight/src/request.rs +++ b/service_grpc_flight/src/request.rs @@ -2,13 +2,17 @@ use arrow_flight::Ticket; use bytes::Bytes; + use flightsql::FlightSQLCommand; use generated_types::google::protobuf::Any; use generated_types::influxdata::iox::querier::v1 as proto; use generated_types::influxdata::iox::querier::v1::read_info::QueryType; + +use iox_query_params::StatementParams; use observability_deps::tracing::trace; use prost::Message; use serde::Deserialize; + use snafu::{ResultExt, Snafu}; use std::fmt::{Debug, Display, Formatter}; @@ -18,12 +22,18 @@ pub enum Error { Invalid, #[snafu(display("Invalid ticket content: {}", msg))] InvalidContent { msg: String }, + #[snafu(display("Unknown query type. Expected 'sql' or 'influxql', got {}", query_type))] + InvalidQueryType { query_type: String }, #[snafu(display("Invalid Flight SQL ticket: {}", source))] FlightSQL { source: flightsql::Error }, - #[snafu(display("Invalid Protobuf: {}", source))] - Decode { source: prost::DecodeError }, + #[snafu(display("Protobuf decoding error: {}", source))] + DecodeProtobuf { source: prost::DecodeError }, + #[snafu(display("JSON parse error: {}", source))] + DecodeJson { source: serde_json::Error }, + #[snafu(display("Invalid params: {}", source))] + DecodeParams { source: iox_query_params::Error }, } -pub type Result = std::result::Result; +pub(crate) type Result = std::result::Result; /// AnyError is an internal error that contains the result of attempting /// to decode a protobuf "Any" message. This is separate from Error so @@ -90,15 +100,34 @@ enum AnyError { /// "query_type": "influxql" /// } /// ``` +/// +/// ## Query parameters +/// +/// You can bind parameters to the query by using `$placeholder` syntax within the query and +/// supplying the parameter values via the `params` object. For example: +/// +/// ```json +/// { +/// "database": "my_db", +/// "sql_query": "SELECT a, b, c FROM my_table WHERE id = $id AND name = $name", +/// "query_type": "sql", +/// "params": { +/// "id": 1234, +/// "name": "alice" +/// } +/// } +/// ``` +/// #[derive(Debug, PartialEq, Clone)] -pub struct IoxGetRequest { - database: String, - query: RunQuery, - is_debug: bool, +pub(crate) struct IoxGetRequest { + pub(crate) database: String, + pub(crate) query: RunQuery, + pub(crate) params: StatementParams, + pub(crate) is_debug: bool, } #[derive(Debug, PartialEq, Clone)] -pub enum RunQuery { +pub(crate) enum RunQuery { /// Unparameterized SQL query Sql(String), /// InfluxQL @@ -110,7 +139,7 @@ pub enum RunQuery { } impl RunQuery { - pub fn variant(&self) -> &'static str { + pub(crate) fn variant(&self) -> &'static str { match self { Self::Sql(_) => "sql", Self::InfluxQL(_) => "influxql", @@ -134,16 +163,23 @@ impl IoxGetRequest { "type.googleapis.com/influxdata.iox.querier.v1.ReadInfo"; /// Create a new request to run the specified query - pub fn new(database: impl Into, query: RunQuery, is_debug: bool) -> Self { + pub(crate) fn new(database: impl Into, query: RunQuery, is_debug: bool) -> Self { Self { database: database.into(), query, + params: StatementParams::default(), is_debug, } } + /// Merges result of the gRPC debug header into the is_debug field of this request using boolean or logic + pub(crate) fn add_debug_header(mut self, debug_header: bool) -> Self { + self.is_debug |= debug_header; + self + } + /// try to decode a ReadInfo structure from a Token - pub fn try_decode(ticket: Ticket) -> Result { + pub(crate) fn try_decode(ticket: Ticket) -> Result { // decode ticket IoxGetRequest::decode_protobuf_any(ticket.ticket.clone()) .or_else(|e| { @@ -170,19 +206,23 @@ impl IoxGetRequest { } /// Encode the request as a protobuf Ticket - pub fn try_encode(self) -> Result { + pub(crate) fn try_encode(self) -> Result { let Self { database, query, + params, is_debug, } = self; + let params: Vec = params.into(); + let read_info = match query { RunQuery::Sql(sql_query) => proto::ReadInfo { database, sql_query, query_type: QueryType::Sql.into(), flightsql_command: vec![], + params, is_debug, }, RunQuery::InfluxQL(influxql) => proto::ReadInfo { @@ -191,6 +231,7 @@ impl IoxGetRequest { sql_query: influxql, query_type: QueryType::InfluxQl.into(), flightsql_command: vec![], + params, is_debug, }, RunQuery::FlightSQL(flightsql_command) => proto::ReadInfo { @@ -201,6 +242,7 @@ impl IoxGetRequest { .try_encode() .context(FlightSQLSnafu)? .into(), + params, is_debug, }, }; @@ -217,8 +259,10 @@ impl IoxGetRequest { } /// See comments on [`IoxGetRequest`] for details of this format - fn decode_json(ticket: Bytes) -> Result { - let json_str = String::from_utf8(ticket.to_vec()).map_err(|_| "Not UTF8".to_string())?; + fn decode_json(ticket: Bytes) -> Result { + let json_str = String::from_utf8(ticket.to_vec()).map_err(|_| Error::InvalidContent { + msg: "Not UTF8".to_string(), + })?; /// This represents ths JSON fields #[derive(Deserialize, Debug)] @@ -229,6 +273,8 @@ impl IoxGetRequest { // If query type is not supplied, defaults to SQL query_type: Option, #[serde(default = "Default::default")] + params: StatementParams, + #[serde(default = "Default::default")] is_debug: bool, } @@ -236,18 +282,15 @@ impl IoxGetRequest { database, sql_query, query_type, + params, is_debug, - } = serde_json::from_str(&json_str).map_err(|e| format!("JSON parse error: {e}"))?; + } = serde_json::from_str(&json_str).context(DecodeJsonSnafu)?; let query = if let Some(query_type) = query_type { match query_type.as_str() { "sql" => RunQuery::Sql(sql_query), "influxql" => RunQuery::InfluxQL(sql_query), - _ => { - return Err(format!( - "unknown query type. Expected 'sql' or 'influxql', got {query_type}'" - )) - } + _ => return InvalidQueryTypeSnafu { query_type }.fail(), } } else { // default to SQL @@ -257,6 +300,7 @@ impl IoxGetRequest { Ok(Self { database, query, + params, is_debug, }) } @@ -276,7 +320,7 @@ impl IoxGetRequest { /// See comments on [`IoxGetRequest`] for details of this format fn decode_protobuf(ticket: Bytes) -> Result { - let read_info = proto::ReadInfo::decode(ticket).context(DecodeSnafu)?; + let read_info = proto::ReadInfo::decode(ticket).context(DecodeProtobufSnafu)?; let query_type = read_info.query_type(); let proto::ReadInfo { @@ -285,6 +329,7 @@ impl IoxGetRequest { query_type: _, flightsql_command, is_debug, + params, } = read_info; Ok(Self { @@ -320,30 +365,26 @@ impl IoxGetRequest { RunQuery::FlightSQL(cmd) } }, + params: params.try_into().context(DecodeParamsSnafu)?, is_debug, }) } - pub fn database(&self) -> &str { + pub(crate) fn database(&self) -> &str { self.database.as_ref() } - pub fn query(&self) -> &RunQuery { + pub(crate) fn query(&self) -> &RunQuery { &self.query } - - pub fn is_debug(&self) -> bool { - self.is_debug - } } - #[cfg(test)] mod tests { + use super::*; use arrow_flight::sql::CommandStatementQuery; use assert_matches::assert_matches; use generated_types::influxdata::iox::querier::v1::read_info::QueryType; - - use super::*; + use iox_query_params::{params, StatementParams}; #[test] fn json_ticket_decoding_compatibility() { @@ -369,22 +410,52 @@ mod tests { impl TestCase { fn new_sql(json: &'static str, expected_database: &str, query: &str) -> Self { + Self::new_sql_with_params( + json, + expected_database, + query, + StatementParams::default(), + ) + } + + fn new_sql_with_params( + json: &'static str, + expected_database: &str, + query: &str, + params: impl Into, + ) -> Self { Self { json, expected: IoxGetRequest { database: String::from(expected_database), query: RunQuery::Sql(String::from(query)), + params: params.into(), is_debug: false, }, } } fn new_influxql(json: &'static str, expected_database: &str, query: &str) -> Self { + Self::new_influxql_with_params( + json, + expected_database, + query, + StatementParams::default(), + ) + } + + fn new_influxql_with_params( + json: &'static str, + expected_database: &str, + query: &str, + params: impl Into, + ) -> Self { Self { json, expected: IoxGetRequest { database: String::from(expected_database), query: RunQuery::InfluxQL(String::from(query)), + params: params.into(), is_debug: false, }, } @@ -518,6 +589,55 @@ mod tests { "my_otherdb", "SHOW DATABASES;", ), + // query parameter cases + TestCase::new_sql_with_params( + r#" + { + "bucket": "my_db", + "sql_query": "SELECT $1, $2, $3, $4, $5;", + "query_type": "sql", + "params": { + "1": null, + "2": true, + "3": "string", + "4": 1234, + "5": 12.34 + } + }"#, + "my_db", + "SELECT $1, $2, $3, $4, $5;", + params! { + "1" => (), + "2" => true, + "3" => "string", + "4" => 1234_u32, + "5" => 12.34 + }, + ), + TestCase::new_influxql_with_params( + r#" + { + "bucket": "my_db", + "sql_query": "SELECT $1, $2, $3, $4, $5;", + "query_type": "influxql", + "params": { + "1": null, + "2": true, + "3": "string", + "4": 1234, + "5": 12.34 + } + }"#, + "my_db", + "SELECT $1, $2, $3, $4, $5;", + params! { + "1" => (), + "2" => true, + "3" => "string", + "4" => 1234_u32, + "5" => 12.34 + }, + ), ]; for TestCase { json, expected } in cases { @@ -557,6 +677,32 @@ mod tests { assert_matches!(e, Error::Invalid); } + #[test] + fn json_ticket_decoding_invalid_params() { + let ticket = make_json_ticket( + r#" + { + "bucket": "my_db", + "sql_query": "SELECT $1, $2, $3, $4, $5;", + "query_type": "influxql", + "params": ["foo", "bar"] + }"#, + ); + let e = IoxGetRequest::try_decode(ticket).unwrap_err(); + assert_matches!(e, Error::Invalid); + + let ticket = make_json_ticket( + r#" + { + "bucket": "my_db", + "sql_query": "SELECT $1, $2, $3, $4, $5;", + "query_type": "influxql", + "params": null + }"#, + ); + let e = IoxGetRequest::try_decode(ticket).unwrap_err(); + assert_matches!(e, Error::Invalid) + } #[test] fn proto_ticket_decoding_unspecified() { let ticket = make_proto_ticket(&proto::ReadInfo { @@ -564,6 +710,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::Unspecified.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -580,6 +727,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::Sql.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -595,6 +743,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::InfluxQl.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -610,6 +759,7 @@ mod tests { sql_query: "SELECT 1".into(), query_type: 42, // not a known query type flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -627,6 +777,7 @@ mod tests { query_type: QueryType::Sql.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -642,6 +793,7 @@ mod tests { query_type: QueryType::InfluxQl.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -657,6 +809,7 @@ mod tests { query_type: QueryType::FlightSqlMessage.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -682,6 +835,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::Unspecified.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -698,6 +852,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::Sql.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -713,6 +868,7 @@ mod tests { sql_query: "SELECT 1".to_string(), query_type: QueryType::InfluxQl.into(), flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -728,6 +884,7 @@ mod tests { sql_query: "SELECT 1".into(), query_type: 42, // not a known query type flightsql_command: vec![], + params: vec![], is_debug: false, }); @@ -745,6 +902,7 @@ mod tests { query_type: QueryType::Sql.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -760,6 +918,7 @@ mod tests { query_type: QueryType::InfluxQl.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -775,6 +934,7 @@ mod tests { query_type: QueryType::FlightSqlMessage.into(), // can't have both sql_query and flightsql flightsql_command: vec![1, 2, 3], + params: vec![], is_debug: false, }); @@ -797,6 +957,7 @@ mod tests { let request = IoxGetRequest { database: "foo_blarg".into(), query: RunQuery::Sql("select * from bar".into()), + params: StatementParams::default(), is_debug: false, }; @@ -812,6 +973,7 @@ mod tests { let request = IoxGetRequest { database: "foo_blarg".into(), query: RunQuery::Sql("select * from bar".into()), + params: StatementParams::default(), is_debug: true, }; @@ -827,6 +989,7 @@ mod tests { let request = IoxGetRequest { database: "foo_blarg".into(), query: RunQuery::InfluxQL("select * from bar".into()), + params: StatementParams::default(), is_debug: false, }; @@ -847,6 +1010,7 @@ mod tests { let request = IoxGetRequest { database: "foo_blarg".into(), query: RunQuery::FlightSQL(cmd), + params: StatementParams::default(), is_debug: false, }; diff --git a/service_grpc_testing/Cargo.toml b/service_grpc_testing/Cargo.toml index 3f3ef9279f7..659799e759a 100644 --- a/service_grpc_testing/Cargo.toml +++ b/service_grpc_testing/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] generated_types = { path = "../generated_types" } observability_deps = { path = "../observability_deps" } diff --git a/sharder/Cargo.toml b/sharder/Cargo.toml index e402d971f74..66d88536e29 100644 --- a/sharder/Cargo.toml +++ b/sharder/Cargo.toml @@ -5,6 +5,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] data_types = { path = "../data_types" } mutable_batch = { path = "../mutable_batch" } diff --git a/sharder/benches/sharder.rs b/sharder/benches/sharder.rs index 515052b0015..303a9b254a2 100644 --- a/sharder/benches/sharder.rs +++ b/sharder/benches/sharder.rs @@ -97,7 +97,7 @@ where } fn benchmark_scenario( - group: &mut BenchmarkGroup, + group: &mut BenchmarkGroup<'_, WallTime>, bench_name: &str, table: &str, namespace: &NamespaceName<'_>, diff --git a/sqlx-hotswap-pool/Cargo.toml b/sqlx-hotswap-pool/Cargo.toml index a7dda82ee08..a85ee6edf48 100644 --- a/sqlx-hotswap-pool/Cargo.toml +++ b/sqlx-hotswap-pool/Cargo.toml @@ -9,8 +9,11 @@ license.workspace = true # Prevent this from being published to crates.io! publish = false +[lints] +workspace = true + [dependencies] -sqlx = { version = "0.7.1", features = ["runtime-tokio-rustls", "postgres", "json", "tls-rustls"] } +sqlx = { version = "0.7.3", features = ["runtime-tokio-rustls", "postgres", "json", "tls-rustls"] } either = "1.9.0" futures = "0.3" workspace-hack = { version = "0.1", path = "../workspace-hack" } @@ -18,4 +21,4 @@ workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] dotenvy = "0.15.7" rand = { version = "0.8", features = ["small_rng"] } -tokio = { version = "1.32", features = ["rt-multi-thread", "macros", "parking_lot"] } +tokio = { version = "1.35", features = ["rt-multi-thread", "macros", "parking_lot"] } diff --git a/test_fixtures/README.md b/test_fixtures/README.md new file mode 100644 index 00000000000..568e1f6ada1 --- /dev/null +++ b/test_fixtures/README.md @@ -0,0 +1,26 @@ +# Test fixtures + +This directory contains files that may be useful for testing purposes. + +If you add a new file to this directory, please add a brief description of it here. + +# Top-level files + +- `000000000000005-000000002.tsm.gz` - Used in testing the `influxdb_tsm` crate. +- `cpu_usage.tsm.gz` - Used in testing the `influxdb_tsm` crate. +- `cpu.parquet` - Parquet file generated by IOx to be imported in tests where data loaded is needed. + +# lineproto directory + +- `air_and_water.lp` +- `metrics.lp` +- `prometheus.lp` +- `read_filter.lp.gz` +- `temperature.lp` + +# parquet directory + +- `influxql_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest. Notably + NOT generated with IOx. +- `sql_query_log_*.parquet` - Data exported from TSM for the purposes of testing bulk ingest. + Notably NOT generated with IOx. diff --git a/test_fixtures/parquet/influxql_log_1.parquet b/test_fixtures/parquet/influxql_log_1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8c4b04e31dc360148c674ce305b0c835e2a11c10 GIT binary patch literal 309561 zcmeF437i~Nwg1Dus{D~42*^0HlZ3jr_5qY+l0YCKAqo33)S3)QW`>yw5O`5!2W1iU ziK4QvBA|e%C?JAtB8sRWBBBT`JP;LJ(f{|}s=--0-8KE;^()dHBPUf|>#ck4*?(u= z)cIoK+7s{Ip1-pvKC$-137bxsaOJf-Uf~^n#k;<`^4Y5ty>!J@S1bDE=+oCL`uR1l zyg^a#$yYS}dsfl!9(LAqivHl}uRX8mamQWy8%5ve{PK5-w(o!9o%=#J ze{kSiie7W*$(mmA+9CI;?W=Bj=zc}_nzqgZiZ1{9*$*jdz4-jYik|<_-yc!*iXXH; zs^~A*zw*Wdpm!f}_05VtFmcB(DSFsF*WIG%GuLi&tD@Vk`NOX$+II7wG(F^BeP2`C zpL+ejH2v)fzrI6l&)sY*2di+HX|47k&&ff7UMJJ#6z%z<|^|H_XO3|(6t@EOyCw}#^ZyyK!w*4bbU%uSA zPi=b}T%+kRTb=M7wf(o_{#(;8@3!IrwJpB#Z%tqA+WbMaeaXgedsxxIIm*3{^-ZJ;-&DHj8uRQxMMNfTr>n#-h z;U5my^vIRryVdq-a@kgjUhv>{;rj6v@7nYeC&r4-`1Qk@x|dv@s_l=hvDG3)Uq1io z#fnb&$~wm=y7^td({y^@;8L~ieP^p>iXLQ7x&3X>!541)s-j!3`25!t-D%6Oe_hdc z9WnVEiq4$#eN8{RVA)-2`0)?KkHdMcp0mxK7dDUB9R4_ntWF2DQE6$1c`%yKfwOliGghp-*c1 zyItG&-+}FeUUKIt8lC=4O}nm|yg+U5`0xGf4pZ;MK5~WqP-N|;tv;S`plEz47GjG?%&t+p_hIzOKrdH z{@3?abkR@$YoVfZ?TyfB&W@9_WK=0 z*Z;yvo}$M*JJ_MsL;DP0?R`=fS@#dcoE&ysqdc zcX?vMROn5w{$w3RcbWBv^%VWYd9Q7#=q306_HBwzz2UL{r|46i8=Sci`p*jw)pYJR zW}dCKU%fk{i|6g9x4*RSuRo&ppSt#ontrc)laH$H*Y;TdJVpO}*T$MY@}U#gIgIUF z-n;d>icT`$qv?vtQ`cA9pZUpYns%IYnWh&+AK6^(pLxsWnto8euIbD3H{DY0?|t#6 zy{}z*#k+p?kEb>L$w5!dQrj=C7}!_Q-Tv4+ThZTMc-I_77fqZsSJC%QeMHmmtatSR zYWr&Q11I^s)oq!A~{) zypQeI_B>kC1K+duI%<2`^XqDQ?&$}wr?%Jq#z~s)f8FsLs_pOo_7Y7mxV7Ua53$|- z^^ZKM=#Qp<{wYNtKWe+5D|*q~E1y;Ls15)6D@DJs^_jm{bkgqs{2xX4e`)JKDmuO6 z-LES8`A_{}*B?O-o4VZ;MLYj+ou;qfeyO3h@7?r$LeYoTI8`dT<2fCcqCeZ?E=~V= z!}E^Xen)z$tLW_8_t@XbeCTuWg!kX)EvDc+1e|W9+H{I!wyD$Clsq1ahzRyK_U3SXp8*L#MKXLbE zr=IzawrOW=G4=A(&VJ`k^RGSRTbG~yk#`y40~bxb;*4{*^7?=C#J8?E^Zaf1I%WMW zr(Jp0$KJQkg*zR3&y{C?e8+j0@AK(tSDo|8T@U&4;wSI9>LZ`p?dW^X+OqTNkAB)* z{MfaJe*5ZkFLwKW`M{?;uQ~5?yPxpK-#q#4YtFyq11GP!!Mmqld%_YO`refNq_z0Z=bsV zrc3Vn(1f)%+%ViCBObW^@(*{JxKDVIK3d2flR0{pYuJ zp0o9=FJJlK$9GzA{Sg~5tP>`#y`?>2-A%UKV(r21Wy!<|n?JGk=FjiS-yMFr{SFhh z-lD?S@G9(67n(R_;*{AF*VKyhJGpdUe{XDv$;tA`QPMNmA1s|L_}-gM+-&&QerJWF zwy3m|3$DfgpICbh{y%Z!{HfPUdXCE;`loAb{?j#f*q3KMq|)iDxF%0MVd8|@>OZLa zkWD6T@}I{Wx+T91-Ef_xzrVLXyXCyi=k2gHH*}s@TfhJKuC#%^QfF_RkNTT&wS_DC zR&lkRChj!!Z8l*Tjns3J$V}5D_KYyJ$9|YiMjx`pm_3z0$vVCLi)KyFM(BYxHh*A^ z9j2>~GUn_`*W+_+I&ssX2VI97q`}ewUTO5SSx4{RA|sWbW0T&&E@oYnbPWW{`j#ef z_DE-}vH2Nm>@Y<=(x{7Y^R|g?LpR@;s~)>N>0i;`yL>R|=j_qXu<`Jpw-~Wi`H9x< z?_HXIo}1U${N^=w*h4+Vh!aO&hG%;F#J3MUlVX6ufn{SKWi9q@q5f0(#v2U`_Vx#h zlK)JP6E{C`hh5a&hn>m2F5RBLvnM_dSHKRNAI#$LG&+SDbC1d{9PDH!`|QKHf9;%C?`# z=@$8Qqx`r#^XvS4W_H}A`S^PrW%M^kXY_tAqbv5#=rJdp ztpBm|(b@Kq`($*3d|bBA-`DHO_P=~sM*r?-bnAQ^mULv>{rPymoIm$r`St#ufAOCKW@+G#ZLM6?#$=a^Z7hlbKvaC5^5?wg=atj{JLlC4bN3^Y!JhoFDGb$Km$;du@`BtC`bt z^XI-IAIJUj`4b@kLB~^)ckvVCLi}}re??Okn_nJ`E$+8 z*OeXe@9O94*K_&vzd9v5?-Tj;ui7iyzA?Z5;V)#{+x|AAFX!)hYCdoOo{#g+`MBPe zKcCZ|9lufjTm$+1x;6j4pUR)>#QeK`DxZhj=ihIe{QCRm^XKFF^?&JQ*Ikg$a{io`pa1Rry5Gsa|5;{s+>ZHqZ_nTRxtyO)$mi=B`Fy=Azt6+@`Tx2! zJAc3Yytn7_^^yGhot%F+FJCY0{P=h0&+))9+4=MH-{#eyq&jXY+jj1IK2^y-#LzmX*#%+;Ya8t=!&U-1{-}%>B37{rEpT_k&x`oV)D@J~UySO|7%$y|?qsO}3wZ>sbf9 zZ}0QlW?X2UJ%9WCK4m12-Fo(d9p_%M*Xf(u=PcZL!8P-)nt$6l2kv_CO-J8%q5YAA zCLex#->)CL?IQ>8)^*p(>%7zX=piDy|AOroeC4Bunu{L2bjHV>a}TqZ{N(23hhI7O zaJToFyH9_od)^Vg{rvYX+wYp5ZSFYkrJsN6BRlrK=bItoT_60{zt-4z zr@pptowDvb+Gk(e_ug-x@%C+}eRQW|+wVJP^WDPlUVH5O?mzE6A2?;>o%^?Y@WLJD zUOsyR^=aNPdf$kjv1_V3v5<5m1-XwSbmlE1Tu zVn>^Bqrf*@FBP^af=D`^qxOYP;25dpnxQ4aAhp##k@$||*{L7+u^pwU+UJF)>j>Ko z%_N9o>FIM_SEP|}4BIvW_NskpAUt2VL1OtyWCX6-XPHST!!UM)>sL-^8+UJYdaxIzo zX%eQf>8pK4D#9@3z}PWFl<0f-VQQLYEc}4C@jQK8C>`Iheb*4C6?%?-H#do7YK<+wTWRWAv4~vLFf6stb}Yw`uH#2moCrtXBjCj&#>EYz z)N}&`RAlDO?^;znc|?p|JeOcH=ZKQ>o64BEFC1S-y`Y z?#24t*owKe5vPXlg_fo7VMeiM@ilFk265~q>f9uZY>QzJ4BplW^|<)H5ju$#r;#5d zsip5>Cax`Q%MdOjA7t`+8DBVoFMzK4uslSg)3UO8gK)$_S-;)v9dN#gU|hJHRZ0G{ty!cP*$dI@)xz86@Mmlk@SGMtVX zxlUqQ27{K(aR$$DctKYLg27hLXPbulN^pb>V#ZeOb6iU?k(U^W;qnF5zA$q55DcLy zjWEgPPz*=I!d_s+LFkH%FMZF69AQ`vpTN)f!;TDQP8bAX9NWI9=an5sG72ouu`J=b z*}PA!D2)s^@N7Pe2=losQg|qIB3Fc#sh=fvV`eN1LK65XS5(iJI80KH=SwZq2yH!2 zLoanhn-zx96Hzt~xMRR>&oTp()k9w|5z=Hf#%2`qRDqs$hTJ#Q?`8$D%gSe%zMH67r^X;)^+_ySeH3o) zW<2Nyj_D>caP64ctJhdVvX;iK;|6?}FwpaW<$zNy>3Vj;qv*Njz|g{BlvzLFY<*uA zE0HqAy~J`X78G?~UoV>hpALrcZ9NYX;adUgXc8qXzlMHa7PBDrVGuq>C|rFmQ<>$- z<^vcpD~%olmYfh?j~!1sv72f>@&cEoJ#r=2hzv*Xb5iDHYC9&LJ61q_J;zfFm>>y) zP}+gM9@7rl!H0DaG;yS##kZO9FdLIRAW)~rz;kR{CaE8?Ao@%;^=?k+*@18Q7PE_I z&+h9AhKZFja#^wCY^m>*=-)3b`9m8a)QP;CQH*q7rkQ!|>n}2+u zAP^?QWl51{vWGPYVHfcrjGy%N9FJI;-M|hlmIR0AQP=ZeXxn6BrUBy|>2DP9BvBZ| z?BtEIZ-hJyB&-%%hR@BlY_x+oftiy~FwiX0>Uxon&`J3op5q!a&gP5FKP3W}NT*u% z^m|0e*w`~Y#CKvi`rQJ-dVx4$ihGgg=()>bd{_r5fCgZMdQ4^P@abG0gP~z2sq0B6 zK!gTH;7d48sxqM&t{bx{;8` z(ZtU9(vDb>B9|*TK^kTH20Ud4xPY10EWG-=2|M&)bC%yAag6Mnv$Qk1f#+I=EnSg~ zf#N7ua>HY#3mv`B2_#FBWZs*x%U?aFW{?K9WN8+uX9@j$tiXH=A)LTO#mL{6d7Ov< z4Z*P@C)0fb7H7p5reQ`hvGwyY4W)|+W|nj1Or}@~`U^rH8O>+N=Dq2A5sMD|Wq2~z zuMk+CFOl;WT*rcNo+(j-EZJ#}BT))|Qt*+BveO8ssyZo<@% zKGH4mTs?MVcaZ`Rr&VQt0PXdV=e+j-XJm`snpE^%52;&vfD_GFTMUxvtG!&{WhEjQmVz zO-z^>?Z-gzN=-ZCEyeO_f_O}gfPqox8n(qWiVzhRs%JK@B57Kz5oRDl(~JWBJrM2= z7vu^Kt9!;M+%z_nu4^$`tTT>&W;0~&*|vwY2oOqoJxF*a;Tb^`CK5eE?=zX-Y@~cUl=@6g zCsw2cwM^7Kl1!$+yP*|?mKUU9Y#N%+la!fe zV#CkY0}qWoVpR(qri;+y$9IfTA8pC3lsvh*uLoPA>Y%ov-6MYVGusFwbQe1{nRMCr z@O_3QP2ov&EOWst2K4oj{b@Ulz|lTV}X0jo-j<-qlj zQ<-jSM*+)7f)owCFmbe=7+6Y!K?_4qNDW7i3zt!|!}3ill76-h3C|X+S3ahe%eng9 z7<1D@ElVON=D+&g5|`O5T%-!TZf5!78kwQWl~zKpU4+|k2FFeAbrq2 z6NXXkL%N}5`&gYOEaqoxJHjtPbfXhE%y&I5n5ODmo6KA!h1B!Zve4$xty0&t5bgRn z7BfFZxk79UW{Bp&I1Z6wG5S6OD}_ETVjWK{rLmwOBMjBE*sRxFDs?=8OfvPjNY)1Q z06PpNYfq-@VUorsnn8%@cTHc9b7aB&XsGN!h=zLpA?_JLll3J)bMs^bFZ;8ENC^^nT;RaYp>Us{ktL??6 zWy8@32(>T39>X|FP!|}6T<3BzC>SdbT`GnX)o~cJF`75B!b?$T^|?-}OwN>9A7cu| z`upM%K-yb@VX;=&jvfQVrZOy0by&HQY`!31e6$~CX@LC5WEqAZvMII$lPQ8gQ1@kB zMN(qum^KDNw${b5n$MUdHu{M$^;lslnNp%Q`2nhHCO?$aXE2$fzDSrd>Uvg^pngS4 zj=>L^-7PHxB@g*&;(Lg#Z2V9#QI=y)^jV?xdcX`qsX$R;6cfyLbv-YJv4n$)YdEY| z`dLsI)Ijs(2nr(`Q`A_@e-A-{rkbz2%s&@_knrhP1v1-(p~YIj5rXpV#<_ju3)EIr zN5oN#Gey4#0-v8Tuv}%cXJZwr1)8PBOr(nes*d9w9A)mJe46lBw%#Wad1TsrT!}%b z&*ewX3Yk;V43jupo8pi+GfadMvM}R))Y%k%HOL(Z5a{}SmGdfs{ZpH78D#zqW>CsX zpmaYW6I0JW)*Rkn8mMq;8PojBznlcS7x|90nE6#4d>!x%o+}6ADbU{|!j^FbUZ)tX zD$eaKb(|_Wi+4qN2Gl;}t8G|;ivJ*79G=$=O!Q*Q)88B`D`0s>iDoS$i$a}?B9FLd%3$=OEbFmP`3l%4 za3_ucbX0X58lKWCaKX8(vYvhx4?W6_EVN=42HVu*!UBedk1d1xhrOnGA0y4f?SpA$ zyI$@K!oh&8hUOr!&rvtkJse)bfL)N97R;r;k@9J{R>+s|gv3*kJ#)yX#b`u6SuCyk zT+{Ru{9YKLc=aMZrZF}*7K9AoSVUBIoR%}VRB$-(KmBeI-#f(hhiMkEX6boqb217; z;;@V-yq>xTPk^O@%)%Zqos72xN`2r3I8~Ia6?)x3PlK;8ZWuF(i%*@)62eWf?oE_$ z79_PVl3ZGSauyy#WI7kWp*JGmS@F}{=EO2$2RC#jqA`<$d|MB`#KK|8$8Vy?07VUU z!C}WL&k(77GDS2wC?^Ti0GX(th2`7kSrK?X24uzyI16mFHy`dtQ`P4haU$Uw4@*q> zQ1#d&8d>6e*bjvU8(rTQlMZ_Ur8>axn2)o;bH!ZVK{AP4e@3n=Js0E657F|~xv&7T z2!#p963dyMrxr$&(wxi`>5yc0Z6eYDPmn|cxhDQ*buQ~DrcxAJo^lRn_eD`gKx2>z zysHR&b)1I;kT}?+hXWP1Rr^>Ow9W0}KMiqSs(lth-ic8zJO>$^%U|A!G51_v1{aJT z156R&CdfF{KTJ|BX9Cu^z`-Ixi$+z}?}3|@)rA3xVodim@09HkS)ihL`gEF&v)zJ1?r zYw!stta;dznl5|kw7044JI*^-ebxyVe)lhHEBfVNvHtzeQ#V^vZLjrNPt()e9#fkW z4q9-7{{6zu{-C$dTj}ZTN53gGm4CTc(@i(|q5ggIC%>xc8VB#7_s`$y08P6$-bmBm z|N0P3_xSKJn!f$T4fOqHY_N@{zkmEEnl8KGZ2g?!mv7O>4P11wrtf*zd-eW<=3Dx> zN&od#y}xJDg_{29gSTn=i64JT)An1R*L3b(**N@o;yrr%#ohi#(;0VNuIWo}ze>}S zzJ9i*|2ld{P2bo1c1<7OaaT=0*S?jerybv}=|5g-)AXT}uhev{kFBriWk+Vm^}i!K zf98h{)X&|$|6c2>0a|kB1N!-wKX$#QC;jSA`n=mNyI5~;xBCwI`y714`TD%O?|)wJ zf8l>tD4^O#1u{U zf3ic<3-`TG(?=gZMbq})Z2X_w@J#(4U%&n!eVsr2Z4nF2pYwxf^zW~4nV?jLTZ z=@-o9n%=(NHk!Wr*~c{9_2b$0Kc3F+^UYsR)cddf@#i%C@B6aruQ_{7y}i>N3-vhs z;lB>o+snW6oTkV9)6;a1O|tQQ{=ld7^KSd|4fOsSw%jHcu~`e4oD0wO@Qf({pY-S<|l^+@r_u{#!FXzVOhD&mKH1yzp|UA$ycw}zdw5OIhuZY^H1pg`=%LReSPxNdjFYcuGDnB zrBCbMullvGpX14`-l@0$cIhLUUj5hOHT~$>^E7?ps%*ae%zjC4KNwx6>C(Y$UR`(N(Dyra?QES~WBP~nbARKPTj>35)Bd85yXBQNHUB+5JG;-e`(}LI z`}WuM?;HL5H~RQxN8O~i@7m*T{oQu?`9^yC_xB#8>ADB4&@_1DB2Bw?&A-PMpVr%t z-@C7->tDFLrg#4?`yS_?v6a5Qd?g#l&;9sD{rhv4(Da4glQrG=wdXfhH{SW<@78kR zrkAeN^p7`Y-)rBE|DxaT(9gb4AHT!tzNY=VZKCNb_kCH@&1PoLb>t1d(A$q5oAJkA z?)sj7udP0qt#8{LnH~Sg6aUuxzw%JVPyc;D#t&N_lkw-@wttVl&S&n-#^aA4`k>x^ z;l+#(mVEp)z5SOpr)s+C!`o&M^H`=@>M zR!s*kzd+NepS)Sq&nGA8?{U@9+4ubW=YFsE|NHK5YWisW6HV_w@mNh?dhElRUjK<_ zHEsXfj{5vt-t$FG_uS?fz5h#=sp-#`Mw*_o<}&^J{o7~T+e|!dT{X~0t&^=2ryTMt z{rhv1GPz-2mwnGuugSjmUE4mRuXD2VNliEVb*4YO<7e4;edb4}>-}5L%D(@k`;XMy zublS@O_R1zkK2Q1OxE}L(vOePa_Z&5-|Fpeta-4ea?)*@zP!=Z`uJ0xSx>L?J$K)( zuXFdDz4g31C|}2SpEOJFf8nxB?tD}nptleB#kQLETzkHz4}SeoO|M-m(DbSYexv7E z*Pou$+ov6p$@_1gk@3kR_rF~~$7VsI>Aa3ZH1+q4_3{7u@lQ3izkQCT>;ENNCq8oG z#(KMwR=xfZ6K%2nkQ&4@sMqXXC!5LC?s9r9bbw`Jdk?*ctnAyow(T zRr;ax+tvTyNLf0JZ+KGsLJEP}JK80d-ee&sOLL)Nbx5cpv8(>Y^w#R$hNR0SCNM;x0YQla>9K~|U1-l2;*CbiE{r8VsxJP-8JL*xqqcJ5?ob+^wrSu6(;VWdGpe?w&=*_x7K_E&BV|0ZsaPqs93@d!pW$ zBf8Sw{$&BzZV&qUmUc(_J$5@`^0I+NeL=K@lcL_9K`NKJ2C1sa&W#o??^%*?^ZbeR zcGp1niTo}M{!D7;f%{i*^nw|)XLK%XJEDDha$I+m%H??!Y zUNgwa5;F)%O`kfIpAOz<#{3y=#|2B(JNGBSvYudB(za*Yj`M{)YWEo{@z${Y3t{lT&U zHM|Bd;2Ne$QV?NrVqf~tq;@g1pD+e)7(3g)a#H(1@ACdg4NLEm#e;)=yIE8GDP}uA z%ufdTdV2Ptku%X|1@f_+iY)04zq*=V(QF!DpY!WDxFcVZ7?IN}JsJ&^Qw=f5Y*x(5eGUSw$3 zu%puMWNADw?6HV8;Hl&x)b|}_x7(CCAGAuNG>be|0YT< zNk+FIbHc5}T!L(=5{fQ-`98wmmj&62WC!v945ZiL)%nYuLZpFM6`5h=JdGs?qZjnxfp;%A=gAPdIoh9rClr3aQwH#AM@c|v#v`O6}KoD7g6>;-EfuU%{W z>=llx8fhB#sx80B!MDo8U5%Ars6*5RNT)J)31yO%j=L!TAj=WKTFCTLH4G&5nl)`N zq!&rKP#vV`g(nI0;qt{{7nuuV0G zTY$T$jB-iNG~MwFuhFH9E$$lbH=JG~>#MQch3q1|$VJhrq#KfzRX*vZ91wU`3v!o* zzg5~rEH8lrRb`~vb#T|HqTr_Cu3D-gMl!i;0YEi-!B4Y^tFU zJUalw6jaVo8cb70SZ+d22M{ON+Pu!&{*0h^CcX zHfk1V39H~Oiz!?--l8@~3Zt;(0_DRLHJuuYyQm2T2f=j4&t0RjlUm$0oV#qu-R+r)~a5Mm%aO-dxjz%GCan}g$n%=zJ zMK%=R9TmBYD&P`YU8+W#;>My@mrW(JC2K{K;VzxKn4r)7G))pIC4#?1vW2>OdTrWXqe^}XN}-2r&(1( z9B(9P*p&?~@_I{Xa5;G4-NG6ni-TXp5sy#Nz?)G|{?ll<^A>Ln=dEeYYi?1MfihIl z+@gX?meAY++!bWzqRtGkxFjh!D_0&`%i039pIQLBjwsjYz5y-f8o^x68|7m1fuW7@ zQo)x&ku(ymbmo?c^TjV}Zc%qa#F@I2>P5*ITy#YM{u|8O@khBvci?Do*KqFYY~K6~ zobXui70bFL7z9k>vT+xfmh>aR5-j8>q(nvGHLhH#Ft1hxr~#&iF@lR5iHv77c^a0CX3ZaaMRdydW9SP;g4${DNfiUujoSF*k!^ z6eS$W##c50E-D}k@)iC>+qZ}wA6$#CRQ{JsXM;Li@T2=dwHRxNv0PbIAzXbK%d`|= zvR9F@RO_D-GL|rCJOf5=LC&H;i-KwoFX1n0vlhs~AljSbR|rS9Rcjj78kr71>4^3d z=k@nS^A!40*YvpyW=!vzI%n>I`!DR8ci^0PU2s{gL&%V-bqMNZtwWT{KgXj&dMZE< z;2x@1y}Zdxu7xAhpb`+34>D9GIFJ7&>4}$IsbcVQ1)C3PQbHzIzEyzBi}ELXyeJ$c zqHQKk3aEgp8Guzy%x%>*dHhB&O&&dq9?$e!4`P~}HVu#0mY!CVdY6brI4&?jmHf7* zK+h_l-xl42ZsCO!@@VXewx}xOtLF^a2DlS+(k6Lt{G650l-i>UJxa$5%kI?bU&M>u7%RoO4 z97Q+85U)SPxFjVWo*sXUKkWhePUvV`#vAgmPjOhMTRDk9hbDw&{qq)~Ae zP;B7z`gQn96F>4I8AQ?viN2&xi?ec%j){w)IXR1#$>e~lfZ|xrlFGqRKAGjw^%yk5 zqMT)@nmVUuq)~Gg4FqYvT!+0U!`QP!5!sQOnr_f?>6ETkMY(jEHPR?e+B&bI(o^~h z8LDi0sV~*%Dw<4y-AQX*lW<7&R6KaxF>%?_h_{YXPZ`H1$prX=Qa_5K=Fv;vLbfLm z&<&L1C9*xGYgrUpR@RVJ|CsXeIN0T;bc<#aP@Zl0E?(Q}r7D`m0RpnjdV0jTZsLg4 zkxAl{P++t|o*7I6>#JilX)2y{0TBSzrlNvOGC>IyWYC0FC-#C0GJYx2Rsr{^EjzIj zRGMEMHDDnlg>#Iu@gyOG$ZHzQ^Q|O1okp^FnPev=kYwV)$rWs1_`igZuAZY}m_=Ef zU^Wm8z%^8popy(Cr&t{60Sr@za|y_pFp$0zSxFpOY4ar8R&m>EkOcBP?FK5PB}x|( zD*aR2mtz@=fJhmUP!tTgX`n3^%3@deLPE9BsL5D1Ekq5Xo^{xrQJUIVoB@zxAtj%! z4A&v_5g4mUy|%(7w_3QBlXNUol(4wca;Ru2ULK-A@$5p31x_p-4eF6tu1s77mw@^U z+sDo0v@Ia98tFzUmBexky0%mfhu9>VmCubrU=$p`qN<=n^_ZoiA+;qI_8Ndeb{)|W zFTqcW2RE=JS+2rp8f&i>W4VUktP)Fgd$KFzWzkJT<(rl*20@{SSG=2=t+7PO9Iwe( z4(OUdr@3{czfmRZn33xw0D@BW)I8~L1sJQ5&Wln>EajXvE0<(oeU*{1fd3GMB^GHv z5~!dejD;fSfa6+6t&>9uq2Z8t=88a-R<$Hnj_^pg1g}Y*7o=n(u~y*~6c(vnWs_Kn zRb0sv7jj;JjOk%T7GW$n0w@eohvVNi4PVB-Ylkw3DbsB+)*Fl^nlzbHdDMyJRmRaED5gu18`4G8-h8ml%oR#(v9rG2|5l)4q{;US(1R zU7}cYX|5Q@QknbZlUQ_kC+oRro);CyBzhH51zpFXR@wCGu(84jzcSe$K-3vw60~fr zVT{$RaV)HL17}zz#u82`p;|1@r&zZzVAVoKN%16r_9BMoR;-DoL|esS%VuL)o@!@NG(b$^7#?(NtUs^4YJhF0!!W0~ z7@G=Mqz-p`$QVY{9a;{s628?uiOKV3cPjXRxcY zcX`iX_9!a`29ssey9bu2J1-x|o@3sDVs>v3Pd#o?S6?*xz=g@OzOGJv;NJZ~-{S6Q zpfl)?hwbj<{9&8(dXG=~7cB4VTe@P{-Wh|7<>;HJ6Nf+aoZi93Nq^Wo^78YOMco5~ z{VQh0yi0f59r0GZJ^F#T=$I#;9t;MfUT|pF(BllX3t5;f>hB&9V0=w6#8hJnnGj}G zRU$j#P3^n|&+-TNLxtucG?n5vt;wq;N3@d)MMjuGXjy_Dll@uATGYF|KT6s=+IyEQ z9vtl3&6?s*G28iJelpP4+cS_X=suC3RUaMo*%?hfVtdC)?R`s`GV$_$J)-R$v|a@N zys`zX(DF+dzNdREZOA~*D&$%sCkEZyCgbe2TI-%i+m zLaD|{1vEbLn>^Kk#T!3Ff*nRO3M`!k?7B|#m{P5k@2$J?RgELA<}05CsG4D}s+AOJ z!R1@{h{T|y7Dj(!HDjG5=<#ZGjnBdakgDNoJ*}itD@r574QRa)h9dM^8s8D^ElYQp zIbB9ew`yY2>b~w-FuElxx6t!26HB=68936=`3jr5Ko8K(VCBj`)m-?9A%i_oPcPgc z5~&~K--}!kqC>Y9zShDw!jD+Oh0m%Q)L6|IJ`%COlv051l}g|6)t7JK1F(;dr$r52 z<+cOiwt&8ex<~snkU7S0=caD#$UxX`68I@TzSgqWTJ}o6>{SI0U(J_2i`sD${E14- zo)o04m2cT2frG3=g^pD)v;n#dY4rsbK3*8W479cJFMNsPC%{KaI|bDzwA0qY*IM{W zzwlK}yIjo|K2ReCiy!vvSba}50w|%rXVHLzo$)Vx zp_jU$ZKM`Z2O?^s-YBie##=VBQN8`|YQFHpeH3w}8UBJ1ETDrn= zctO<^>MylCe85n6nEg&2*|7w31J*tQ+wu@p^HH#09 zCA-(Ehzw~NG@vYI>nhD~WHO7%nIp1UIAlaNi_$VjWwUU|(5|6u7FOOd*(}5=q|e${ z(AbsIq#2kCTu4ow4|prjHKkRD&X*7piRs>IMj?4ELGv^~Z|Qt*S)I>r=HLd5PWnb= zFCXr|G6Kren9L)Ex#+?NTrFtYJ_}#4)~X9P+z#nap^Lr3->~ZN^0};cjuShP4=R)6 zH4o~<$jXYLGJ$zhI~VLl9=@O~hL#3BLHbn!|n_e=$k5>-egtmeg!?%I}F*s$VT`{)dXm<&79GwgZ_dooUfAPK!zzgx{@GguDA48syLJF5nT8!E;c%wMK$d8Q{T z21bNO#cTQa%cuH}reOv7%X54@0Tg^(Rg=F!sx?p)>+lW+z8}SQY=$wt`+#R}ns{)q zH0WQJmCsUpn+(qGT{Po3s)Mp!g-dxm=gv{SoT2LAQHJ+Obri>AUw?0u3=HTy?L!%7 zZ~qEi2iu3fsq2bo^`yOe&pb+ksXiW|!XAt-!~#UNs&{M*&(EDdea8H@X@|BAb}viX zrY`7gn>}mJtc7g~H=NI1u=W&J3KJ67PgU2ILVJ)vL{!o(J(_fA9j=JjgRzxOHOhen zU-N@1k{{v5$`2ml%2JOo%9W*F_&zTr8eSY{x`a0Aod%4(=GmaHBbG zXnTlxhVj#|8poMv>gj+7}k`AG##R$m#hP#$+AN~3=I%M zs5%l>OWSLFJY07Vb7lz6TY%MRY~#xg1N0q4DdA z!icD26$uz7cqYTZrW6yFl6h_&$puyryS5R~G2Ts7|K_IgoYXSNO37jkjBK~+4u`J+w@IYG*JY+ZJht)1|L>chFQ)OyePC~CYf&h z4CcE5HD`$oTssZ{)@#Wyl_z2wofsR$*Q%ysHI%&?EWRuWgqSH*?&k?fw4Ct9e=sJo zcgl#a;aa%iMN#fj{bO+{){0%jT{JlMkYM#>VHwhP!-1${{H> zo}J*UZ?!MZo%L^suc~c;-oorfK&wU=nkX@~_mz9sk%4O$^l&J5Rh2O!_RKLO z;?BzC8x?mRwreON#;!3DF~k>meZ?gh002f2{Se@1&{xP+9KU|(P||_FG6}*kwxcFO z3#jVjC?SJ2SU(h44W^^Yi7M!a1WrpRzbp$^pItPA7zLOvKQ(h3m44`uqL06E{LDqO z6OqQ0dL^WqxM9>Z##cf0&}mxlh6!&uG@q}`Uf`jY&$~f)RNF6(bz`9NnLZx+n%)ig z3+Y91VI2$>SY~7g;JV6C_;&MvSr?ODW@(&<4VGTYpG@PeNi<@iokQSRVdruy+F|H#(84 zas$VB5Ah0kCDaZ*gb{Ef1^ElOIou9Fl2@;MQQD#J)8Wr`>yTb{90jRw#13dhp>SL3 z;b`|pRaCIIuzRCsmnY%lktS=Q5`STPl@S@oHGQh2i&|ggERyOAf7R9wX&_Fl-K)d5 zL4+fS$gd)GBaUU#Jh{Hbw8QdnSEIE<1IwQ_Y8APQIgHv^HtmoEMEZTR9u(3Jsqkh2 zA@)^0#~0~k0yaa_Yp)LJ<;OBKqd0bma>Q=hJgp>09LRYxe_%oGMpBJ zjVO#JIjDIV%mzfqa4WkvOh8adD8N7+@I~P&70SCT(mkq2moo+fCsLY*S4U;ElhQ&u zB@7UIM&z0;`)fFZ(R0kI+IDSqw!ciW?L1w&I97tu+_#L#188x+G>dZ>2}?9p#sWLM zO8$oCFwm@MYb2dI0$`N-5dKO6;RT_zgO&vbN>#2QmW#g{Y=IF-^5}4YH*PF{35qpJ z=%2@)5+Jn~R1ir{!N62iVAay2GWL?fvj~6H!Co}4c0&``ODD8M9JL(u!vz@NMN^wr zLo|^EG*YkZ;s8d59Hg?jIKX-EC^;_5VFJC)wreGDWE`gIF;PG3P!UNa3uR;_USb5E z6*o_UZxt8fFacJc{EbEnFe;-gfLTPs&pLYl7?#(<}jbE z4BI3_s#=U$GZ+FJU@ph4BZtEYJv*R$%%Ue4vF7G+A{G;1(@W!{Z?FKfC{+YoYZVq4 zxg8~BFVz?oM5%%TjHs4_n-nK~O%u$bV89oqSqDY{swLGa#LyxaV3W92qB7b9YuXAh zVBl>vH!7-#!Y`!&qt~vA1{dWph1V|>2zPA`Qvmv8RMnv(I#Eb=FTDZiVonl$^O#^o z;>^m6SRNy+!Et8blT6ijy0Q?%IxC?Y5mgMWuV|c^n6ynO0yko99TDJEY*N3DJgLM7 zBUo{AN;iryY#tDfLOP;To+8Tz>xjTb=&Xp!5{$ft66%OR@{l-LJkCtEvXuVHm4{Z1 zf=M|A=Oh?6W*v~Dw&x~p1j3D~Wj33rJzs71QqWxG2{JdBz0kEB`m|I|qNgURUC4LfQ~0W%o%7*$1G^fYt#2{KntL-aZ_=7PHb`5u3k z6ay6m!lX{ck|Ir7NQfg0uu}0?gVjUw7O=m3q}13TGkT*2pI5;C%jgNv74GwEJ7kFEs_~cdS=h9WDQJLEXk(L?Q9X-Z zGZ@}=0Dml_4)J9fNfJa6SD*@vYL}*Q(H9e6-6iOvf+2q9iojzz(e_FdTgJO~YM9yc?F; z>CG&ShrYq$%QA3YlMGc+e3`TtD<6L$Z-J~)>Boh#=qUwo)Zw+f8;Zkh1U4^ShyTzs zoRlz|WuyUB%u!209IYUhM}jq&!vNtR6G~}YV--Y~h!BFJY!VCrUt(ZIB^ZUt^x6gE zrh1OA;x9tiDg~pC2v~r3J_>^v2E(!0L?AcC^uuYTkzWng4^`T@Z>V0AW4Q~n)GwiX zgI-ez8HEl~NPf|e$;Lxk#Qb78am>0hq#`2-$R>6}%kXg_x-A89c!m@csXPqUVDTle zz&sj=RAeyXOt|UFCcZ36Lvg(n_Y&u2u72-u}TOj7XXe9Q%sRmSK^qg2f^2l{3MF$LeJe2)~lwmz6v-uB(ao|h%c(K z904E(?Z0&ga6u&3MZcjY-UE8s%l5BCrt~iFkCOI|_TD9n2M7Chv!?h{%yxd5pA7W% z_6#Hox=-Zi+5l~qf+Sl703trc5^9GQ2#7?13u=dOi)(v@xJy+p8)iMZ8-YmpXr@8K zb1$MDQj5F(Q~L#Xl|?yhICn`&pC7!3m1LKcRvCML*(O-;0(G{K?d7=S|GOm4uQILo z4cX<9LIs9lU8Yw8?jgV(X+YRJY}Hnd2z*)7&9a1+aa|#(=vb@&OltpMmtORpQJ^n!-mT)Nu&bOs2!5iF3F%R827@JXCuE>uI#Gn*-+dCBO0{+sw1N}j&a7Pfk%XmtVPj0 z%3&ev%bsShuHp+<^Q86#0Mc|140cDr%$(HTw>0SKikJ7ZueYbYgQ*G*v{^ANN<~=< zGMG9n$GFJ>ZRk`o3NmVFdkCzC?IHY{AxN8HyM`S#1~N(k+UQ_-q0lm(LzCEEE&B@= z%knmmVB>di*l}pmp3pbE#J57Hd04D~^UxI2%j24FxB!#XRIUQCm-W1a&O?ALZ0cx= z3NUKEqyr*$^|V99T_#GWkVYMdHclYZ*p_J$nX${?rcn=z3NW`c3Sy%ZH(Visi>{ow zK_PP~6F0!b@QRjH;?<*6q=@-NML$q#u2+YDBQk?Duw|T*6y=fR*fjiAOnQ|^J#4u2 zA}5bvYDMoMVxoj4DVSfG+oh0hNG1jVGljT|1QwvM>nZpWfr8o~W)PTO>^4!_VIl6C zQ5G!h#wYbUluTlNRpKrZIwZDR*?fl}%>z$URCeJxRGAc2&<=-u^EgWBZb=+){FYZh zr{zQhW{l%8a?(~(?4yAuZFSD94pL-n#T4j=-Y70V|_JPewn1rgT%sY9h-P4 zFmg)hJA_;4SE=gM3YlNjVIiSwAOhIBn*)9K5RUaIh*skJVIkj^hv``*uuGy;74(d=5ZYsM~kt9wl^NRJ#o(FJJ z`M8TTcoiuwD!b725J}bDhMK!v!-6%aSsp)ksai@iA%)Bf3^F%bO}<8GhIUy{up2JB zXz*o_X;(RALpj4r$Xzgis~nw$yc!lcDRigkz*$Sy0tXJ z|76TtEN`Vrh-$Fx!j7l=u!D%IV0Y0{u7sK)*iXczigOq6tCDoygQ{fAXR?cW6jH$I z(F_eEj#3eYZfJ(mHe0f5gzR#f)@Miv)mL>GmADJ8lO-(ovccIX8t?)$*C-;lK-30T z#P{n!0&tx;RedQzvNvoaYMyXcA)Cu-UW_PUpl}aV<}6x@lu&NbX`Kv{qBfTcgg3Bg z1++peOyI?-f*!xk6<9J26FcUwABI88qcK8mx#bBZH&`naD22Xls`{!El4$v0GQ0!~ z`M?il^A{>?dC+Ex+Fa73_^OCU0}y%-Dd*Qwh5~#&oy(ob4ie9bWh)VC1b2DmDMD#9 zcac*>u14iVC_x*!5^|SM0*Q~SrI1I1qxwR6LyBic`#*WVk>3;Kg)UDM|-m@&O;>YTX;?!T~W-hp%Gb>Z2lbqE=W zwGP38t96J9UC*39cTU?8?R$&84xT!d;&E2MYL1n?#X-T+<;jA<{v=qoW82jIr?(Au zFH72vSZ)|*vS-_N#;AP}`$XH^`O|02Z<}^#cJR~%oo%yc&6%~Z%~s&`IJWxKQ0VGq zF!3W;xO8R)6+r!8P-7Je~rT|^iwpm${dD=uzW&PO5`FsVNn(%9mbU^Fz{?mG3t@H zMQ)^Dj|u2`p$I8Bi|AG5wZv%o#HfQ~x;e$D&6+||V&$wb4Ac^Od4zBHk|nwzlT(dO zEt?G%cr`?yOpe0g21-R8a+GSfGz#rR7y-F-O@vikTaE%BhzxCBIf|3fqv)-oF{*q( zB@9u3+GmfWEC&{&^@W2gwWc`=<_J}Qbp_Z%KcN{V_wfSL^}}X$rPi_%|1Zr^Ouec; zr#D$yE|EDZLHqYG@G2?GM8{QIn6mkE>Ah$=aCt#FN{A72pokci9A{YI+KpdVPDPp; zrU$gE!4*XF1S1QGQC~E*7?mnQt=%V;96q#eE+dDL)#(;49;a51Bv4g6epD@URO;ZR ztRrNSq;Mu#1yK@4GBlMpZ$YrEZ)p;@ccj76fn-wqjQ;-K{`QWZx~xY|Llo+V~*%bd;6CKT)RE!>s#6#=_~Jc!sKNGi~53S2`5FpJ%dTlVAtS^ zKGoG(?OD9MXUVW9*4teJx~;SJpiFA#f%{i*^nw|)XLK&aWjYQHt6D(*Cd;rAIedVg zBoe2p@NtRnkpPt)8Rwr4Nhd+ucJmN^^(X6J=2M!$Ow&Q zjY^hTQ6-DMN#iCa9#ZAXti z%Je4&7RRf@8cobZIl@#E1ve##jpo_Xm9yZeV|tl2+N8R&smg>Lg{U-ES5^@RgjMM{ zOaTSzjEO=iMpWREl&8P~pQ&n%n*J2m_5hi2L>+MxlHtuTO`O0^B;De9>t7oXCtgKM(552Bh&By0RW zUBkBMcM`=(U?t?!G*2s`Djuh4B<~T%b{ff8CGs?xKtwpYSa2(5`YGo{olj~l_UeTcY8sOR&RgewKFa-(1H5Fu!L@sIJb{;>AhvX-^ zagxv$*zmB0n})?}Dae8Xmjpj51A1)DT&x|;c5 zdDoVnoe$B5S_vhq_PD6&I4^}H0IvLN=G$Aes3diqsi zfmYu{Cfq5mG*!ORShbX5kTQZRAU%J5-X+2|CDf3~URKFQMOhrTFx}k>n4Z1^c#2Xa#xFe6 z*s#oqwIrbtN943J94Pkw!L+I@V4}0- zGd(2%SeyF7g5s0(CDQjDQvItLpY+@Dr`U9|a_Q+^2ev~((3N7hAPN)dC2{jGd3ECx z>B)-MK(9`TC_jZ?Mnqc5(@mOt6y+>ei9`iVRIS=O=PY0j9a?0KUo$mr znZi;j@P1?+h0O~1T;9|MrhscZ4dn3>d2TKGM-iK3v(~&*XHeCgLSjfLb zeuhO0I+A~@1wJz#S3M&Q>ITQp4`ehK+Y! z?xfb3NQ+Oe zflOW^@o9P<`3;rJ(}V=UuYBT@w0bN=Rqscv67fPF~ zK-wt_kb0Yj$7_mD!eb5OtP+V&8lnS{vI@*jQfNvT6(JD_fZ*cJTUYpGAdpyJ&3UUe zQ^&(XAw{2hHR-qcA@)LB}sY!755^-*j;?r9l`qkMs7e5}{E{?l-B9Y*|93 zF;xu1nkHhcd`vEhsp9KpT`I)nX!@pOnKi0UGbX28i0&A&juJUo+(J*b{4}LGXV^UT zGX<37MtX0{q$Gpr0k~J?yj$E)B{VwG_)m4iPTd;{}75nDkl(S2-szF?^rhJ=c*x8A-0TLfzB?R?jV*T7Z|s z0CcDbW2p>yn)%kDm;#6y`$lL>y6(~zyCt#mUYv@t8tJPn6Jxn95eXH-8yk?Ih)I+% z>IfPDEia2!j95ryN7CPDVJ%fq!4v@FQ3qeqX)$rVG!#|}T3*m4Pk~07nn8-t z1Kzdex_HBN(a@lEi4;L%PO87oDl9Bdl^>MNbwTE(0-`GDx^NVXqbTCKAidS|Y-u~= zXDqT&KtuNfFG)fXC9M`+Lz*DHTAH*hg3fSw*jB+RLbfJjk-Dh7B4!;DD{^gHdZB}-UB*c)TQ*j1V}W1Nq>M$T z0sm7qauu9{Otlg+7RdKtlwlPXl34U}#fG$2uKe@b6DFM0zOZ|l>iR2zLz^t*WN9um zNL=(fOlu1MQKcB0bvQ2&`EDHB%p$;rDUxj(#+o;~bHQLRm~y63Y>RcQQ40I3740^ie&zqyKJ-aswmd>Nk zvuf(x*}J@FFng301B1!3>D>cM)SZ_PWY010Kry>Fh^HR6sH-m;ec-}mSzlMDK5*~; zpl@+^G|(CJ$HR7aa{jQ*dA-Lc{R@`UXnMu4y)yMPS^69}~FzN+|b`3pF+syfM=g_WtVX~;ddqB8`J;e}H zjVWaR8dbs2vl9*qmM%{g4E86%vK`x|?mxY45cX+1Vmaa`*|TjsW7NKgeWGpd{OL31 zw@o`VJ9z4X&bHaJ=FD2y23{e5V2!?j`mX`i{@ZBp;F(DD$GhPe;K1lVliJ15e#(4Y z*Q9z*{aJ~>r+0aOq!_q&$>PDmzTK=T{uHyFALb_meZ4&c$pYQATSev7XJ<6|i0V## zOPMn9@_yY9yuHJuha-p-6^ouE$f+fag{Zy|wkR445uoZgLSh2fT=#G-5ma^R0Yo=b zQoYqvr7|*tw6*TBlC{>oVT)@S*S)Hm-s-;YfwzuUS5e&)q}`QJ-BUGShzK6JkiJI+ zic4>Xl`CJZrSGW*E7ZQ$gNNccG;9n=vZW7%3F=Tw-)pUVBb3TAu6tE|va9*JXOjmJ>WY?`VPOsuK!naV|NyKWu9 z#5e}S-wGlhNHL%6vev@aTKG!8@Kr_cS$5Qurva zv`V<{`Q#Pr6oG>Jo=JG!0Qj?}p{p>B@cfE8$_N50@coGN7)shx!)cznw^m@|tsB^= znuorcuX`$CMtG}eWQDVqwQrv8Ouj zkR)kEB(ygVn0Ra9d+RQIRSQN}^M#M>4FHu@)pdnL7RugAxbS(v1d%0H(7|Vt9!Akl z5q*!Ee9}-v9Uc5|7KlUyz7e=Cc5d?&3$&KKx9+l66}WdbU-nFT6I1m{nrVe)52LMw zr5plCaZ~zO-~|m`7t9K^@YM`#5Y?bWie8)J&p8Je&GZ5V^5Vq&w|R;=S_|J>cHt9M z@#0qZg^%zknP-(3K0zj783P+sr5UOlW}$_TScS<#SwP`)=_w0S3uTzNzl zd?dfa_NH)Uj+Aa43SU4YGb%PLFSSKX%|_EKe4~pmhDrqHP3>H;*9^8g%7>5W&!7w(4@kXq|$l|G8hH# zswisZ%EM~&7sCr!OdX!25D4r<*a=M%1Rd*}Cq;H-&C&?&G7Q`3l<*ea#H-cuHkP{x zk@yspsy_|jF5M~Kaw>wl0QGKD;8M1PB)W>wQ?IDM0w630X;*-|Ofr|K#5L)JRb%-swRUu_;aXztV4iN!XA?89!o0;Oebibc0WZKtZFUX zYAwJV*Czm58LXBr>ZJ^u7!>Iiw!d`4G{7>8sEDvG-o|?3#dO+{UYt6vq%s8HmzDvx zV6R~UtVI0PSoelXxp45IRuo`{WtOn+0M{$(nCkOfC@e{A6_8t#e2m&EqN9>AKsFnH zK88yLSQPjkIO}el_;%CqS5*l%%spSVxM%g2V4kXVuAJyeK%j&smw|X zNgSq#07LJjmdUHb2BWikoEUW1qi+x0)SIWGu_}WN6Jb@WhF5C_qh~SLtrfAcgypI> zjivK(&@3E7sUU-S2FUUftj(J7FMRoy07$Ejj&X_as0JaX59qo@zvZS8V8uN2CE~6I zD~JYw%Tj4uV?7+8vMXq&vT+wR0f4U;MV>=Co5YN(Iyi*UA;lDmGJNS`s*PVgw5$}I zCp%7ElSp>bl3pXTH>SE(Yamx==>^t>ivt@!Mg{(obZjgie<>x543459FhbFka~mXv z9=d85V$NZzX$Xm-?)W*(v&e&Y1I#e0<*B4^8u?XSKQucHR-C5w&tv$j7B>~$8wzfL zAIc`Z1Ub}XbQN`P;H@xy7w^Ww1(-|qJay~vZp3jI8Td6k18fB;nn!$9Qw~eVT8&i< zEmaIkFYAhe3%s=wCia@xQLb$j4Sms-#}o#dVU^sCOm6v-%6$TXb!di>A!G_fyb(u+ z5hTrnP*+uMz2P{V>Q*fhU!65Wq<~32P(|5A8#V>4T{hVTBoM||QOnEs;TQ>j)f8R6 z^nBAb>re~rNCeniDMqZ57-lPNUK?J9HG|Y%r)nYcYRy{Y4%1SjqR}PEcqpM-NS9G0 zMPZOZRdQpY6&_SeEA(m4BP{9E;gXjkq%JzK!%)V;ikl}bs+v|njRcuDx74qv>H8nG!u83L&6ntt;X_gB>k zGoj^HEh1m7g%%SEC^4s^)DnoZ5=t$pIw4TySyUzTOcU=1*@iV-g^IgC1g8?b4)%(y zC{jRc%JJ&Uuz6Z{*OXgjVy*_uExLA4WvI}t#`@t2f0Z%OP#Sasp#5=CSD^xIb$n`3 zx~eOMFqf-ZZ_#jS{C;@4R8#9^nt=$6C~W1*w~vl=O)F0&WrK6&G2#WvQ^j0fmUD%u zSvH3TW#xq6fCDI~9r`xaD=tkfYRWIdTn_lj^*{mGARGIBU?f&zfqvJ@md|_|_B0&Q zvP6j*4Q4Oe03r|Y5L6IgSS%%U7ozRqkuMxaw#f5xlybPrQWU?#lBzYT!Aj#hj$Ma8 zJ|+E$9^Ry9MJat|n z1QbH$T4l5_S;9;xP(gs%P#8OfGI?1)a0c4A4Oc0?-rz7^SOCH`eg!dg9mD?r?R^P+ zTxGfbq?8(@LRlsxQv8?L3}|U{_9ZHGvb0Un@@{AvE%ye>ExM0Nq$*U@PQ7O z2Pj}e;cVR$!Xsb)v{z7IyoJ|ZXfUbjBAl$z-WRW-=Ueb%8!(f>q~+K90*{CWi_MSr zf{&&F+L1@wI?>Yjgy8UoXlpiah9@%bS9=8|x>0!TrNMjxde2k7;0Z#l1xEH$K(m>Tdzl&#D=cbIz!2OJ8PUf0)fYT2 zOHI;;<2;}<+ zZJ?i!h#5uB0~Y0|js=aXSt@iF($hubFoRz{I2Fo;moB7*beV0|ImAU{qUxm#pBZ~x0AbJ~z z5WD~yhg74lkAvb6lNRz1x%8JT&jR|FM_yma19>P~25+hsZz-xN+le9t8%fxTc$@=p zfeQ)+K*4=4L4!AsdPy(!C7@U(hy)v~hl&5hvIejcV26IB3-L&i6%mZz_i|K=1I;{d z38HF9R(Lq(efVBty$~Q&RAEDhKo%-U(|U>*F!ii)O5l7i0gNcDTE<`KPzHgD0@UG{ z>}wR+V7d!&LkObCk6*eJHDSt{B}%er8k}B)^bYARAb&!uyMT}7MWCX*bQg+tgV;Pz zQ_;MErMrCz7wag5uA#o9iv=V#^cWq#bQ!vBOO9&W2+eb*UgS2+=$t3ki{XVF0<*yZ zC+;a;x)MZgLs;^FHv86H66?qd%P>2h7rG0SX{<+# zAO)90tGhrc0aj?$8?nI?A$HIVL!zS@M_WoLBgQMA(=ddl1~WXbHpAw;U9Y$XrXr8o%% zmZ7PD;85kQBCp|a!!SU2jRFTXkmXTOT;^pM3Yg3T7&ByT#Zi*w)9->^2$)kgO5Su( zLxONg8j6+2?{-N7>XSvu01DCs$xbolHDnmd;H3=?*s!qqE&f&fK%wwn2GHw5EfzwYe&g{7EuscHhu0<6h~g|sMXw=7 z);4~-PPxy*KCIG(s37VmSvNcn-36T< zK$syy=HsaUXC*f1{*ygoNV*FRiGZ{}9KjS!-7s`XDeA6_ zcs!&lyp|!r1q^)Zsk>ND;ot=|P=|&591;b-f*NqD(cZ+5Wrzk|-~q?k$8Q;$8fxKW z8~rDB-UeqvQFmqPF0C%SmLU*$Fo(RPi-Pq5lr99=fn)b6UEr<(5*fNH8w`qc7pmD% ztDO%iuVmYfs+pFlJ4mcqMb$M-b!nkhUA!*CGUSoe@^HJr@fW~UGPnmhPVyVnKuHOD z-+@{q8w|=7E)DgM2x{aJGDI3hvJwWmfGM(NN#UvO@)gw3!drcUs37KS&!C2Yo+tqf z$U|P>8{x+M8HVi5*ZmlVD8hk12aeJ48-@-#;Ua`Tudp>j*$n`lDh~_-pE;MW4G(k>isPM8C+bl{~ zrgX(a8;}UfR z@cb+m(qLv}9uQtZ)>vUOQp#FF2c|JJx@#;P)LedcEl^P4UsbYJcFpCN8=Y(IrTt0U z=sIayd{O!nA#Q(W+H`KjIuq;oX^Wfd8=9xpu5@>fFRh!lpmAYi%QR}Y zfG@w-NE%&zl{M#Aa+NjUJOcnn0w5jzdq?+a0L_CSfEj{)4qzJ;5$s~x81P*H3}tAC zpjDm#)1;wWiC_FKib)_Xpt}CmSbg#aB?~ki3Xo8FScP_ilT}MIdD}pPOg^kW*C}UK zp4XJ@F`L+n-B!PNX+wQmeBt6{i(1;6mMv^*gLRl|7pS;gyFgTP?ZPmPbD9?~oOWg< zWSk)#*-M_aTf_mDahVC~STYX80?UB0PiCtllJWn8R!6`_Ae-%}JOz%J)dDBu3MVSO z@8V5C69Fz!pgvo23b_iJd=#VQ5mx~pRf4x|kW!(}Tn$f7z=!tCiXlG0k`*BBiF_LL zC+8Vb0T~7(v<{kNY2Z>M7CK*MDMMBP%|9^k%iMCM>u43B=aCLTA+*j}oMuTjl&EBc z#w;D44#=g`{>93(kRl!+<%tG_u*z9S3J=c{!W6)$DO!eslH0H62`ml(kVvO5%O&Dq zFG2+Ahy^Aq!Rr*{H8=yv{cnpq(pKCZ4uGaKs?Fi(ZU+s#~@tbEnc(*RW`%Z z<;jQYR0=3q0dhM&7&+F zAQ|8QbU|?xQ5B_P&^CZUTjYWZDC0p=p2X9`2)4YGCpeFT7g}P)=1430rG7vlijx`j zLr(39Bo(;%xO||dvMop^=y||pDTrXV4oz$+r}ositZ0G*Y?cN{dkQEX039p>Hy)BD zgJC3qk_CN;CcJ(tByF zfJHn=%2N>-i==1$E~^L%AQ+EleZ>L)@Q)Wl)E4FO*$re-oh1pZOheCyP%~jAV1IF< zBkP8l5W<5O`oJV+SSmZLi5Ub`keu{1ka-Hp_!j9fIoQr$r%N4Dbvy&M+WVn;S^~HUAQfYR@OR^ z18kTENVI??z>Df#KP-UqIf#HE9&Da0*6b_ksmZd&i3sXscRiV4VLu&8D-RqyI+Qp% zFFA-H@q#UewR>HFv!1|$>4i+r1jzRU2&&YByLACurIe+@EM@;x}%Kgb8xD zD6`Bd;xJyZE2fX}>rND^0!%M(1}{3IY=#GH`vG!2RW7WqCwiF(gkSK8Aag``YKF|?kfvrt4xBI~ zKcfW;&7IW%#&pn+nn!NZr+Ey{MFpJWKEk%(gkxnpTt0rM;2477=x8Wm8jcZGdYwR< z$Dq_KU~nlw<}p$nAogX(=4cVe7Bzq#u&4qZr1>?Ec`%DW%FzTjdo3DVV3Zv*BJ@1W zV^b6i6tbbonZdaCwPMtEC~6zhN^Sv^8vwd^`kkm94`Mz9T@S(J=vS~bFfjr{=)>;> zg9upcG$oHBPut{q6TT;uv}2oAcm(SM>q zc945SPz#B~3sAySk{F+fcLPHtJjDPa@g(%gLO)%*@bQAa;b??wWAg2KgUjSjQ%bNf2g#?d`)e7g-SgOQv}P$W zB1;hR2Yzy(M8~hO9H2!L3DYuoCt+}EF?cZ~SXv?3Edr#mcog3N&?S%2NR4=rgC<$3 z3{)gKEBZ7B1sP1@sHDmcq9oZuwLU^4l9Gp;C4tS~OxUK&)2bCi?t`)gha#kI7O0a% zFzR}_S)g2D#YaICEf5k*NNoF*EEE)oD5dogaYtKtuwjW(9zYNc9O0@d8EB@8ibpX# z1qeCg7pP$lr(mIGR7P887=2#x3v{6go?x+tBCLJ8Psx%1zTo5>WwB6M;w%m}9j60U zHhyzZc6dWYUtL);(2IssirB?;hYD2+DRvP|`6w9FAwZ)g3rXT2^5~;WE`a}jRTk1q zJa82Q1NR&niJV{Ag62V$HI<-07H9IpOhIhn(1b>=k(5~)V?_ornlygF(i5C!ClC_^ zcu})7J2Zl2bQ;O7KI#f%^!3^O3wqic*4f?t?y8Xr4RryjRZH!4cCw>?W1CIK)-7JR zwBP8rQ;SUt>M|Ec0VKQa{`Eb{HH}RRjlMN>g47xzW7Q&er-gC!qsN7tjBdNLt*)o5 zYi)OjY4msWbhojKtaWcSwKq;cdoNnsm9UeGoqA(qi_yKt-KTDCGKptvTh?bMQ|AQr zl-+M9+vc^jG-aG^q0!OJj%!=cvwi^{wUc&Vy0Nd{?rK}uW4TYAzGnIs(-%!2?Ot`B zk+jwuNqfv|EKXW>a_A~n>XzNxnb*qviNLl-kwcXZqCwc_mO9C@MCZKl2% zcbArAM|*qfTbS3?G4c@h1S8iCK5hCAx>8?K--tU+eNE}FGu=(7ZsF3|OL<=B&@dI~ z!EA<5U02@$?HVQ`vg6%S2q9ji5|FE7J7lyJ5|Ba9SbqT9h6xyfE zDP38DV<@T+0JR(;T+|f=E&=il>~kQ63+;*Fa(Z`0K&y@bMxA+J`B1Rvw+s?3(|MGB zAfIz@E_+Cq0C*EbF%M{u0$dMb;gW==LaHN%huW%W4;Ag9LfS+Bni)Gt_7JG;b?}XP zCJNEUP!D1TUK1E_*=J-3xE4{71rUy|%xMs*=qM|o8~XS?Sk*$%RZ4);36(u)aU2>C zcBGFw%-u+f7oWB4?LfO5K{ALG=54TSyGM8kSdRc2MCdvUf6On^8KtTMh+GE-cIGsP zG$1!Y$Cig|NeS@yYDR+58FH2w8rdpZLq%)oUttXucDfFdHG~e!ERW5Xh&+h-PO_G zKl~)=RT+CZ9d@VH*JiCvQVh1TMuPc<+GC&Akc6onm5u`gIif@AERF-1P*=4)CTs)& zberiBOSMGZ3=b7SR%57zRcD~n3}T&GNwN-Na|pG{Xtd0GIvdRH4qyWliE&veesT0* z0fLnS2Ay5m%^`>v3d$77ZXVGgi!(S=H3dTllaAsP?IAb$GE(@8*|K7`Oc=~?=94Y^ zSIyW#vWU=*hLL1?0vtek#s|=Ym5}s6{jE=n2mv32h(K(ZBRqsD88P^r85=BGsKOW+I5T^;3^TS46e&1?@~Eqm;llE=V=IEi+o)0q&6GWIQXZBq z8#0Lato^$VVuJ|5NyNw)IC*rw3=JI_M{LmIuLuwEAY1U8Fw;d1ZB`&(bFxH;)EZ8j z1(Y@@AbO!N#teY4#T_;Ye&qlIVba8j^udK{c|)SWjju4N}+-$w1=*MTbu z88?&pAelFiacq3pu^x9W3?7)CK3nYeWJe#bvhp6jp2I(amKB5P9&kWG&Vs_gl+a|o z22kxt0-ayUAvuJ50^U*ri#2K^=yOUXKe!%^a1k$d*Zap|DlCG|UL*WEcq!PWdOPt7 z4kd+9JSZY+?TK=hFfsTHv@>gFcpgi_3_L86gK^)LHdB!df{OxDo&%tX*4yBW{sJBu%}UY<~in35XGSFf%iO$l!EM&^OyPm z!wpgeM-DSc(*|nV)bub&afly-Q~*&u;5vSxRU*jjfc6E6VqMOOR6~n)4mSGu!LWv4 zB9SPnwwMsXNfn;ZS#c&U&ZMcCG%Kgy2YV*XX^;XyCxaQq)3XO59?K>71q`8zuFD8T z`tQ;Va};&z0|T|WrcqgvG}K1tL*-%$NIbAkIy|Oro-6wGhSGjnSepnleVSHt{*gXHf~Js5%y}psX_2IcT$}%IRox3@h1JUj&we z2R#wc!8wymj3!N&u=DtRde!7Hhayj}qEy}Bi_>V)7#)rYIQWfGJuBkrAfHB6B;+|w z_w?y8|1daI(56wqAX&2BZ){XRHyVU_L7S8_$;9aTMcAWx@Ux<8lcrj%r(wbY$!mBT z%oNShqB-iCqk2}Rm4kd91%gPI(X!1Ooh>7eAHazebb;uAi&$}$Ho$T8}>!8_~|hF?fJDNtAgs;T(3M-i|TWkCbz>9bkt1=Si+nI$4R3cS9Q z59kR;$w1$Ot{hk~Cp=T>$jYR2!An!TZt3g>NtO8qkojxb3CEiZ^T6`?XCc@GodKF| zdsVtHm<5Vp*vEgROHMcpB1H)3^WJ$}^a>B4rGkP%bdzR1Z+)d8G}fz#gMei#$Rz=j zc(ne4ah3-@ThY-L!7&oLZ5Xhm!sGB|6)^BoaCHL8HS4*oSc!vKzyuz7JTTRIxdJFM z@LNe%NEN0dq=N1tKIIF&H9*}A`C64rfua0?v5ateM5Z~*7DPwZOdXzpV}^$wdU(a` zF!dEk=@Q}ij;X%JU8tE;eL=cJq{(4zjdrwoMPqsDpgG!#pz+Z1ex*x8b6CNb_5$pR zXN61S&!1Zf6R9ZHvsN=4B~k2nInq3=%@)f#sI`}XR*!fuyyOd2*{s+&X!4~qe7A3a zipGlqSk?V#FIksWpq=yRD66PCIyIxMELdACE<}W*zO+~N)|Um9t^#W>UPH1TU0S`Q z3vmQFfYJq~7HDzbAX;S41Rm0r+xL=z69eFO{K`w=09LRN)=(T(L6hsygo%B+UI7xX z0>?HO_z>2pXEs~lV21@zc?pnv<`en~Z2-;=?d8upWTlv(Y0X1>nTnwr7OX?d)DtLQ zFACU*;9)~ptwWIm_YUkWPyMBHK@88!L@@6+&8f*Ei@E!;4q1two>wWItU9Wp*cL;A zqxGd38u{{P9cImc9n99D03tsbp#o3o!rKPD7Bt@r=BzIG?tO(8#f%Esk9CM1EucTp z^Uz)r%WkNaB|4^R*;X+!PqB@m*r600`6{q^$b-2a9=)gc#o#RgG!KE$K{o-v+6#$I z1umu!`2uIHC?c_#hx=voj)88HKs6y}VCdmt56|j;4W*X(tUQVZ)n5gcFA&!w*CQf` z;Srn1`vS8uaQX|0ETVw$<$l3l1KX~Wy%(5TBMY;DBIXe3(ejWl4&XV%F)fI_pqRzj zVTSJoZlU_DaQ%WxSAk=PBFfWwR6Ti07XoQPlrCO|KZQH?C0(qGD)``G9%e^IG4aCp zfW()Shj>Y%X>y$6I6Mj;Bq=;4Pkz)_umr5o>I=R&3Y{7G?graj3$p7WdSI4y4xoSm zX@VdG3|hYeh9W;56c;%J45ZKip9vB#<$;XSEQ1q7Qx!B+(}+rVh=TkIm>AmVp(LXf zhw1nf&f9VE{L(WlDE&=?Z z6&`9>KLRF+LOiqq4ouoQJadE#JTw^m{QwFW&q)mB?l-~#z%IH5Vbyv z&|jzw3!r=z z=lQe{Lzdps9l+*Ii%Zx}c<_h*+^^tBSHaCghFFli!yBxhdGMr*wP@Eg$*+B=s)#s9 z=q!~}eSxY9;>GcKWWY2_*G)w*G)ppdTrf1=mml@T?8D#)Sb^TfC$iL9F3~S(>J+U`G)1@y9?=t^v;>@`j1vWq1@? zZhKJ@>*Mi&y1e$Kj9VNIUU_w_W^&IsNJ#Us*lgR`s9@l?bv$FNln}5-B5e`rPs}9y>IpN58Zy*m#fcyXiG)i z9hcuAuHJj+%7^dx!i{S0E05n$_q8wH^r;PR{pR_Hzjnn}Pun!%u#eW?dF8EjTaNz3 zsvq2W)opXHtZe*B{ashze){!7#|uBW>zc1EzNL2SM;pF=?OjXnY`$yNBVWJn?iITe zPkg1}?l0YQ#-8MFUwGv1FYju5@WR86pR@D&Z;-mN6_#ZtlKeYAux%Y0nzwbAD?>ghLdvE&A`Zr&B;>NkVZvM~n-+k-1FFv;G ztKa+g$RQt|I&b$a-~W8+F`qp1@!hvRe94rL&)+uh8`~ebtXw?jrN_T<+hbQutGje+ z<9$1xxaQ=gUqACOe6x}0F&3?``h|A_A;A_tzjlZ(S$g_vs`G5_M!=K;Ki$GLM%n|$Bw@K z4zsVa=KRXuPNO>$_ZcD+B04eJ8|_^XJt7)8IubdNX+3;qPjJWEUc*X!eq7=%KbjiN z)b=YuI(odbqh~LOPNrWk zeJA)v>2qq_eV)`)MJA=zr*GNgvC@a+d@MXF`_<}){c0acJ=Sug6V{UZT$fKMyL>|I z-{@BxeRi#M^g4R1W246|h)$r#Vz=G1W_5pmFK!s?Gheew?uPQzh&@xk0nFk->upkB z&)TG!dW|kGDZ@L&F~e=t!FXGyM5m;$oBrBHU$r#7seJIS;a5(-#zzdg8g)HuyHn=l zl_h0Ymc&k>Z)Eg|w82wmo*Z}fU={wqfQ7D@q&`2DqhlZY-t#o}@S8uve%^oX{{s64 zB1?{YmtKRnZdy&-Kl{|NB&#d`rqQ_OqP=Xl$f}EX()LT|zDdi^KBdv}CA>)E$&HWD zIQQ&lX!~X5_tW_HLor%^{co1g*x!CQjc3U#X?*JE=g@eK_F=l-w*C`nY&mg1jkABg zmG1KwfBh!ySNqzHG(IrrL|Xsq#S-nO{%sGf|J9{e)A-PH-=?vIyPn2VCcIAL+lRkE z<6Y-IMB~+4|3KrBf7wps>xtWGJnNWCX*_1vNi-h+lOt$+ZR-pgkL^94#=4g(X*|E- zL>d==V+V~h|LXoeuixbEH^0d}{;Z!Zqx=5R1+z)KJ~8 z(fZ!c{)NT|j;f~dn&Uo2;?# zZ9o2!)wI4j@n;&jb1(T2yYL-HE~MrAe>t7@`}3q9(KtzS-=jC3b7=eD8Yj^BsqIhF z_Rp>S4&C>?@9wAN-#+3#uhqPTmTweZrtLpk@1EzGcii`;WnMXL|KoFWX#DtH3uwIg zvG?5LDx);Mw|Om%mwomG8u#tnOXIfJ+~w7=t+d><*i|8)JoRQ;E~%`b@sHO{pt0@4 z?tAg(f1E+ft^a(LMkA@yID6GB8qaz9MY`WNR9B8p??0W^-*mxeXncC{?`eBygZn<$ zynYm|FZ+9<2B!DqVX3mO`-i>68}WYrLW&g&)avqa(LR-Q)vD7cDldANl%_d%inbBXguzTkJ4DmEvE7B z-#ePdV=7&n=aINOPqrYI!mjh+%nG`Xcfa{J+OKvFGgu;#b8dF8vv~X4wEo4j|C>hp zg5T2qpWEfi*R7i$WV=VMJ#jK^-%zoF#{PRY(zvqf1{(SE-QQz&(p~QU=X_ef>ls(> zUaKgj^IrMT@icBY^(Qnwe5re#6RzQD|0nO+L}N+S>yz1a|MPbrCB4v6eLIb}9C8P3 zzy0mM(erKC?8;sAX`R;pWy2I2fB*h{G=An*_r8vP|7BV}^wnc&oKW%wX`bU%Y^ppQv)@neXnuh?cM2 z?iw#=9Omx-?A9Y`z4n+Z4@dmnJ@4lZok8nwG^f(IKH|#HcMgA;mM4DNov)pL{FIiT zxg|ospZ%=+p1xD=%K5g*-L$>@26tZDeE%dr>u=1Y>wN9{ zHKeE1N8R`R&ljwq^6(+X~-kA}5rHX9n(Z7!7`HNK?HKC%sbjO`1 z-KL#!+j01_X3hF5-s-&VxM-xJEE1v3$Mhz9ERLUPubpWkoS8H_XYv*0UTf0_l~iCt zV*jxf`zPWwe%9blrf`m?*rw<>wxw~2gzT{@eG%+_C}m>PW5$Nbl!#6CpoTSRRg@1~ z%-HRa{=Tk?a`tb=%F^kdV8VolFE2f3!zTJOWAhY_vrLCa=iP*uK%XBSx9C{t8(_+k zZ1c8ZDr&~=rDf0Wp;O>&yZ*R?^iIa+*`68qaijz*%DqqFKWXn|+)n97YgC z=G0d9)udhUF*c`N*^JF87qcSmpk{1x^%BmG6Vud#KQlL{rx_`yH-2!nsVFGep)@4QEewx)J|P>c4b{(P<`+LZe9 zQ6qYF`YIPbRua80y>Rb4B|}^GUN$}Y%CyuW7e6t5=x%=Sql|^Op2}F*w|9E<%=ESQ zO^;fszrX(e^wf!?6Dr<6cHU$one18L)@QD^yNqZI;hd^S`NV$UH99XpYU-!)(D=?K zS4}8C#O_{K6mwlX(PO!jmRJO1##{+^`KZfBl%BJT3YRE0x&5_9kU@WFF;44#(lJ(-0ihwqxs zL#3}yyJd#T*f)eDGs8>ns^Os-_rj^TNao2#1T0d=4x4#d3anyk#rwy^{y1amV8D)k z6n9d7ZARH^Gh)B1ij^Wzm!4aPCS1*9T_jTW@)xV3Fik=gRJ-`i}+kH*kvp1DB#kO##&9 zn7$)rS6)#SEi2oyxhgt668kAXbkNY7G<4uLKDLQZ9e4*FxYIrG4PoSg?4=kwaE}n% zCZrC0fDZhrd*CK{=s@>cW~abBC#vg zq4SPgt=6+QG?wM>o0m;;N_+iG52SIvGiUdxG61WxGpa z4{EWCw5hGb6wz?Cgwt)hl6}h;T~ZZY6p7vOsiD(l%8+~3^UBVPJy#RkSu?ejsV~xD zIK!9d40~~gPex+<+lOAsk<#X#W#5dleKTT@8nKH$jn^}+?8*l#ns9J%JQtFK}@{GH24 zyYBmO=2bEc^31EiMT+ftl3gY3TMlJjy;HtrJLxL-T~1fI@ac@J47c&!%ed)RIA5{9 z{)Am7?W1O1X7oCk3Xjrd-gGas`#LjG>>(--*@Rcui`o|&>pz3YUn2(J8%SdGxqp% z>@sN&dnmH!9Xl$!@@n=QZ-+y1YNYIk&rXLk6f0$a9Lt>X^2zb$=EW;&T!*!M&*9bV z!^Ioo@%R$_$3CaW zz40ElJcI4Fa6JyITNcN;47Oa~j^%oeU2y}ubYnYCS8KB6Mz*||-AY{#{)s#6A9i=C ze>XIpS6#h{eFOt#j4us;lWQ+Tb39yNbu>HLtVg&gpBOw|wKu*5=-XAYo;_ zR^3!xO?%a{hh(3Ix;bo^d)|!dYMuS%e%zBTW(%g+yxN+(ynodSnOg#S-b6Px*0<=@ z)!H()ojo{xmMmjmiuRdK%xcDmH}%l@1O>)rQ%)b~N>TQ03yv53cel5tP< zOS0cjeSGkK<}Kn9;{4UCgyqZZ*4oZ^6MG-BykCoHJN+PLm)@89{@1YYYwYhozKm_B zgEF6A#QW7++qY_k`(!KF`?G=mPQTB!wM){!XFR@?ZC}q`wc2%Nmo>kceMyb=SefO1 z=h1e|FY)-Q*7+OptnPDSIho^n-E_S**t53N?4G~QT(fB1s=4g?^(%4FEYDlt$hOzA zN13~PBfh7N_=~;8%h+UA-yUybzbihCsrin6|H1cl$ujo~VS8Xd9O4Rq`+Kjo+S$q_ zsg+vl6z%RZz2xk@aX)n(L#J}zryjOHy=qu?KkMD8Wcl)D+ydLbmNQoHm^awxGoOi+ kSM(Z5MAZ9dw)ae1XD9oRbC|}1L0}da*@n5kDc;QgFW+ZSv$TCqJ+s{#!ixnJp(b?9$M%?{xWuxBq$L?lWFIkLefh{2S9_e=X?cXYZJy z&u{8o!1UOsA7r}a+y7;{_)nkfqtCza#1EOCvRC87bo|sy)0pnD-+@dY|MyI$&mJ?2 z>B6-N(-%7|rp+hJ7N1|+%=F1GwlaNe{S6=f2d;O9{}rY$Tzd!8mp^nj(`%KdnI7Ev zJk#G^G38Ji|4L7a>E5e+rqibW57WC}?D{=@e(Fa~WBR>YPG`Dv=4__hul_mHhV~bk z?t0nFO!xP1`9J#Jh5LM+ssGIf{y@ji{_QzxubYFx~E}*PKhoT?gFCbiz@BzW105UpfHi4_&?Ovwe+JFCMI{*AW3e)@E6!eju zc6^J@?|AecOh4VZ57YmC?*OLP?|M4Z=^uQW>C_J%@HUPA+fJWiy2bOCG2QLYKV*8z z&-Xm>419jUgt<)5Ir|)@M=$vd(_g*sKBljH?GdIe6Mw*Tvi%dL);nuXqVJ#gi9V*g ze69CQIu75vj_Kdtyi3sEJ;bzYhliOSc=o@U-rl;!-)a0mZ{LsUIS=ruUtHFVmirf64UWZ@k3xjES!^-S3ynpEJGb_g~-Q zQ@GxroNXo&op_R0!J7k?H@N`X{FEm0uUhCj?=y31d`G4(8* zSLA;(U4O{dd(iRcJGNzd-pUly^M5BX-Tn1rnLd~4W4h&Ig1*%JSw6nG@%aC2v&Buj zJ6}GD>0_U}j%n-Cv;RxyzwzL94cmgYFTZ~R(KGwsvjtJ@smqw2dwIu1I{w#JUSoPy zr}UZaaJ=ht7Sn0JJ&NfK(zX}S`Gvb&&-BX^9$;QVqt~W7#<&Cd0eP;jrm~Q)U!$0ZsQ`T(7bgw5r%=D;BzsU67N58~$ z?h76NqHzn?e{=GVpdb6#GfW5lzcAhH6N3KpmcQ}wQFp$@^b31ToI>MoJnjQbFHW^E z)xLct(}TbE-7nGSdro+S=|6Yc`gS@#;DVi)PW_tB^w^_4rvEtMG^S^17c-qTdz&xQ z_@7PLnd#^540hiM$N%@e)l5J6zr9Qsesh57{8K;4^jj-WW2*h;Or|$Kcp=lh?|F&o zZ~yUYrtQZ*F-X^Y)91B!nEt?OIF-iz^5aXGzW6{p(<9&Tnf~8D zKgLwP=|4ZAaWh`u`KLsGbld`kbJ$S10 zG=2XM|5`O;XV62g>0;WI>1MkA*L_SczIK4=Y2hcB_U|yr^qF^l#&oY=edqxC{u}!p z#dPx4lkeOL|E*+$i~ibaasw={y^fxg9Wr%N)(C58XxL^+!xodK9(LVnr%#-;%k-lk zxbE~b-?LNW!pjf4{*1G>o2smN=7HFLdcn>|}@{p|J4=k7A&*zewY@r?^U+qmcpEw`<|`H0Jv^mE_6?UFApyKe4zd(QjP zrMI=;+IHKq-}};Kx39R<{r(r`-Tt|+eB>LM-#z!e+dqHDst3;4=6&qhK?|Jc|^WL}Mt5@CI_u7wd`^Y0-z51Ic z{^jNGU%B9pYwrK}+kgGtFCMw$+HZfdVdA!XExhx(2hW_e>j5htz4Q9-oU_xuhhMeu zt{cAh>8a|+o`3YN8y~qS)qMV5i|)SZ`TI-e5b;JQ;W{vYR_yr0sl|lKAP^tpnIK|?PC+BJT_tf7EJc&qVJWu z5+;g;wwPBNroHv#Ns}k>q;Pd!hBPa>A3n_9ZL1)Y>B)%uEuXS{|GjWGgGW?xv2FNb z%{_kfxJT%M9VhpexnRM2Q4GmeCB?UV{Ly8f|rtq7k zu7x+V%j8|MOK2-xJajtyf^zfQia(tyNbyW|>giwApK-ikRiD$<+Zp&`Qs+;Ya{h$< zXVG#gxBzaSnw-ksetTT;gn=M)a;9gXKgg7QO55iSnwoc1Jf|%)J)O}@x@W?adnW9E zFwGAo9~1P{+Xa2HP0+8sDd^6}2>Qnp1^uZj=xx#cj(&zjOhd!TKN9CVqwhByA&&3) z>lXZvkM)b=>yH)G_`0CS+$HFrqIu5zyf}VwfuPqMID!B1%1_1dou>=>#;Fkv1idwy z&($9l<9`}GuOOPoXCSud&x`BD_$kqI`fi6fzGuFmACBhNdH;v`#Gbo|-?#Ormh%s} z<s=hJkN>wsoL{}{{`bpV@!{tSdPS$8Ex#1>51$pZYptM-pCb9y&~V!_V%dg<>!SDD8$H*jBl`Xo z{Bgqr(et$q7svCq6yG~I`reA4i1Uk=3HpK6g5DhQ(E(?O`@OS;nD5Si7N2i9SI{pX zE$E}sy6lYR`B?Nk-yRU3zxaxHpI?jC<>Zzv_=Bq=e%L;mw{^3a-wDxrKR4o|kFFEf zxhnb{&z~WVooKy(FdBDWv|gTy=AlIQ-+OB@fBz-%y_2H%_WWhy{7#WvI3^UIr@txI zNx;_vz*i}52 z1C|MT@U!BJ9g*Dl0;dRk-WO>S#kWiAMCv+ z{p*m&w&oMJM!)B0(Ykypk`u=~DBimpt-G1a#5%cbrMS*((Q`d?7rtG?E$e=`6@7Bt zsoU~IzVfsf_vJ$c{qBbZZ9PfQxhHPH|7iKLIDR9NcNa&$>-k8|U)C%>Z*c^j8GU|f zG>?Bq>;9L~_{W|R<9-+M_ubJto^h}^e{Lja4*rZdzB!VE@3f2K^P}}S80oQ-BmRBn zF!8w^@#QPgI@u!{x7UZo=Z{D7z~~UqWm4}p{K1>i?|bYeG4AE#1+@?V5)WK>f;e6i ztzTFu|GO3bD>nBhWmU}7N!i7Lufn{qrHl8$>sl^W&N_VZfh+FxpSt_z37?;J+_z5K zX5Vo8E0pl@>lWP@?r^0s@%hCkPn>q)Rd)aHpSibri)HUWWUs&f<#)fH_M?3cJAKO? z8ap0+@&hgB?bhV}^wFpGU3l>am)vmaIrH{A;`37Q<9p6)pSJAU!_L}8Kfhx}`xjep zZu{E#N8SFuuO9c{`TMP%(xiW_^M!{lJL2cFj=TR8|K4KC;%6lHd!JcipFQ)6yMy%Q zdf(r-KUPcs#5(f!@Q-^MyF7bOSMztt={x0tvu;my?{#nH_Q$Sn{`}GxX0JQ&XZQ5I z_1@#oJgN7U*RL>7JMNM#-rV|`iLd_Vnr%Muk4dlnza#(SJ<@fD9Wrs7 zDH_X)As27Q;H1EVG21ebar)Ce-NGQgb=s6$r|rM~1F!_AmUuT-iF54jz4+0|FMf1r zzho!8LrJwQ*A2taH9W`k3_52RreZmgWcrHcShh^(?7;L?-3(ks^Fv(^>71^aN}yU^ zU}=hG1{$C9G|7-G*S0Lf&}^H|X|gH1mZ>U|ud0@*&^g7jU0-!=#WfwtQB6AMS)r!L z_&|lLV|o&uGkwRgP1|&B%~LHupmR>(OS+oDG*YZ>kSF}{$!VLHxuA}%` zXxV|LIg-Wmb6sCi9anN>*;7=@qH&UD1hyThQfSD!Z}a?&KzAM6QH=m+ZPlZ3UKr@Q z?mLPsE1u=>^<*RPj6hawUAHw^;<*H7AZv2y2byiknYP2AA+1K*HkRdPaprkJUt zyPg{aI)e4txmt;%Q4NGwpMYH+27;v0C zOL84Y^KD<_cNv7PiC^W|k`}0@&GRz@UkMZ+Kf%N_Ren~w=Y)=7nyP1OLG(L(T~R#K zmsHaa4ObG+-ZMhW2xZUELQT_jnv1U3nyVVBs(FSHxcp2l)xu-(WvmR(lMH@VR;X%f zC}YJLwyVoLmr%2v06)zLeZwB+W&_dxeh-N3grD+nb|@okf@XKS`= zsJf-AW}rKscyGF|yPBlBo{eAW@bxU!3$e&;S@A>LkobCz+7L~79 zx+9yOC7ylg`L3bsiYJ?arSe>iAg~O@GBh2FKvQ^}ugR{W(p!^EUDx>eg_wyWtDdQ0 zg*!H%(-c*9a3Kv#$;Jxi_pAG6pqRcK%Dx%|{9GhY$DI{$gRfeS!SnMqO}5>@bX-pX zKk@tZ@Q(0?G)Grd%@S+PQq3@MRs0xS)^T}_JF;Uqjwxf(;BJrCkm1OhAxo}~MeIuk z{|>xnTUA0Zy98D;#2T^<*Oz_6bag4Tb^fkt`I2S1TIf3Xy2%Sq&T$62f&|3F3gCOK9k_ z3C_R+^?gWBdVb(NJWD^Yd^~Q)=XWG&3UPv_`e0~VVfp7Mo@5%3I=-&jro`U`V_Bi& zLM}Pr9-Zfml_uGa=DCV%$c8NBh66?jY)Bhhhn(f-5(K7=_W*f-E9gR=8aCd!rMbSP zghmkZ`^7KwASS>wU>rFVbH-1QAzVyH4L!%^d+0uvU#Nr*M3ZId{61`~MMVvC%f^FO zLm_t^N3~=L?B&`J=KR}al_u?aJ|vi~iT4{QTA=$7Rf>sUChg!1Ems;*;N-ExGuz3g(r3 zKQ#Fs24sY%8?xogO6W;E7md`at(i~*&=t`-Q1GlmGn6De2`%8?8+y7T>z1K9fg{Nd zKW8upuC79X;ITSFCx?!zIW{DsWb0C(@OMoJJzNMs)o@4)(eH5l(6O*4><~{Q5VGC1 zEP7owq*&nKVbi$4#mlvPJG7zw!RmC*^+FGe&A{7)(iHR4G%J*BtVYOsO%l(=z`D~h zGfP$#=x!lT!32=wrf=x5Fl3geuJ1uZYnp*)VBF5o7c$Y&F*zxeA!A`4*t{n3#zQ7Db};9s#pUs z2%)8XP3TJ)W_bUq;!2pDuSWa^VQ9kwcHGdFgk2097Yjv$)V3`X_AIXh7`LVa6H#%o zkPJSjLaaa|$XLu8sZttehCU32K(cJZ0B`YgwmcUy9Sa=uFg#%sD$pjfuakxfv9S1^ zD?XmFid6x#(^T|mt-V+yD5ep*t)Q36w6a2 zBQy=$b_^^Bnjh>}IS3t6|1frheIMdY<2B(5t_sD^@=Ap)R>8YaAdq%4^Cb*0Ck((! zU>naAHmhaof#rFmSZu5u{w^LQ%n$Gp)*JRSc%6b*J**Q`f(K&z!Ulm3(o{TjmDt@@ z_?fzpAz*9RATZxy(y8mW4ZVaxZ3vd2I11sJrqR^6&PrN20p?XqjSFM z!N3fCUBjY*NTqWS0x+HfnG78XlcaO5>A~W0e0Vv51i``A(+wT(%zjT~rB0Q(Z62wIJ<=Xv-zl%ZqbZyCaOfqMcwR)z6| zWeJ0p#%aDJk^O*E&=$VneQ=NoiwPRo4WRmHoCW0rjbMhdf|Ve6S_3oUXWHOE&`>;k z0~RBc8^jEZYq2&|7-gXYugQfraK$|&H}njc@w$Z7g!M)DFauH?Dw*JTfWn@HPXO74 z?^-Z%O^fHOTcjsLhTQGHXbd6e)j#USUz~f z@WtVl&|DnIcv!3&Tw=?H=|$)0@xp^3n-U^h`0;p^8hJP1T@%&?&rgEF3OuYU9gKrz zL*F$>0J@sydN5?IfO!im$bmhAw?n3=!RHhM4xIun_hc}QkZa`P84A2&IHoGhFMd`U zxLLvPfWfUnn2Y(rB!s7+$Oa5vAv?)PlRO*70If-r?_t0i$17AJGO!i|&v=>-_Xvw0 z^F@e6*jKU*(<;Pj)**6)&lGAto*iVV1+A)tyw(sP0H!{@I1T%T97i>jmN3jymk>Zf*yrID`FmsaeMYDjnrG-nwur=@A33^OovguUj%D83Au}y2%>734@^(kCPI$Gvkpmy!Q8Qpke>?_ zScs1vWj61IJ6D0?#Udw1d$?=MZW`;0sm<8Lx6zA`!L#heHp*Ii>G{$q{0NQHdxco;A`G!iBYugPUZ~o5Qs6LNld!klAk-D8Ra1gr$=3_PZ-(bV zL_)0z8(4)`im(cv0E}u_HRbnLb}CdWQ+Fy<=$Lbj_gltwN?kT?Wq4IpSiz zzqRysrZ3<089qPr%JX=?^1r`b&l||M|K|Q3>4AKDx662=^YPz*B}K=^zSDOndi1V) zy`SiZRzCh7qBAG$xEs+QE&UVkZ=d<(_I#c5yzlVm?I(VYH{F&VylfX5ckdIoh~sPb z;_E;6)Tg$k^J|}2&f|AD;UM03{z=;{e4V`)PUZ2bNf&KL@#S@(D~+pSzXM-?!b3 z{QExLzc2r;2fLr(NtfAq5&_IT}azV0r+x`vqALHvh`lZYH_~wJf#`?@F=f)n1z%75#npWO=l;Eb z$Di>9kvOsSb}#Vxu6mKpNvzY#C!p(gCsCP_py#w`UQ{MZ~CQ7Z`*ST(`i$G&opS-ov(kU{t_SG za*@bK>0fpqAOG)pk2AgVgtK|xpWXd+zV3{P3e&@mdyzli>db4IzI^;PeE!F`-_FOs zo+c7O9(-;cpC7zv9@E`+*o*0Pf4qmsU3tYFOusen2Tb4E;bErlIb1x)*I#{zkN5sX z$n=6qPxJe{>UJR?e{=0O`F!(-?q$00;fZ|x_1iqj$6cB4GX43n;`hDt{x9=!<1SY) z{pt_iX1djn#dEs-`*-v4cW!x*=_TiAe7&b;uIKwcx8_Qw?=!ZZLIckD$`|?g0`qW| zA6tH3;^PDQ{>;zq^*vr>e!Jzeefj*H8(-qcNlGta)n^ZDI3y7~Llo|wq%tsGY z>(Gzzv3kH8e4k}czQf~xxzAVm^G{rLA|DU_s-KVNUM%F_v}->tuKUWZOfS6XdZx|o z86Nk}t{3oeZ^v3b{`My(@%4Z4=wp2Tt3N!J=@YMu`yTS@T0TDLYGKDLyz6Bi|NK=) zGyUp8%fx)Tci?e12I6~G`1c+= zOYrLn+xGJLj&)D)^?#ia_Ji?-Q~180XcE7(yj?vhyHux6%DzloX|o+&`_h!{dpn)( z{G_F~51jJ$P>D+Mi?K?+EEv=0e|BH`e<*F~5-gIBH+G=(rEzwnf&gBVtTZWlhosMz zrP+!-OH$#_BlSi;wsB_Tvdn6yJADc#Pc3SpZxobLU`z*<9Po#Q=}`EvT~s18&gRmE znT_+PtfX;v_dsXo%*G?B+Mq2!Jwac8D&1(&i7`5bxyi-RbSBE z*OOVbPM%eC1~C65H|i@;bAl0cX3aqN+5nS`eiPw%3&t(aJW;Sx61V*>dTRgA1Bh?i+ z-P2b#wy#O|_6C_$`x+6swj{N(_-C$coK<*s zWn*f^!g+1;FdR=Z-P?s}E_J#BoIfNrt$iUBmFw>rZ=q zv>b>DR{2rtUgK;81`$=$@WMn=U(QFeWzqBQ!`B1){j0p5fo{A>8RtBwyF2h#t#dlD z%o=CYu;`R~a({Ft6F7Mz__!~~q@&M+u5^EY{zbBhL+UFfVvBXTl&T;>?9T&s`T>C_BJbZAq?_u@SgwtIGsjiS1Zq-)U8FWkGq& zX0EIQu0;CGyMrr{`Uu7>r7LA5t#8^~d1vb zFY}dU^PE~JOQh0`JX?-Q*atb$rmC0XO9lDrn>Js@M`Mw*3Gt&>p)CuGL^m^LDaIV- zXe=_z-W_adpkh}stdeZ0pk!f_b}cWCRhI?|S&;EOe05n^roEXfOL1jR$R}S}&Y4>a zWr@nHk>|=Wi1!$%-9jm5DW*hL>bpMPW1>6}h3{1uv4ubho4K+MxKfvtcLi6XuoZQf z(1^twmni9x;Bs!lQP}v6OVs9~%1N(+D+{4YHgjbuuB_l%)+P>XBWt7_VztJ6;^B^U{LKivkOTk8`6Am zA5Q8iIyrh5Qev-x*5YL0(W_x<5+V zS+H=K;IEa9{XM;DPo(lRFI&1?Hq54uxvfp}=C&=EHlt)(53ngvQAdBONb6aX&Fbmt z4n`!aM^$68dH^YaqEQXQO7I}UJ9W56X`^&gg3w5m+oKZJ)ZkzeW<(t;Kx&5XQli){ z52b2TLpePor7#ifkI|R_JRt1Fk~@_Yl}zYQLgN-B z!cs~?UM4Jtqac+fj0CHWL=3iZWr5AP2CLgj@Vq;dW zA|tjCvvQm>DU!}3tCeinjMLr6uQYSeBBb*m556Z;nK_Zz-UpMHGq2Yvu{N#K*BA6J z^7E|Pti9{?bRj1kDcp;DL;}7T)ru_h;N(<0Iuoo(bvSE-rh#50%_C_&n>oL)QpO&p zGf&UroT=z{tZM z=I~W;A^Pn2=59(}pWoSYVhV90FX-z_Ek~(|<95b)sO5Rufld%y*|K=i(s^0V&Ex78 zwJ9~1^!yUI@6%H3sIF|}A=HioUr!w}jc3)8UYGWR%tkOT9Nvh#3+(5v88cB7lXFu# zEFoZqMzKudsurpyO7{%(&A{3jW*rNlZ{ifV48S{6f;udAc?nY}p_L6-jv^Hf;J^YM z&ZFYAIOQTH(BcICCz%Nu9nfFI=;y{VIwIh}E3~VKLKQ+vk8egF%_x-8w4|f0sbyYS zw%w%MUSLuJuU#sgV?_F@{@h-2)O8sXOQv)Qb#id}zhuQ>~wo79dk^EV=KpE}%2B?N?QFlzezn5b}hCw0SSYFo?DX6gpd z6~b0ZE$Q(GI)jq_Yx|r9sVK)G&x0gL1(A>##htR>F5yGs zwTx#QK{Sk1Y&t%8g_JE|Ntaa9GFm3op_&FVD4L-pw4(u1N?Q68@pd%8`%S3-;Y)+U zwnnv+2E)+On8cVSShNLQJ^evyVc>Kk{=1~7JB^q$6`HosS(X!;&i_RSGF1ddQRNyW z{mOmW42qG87|Vd%0Yg=UMnNez(7m;iiA>Z@168HTTv*zpDcBWKDak)?TZjh5MUq+wK1HG{?=c`d|h$A6oGHlCBtAa0DPF}8P7 z(_7mY)lMNxY9)o-+&R$itx3^*+E4BS%1>7D=7e`i;nILMv;<3IpSgnG6}U#jPmmh< zoPH%0zZ08Zz;j$i|Ns0bE2>+B);^lO&n-j&p`&Pvwj%N~MHR4JSCM8duRU5o-Lpcs4~V zMf8EHf=>&ZWRGt?UD(<_Z_b$Vsi`z!5|v7XHY($J45|`zbQ-rQFS^(PruL`Aesu<8 zh$ISIPn?|rGXYVp|0Lj1^qZ8cD5NdySwFs6w5FDJZf-N}B1I+aMY~8FXG}MbFJl@> zk!WmbQn5i1_BYx(CJli>X%^^doXHDqXOFWA7{^(ioK@2~r*cLWscEz(mgS^%Nk9c6 z2@sQ?Mb5T`KHRomg_hPmfUv_;SJA2w&HKXfEdmybvZS#yrddh9n{>1Uq+!V@8KlD%z={ z3$do@|V78AA}wWJkkcx8=alLD{!Q+DImggY4x zC@3cC%TKt<$*jV976()k>KlBz>(ssN*uDku_hE&59l*Ipmbwdf@- zTXq#$mAYc1$v5DA&{f$8Tx)!DZQ*|Js0WW>?^4B31>BmA(yb#Nl#;2wzM^iG;f4VM zm9imS3?m`oyox8dhxpwJnO=F1zyiFBw2BO!hRme>hd^`87*Eg zw_Pj9n;LB%H5WJ0EH;(OTiFErRn(#iiuZNZh9g?cr6zp4gqH58YEcx^s>vazY$Sul zn!3!Uuz+-}n$$+kb^==pjnFqd)AYu-8U=sIW6Y&CF{!-CFllTU3HBY@8xq7zF_Y?A z;vO8M%A_PbNXIAArLv8|gM+`Wob1R2dYM~B|_yGI)*!ltDmzbk2EoLVO( zVo_`|6FPPkNIkY@yN0SG@zo4;$5Y0)C9+~}>(ab@%~57i0QF!Bn@-$KrgZ7bF1-z~ zDZnz9;!y=akqLQ}_NMcBw6G2tBjWa)NtGwqbGDS>fJg%+8!Y-*tQG~tA8jXA(HGv( zecjb0)%EPaSDbO~(dT4-jLNoi9+D+@T`-5TILmBFNBFcI4!}0wZHTmRj!_IqNLIx< zmq0xd@-q#~=VzR$Q6L?eXuL_D$oZ%AG;BuyB$l5of(k*4hT&Vqmg)Nv7hrnVO z=g$Aw^Rp2(;VcOUOE3cY{G02q8O?yE&&Mj%g zyj6IbqlYUXRI^OMcC~3!@p0q%5?~)F9Vp3{NQJ6LbqkLYGfv2t2tkp${a2j2MGpvx z8bJ)7Eo~2BJYSJQ-8C%79j}Sholajrg5bH@cEkbo3ag#UVdh2fnaQ96)G>q#$b%CQ zDtUBq15n;$Ynp-j;bOInn&i4v%ZBJ;nn*2!sYxwwQ(>!3o-OIKEFCE;mTpwB)#itR z?K`&RIIe3d#yIz$dlyzNm#BKLYeh$EYTttT#%v;Xo70>2(fr{gG;8Tzo44JyVJj^6 zqjG+ArpFCZbIMxW=q3sWv_e8Rw!^XmN+)!~16|pe$MqGlG)Gq$n1)Q1fv~ZQg@HjE z+{wb-qiLf9MnEFmgk-{icCQ8pw}=%6mTE%uRf9{fxrS%Bs^*6>x>tnG_>OjusxdGW zG%LX9+qlx_ipD{f*GsorEH4a(1E)KpA^>o?G2wK8iJ|=2F5Mv&ts0UFfsxtsBD*9( zKwYOSgrs|3D$E*4fsa=32!u&sqpnglt4K5t&}ZG(6h#S;Vg~@h8VZ4+YeToS!uiJXYX~B;_x3lrvc#C|jiC5v~gs*iBbL)f4KKa;|#%HAIR2scEJY3Pzsa_0(pUP(3Q zXmYj&?>~8n8-6zw=aX@K4M+k=fU1y#LR;2lPf;9{fk5Ppb6+~wd>$LV)`yJhavrnj zl-dz=_ff-|q0MOd-NDm3up|x3EDgLXQ}S#5xx^+4qZ8MQU?+;g<3En&Y1stU2=EAo zA0NkF6?!gep(*<=ipXkd4Ws+#26H%T0Ou!MkC{0fC5ewcPbbYTLHi3o)!seezU#Oe z?aZO4;h>eY!BKfa&TE3a&e7D=@I_JhoG{QurPOL=v|M&^flooXHdPdXDP|yh5-?5- zH-rIhgRR<8_Rtyljk80ivoDz0I4_gw$u!RHhK88gc;rBkIk^pv77`j7XMeo0C$rk= zPM_lRLu;%`<4cXG8iix7_?C}9y6oeMu+)Hx%PR-OgLOTta)aXi=t!0IdQ-(k48Rnr(j-Pj1_z#p8G@$w zH)A9)tkJDakE_qrC0)i!FWu6KAnr^;KuE%ymW6v2WPq76g^Kf%z<3y+$)fA!t3Di4pJ@D zun}?E8Mz_jK6s!#nN{m#2Ib3*geybZ6qgA264_1RQ&R`cIzZKBW7Yu&F-H#FBO8%o5J{_&wh%w8?o(r-;hR?4Ez+lzIZ1T=LDjemFGI0ok#=e?A*Z51)IUO*TaIX%cB9OsGjDN9Ipq;U++p>7k(s_Bdb2hiDMQutA@%6M+q^1klIuE0> z1n;FjfO|Cebo*%vI8ZCs)`70xl$^a|@~>13h|^sbrdO}o(8My?Z5DN-*2$i;Di{M zFHSee&8##nZ6cMBrLYE*14#jdhwP(Dy>BB~ie!AVDx~8OTYZkKdPu6YRmUE6Sy~iX z98<}fYaflxW?#}x{3TAh8;Q1SgQkJr4SNGhS+=L{$PrhUOOylLeHpgT30aIp5HhwN z2Kl6jQ|dA}v^WVCdx3SC5f2!d#HcKa!a$BMRESv!#zw}1h7R;q7)cs>@j3`rXkOsi zelY5cJ=R82E-Tlx24=`E+F;4#YDHKh!)lw!K9sD$c`6^Dtd^Cd@FY) zIKK$46XmCvr4#U|iZhuOr(vU2U~ZcbJV_%P25qtr4n7yl;0n-R>}o1rblo=t#q{M+ z_SGO5uN7}~v!kChOJ`|MrhiS2=FTZJ-?vi2PPIHg24&``G6$AAYE??9Z&jPeV^pfrh}sUj;~sdF<$rfBla#rbsXAt7*3>8 zFC|0Rnz&z*EHF71JiLm#kPZSj@n*R(|<(7#m1GxRq6z~ z5oQ9XC}j6WlDP(>6AjRj^+4SWN=XToYJKa^SwN-`ORr=Cf?nF-;8yVh0?i9GQ?8;u z-U9$6;2$+dS5RWFO7Q8%ajGD159CUTMc9<^oc^9nZbCynm=?5|F{d|!tp52Z(5nCv ztww#$Du6_nkMAYn{Gl;X-bQ}3e{=)^z^jlMR9X<|^%)j~K2MGWOF*y?1ZT%a4K&L{ zA%Iy$SP)h9Vc@DrNJR9&an;%h0@?`zHsXd_fLTRnaPzS(kB!`mwxFvAg{?@o?}Tr@ z1e=o(1++ogK4)2uM9822P)1Wx{t$T>?F)e$D6IYB{9(kAva$Bz2IEU6ERA}69Iq&dsm+cS7l_|gf~*l9zK)$gSy8L-@EzGP9LJQAniTkwH_p2i zzRMwNHj>{(kYY#$Cy(8aie5&}pRFrpcI!}|moUSL%=pm?=9>VQp#q0w_Bjx&QOWZ_ ztXkF~6=C*QgZ}Ku8t&riLFoFDG0xKzM>xSL>(i26ZdUR#nJA%nWn;m9oe1WY+kUHA z0u8gb2ruLt*Tdg{L;F8OyDF#B$SCrGH(7EE4%^O_Y9oA?Br_4&%O%PLVD@GMZrLOm zF)%I$mpqjQrLYR_k%)&?73t1{gEv&hiMO$Uds}?O%0OH;}XYS&8sk!qOccfATTonTjzy7lOriNCp>l)3B&Z}I5tmZi6ga>)CgD;T z-BuO2Pyl0hec3lmSC>LtAFpP2xuV$`*342jt%`Z4Wdc-VUo&f1u{&9m=8weGk}+ze zVxbFn!YDOx*o*97IEnZ+b7fYS@~}8QOQaBP0NQYceT)UkmK)-^ZoDKSoO%~{m&`N`_Iwa*msO$$R zG5lMSX>Qop2$=mNR&$$%YN-&2!!P?$5GD&9+jIg2A#91jDaPX-R*8(aO)0-MHBDA3 z;kj13fjz7om!T?^kr57Yash53O#0G&R}|o;lKWh5P&X3#l0(cSVty4e3Cv%41Dost z6hlJ<8daF>rsm3~;{n-EwL;Z5YRLSXN(uhFlh!x`ef_BgXsFzZtkM+v`Q{i3xej&i zPa1Y@emk}ApkiLyt18~?;(EId^A)P z8iBOO1=tiFIby(@pq#RzVTLH95S6~H*VcD}; z8-VJ8i;MtG!bY=IToE> zhH6j?;^q1Es#!5ms+<=A_@&5Y;VN796x3RzyCUU1$yQn3m>LB8V$0kV#F{Ooj9nlD z?}W_%Iz-a1x0F6$}cy;O01|z<^U@XqHxmQ-H?f5Tuf0py`9@o3*qH^76r; zGsZgx-liMR!zk3e91_Q`bbm`>RznPL$QaF;ydQU01RkZF9Za7^B4}MWD^rnCFAg zzDo+tu}2LJEE%P3bYzIsp#}y#Clc8bayEL%kOsafRs$=RY%6*-jaEWz|C=H5Tvf;S zHD~jNT0RObnv~WIg?7``zzmMw4%;@}*Pr(K2#vV6)9GI24+w0~#@W~`r2WD&J;>34 z7o8rGYT)AZ6qEgT*0&pW)G%!1G%$9((WwdcLm~}~K4s)MS8CUy%q9g4^iJEfszHmk zB@I9jeqi|kd~y6*f>_qGrCcI>Uf)PB9Y~@F!b?t_cuYg`Y9r`lP${mMsPZ(_(wQ#U zJFZLp3k>+gk#P-sc+kH}g@BkkTe4;$P>9Ctx?V+T7XV_=s0E7N4^xSnzOu0u zJIn_evyd3U#o^OecDK)4Jg>PUm4{oq49wJ;0q!NhEGa@hryU_}e063j5)4uFI>Mff z1|U0asNy^)mY#=$SsvB4d~lm!fGVZLAO#{CPyody_07mB`dHaLBei1TytaAxEi_dD zn46+}nh2_I=&Y9_ts0%BN0t+WFFYGiXnq3-Tv6zs0%+CtXrm%TD96esfQbxG8Jtln zf{uJ?9XdIziRpi$=iid zPG@Rz&Y1AoPAj>MCuu3xfg;Adx7f zSJAFQbFszlxH7P3LKV2#<5V>GkxcM{>5*x`HEJ6G??+G=)JI?SYGr{6Yh*+5O5U~_ z9h4OdLs>XjVZ-c$5i1B3;Ik+QfkU|$@4m}+T_Vh((J_jKlL`VI`-6jn7sc8LHnq;v ztHGc`17*o}G|yFB?9j zBUQ%bm_HD}=OC!-Mcw^CY)uvVAjsc|E($Ls-Myj7#*h=jBidVAO1mM$Z?|z|`w^yf zz^t+mlN(_dM6w{Jr3QoU=ejNYOe4odjxvpM&Uskv(OK~rgo`61YF2#OlWmM1@|Z@} zZ^5QLWJ*Q9MK++%0a{aX^H9-Dz}cXgOftX4z+NH6Ir9r+3Fw;KqCCGP8k*;~aQL*y zZQ;WG=xo+)kzpAXyDjL`ZX|A{MWB}`3plmHW^Rw9N=gokejB?0Ii)Vv_(N zy?XQwus2C@!Nh?Bg#!nbU}u>FseTWQysBxw5d@|Uh&k7B;3MeXctw)hLa~3fzv!vQZr<{7+Oh8{9Y`iBl`C&QG zWO`(%l37L0@hr{l+`} zLnUfRhTVW2UoN+HLEvOk?Y$Z22;~cVRlHEa3TQ#zPD(9 z@2WUQS=I0)s6KG`iX1Mo4G?|@4s+-^b}eIfc`6YqyPopmBMDJ+MP3OTeiKvSax82+ zf?G(;0~T>f9gf&r!mL@^i_4E|!OzVLX<?N-(%pmt7RF)rMYD z36(Y~)oDZg_E-i-h)~DANfmCl;y5bsYz;$qZQE6AnQFTU7}d+ut|-<3xMK(Qp89x4 zF>}yHGyG70LUOz3wjI%i^28V)Ih*O0LIdmaI@_+2%f}!yUdC!RHHDfM6|*bYxE}Sf zj_h-4N1sR^YX$?dHdKg`34IJ$I&?cwtI)?NEkFW`>-m-!>e_fmSRXYTn|$GN;p~kI zgjE8I*a+Sp(#~~;tB!u?i2o>7X&HbGYuL$khYvvjc<{*+$pBS_92*?`MZC(kblcLc zD)1o!MGJJ_w|xb0JCFwBG?}$Tq<~C#Nd4 zNv%|Tz7b+zwD+r#QVG9Wl}lRW3Oa|@BYRr_pH9l&)B`OMdt(Soy^*Sl{BsC{;YZX$phCIGQL{myAv1jTHWlA-Tvv4sP4Q(}+05Q!%ic*bp~iu6 zj_M5n5^~LWDeYaC$P}Y9LSbqv*4q|k)t zFuoalsGO_L%E?8)vk3j6RJA5SJTGouRGY|p^61-$?S`W;G_HgTyf15L$ydsFHvxi} z&^Ipy+zd8FkUVcsoQK6_x=^}JVbNBJOc%6=g1wzk1}IeXJ2?3BxCoqS$tbHwqsQSZ zc~}RU?zurA1!z93jaR_tO)di0nn`yRmy7Hx*G3AAdhiX-t8U3U__$cV15%|P&3BZo zlck-IwUL5CY3Q%Sv$l!u6evR(K5Ki}p8(>c|XsGRu)fiO7wi zSK3>PRpZd&%jdV$hJVLGi&J02%7xe{azuir1$A4@k$3@AOkeiNI?^y>-dRdVf!E+7oprylrMEO?2ZX+_^0O z>cH`$KV>_gnn-Jo2IOb4$!oG`a-&uU2JAFT1Nd6WA`b=K2ooCEG;Fl;P07R#2<>J)UZf-yBe0Dzn3x}hFuzE;EV86oumhICb5(B0RQS+!1{RdfcfJr{m} zj%m#%+9UP?Z>0_B2mL9L%njI;Ak)iSn!}#?zaW<*yL$Yfb7kWpr=|{?bpW?MDxXpo z!*Y4=UNp1-tp%yS8buG;I zZnGz{$=u{s7l4wGica9%BKcC56Qi%9u7YyDKUN{fOsGbHy4bp=RFPjoVL}~1B!No3 z3B6h>$2tZi=_nPlG6PshSp--7@(2jY$buUUo$}C2I|4kS2Ov<8 z5W*=u=fNhd6$Fx@lGLbdcuNu$a2lnnyiH_Z0~Y?w>oRE12?+Aejvf|)7uF=nQ?Vg-E>qgY#YrbTQenJlZ+NQ6+sik)r( z9U_aN&bpeZNG^pvl$6m8RLcww{vnppVMoC!Q>-c&T~Tz^BcKe$w;j2bk+odjs5j_E z=Oj4D!jvCzPET-I!RlTLo*XkS!J`eaE_(;2tu|9@RBne}^ic~wno-VosEWA-3v<<3 zo>d`OTrl=Vz^dwKBM_t76|`irQL$WdXG}&32-=h28c8@DraUxpf%l+cpQ&(rObpCZ z>wdHn95|*28l!6B;V2lWmQMaMP z=P8-Ej=o1pV+UF>uqnF*dBFo3l*jocf(JRLvTR?~CL!m}ylW=r$Bhy;LUkiw z1&(TEuN>*!GFq3G%t)6}Wr=1?$uzfyFkF%5M$#*ls^xPp4~*qsvICNY{}da@rLYbX zaWF;_2VWY`!74Vks>*G#B^P$O>I3o3_d~OW(ZBp?bzMi`0Yuz$qMusab0WH^hzLBk ztx_(K3<0Fh!M;>;(Ak$B$VFDn|AAoGp$)P;LsxKe`$FLLmSD9Qxk?L)APVVe+{A~Q%qc&qB>Luq%jMCAsjnK655imssFp*qxYa4w>&7}4* z1sc7n5&Rb<&BhH1RZ=R?KZ%P*&S+ zMB2$lDpZ*QO_dcYJU$f(78qBoP&L>P33s{4EY9b|qVz8eiPNSuiv-#fEy-nrVql3_ z(L|v*(mXFH$BM>m>J_2Sk|B|RpkP)}WNpH*cSAFjB-C-~0@(C~%ag z7@>bfcRt#G9tmV$MTTctksoc1!oz0Cz>zB2+V>^c8XBpjlY|V398yW5PGX4|QbU(P zM2kP4fFa>C30mb6>ztAR#a>ypt7@HST);6<)dR&SPLQ>_0| ziWO{{r1W6~1HFs~R#D5MKRxU`8(dLr^dh4vX%vY}&;qL$3D5-r-l4tPoQ;>Pk(*7- zB6Xj@?nPq|llGJnX;~G)U3j3^Ct+p8cB^6K&`~wVwmlnF6H=hm(CjT@+VKdXzo8e7 zTxO3Z36Op~BM&L$d<%Kx^(|60W6WV)w+(E`kQrKp5Tqurfhdml7RGuLgnBU~5tEvD z!bvkCA%}=b*T*s`_Fr_(swPeZL1(}zMe4QTSb<(cCLM-vS~PUb(kr5(rKl)^z18+L z4iYD`?cXS2vTRWV9B|IL$p1YJwJ*Hsz`!h9XAjVv9^irHY7=nZ4ur`!y|sN&j5`X3rJiI^h>A#k)Y?Wq6&UO-_{kr7Qf&;?dB4~q*QQ_JWiDq2A*b8;0;g(8E4ZSleYRR-&g3oL`av;c@+=fVH z(AVEM`{Rv0nbl5r`V^->-P64)jW0E#&8H3f-?x1H(PhfI@LinW6wGK!be!MWbK=Ok z#X}NcPLcOZc-ABv=EzN{A@jNtj}y=jHH1xdtjP@Y^~ViOHfjZ4WcGsPws}X!0T(meyHh*;LvGOsN)_ZY5`NX=<)#+n*t%_SQFH4L=nTfcR^DV1yIi_EKv_Wh7~QO1%-xw{Yn#fE(KTTyc9 z%I6Wb?PHTPbnHYSb+2TS20GLx4U!czVj&l1kFxO$yRw&tM#~$deCe^YyXc(%lXy{nSxJ1^m(0cRJL8o6i z4Wqi)7J@z_uMtO{En_2FsA~uIM;WeEliH!gy!mNSIKu{Kj%LdBOI{hzl7@ujHMI(WS@d;sEY~%Cbd*E4 z2p5Zg90!=jFdb^?JfuS{LpoGyx;&#yYkiPAI^F`JTJB03h;qxp)wKm@20YD+@2 zgZ28^1nfyq!mffnUEd3&5S}YC`O$r1y!UWv9c}1HQ)%jGZJJj;OHtK;=u|oV3GspK zfP9Z~43HyD#BDUabPlnM+*@d+dQ+6xNkW6O#5_smi-Uv5#q%V9v=DcxB1_R%03>DT zj*jgU8~ZY&tun{jKJ|mXPGr}w>hm%|(7no+>0jNe(A!x?#m(Zm^!J*cO!|~nnJz9; zO&vrmLmSVBVgYXkol*J*zs0t2aKvu{p1zRD&Xsw~vD{}reX+Lp8 z6;0XDX?1R5P4+TIrK~1Cm}3B-n`&>0`L#~_3=YF{Jil^g0x_wDbbGyuxT<2gimU0a zq&ThxRK?NOf<^fXV+x#H!_0)1vY81jYQ1D8lz@eyY>3u$H}X!h>5|!mh#Ux)6piAH zj+XSgv>#;3IKEW9Hxfk#Rp&Thbq_nEfiPCvk>ZGGUfS!UY@A*I_O9{=&}0^mrExa) z`OzJK(y5AN51`Pv6?hVB9KK~yELkfP9O@yQ!!%BBT7Du|~P3 z0$EO~2HX{fhN}mL82|_l!DVZFi=!MQ*VOd+Bb+svw57^?Xob|gZfqrG2D(cLf)aT~ zlyIe00Dp^@0wa+cNmk*Ar8dR7462O{B{UH#B?c7oZR*cu09rGnef8^dv)YSSej^h{z!_mXg>2 zBqT;_H`xS&rIyqSLEj{5l-e7s`_W&?G9XxnpGl?pK&g`Pm*rzS%c>#w4Bj76F$0e#*ic70P>eQyNSscr98eo-dqfBX%glK;Ku^i+8YLb)koJMGOgM$~x z^Bj6KBaoO54K*x_#9jdnRQ}FlFU>UPYRvnn?PTETf8kO1kc7 zrlI+&V>*f&)DYXIh|@NqxOr}#f6eQ8YZ5sh3#OD&IEGpitn}hw6}kaz;#?V443slh z8VJb&a%%YIDH0tiIm=SW&JX2L-!HDONO^8(YQ*5n5!DpMdc`Q^LuGuqsOA*H60u+k zIJKnvyfsc{b&waW%7&{;MX4xkovq*F?j@CQ(i6;}C{#A)ad7cOHx^f&0MlcF;2Kh{ zbXh970f)>FIFHHHy=qWZalQ)-lerMggNKb&5FAKy-0&e$7)+t`n!&*vDjMY^X)N?a z0G^*Zd`aV@7)do0&yhXTFdcJzhryP#%v(675U@xdTY{v{O(uIR9uig~q;i0Hl`;Yh zSgrUVNwA7yS_PbnttjToY+(b2Z0!R^6&L;?up}q$T%w{6A-MFs zcrHaZRp1X*;X0c};NY8J)X*Uq%JI#mLn+rehCxpDO1|;JfI3w}G8=Z&+*Ylry+Sr8 zUVmKsgK{XbW01|M8Gu7qY}&>xN;WenO)D8Y64&OlBhKV;WA-@9XE@HXUX~kkPUVa$ z;zr9NKO$j*H8M04G9!AbSm>KLe3fZwjtZ=?Pzg;lRP4?54=1JjpfpE;8Lg&{BPym= zZ&bCznrzf+^Z;OW83h4a$&DNtxTau`B)XA?jT$qemM<|Zg@OcMXvTNqHKT^co#@;- zd}h>49TE*jJ8(9w&>OGJ7$INf@L=teIHEGZm5sUCS@3d8r*M_>i7{9m!_19ZBhev!OLFVxJKhP!5F>PN9G+j5O z8uI@z;@4xnhg7y#!koF4cah?@b1R`_)OW9hSYFhiixs(a6Cni|hIOcm4J4r_*2RQZ z1zr5!*r+v(V&LBBRm759CDaLl-w!Rc0&{B!5tZ-af!*p5z49DznH{2E1ZDN(7CXL&2R$q@6Y3)j=ZM6l5V>YOHg~m3_$!Z9NP$NQ7}7 zU0yJ^T}5ZkIdGPb9vE! z5hGaxp|(VP3g{e4hrA@7Pst!a)9v9~(I$W$0Tuw>zbQDnqIK@rTD8cVNYFaBp6ps=!^fmKDWNs-fZIjU6buqFw{IK$EOADSC&rL4%?R5~`D{*sC>kbfnjO+tw8~ z1Y$|;G-zo~9W#?P+lZ*=MAQ+Ce9~lMgn<%e{@C?lHJy!6mD(AiY{}*x1k9-Nfz0dH zfv(=vP@}TsUm1#nmHP^Z=~}ZLW0KuwQMb1y)sjiC3*vqQ8qlN!IASJtXdd!G!XKQ! z*CxIc`;(WW*-aWeaMO4LrklO;>Y}1%ph~8piHQn>a;h$FznyHv+mN9ZM#VL*>O9mlbV$!oH1%T;L7013FRX9nn79!kgu zA8n_kemQmw9hEstThPZW*GL8b3iEnn7`L*4T*4`-l$3^rGE7J@v;&Q?lXc64F=v#F zGRV}^C?P{L8c_jD1Ffv6n2JD)@@!nHZfeLjMxGb8N-T9WNPk~98pYxa7thz zgR#+v)~W`2n}QT*yXYE@0xManp^03?!gZJcgIw*UR82LC6CNuP%dtK=-g=2_)P@(~$ZJzKkiKQfBF>B}cZbT1C9x z;inkNw(2Md-Pf>b`KG{^qsWg)HksQhS1O;{#40ur1HrCOX?9dllT2o9owX=O=On12 zgv?0!=m^ONm9s>_jNn-`n5d?G43O|cR5N0mK(c*2&&}3v7I<$ot=~DLgl$lMqoBeJ z-mTaM+)T8hy0=VSN9T%?6(IoLwk4SJNdyeG)`}c#v>b(a$_rWZusmx#$MViv;}nxC zFvc;e$QVb#H0oJk-6zVNp*mD({TOH?7Bseu3fn5&Y{z#bC6thP8p@`p)KKeh$^mQ9 zI)~3YOF!hx$LUa_~z-Lq`?hf+&I> z{x*^d++qdQTAJw~(Nx`xyxEPcQfU6(pflBml9B#lvSHl*kgAl2_=90cf6xV|Tw~)h zap#dwR*Q{0qZ1A=ZYh{&Oc zcJAg(Tpy9oZj{w%-iBKq(C&-+zX(%n_vRb7+nbjU=UudDj2qr2<<-s^dv z*I_tg_>2*Xa#bI44whuV@C1#TM|mX~;oN%Eog!>oYWNe%v?YcFn6~!@OENM|BG^r; zSE7n(O1Pyo8tG(GoEc-ctJx6H>KxDI0BIr)lIhI%<_E_tn<8*^&xU|RZxNuaC{f(I zXL2D?T9AhmXm$@AwJ)bZ_fz@lJler{{ z0Al(#g@U(LkEU@I&gi)zBvJuPK0?`}Bq0H3WE9jqqAQ(J9|DP0VMY;Dkt!U6yil2Q z*n+&xTCpHT^g{W(%h)wOG_)+ap|7jEFWIpz$*$4f-x-7LDxqG8N4p?DEhtcL@A}h% zJa{7|Ev)fMB{Jf~#3jK(r-K_oh+@2oQbtOF;W{O8l7)MVPYuonY_MxMpN@d@Siy!%<{+nz=Yc!Ktd7PW!b^x|!>$ z2p(i#i@= z19SkytpS)`Uf~h4@o$Mh`X2fyjMYk#SzJO>3bx0LB5)73T_leS?kPauyBzG8xzfJ{h_4^7s*7o|D++gB&c5kiOKoJ4yhx9PVa z%N-RGCLl7!U-hVkkjc>LwwtN%ZAH)@)&mU?H(xLjXyU@EUjZ^IH>;$N(ZgtnG|Wuz)vwsd>jhr!Tk*&Oxb}g&CPDE~)qVhoelV{tvx{L%LP>5*;dNhEBK?PRb&pwvx zGsFb<5fT8+9ty6&!tWCkzYigqfb4R51Rgut5EKDTgH1~TyQCZ=61Q08!5FTmy11EW z(lAK|Q&QO)RwuK08;y~3%1AR(3(_pp#}JIl>KIh;Dri#tB%X-i>oBN{?&OfKkOD7I zr{sx=`+@}?SY1uyBe=%0nNic6q6&&`>2fMY*LXGveD~H>Z5uk;HiE&bE!n+$hcS-6 z-XLD0*i|ZP?V8oSnfGY~=<0`6;n z3m}-NLv1Vp(;GTiM%ELw@eLuw8bs2lIjY_>M*jxh${3P><26Ih%x$smDPg79uoLyS zLqo|e#!f4f%?&5}*7YZs_6_Iba*bv)$m%^L=K*KQo>-rUukTnsc4V5IVEc{+};P+J;$RpE*RRpgwv9MJ%kFnr`ACh9eUeUx`Ku(1ePTPb)2cUaHCZ$=mXHFWgk=IR7oSMZ2A_4h@rwnO zZ>9J}MMtJbs%Xc81`O3Fk+dsc`%I63)>#}1v;0Jmm`(A#*P9)S+X6x~tbznVIbr+% zU!a0SkXx4(DGGj3l`YLQRWp+Utc;thcB~UY>Kh+n%?l2gdU&c7wXbUuP+Sx*BK$?U zTu)JPs$b;R?zWH}qDBh`*bo2~pe^qu;pA{_J}2YMo;vjD2qy<4n9=Z4K~UCj1wcwQ zEHH}}iegi_)J}G{3w+dwo~)<>SP>S40c0^*$idne9e}~^qDH83MLD{Z;UrT-*H4Ah zW9-L$P#qVQ6F9{^l^S~<5b|QuBCc6I0b1GO5%LWy0ghdd>R6Mi2>haGm!e9audI#L z<%GEaqB4TFnifz(Ks~}U(8g&!Mnsgy1)ZhE1Zze{YEewU#lwL{+S59Bd~kFaY#8f- z_5yU%=7gB8m{YJ9%fTL{emOo~EQja4{!D17;;>$9du2c~iK-&NY>tw86@ge3LFFQ* z4K1Y$bKH$3K=Z1h?5^5&F5}yh9h3=LH#js@iSLEXWlFFnH}1#*$`Ld*+^j9u)Crfd zu4!I4M+m4@<$ePNAh0qhj!6rAicy9Z_rdIVJ^~jM8W=*pD#pFwz@i5!C1ZS+4A88E zWjvKcIPN88H@Gig8nbw`8=+>?iXzmQQEGd$o~R`OYE1K_k&qY)&vPyk+g@O0y|aku z=(bgTEG(s`G(kzH`HUbM+FU1it4%D-X#~`^e#^$bwly)qo@;M>qbLr9`~I37S&+K4 zzA4!i7EJ|UBU#_~2V38$4OCP$0_;LGwTvQKrmUw>9yr%kdhhghBF*QmF}D|2yPx7N zP<2GGXve_fRjllOPA+O9#MCL&jcbV?fG!n@%B->~mC&6)(5b(qhawd$8RSZb319;^ z>CvQ`P^w7TtRh7U+*9W60bFM~y)ufd|+( zYK-7ZJvzOLhz9mp4nldplB`f~*(*W_97s}8%UHePYM?`p#j9v5i@{Rnc+|LOV@Gd) zjXP%{n@g9i)T(Ee!4lznBHK|WnFClN^i3FTNYiSS%z;VDI&y>(OOa=6G&hD2OAY<5 zqp;DareqrEA0!)!g<^#m_26ts=4>gI${Lr}Dm=)dW|4&Q0wOY%4OWmp$Y=!PXRyl1 zYu1toqriHtWU7L>gZeCoRGpB}kU1U|d?0ERNFtd0LFq5@DI*OISZR)f`};SZ*3r9V zGwSTNCEM)+_?qMUWucATMkQ*u1Jb3_9t}e4^#Dml?T8}7@$!>Y2`o_|m8sXU^!SyS z66954itA`r6jcs1-QlqjHvjj7!RizyE*iZ?2?&XD+K^RAPlI$(Q|35pf1ClK!sd2% z^o3G3YuBvru9U2C(mECD<_Pj$p(;lM4R>hxlQBJEA(V6l$SxdhRh3B4ykQ|(2~7_! zQS!Ff5=I-3()0=@1&J@#nlR#vQa8XALn>P0Nkwwy>JZY4+E^h(5OEN?MXam(QWXoka69 zl|@rqE}+{y3jW3Fqep4cOzC_kElX&L!|`*KZNe>EFNq!1SU_dIZDU3GtT)pada<&T z3f&^{I%zFWRt3S%umW0B(-+)`AaV$)I|U8)`^3aG!2+uANQLqdEMFvx(wu1VnM@j7 zT5+O+t4OGLrp|PR#i^TPO(1P@Pg$It z*5gNDlbQu3v@&Dmq{5Y+mJF@Q!7>zU}1!f0D&liWHKLU#R|NxYn&DVrC?eH7%4zt z%F7aHsp_Mirv8jp2DPhftp;L)W(n257U(!aBO#(gvkK~VyE3|MFdB~LtQb{ zBLM0KAfK-jiJs+P0w+NXVY;FQjt{9(0VGU^tiuxUY3%O-`9TV6tZh7XPEv-M>N z8&*KbM-e0FOU~pi$*@wAYI10AFvsCTJvJ_3QfT9MELBD{SfRn(0U5C~kSYPY9&oFq z#ypW&Q8Z@+F@2rA|@68DQ>$fL>O?pYNwnRdfRm zDWK5DB?1~JsE?Xug{B0BjwJ8&wqS*-ONt;y>7Oj6;Q%YJO!yRM0E0WXMRXIuIa`V} z>P*0I=`Lmy`ZmZf8Vjlf`X;O`p>!!4K~jxbKrxHH98!v! z%qY$%*a!iGlw|`66-!DTEHNvg!LPA0b?eP4G>WS)bsB8$0%xnB1G=9!73;K79AZFf zXk{uAxOrmY(aOq{$*7Kaj}jr&t0^Y>MR95dt~)N_HcsB(@)j^iLBhun#3!ZD@hf zjt32QeA7ax3JL2c*D{cWg75G-T%hi0HOK}lcCaRiSOcqvGjWQZwoEN0WaN~g)Jfxv zI)P45pNuVoBWYsaZtplS-~=2+Tdw`c11I6SlqH^;V7@B}ni6W{T1HOg$z<$w@rVW+ z(@3@oxFJ6!%;K)$lyEIYP7|u2AJgpEaR3uHQ^4RviJ*Q(H6>>ZSx%>{R5}%BSjq#V zpGDli1)XXbYKd~Pl?5;w&K^ysW`kG;()$7$#FYct6DGF)q%v&k6^`vy!TJwIfB@d^ z2>LITF%?rt%dpZV^m&WVv7OE0*pnRu32q-L#9A?NE3XN;1Pd+Vr_{{N>$3tuKFh0L zc7+g3K^D=!r+UFut&D0JqHdz+v6@z5%&=A=m~~>978Ihn_Gw3FW%U>Y?3FpENzIh` zX-%4+v&lXMv!tLZ6r1rtkr+or{b@o#E<{}p3vN(kG;W=kxHVYxfQAP^Ap*;$f{JZa zxuE1Mol;aSM$wxM>|$#HrsYd;y*19sI4xxO@{ryGO36%mk<-;d2F2Gw2nWyq zT2PEaDuE2v1bh!f+di#Ux&%N z@C4s4NxTpnLP_R@O$$oBut{N(FFdY*hG`*XR=2t5Lx3GLusd-PqCc9+sX8wiN=CE{ zQ=Q|CDE%G1-R*_-GI8kDwVjiV8B05*@*o66fp-OItv9_Ih#Z8sDIpQyXa0&vo`wtr z)mE=cAabD$Y$!mIV$0~g74@pfFIQCX>Z5)(LWy@;MNN{Du`;}BiWzl|+rORTxg6S- zK}oygo<)69S?<*BFwwhdT`%e-%I824;H|7{sre87yj&^y zWj-yJM#w~lot4jy=S-_*Wy|WEkugS$6+E|GT+X$Dzg^Q>CQfeYv+|?ak-XJEcpjz& zr&Un%0WAv%gpb>b*e>NuLSP$cTCFN*Bx0GtdJ;-%1skIDNU+qx)WC8OL25bh6$`3q zfqq+8&AARys-@JHq8Y}ANA2LP({ZIpSa~O~i}!5c5Fia(jQoz#A!B4L4mmDqaP06O z#U;mRjD{X1zUfi)3av{y)>NIt4Tq9r)cH~Lsj`Jk+Lsvff$@(Z#~MoWOmt69nL5ig z&v~!5xZ^||CUSuWE;W$>2PTKMf7TS&gdki&K?z{2eySF6*svm6Qxh3Bip0S<2qmK^ zYa=*tu&&iO5e-eFRI4IfWy%%@0ox5J$EXWP%pHgM8Zi_|=sQ$T5&>JK1ePN@c=8Rzi-IMaxJwjI#>zei|D`^0nc`E#mXDz+2XV^-2CdL79bB4+<@XGqk?oI zHCHQdTsnL}daax>HlDMROIIzc%;Ae;@|ZY`K=i;)VG_)=^sJWr;24w`brSCDnoRUe zg_Z!8;gajJ>G2`UHzw6#3=JjMp@=Ri6}@lkMmp^(%Sy9Sn~k~gkt*tRq|Wj(FX3Uc zgU;1vfJ>rTW$Ne3pgvI;C{p|m>4BFU?+ zO!&>#2h>OKVuaM5O#w{<6n*% z3VVeTtixf=?V85A(9v9#aL1vflChRgOnfrfM+VIVnuSD&M|1Fz6|g0ym5dJQoa22Q zV>8KV}V=@qKARpJiHLo5+iY25UMiXEFgRI=fPMN*RSK7tr}JZ)L=j2 z#i&jkH1jPh&u<|Z$v}D>aRvN@Ei1v6h=L%0uM`c>!bYul0!WkuoHZg)W$|S<;LK(P zMVu+`93tW@v$h~1MOT7virCfe%A(6J$8W%^&aNAPP_~;NmzO`g_py)s@kNzC)yp5R!V0Y{e<~<)Rn51~N0U(@6E5FV zQ)3q`+u$qW&O)h83Ew_s?}cF50|t_4jH{PD&{80VI@aRy13hK_~ zvbmO(Bj5o!x#e_HK7AH+P#!w%{FZEPyD>6&9$OLz2BEiFV6W&1GNHf^e=?uO8Ph53 zAGU@E^43T`n**J~@}ezF$&K3jZ$@>f147Q{i80CTzUah}3uDNIpM*7FEc`S@5#-@) z+8Uw|^81r3mY<03imR8OxFXrUzAJfFOXaI2&l<;XVApGouqZB7!ZwymlJcHQw(ulBSU8HiiiUKYwfK16Khw(Fu(e{_ z(P9=+_!1zN$j2h{O4939#ynID@}J>k5}lkq5{ct+;^L?jO)HW{^2SSrk(CUW#HFBp zJ(8$Uvq%w3K`2So;28l5c~`I`>gY*?;>qeIF{P;n&{UbUlCsjW5oakv;gejSdbzKF z8LA^^oMSOkr~~qj15W8`Qmmwo3tXb1H9bBlF7kCxPLgsxAb&ToOcyR0)vEiJ>FO%1 z35Zr0b&55jM3rJsk<_}|1vYZ2%wmeWt6-K#Vo~0QZv3l#7NMRs672{g4?{soOp#%NYG`K>6@e3#H0=1Vd5qDnqlq;}lMS7Y{ zgc3b1tF-o%=xNOJNImWx)p;S+G(y27^uaM~AT4y1H%H)El~R(f7?viYjfkntaW!4w zrxv=(E0pD$E=Mq&E#{89kYy&WouUANbyTwN^<2$OK5+0nD~Wk!U1d~|N0Wd5p!GBv z<0!2m5vVBVc%IP?ON4~_h`A0%4kTj3in;73=ZaWrkb|J7)FLksYS%rsXO)UN>m ztGt9N$gm2YU>-2Zca^Jim<*Naz^DPA6~#Fg(7n(=6~C2Mm5em!oMUUqGRKm`*_{L} zF#w$~wp&XP*6M{@d~%YPmz}sAM1sjoE<229*NBS|F?M5)uc$6d8hI#Z9Dj0hcyMHD zTK_1D6Uf*un68#eMLQp`RQ5u+YrWTnP5(NiKZ4C4hncC~R z7F0;(S$fd@P{;j-G&2Po(CW9OP=TU)Z<%IzF9tt5*Gf2CZF^FISOsnJtI`0B_@nrPZ@)bK2;?`pt4BVAq{(vNh3b(>R)VqRGF9D4)y^Ukb>GgKf z`-E1hKAK+a6ogjl(c|jjvR(vHBCXEW7JN8-ct=e`CllnQU>8{2+dc$m zXRG~q6gt{%Q~gv$eJ&s>4n#DW^bHLL|9clkK|)^LQ~6jl1bC0tzFe17Z#kg&L^%!8DIvT{^RBz1#&d0vq8( zlcv>*T@%etb#RV^ltT%rbY#5O1S)1V`84u<#jP1!T;#dLY2Y&R&%GbEXz4bsY{ssh2Jya5=Tt_0s8P zJzlyB_&AWbA=eUgN@txbE(}!a9M81Z;8w!B@)Tq%*60MjDUwG_DUwB>fSF&k%ts7)qW=la zHzH~b(wYyJLYUQnu}4wqmTXyyXsXns&5W}XI6)na!2-N_`BtPeZEqKJ9$nwn?e^q8 zu;yxDO1BFveQ<1%6=ZSoWKAf3fZRYmG5Sc5KJr-7R0Lg*mNbB-uvkNCKlEavRQN}O z#SZx~jaMU7K?6db;xgz<0}2DQzR>W*$Alf7ePUODU#oZFBU7{_apZu{YSxTfM|)9` z*ZnP20g2u^6i2EzLFIKwag@Q332_XxMbY{Jmb%785JzxXi)f$*77;Adpu>(&96iNV zbx}~cZg6CL%tE?n3bI`k2y-yo)$#AScrGH)3#QmxO^t>7M{~xW^7$odmW#jYNd^wM z$XS54s~G+j5uwC#UZ&6j=*y5|2-C8}8U-$-$QZ?pAY-(^K6RZ!?DG1A926Z+A1bc| zd~^;KZb^RsfXpM1D{h@vUz6OTmWVC*awYMK@# z))a&)fqnXkHDIZ-5LUU8fq?AIe+P;+&k3?DDG^|w#1@j0GNNH9oFzy^G!T=mTU2K> zTdFgrg}=aCq2W2LlR5?OfD-@?eT+H9ZBSC!p-UAI;iyimUR@Vi({?$>$w_jVmjW<$ zK!qL1{sbbv1>_i!+K?eVUf)(lP81}zgtQ7F_|)miBl%FFiYw@oAWt^2NcPFva3y_e z8i7#Vy{=lPYk2g!tbmY#mbg%g9fcUA*vryqGB7AF0x*KErCb#G&q$JqR2CYdYv$)gc{%XjWRJ6M#O+l2@-}?o?DJ;2{S5!;I)ZKO^58JLVx*}Szt6-K#JTWbl zV8Z9UwW3-kq14g+MiZh~+Z+e@mWf`8vWgn$IQushKbpiU1=wjEs(tC3^=M0X%`Y3jAfyjYknmIff8131$3Ysz5nKp`ZeTfYO!TpfOPc zhSRDvtvFeOJw=wZyImlG@yqEI!E&mLI;TXTxm7e(wo;OmGO7QYr5;qDezcl}a?0n& zz*w0cH;Mm|Wo-ySIq5J@i>NB)MKy}qYg@Q8pg(~NKu(gfmmaZ{DOWkG#j3=pTHNka9#gS-?rc_@M-QBq=V|n!Jpx ziYKycPc3pqd2)qIu*5R(gFd(;+UGHL0a1e295xz7r5P}LCn{nXVfLapNJAk40p| zRO6-YwQJUQ7tu<)ZNR%j^g`?!%$qxm+;*#|^OAcyKbC^GA%wkgM{Ybnc7n|mHeI*k z>Hl;fIVwQw>?GD?v4XAr3)(tOb!GJVm}JefcI)iRJWv`j6fHnHD=$X^HA`0f+f_(7 zKCGEwH=8M5iO^3D(XJvBo=q!Sg?ttvihQ{XlWXf^nr8umYs<`-P%bx-Thjt`O^`j0 z{!UQ)Sfr~bE5h=F_U4w(nHB&bv4W^)!2&c>^qz{^wy~ok`VnQVwxmYMH`kuuQxIVj zs49{CcujfK0@tI5&7<)+^2H(JVIqBqiHS=CWfM$+B%cv*AZf|q(*Vt&*2s_qMXw8A z8#ghdHi8qGUTYyFy)HYJ1<~g;VIoDrX}IWPx*rH8GX8iMDslsqCUw--IDhUSYP&{} ztT1ULujyW@F5w6iDN9)Uu3~f}(_zucybi{ru!s=QW|KTxA>qgk`bC@pZ03k1!_8#adX18HN7U{rSA6>>V& zm3Vc}WLM&`?C79rXJI=xtlx~JNE>xFM43SumvWYwp}YiA8+eQUW+-6@!|6Q;QthNs z1zLjY>%vJRJ|Lv=r-3mlNdz1;lG;N`n+J`pm{O?=1UJV!KoF9NR2}P>HvK`W&ay$O zPN6M130w|}JN2M!q2M;hhev~s=r)k2a>G@2s~MkYPqHwmRe9DH^!h0|A&V0^q=0my zj_lZuq}#+iA9RNsvtwZLZ@1aYl0AFUxh&bNWmYPJKnDV}>n!*Mtbc$V4_*&RMt>WE zWO>aCU@fVRA&ez2QkJb`O%wD~odP3~f1}v7mC*)d;aKQ%Y-EN)dG6O{9z#V|(*rV|aF9PI|KFWti;BjSnZO zD#A4K;O)W-qSucBb|P|WSw|qU7hWf@6C`-?w3RVNLiawer7?nQg)&3nc~|a1#gn8G zAsf0|RUf5BpY*gMXL!j1`6)bu_&o1oYZ}*KKrqkV84ZEWRFqm3+E#b;^KE6l0};rB z0F-t4&X2VZQnNO3g61oau$03F#cC;!=%0i5LP(*MRIsH6k!ah3se*t!3T;M4N~3C6 z5=||QD0e2#Cam>LzQ;ItSV($R4$YZuUd!p}1sQx+hf603YnRnhMCw;nmM&*1wpJ$i zR*p!YNA&IFqz``OfZ6ha5erHZD>95(+`6BcjNn}O9+0vodktj?uo>8~MQLJtVzH_0 zlPE@%+G{NPc9;Sf3@}wJPN5XAjJgTP`W->$G7v3@iRfYkat_JjGomF~XgY7994*Ga zHnZ7Ww%Ixgq&D{I3B=CDvL{mPGKy|+;0dT+L`9iTrz}xR zA^i+W$QVn6J#nQvj5c-k40LW5qye2D=tuL0WM_NFS|F~v*KSPq_jY%8RZc|@nb`r< z6jQcr7`>P$Gtyp+Rbc3-Y|A$A>G+_K8Lj(tNE~2{csBcyYC<4<(9E~2Jildh$QT(& zkLR#2JJPaJLE$eVoBn!HQIJ0krxL`kFACP98x5(FI(~4Y8Ka>??{|Uj+0hA_+ zGH{2XR3ysziP$-y#!y?%ljou6dIa2~z@^fYRda)^n6Oat3O|Tb5ivZFw?^{W+`vw1OW*)OSTR%bU92OUs*9^;fH8x-`-t^Q zXzvM35g>sTrBTuZl|xSuKMe@<9vTD?*eQ$>L^V4CFr@;CB0Y~pvWZ0^=|`KPBL7n$ zs&2BB_IXfQsN+MBB`h2n2rd%Kp9gA10KL$uijO3gi2taHkx84G6rYOIaxHwC>ytTW z)i~%eDl?TYi`+di#K}v8zG!QnO;elmN!gx-y~-gTN|12#kY5GGz`i|`Kvk}wBQm5j zjCA8n5WG_9MjcIQDWW~GNV-YYmlPt}v)sL|Y(s!OqRs2TRE2I?j)j1b5Z4BZwgAsu zM2Tbd7H?W1?u`sOl<}~5r5K|{vpKYDSM_fh97z-Qp|d>>_yeq_va#IA@CTNOiJKcX zkaPn*O?ky%Ze{9W9!_X$89NH=lENMRWku-}gyrbmfD#@tf;(y|uvd6fSCDi`qXr>H zb9fe@p$kCJa7LX{|FJPEw_*+C(lk&JRA?6$eWHL;VPA;!RBbB&iwq>K3YCzGnAHou zDQi!1eKtq%M&lf-ZG04vDRdHZaZ|_h2ZCB;6}waiSX}KeMJqd9d{=>ejF4ePKR{M~ z?@7;~)W06%!Kg@pRUg)@24@#YU?(Pi7A)RCH|mNUAtIat)2y69`9I=XfN0AHfZX()_^vkn+g1aP;o5GM96Mp;_6_@ z)>IMajzA&J@srq?fzuIh4%E6(Uy6OCThWnA3}s;xp@@YFMTQ3iikKvQ&5l^hY9fxZq=Vv^g?l}!5@6kW6h~1)sS(lC zhEkO>1&k*qZV90&(E?BrqlhD55sIlP8d!1_HC7z)EYOr_ZQDJw4R!(br~bNewd0$1 ztUpiMuwqgOm2-+BQ^#trK`Yj6e?D+C@nNG-qTQM3nTn;rOlx8yki`+|aAYyV%bAoU zSm}&vDsg6JXMvc;TngM(wiLL1d@Nfj21T{C2P=SUR{@O75E5_2+;`JqdZ1oEXjubA z9MFW+#=*O+@d~mDAv4|ZQ$49Lknxu0x!JU$MA!k;1M3KfhxQ2^Bjn;>BLs9=uva1k z;$-wev{GDJ0M{w_vExi|&w|yhrar-1@(IN9m4yeyJ5qa>K;$(zyejg@Ua16=ZaumL zx+DWE8P+4PtBo!S6h@V@0IC|HfWcBlF=J{OPy>sGtQzq}#b5v*mLVyL<$WdKDn z>w_o*3&M<6;U{f4$lB~KG-I(P(1C`CObC;m)=++MJOn2mP@Y9)vIOEvp3vWetV4w~ zaX_JcS+ul27=!@SPX3$Ov1kHmEyGt_C82y^>>BVSPKNjCrx``N3Lp%0i)6JJCXGGJ zE_F$1+-(4s_RWQZfRWQqg)&k&}y4iAqZRzM+T~AUMd%Sc?IE&Y(L_`j#d5wzs{k;6 z9Rjq_08s%!a!Ao6?wCLzr64aC6*L7!1cpX1Dxu~+x?x35k* zys@wOx;{xjA8L*G-Ml^}#Iy7WlN_fz*MMSZ>CUBWN<1q z{#9hjnBzp2lkuHe>rH`b;*np9g=4n%u^z1Q_)qm`ky6EaRGT27K}{kQeJH^MOo{~a zrpmz{Qk#gp!dQ*~ah4PVlrol(mQtdj88OmG&E{Z_&BSOkOcgnt$&Stfw1(#+Mri&S z3k##a=UMA05HzKr`8P8YBt-q|57>xX3MHrz(#ryc6qS)^b0SA@I~7UQWHTj#{mIb4 zG7z8J>3M=@QHx%$kggd)b;4*IfmyU$E+3hV%=`o~t+`C%fkei{*R+*qyG#b)rl2@s zB@l^ml$52Y8aP}84o_%i9euv&r`F~T!wlurBmqRmWi>%XZDM!TVLDW5r@i*tI1cbH6~~rIIWD= zN$I+fupbiHTIEryFk`O^He*#?C1S|xWl`Z#L~Fv1)wnb#>IZAx$IExHM3&a|Lj~7560>}B(1k83-85{~yb>c0JQZr}pmWk9Rz}oOK(c9yom+2>- zUN6&Co6{Z&dJQvDD55MvOnrqE87=1koEWLb@^*8ORh}`<@Hc10AAxt0zw0)K$l&0z_9cfKn5zr_O+!!&2b!LM}x@qQ1 z$U(b8cM)3+`J`~xjOYO|2_7Vdz6B)E1NCUE!d4YHvIjx|2xO=;kOUpGE$3#r4U2I)jEOtKqZt@roe*Np_yAn ze5Z1g8iACl4u29Q2-Qnz20|)i3`;jbY{MHd24QD|GVR$o({wD*rMQXLkSe7; zdZ0XL@9WUc1vL>Tg{=Z31~Y_x0>JD{M$uqWRU(v{ScavffoBJn&Z^#LGNHbl)rRi(v{86{mf;cr#SE7PA5gM= zeOGci;RiEO?WKUzjS#e6Rf*1M^=Qdfp(;NgtSV7}NtJumOC|Cp9K3T29VHAwFb|eV z`$E^f*w~n1>@w$YkW8~&)&Y!N(Roc$5M1$!DyUoQF-%*9s0LJgY0Gs^K}SVNjUcL= zfPPNEa-;=KHl#Rx%~@sD#*dIyd7_f#%hQV-PAbzThL zxLBZOQi1f9wk)bDi1?~kn`ty~&mcx@SfZt8gcykn4$5MoQ;BY$4zDv5a2eO~fYKu~ zAdR53l98_SSBQ0qoaIr6h0s*UFf5qwh$HWs%2q5BM^%zQc3HhR8d4f{-3zLqpb{s}z>l?B-t&JU!GF=`*XrKh{EJ6TSIOYnMVF3B2nSL{A? z0MXkS$gBvefckx3Zxoe`^{8XfLO~CmSP_U&Y+~XM!8#V*hlpUPdS?{W37Vj0gtV5S z_C<3%&q>|6R}E!%C1LZLRz9EHY~%)w)R0wKBFn2*1O4S}@bFglIzgn zx1x%De{#k06F2V2jpxUfpSWUKSzI4sJu5a4LrBn`u)1%lu=6Q|@YIwdpwE(lYMU40`HdGOF@=TrSAe09-f3QPJ2tH|4BkN^c)w;wsg!NM4 zp;o4QXDU(zSp>ZG>W$)zDI#r(g3lD65_t8Xhq}AQ$x~NfGPKc#*9_W7($MHxmwnMj z6cesP1`}SAB!$9V;R>>O74D=ig|cAL<``tX5dugl5e%GWM%Gg@3bV}^`M^B3>YTM; z8Jt1k+B$mq*eHc+A3zdyQQ=kD7n)nA0*MYLx`Y)Zlsxii^QEKoGn56Z!GskeXiMEl z872Z%DK(Qxt6Gc>rN=6rMHC4Ro`LEwdb?&&8AEVHCHn-gv89a|g#s>y*PzxejzzHLT zlg6LS(;3q#zesC6Tdy@1O+|+(E41}j@-0DwN@)_}Cox9S1q=%2yOKOGc9I4H4rfA1 zBBl1GELUQDzbYg0_9_n)Tf-KDMYlrCkV#pnV%^Tzb%1u;4bwhezFW`T$*%bVRoS zN^2W-8#FcCtSt!Tm7K5ubIY#TkwpvAjk)oW5tP^D123B$Glr7wJFT3t9SuzPq;uIk z@U#a~auX*+IJu#(tGmyI9VgrSJE1751PucX3`@8oY8d)i#(%zW3qb<`e#jM0;%TC3 zptV+dAhl=A0z?_ns8ZB0n^uGxCZE3qHB7`U?mibbjAh=7ld z7_Xm&fd48lU*hhS+pLsoV_J z!YWSxL*MF{By0w!78bq4)33Q2SbBtcPV855pBRO(fzb-`ht{xb^uYHjg5n}QK4iHTcafjb zIW#_I?nu&I_3z0;Z57S3E9OY}X+oc6TKG`8@;10iED%w$%hON4k7Dj1RnAipxj%`i z49=vGnq2j2a!{j|l;6>$AWdGN+$-sE33W(9STmY>WFTM=fXgB?8c-J$K1FCJWu!&R zPz=e+s7i`c4?>q~U0GoFc@r4Na0SBe1ELt7fl53mA+$6kq1bJi<6~G$Cu?VU9VnJ? zb~a5bc5|_ZNuq0syW6hJVcRvQ3#Pcc3TAmMY_x?0sY_V48PLFxXjZ1BV6i+!kvUzD zAes{OJ-0HVjw-yAsL%2K#gmZ90$aOs5qxx73W3_8zIKF>N53dVkSO@$5k^5`Et|qP zy|olZWPwFp2pw2QD;fynn*sytu(`qatwqRYNqi<{Wn{rpHIDA&*mnh)P=EZSi@5@l9FpM`-R#u711QbuZ6NU^8p|--TWGb z7YDQB*qnwSG#Kd-2Zj=>90mjZD6B1kc(Wz3C03#VsPLsU#tK(&zo1j^hFh1F0`qx!F_ zuea^BLmwsavPQspUqwyMc{ri0rJ~k=Zl)U!6~;oR3<_iYC@72t5WB`>DT|<|L0>JW zR1u{NA{-EPju$f&*vj!~>S^IKk@3`2aJszK)sU%)7O4WERFbhnQ7OyxXk2K~ z%E}fp4Jq-rzJXM(L!k!di^0G{3AKW}HH5nHu*Ulj#eZitK0UuNLH{houEZM_Z~f8X z711sSg`Egc$G~m|I1p?HbI16|IR&*>Y?8Bgd+L0^tiG!8VFd;~5H4w0@@Duc#7ifO2( z5}!3)1+K$06?ZkGEOGri21iFNWPMl*0=p?e2dem~MFFh?1(7k&y|a{5@!lY;cctdA zAk(g_i`k;Z(n=-bIRlLdWEnic{<@f_+R6GeuyISF)G#PD5{bmq^Bay>mzcMp`IH;? zUU0~QH!R({|HcbGaQMQOo@-9I>B0}au36ae^!}SZ@R4JZ9TzpX-~7Rg-?+4I-`0n2 z{?JF?#HAj)ru~)=f9yCd_rlW;-SUx7y!Dg|7cAZGmr?)b&Sw|(^L6=y$u(W1`JUHWNp$B*}I`}XHPcAYx<++&~a{QSp1^N!sw zzwnE1fBq9UynFAwBi`J#@3NaZFM7kWGalLZ$y-)m*0Sc>uG=ra?X+uz!Dk-1{ff_T zxUu8nH+SE02Ex3!nPpnGfV%eC9h}xcW;2`!77=xK(#v zbC3Dh<;%|e?wy~$Z~K!sueom37q9)wIZxj^_{;Bp@w%^OpMCh^<5u5w{nzp@{P^}W zAHC}{U*Gl8bB|xQ`b(ev=J~I@{NgVk{n8Ep_z#Igj$GVx_l*yIV8I(tJnON$Z~FE} z79M}<^*#68{GE?Bi|_sC$L_i1(N82hFIl|i-di8PVrl;!XB~lWHqm(0A~|u`!bQ!8 z=j__&#zfQ8^P672?|Axu>)n?f-x>sVz06%U>elh`;{(>jpN1gNjvon6_8}@OZ{hU3%-yRZ&I`?siGycfg|8TSYLVc4xKH=Z$ECk}>>+XHJp0A%&h|#< z{8u^S51h~Uh;u%{*}vDB|6ON%z#02bvtL~2e2!jc{uiD1FMYfH;)B1jZ{+O1vA4C8 z?eX_L_5r#7ZQt1cZLqgDzQx}Dj&q*Gx9u0le%C(1mz>Z4W9L3!=X}4b{??wi_}nJ; z;>r{3^WEUgyWaWypL4c5d3*j%_K>)0WI208I?y>ZwEL&FkBLEFPK* zzP*0og1hJ4kDUu%I`3Qik7-_T&QG3rzigWzg4R2%HFB$gxFFE_4 zcE%ml)Es~N1pCUI^Q=9~_9ha&&Up6e_Jy>6w0G=wuKR+sm^h-5xY60)<9v|h(f0f& z9a$Q6*1E@>`+wy~`~3$Fv&ZCOd%W8@PIJC5N7NIewx}eQIx_W=BO^PUc?o-IOuVwl z-m%}AcjMvCdDq(G?s@hH^dkH~Z&3kl+^W&3yaC z#JuLck0xH(i#O@7gZ^3Qe0zy`$0zn4o47KuFmc?68xk)yB-q!eHS{L-zU|3|`H8)! zoO&n((Ej%4eE~-%?#9lw>q^)e=hL@Ke|e`e)&>^FEwNu%&uROdZ3%gKE@lF}?HXraO;reR%;BQ}0)$E%5HE zAg1n9w9C1T<<2NOqw}jJoTogo+d05;9^;1$d*TP2-)fWd*tm~xr}J1GewTgVKRCyq zoV1^#zjGdw_dEN~cgCah_IWoQW?vcK4EuAWCE;8?aq@}w{@*&PdxbOaSB_XLbG9XC zJi_@Pw>sl9&N$+X+noC!aJGN$9QQ%zzV3JSZ+G@PQk?jK^F`g}?Eiab9CpTMo#WnZ z+B;ko{YU5hPq7e5BC%nwz2kXj!fHpnE_CKS?aVu#J&%dR8P0Xbo%8?7SuQ^A%sxjlJSe8kM>}G=*%$(;pO2h+zbkElcOR3} zrpBhVjq{j2PD|9f~uVh`)CJ8YdjVQyyIquY0|@yEv+Y`pHct!#Yd zKhtd7e(2e3-22cTHcmXXlZ|h`_5ZPZY5Ze58+V+!m5skU^945k{7xcaSZ{--WC*!Yg`LbKrSd)I!Q zjkn4g8&`hCw)g*X;TzfZ(C1HLPjw z^>45FJlpup7OQus_p|f-^-ISsr1L&}&kvam@^9a7 z9Hs4tesCUBl7D`9n2kMm|A^i9*N%OWDVyg{9AZi(_2va^{Lp7I>^#ptaMmI^{@%+r zzLCas<1OsCXI^@$nYM3P*2?DdAAFk4AGoxI-FMS_|B2aVZ+f4|=I{TbZL2+hs{MKI z%$&>acj*iJ*t~CLZegGA``?nD2cZNshl2mAYY@S-Ew`Ty|Ezp{D%wl>4gyZpXQ?0a4Ho^R2|NbI`o zhwQv(4c@@M&lAr-mMO2xo_&Ok#>Jc2_)kw{+5DfLeH7c?^`7I|c+ove+4$9u|8M(v z<-Kg)q1!HGW7GNXX7B&H{UP@J9R5f9ypLaJvG?_Bb2w{o)#Sofot3W7k=@>>jrNxaU5@w!iQ% z_IY3Z>Na+spZsiqjc+~uv+Vj0K4+Von{vP+d=i6-icN?~|@n3iSl>MIH z__Mt}uG?XUZ`NfVV(-7lfIh?DwPR1R?O*-m5jGzGpZ4c_-%k7fE_&$GZ2zt%`#MYd zu43DpK4HgQt~lvow!P%{5<8!A$`$Oo^B=gLja!cCW9!+^4%x@H&-;fTvUTChYdhHX z(ysqz-`gEO_#~6Noj>kp&&#T#?{1|Nf98-KY<)QPny)RP?c;_v>ondoaWV5fA9&)g zi)ef4zxSU)o#o*?REU_dd(U?_c}x?7mJr+LrepzG*&tPCoc0 z_%Zmq>sR)3cJ@X0vCp$x`Wf5*j)(u1&3o{WJK1@Tc>U>Yy#DM@&|!(DCC9P%f3xQp zHXif0_ptkX^{CZs`})7P_0s*@o@Jlo{dYgl-hch;Z28*$3tNwV=*<6S`)}WO5u5k9 z)PJ$@(0880uJehn?PTMf&kV8g1CRd~8~d)`%*OX@GT3;+%fDvlx$oUSWMlT3{cQh7 ze*Ox(pU-CZuou8j=>-{^O8*Gxob@yJx07hT}~;WZA`y%QqwCZwLZ6mS<~KMM)Uo(7QKGnqUHt1 zFXhjD<<(o>gqMF@vgyRv&%5?Q7V~BFKRB@ESPr4Sblg> zGyPkzbHQKPLX3~yykK?DNhXO!!P^T_M6SqEhRYa$UyB(5x0uODl8IKkoSatCl1wa^ zzThq2I*kby&i2L5bJ|PJc8V{UvkKtXqGtb7xa?=acu=YEZr{f4m3?+McwYFn^Dccg zZeXd*+pa%b@V4t>0`uYeu~U!Nq?dX~Em>Q|-LHxaoIG8ZSGHwB(}2 zO;5HpUew^cesx9TO6TR3{N&5~g~^wDrK!pLmBzE2$xmpFyPcQMuWa1tyu9N*jh}L_ z^1Fu{8t->Eu6?Ot>YWQOTGIIN@y;Pn|71zyALlz8&;DjfFXE2ll?f2 zM=hDxe26u&b4g+?g~!b=B8iY-rD$giPoo=FP@Cw&`)ch_=$$5 zCmLFxY-_!=ZShvmQtVxvan?VuvtEBkYvbQ1TEEOqoz;Fo?DKAJ+T8l;iq>DRSgc^_ zPBeY^E3J)9O|Rmhw^Hwo@F=r$epc?LZbB%ed=WF`L|E-<%Xs&H?%&XwO*zz z-dd!Did7O$_a}C`KjC!i6Ri)vW9oE;(qo_X{Y~$0eST%@H&!m*N~(+LFP!1h`{{Rj z^(Iz7tyScu(s!@5F=YD!S%L?Wg(d!&Pp^RhparfUDra zyz%jdf*UOmT>BCaq?;Z{w_>eqeIbqCs8mZfdsHdT{~*r)=B7LEV!zSdf5Q1)Yo}-d zurFX6J6L~PANXDaK834bnbnfes&nqptTg(G5BZ~gIq zH5>#a9t0!;fN8S{6;gm5EVU}n?spK7m^oN|K{XBKDyem)+c@eEj?)q6k7SVs8~|Rc^hPUFC&86-`RTUD@%NnzfC{GnVOq!ysx!!Wuo=7e<`@(60gX< zV>^ip@3r-kSLu#jAAahU=CcoX{0DZOXK)=Dln?G}o%)f>hqvGuw?6z=x{MnOn2J66 zN4T0k`~dyR*T4b#yF}B=uPlMv*1CXRAlklYar05_eSI6Yth8P0<_BK4g8sPn?)LWf z)A4`$JFUHaS9|C7_ETvW#`g9OwjZ0=Zf{>hTiE4}*jR@>U8m!)_MvvXUbVdoJF>KW z5=~pT3x{=XYR9<@+V0wp?Jkb4xSKA$W;;&TVbb;*+TK97(wW6S?HT%q?#}slckg*C zR_tcm+t|N!D?8Y&b1QbPSixSgH|&FOSMBXReLK^uSLOS9Ht*TCwQn>fNZ8q>t>|5` zg3aon4@rOBovUbAecnkcR_OG~{H{T}T5qz8fhb+4NIe(?L#=kK7m z==iPpEaK_yYkSo)*Y7mTU%w;2ZHwHU-g+v2o1vX4_H%ZeQ~CNGo9O*cy8ddN+bZ_$ z*|uf<&{kpSoc2?@@cV1*`C<<_-TUclOd9au;){YD?6RMK&-cO3w^=%4%X%KqNx?mJ zonHBVy4ol2r)NEv5>MT6hOl{)wY6iYy_cSc3eVRD_CEVT%waw6eE;Xr_f`J;Z{I}k zvx5phzli5+Ye)W!E%qndLeI}`_S$`(J33Bxf6w;zetLfwJ!&00&EfQ^ee@-*>B7zm z_uIqX$NJLVe#X{Q_u#YIpA*};fa{I0>z#v{9Yf~GsXNVc*6%!HHC?}J8!lSm^LDMF z_dDpLtlqo_-_stvqNjKhEoNQY+k5GE#orody<@-sJmSbCQZ(nAYK z2Pskoq)AsmQHp%(J`XTDV!H?V{y!Z# zkH+7!^NEaZwE3Bg{&u(X8GY}>%e9~5_ieU(j?p8I`3s|uy}rIq`>%TVNk;Fy@-;@s zU$D%i{h3P_>`C+FKcM~L z~C(M?PD+8=r*D^ z$-6Q-{jBbXXnRcdcYodm^!i^Myprg|hj;#n=-a2YF#6)3?))chYfbNeOmxRnPW&%X z=g`F)?h5)$>*&phezthqEr_<=F|Jhs-Tti|=Mc4=CPr_6e{V(?jvYOh_7CYkhtbUs zesTeAH+Fw`AW`$z`?e9i){;`NNcJhn#uYJ1-I4`J#(oCi6(^p@=DR&!#Y%ZurH=bleNyaXuls=Svs~}Nhu)A($bkPB~kEiX&7XOXWBb%Bh(Dt#33nmi1_TleNBD&3=uAWTvi9;TpLUjB~ zzh-poTZw1x!}hO6OOFgeHop5>vtIa??w+Yde@-i&ZXZ!+wqq3h|XQ~pYw@MJlej9 z=mFmzd@<2aKiq}UYnA7B{3W(;|NgHSoxc42ooV}5w=ZlYy1aeLw~1~v?@2}*hfUdq zw%>U1az=N*?d@G@d&R!fhZF6bvDu_|K+l*moYBo17cu(iwv#5){(Jv4nbGHhBN=^r z^!fYK{$HC%9YFNLv&T&*dUoQb8AP{u_j{fty4Sh$pC|gG<#T^dbol$<{Kg3fuNd;^ z8S-aDpZ&pl>zxSt*l%}VN_79zE?P#k{iE+6N%VIU79UM?@c-?39MPut_d0>-ksD@C zCffGDkER~yt{C#mu}2U6KIpCsUf7=KHCy~CeI@Ak8+PaZ(iXl%1C%Z(yF=khv=K*?u{qcRXWvZ{N9M$hW1=B}CsH({&Wli@x{X(M0#V@0H7l z-uBDI%ZZN8Jov;8pvkGzeoA!H(?5Nd=+^%_{&}MJe)PtM|6H!rBwMN3tpKmF|n_l*So}yPohV zsQGjCWTL4*-+t#tC$AVXa@S+-BKp|Ahu%%J;nbskM0D^*gMYXo{%vCd7yWOm!3~hS zhAkKrL1^fp2t*AH4I58ySbxya{jNIpxUUWR#>fRfx$3wRzP?rCl#BPf`uLMJ9jYvP z^(R-KaLSg6@uvNiFm^u!J4 zu9$pI#eB0&sXaDf>jo*3dw&f2l`O9ygxoq<7S3Gp&Cm;Rsji+wE@{wa2 zzP8D*DL=gG(Gv!3vs>HKKfLGP z4W2W2&NL7bpSQ-TyE2`=EXhsX*`~nB=9-kgoeJ$-63|Xegl`W1X5lp0h)sj7`QZg4 zhvs(`oDo-fb%XWRJLhW+Pp`M$;LY-n^0gp+IDe88*WdQU^@n%iK^6#}kM4hP!!$M* z@lYEL9yE9myE)8IHc9F3=&JZJy55FCCexYWSDd%)yy3%eJ%f#?;ba@LlTGOK^B<#5 zC!DjaYZWJ)HF#F^)waU>v^6I*CEs&YL$`fRp?wDa(KJuDT}{?aO4bs~R*|5_qf^M&EoTF zL-y-XPWav6YxaK2tvYk5rz8Bp`(LtrBz+G)Ic#G(+ zzJ_z=vK_0snXlouq4dA_r8mvyIy26~pl|vdz3tJ%ccbeU9EGbDYtNsi>5>f%ulT|E6`byQ3#YH1vpyUD*nOPtb0?>_evSV=^gHb5h8Jh9$Nt#w8NPk>)0{pv z?i}{VWsmdigP-8^#2!v3zQ)hv{D2=ndk0Q88OisjEx!HAFn-)}W6?eug`zwU7Q!URq~I+WAdf9A*Ebt&JzD9h;sw{f~j z7pGUv=k(a^hp<1c{{`PZ@JN3A6C(&H4Gnu8#rGe70;fyz?`LuTeqLC@_qXStZ^&}K zJ!kUf><{M*zCGe1PKWZ>-muf%e0x&euedurU?J*B?`oW=JV}IQJ zXTE*-^PD#4&v*Uuo7j$fPUZXeI*^~|)UW@5?KtH(eE+fcaJuxn{P#ni{+RsvTljrFnc~}bKg7qKbup*wU(V@_H#psl!=YjN z96tWH`S)=7)kmu|;^Ck2agXfDf8YAoe0$PMoPPgAPTx**`p!5`A2^)T4G-dU=@?G6 zkNJ6eX7KGRHstsF&Yw6v`eeTU_5An#bQ<5D-p1+A+Bv;*4yWH5#h?F&`E7T9zW+Cu zaXO}%U+&JMP-^X$J`EX$efBr#PK5n_n&l7y@!^N~``5^xL?3MiY zJ+|S;|NCXW{mub={29;k?L+S5^!Rpu-0u1Lz3EJ# z_OEZ+nEkQcTYUVP%Qs@%KR=ZJ{+&PY{hJQvbpLDl{xkFM=agOf{v~_y{Tn~Z>HMep zece2W|9;eeI9=Yt_owsU>*mh}u>;=wD}Ua9G;-P%a{6ZT7VP((7IAvThwO7TbT#qc z=UvI)_s9A9s^7}j-_aBJ?;G#TpJ(OU{JQq-eEddt@!#jK&v85bC;a}Tm7M+~zdk&k z`a8S+diizaoPeMIya)JkOGonao^vUG?$aLQ`)_-MZ{P6&Ki|y!`QE#Vzwh*HeqTeT zZ^{O4@gl#kj$wTNuEY8E;C6nWP51Nj{@CW%8@7^f@2v6dI}YIE-rbE~?}1tT_a}bA zx6k=0Kkukl`S#{-@$D*ZBE9*oS|YCD-unORna>zwj*IUNL?XcHcdX z{5p#s=I8x+e*GSjU%$>cm+yZ!zmA`_h(G_334H%vpK;onUzfl455C=&$N!!A?|SXK z{C%d5;@7|FHNNdH=lB17mLIqEQ+#`yfAilzZsX%y^6To4uHxINA$X6#0qFXrQ4 zAH&D}>l%K4dmPItB)NMw#J_Rzc2Kk=4jmLN&Nu{0(trZJh85^7gQbB7p?_j`#>pi+ z(_C@*WcO`9>D8s<-^MEYy&zA&eK7v*^+wSuCd+|i$&P7QvgOD&Q_M6w^lVjeY{@kx z$+c*lCwsDK1e)eMW+0h1?XykI3su=xRY$WWLuTU?+cpi&)?Hn7HA$y^VW1qUK9-;3~eU&}BUuXG@N!Tbkur znjR>&Py1vXXIhSA`KBGlmgI>RRAS zj_ter8k%GJz9y@d>(~x|Hc6F2E%c1gwOu!4??CZwT~bWP^K@AYBzAvVC|MeI>5lJ& zF29#hl4M7gRLfFy%XZj1@O7Nqz^m3R!_oPB)@;{-lC0>CE?c3Z(!E%|6iNM+JX{zGm5%8vU2{*;wpKkhiOwCpU*RkAyjSCfg2i>qt-;qp3 zVfP#alC2ny;yb#H`M|F0NtP|iwxI{U7byIDD3+r*x-V&JpeVW`(Y+{+jK?rl+#W8b zne6^dT?srx4MWKdG@tErZOacZeej_S%kbE{*JV%Fbxm>%!wqCTpksB-SCv3kJs%Uu z^+Vcc_+(h6b*bEH-MK( z=d*m@@FiDs)zEWonSEbV_XAfCbroO4z!YTTY)cPpIWTR_a6+5SXH7CqUB*t^Fhdxl zXq@6$cBpEmYRRGOyKJ0NdMS+w*KsS5!Q#g)haPEet$U#$@)*5YsAT z?*Q{3FF{j7+qXQ^W9JK8%hr5746Q(OC6gVCJFx&rTQ^+Q#B88@F=fldI}B9a$Je#l z{YjWqo-AXUo3`Xh?A_a@Pzfkog)5YBNfhL?n9!YQWx~uSlL{>5>I_nYzxNL(>9XaTVMn_G8VXV*^KZ99ywe!;v&h1onM>Q}q;E z_k65GmdWQvpcooX7+5l1fzQ__#W8GHw167S&VapV&9qfFlwDiJI^+558h9vyG*iI1mKopI`5~}#L9AX`} zJuN>cal?k=8nP{E03#cxTAJsoss;f20e>%8g&h+sCtenAD}T>c7`i6df+`u7qO#}p z6+d)b*}~VrMCISj#abPDz8%PfBa^**#r18?3at=<_bo-ut) z6Vn`%&vGUH9ERg4xD8BeAK#z915NQ2-Su=2H)winuK9|oy5L7x*D==&b`8w~qrjvE ze=tLje@pPEK=mE02ADW_AbJO(i?z$a;-#93XDV!NxY%JUuA|`#&~&14SioEhOiB;& zm1Unln-p4LY<`D25?RHq4u*t~yr0=c%r%o~3ApCR>4_ zv+Jr@g|O~{k>X)YmCYB)ae;W?1Is>ci(_BH0x5Y|Oe|RnboLHxTh`^kvTX@p29u5M z*~LPv;al1=rh}@m=QVKmmhFbBDr>Hi=O1QhJ7MU^mL|!j!=4um5!}>vY~WuuEcR@M z=EF*hx#Gxy5B5XX2xwK;91m;){K92(!*D|b3x+~VqGxh`scT9ofmd3Rf|W>P=Tl5W zb$ktv3e@TrdshZT5WGDz&~4cQ!08%z0lp#OP2pxj*cR0SwClf5h5Q^3VdzyYri z?}qLfH|qi95`ZHaCg*wJ$F^^JU_S~Zr#wHm6|5?z?t-DaE}t8CkFFNt5+P)jKxb|-v2k72E0_hPGNsA!PYJn$sfXYUWf z5Y}-(!*(zgIX?=q!n#HX4yA^=<*>EHkvu$S==%zN7@O@g@Cm^-WZ95)*Wha+J@C(f~$=brMgfIea};u~3X36>o8btIM+r!zaRG8q zV((PN(jYrdVCyPeAsTyL0|x`UnuiDSarbmxh)}YQRThg8AnmbtV7g#E5CCyIegG+h zy(R@35G0KOFmi<^U24{?q^Zvf#N@(&gWi0ynG zG9fb?KKxPO4p?^Bx5T*u6HBToOAuA_>o}fPRsd;N!!&f+JdhRKgW{4D3t526dZwX- zt_9iK#e##?lb*u>lfi3t6cxB*sy|3kf?V?#^v0!V;ZWN=P?f8c%es!7xH zH7tuX4lAc6>);(=UJx_bythnQ3pGiR3>T}o&ejb`%BqU@g_TUR!L8|7+ci}s#EAl| z3s~}KpJPMG3n5>45E!s<(Z0Zd#I2iVD9OI8a=xmDmIoEW*MTJ551YGqaxhQK6d!9o zloL8u$2*lm{7=C$ZMp2dK!L^US5+6{6O>8z{_wqQQn_^9(19|B51-DQb`2ezQPUw>S$JKB%AO7SjikXL3XuciIOj(`{)S42)zHG)$lpD* zHZuhHJRr%{H1=#B-kh$ROfL=rF}fFE6xsqrDNCXSkBuYQ7IV+@u{=ZWWbXr4Nr&5^Yk*_nX|b3B2LwXe2mXQO-F3nC6dS@A z?Sn)EhKwnX*$+XVT^Fp@2iw469zfLO?*OI*ABrG8F+?h#>m-s%%mexj#2Q)8%Z@H& zE)&`flp5H|G-Th0jN@o@Hiw{kKxe@;hj0-D4tw{|7NHH( zVufp123v<9?*e?7ejzE8T#qspOf#&c(4GybKI}Op334n1R+tr_>T!FCtwMkD4cnvj z7pf=Si;D$Ef{_5qGz4Bg$0ZGl6e++Kz@C3zPsf_%)AVE38U7p)4I~3L0SGUU&e%Rh z!Ag&hWMQ7GUdZN)0fT^!M~3PbNQ%tnGk%rvZSa!us%>tMAmkb*1Q-Y=j?U+r>e-H? zg8{i%kYEC$dsbmFGD(5K6Waha?Sno6)($}t8i@q?jP^m*(f|iqP9&@$>{~)m!!!kF zm0V19{(Zs1Ei5ArR8GtHxO@spJWw6ogQOF>lE#kJEf0#QD??VH*~R9rhgku>g@aX? zRzmiT@S=ce4N|rbS(HDA5B7_-1OhVh?&6ovnFY^=O8ZW*E!dk0v1FekuSzyrGl<{(MvNAUp*o6jDg;w9j-TsQ;c};r0M`_RRsS zAcQpFXoe1J0Xr5bf<}i6!v+$9!_u)(#IcHzTn5D(N+#{|ZIzZy*#d7MjhXg&SRfRz zS`SkrG+i!FlAZ_V8317<6w)}T-4Kd>RkMIrSaE0{mVV;eSd0}cynK#3WDaxzCEx|p zFvk&;VFi{4tW1#J4TeEe#tVY52C)GW1(%g|;KBn)A(j~sZrJ@n;KDNNJ5br74CHk` zSQx}$%nN8#Fu~9@0xb1dn87je0%2C8eV7iO27yF^B!aKSp2NmU0J+(K zwFwG3*K4uvLq>*PkISq1buDyYoiue1v!A9Mn;V#r0U(3hb37R07|)ZTY(OIN6&q_U zOh9y9Ff6Qxv>(%1=6VH*wy?`VUxTQJ=|^4L7US^~oo3_1{9Lm2Pqd{_?Xap`jQ;q~KVbCX!;WS2tP8nA^C0s!w!Pnhk1z+}K8Kyh=&4Wra2);c;y!bkWB8ye zHe-&(?XG@yBJH1L-p95d9CRk5H@$xmqdoqi%)vPCq(j(o&r5TeWAG0foG^usd-j5> zSwPJF4M#A##o061xCic>J%N7LPng0SuvaKIeUr9Fjh)CGjLSOy&f+5W_-Qk9%$_PW zwb1XL|5S$P2A>|iKhfl^ud;Eoo1bFW{l7UsVGiEbjT7uVzqxTw=5YV*`TOli_aEU-B3`&)aRsew((Ny8g(ndu_|}jAkY^?M(asy4f*X5}kJm zcgUZ1!++Rye!1uuw7bDRhdZKwb~g{qSp0A&druF2@H)G%;g50$`jmI>W#fN2q>=r8 z>W|K5+XwFVG^4wm_Y|XNP3dCa$GY+{wypjAdiMTS?#vz0$Ngn<=9oR~10E=&9mK*; z8ul7|EE|9Oh|?KO%sZ0NeNW=o+5Czl*!G1NpU3F&ZFAXkY}nevwp)I|f1khci){N3 zV^3wzf57@Kdq3|cU)qU&n0j$zg6QBYhwn}FikF^d`?qWL+51kswbN|c?;baQ7}1Av z4}FX1E7CE$5dHVCEtrG(!XI_)O53?_&Sv{hdT5)Wv_1d-jvh|*(E~PM=iBLqLs^8z z#OF3-_cM3nw+#Bd<+xkfdwg(`%I1MOW&9qr|C)Qht`ogy)Mo6xwdI~>feL%v&mHZr z-uoOoZnJx?VdG|e>m!-Qe_fiw&Oc+v>FoR8_|ge%p10lo9(zw0UUxft&Zjf~V%HnA z=e=zI-kaXTwzHqkXU8A){EylGd+yqr?LTw;&)M~6ozDXpj(vpF^;(v*{VUJCnbAvD z9L8wdlJnW|5B-J*YRo_P1-iY4WB+|Vqs#B&xIFpx{n&Q!+9XE1&*9tGA3KX}cdfrC zJ5R9kO}0Jk%C{Ij-TxP(@4RyRp7h5@TfWjnbo1l>IFYFNi0=`-{EyGD3%);h2E(1E z&S&4}oil&Hw!i<-TkOL3AL@lP{*2c1CllT5k>}a(qbEMdCfWw)H*Q4xxA@nE(}=$L z{5H#o-oDQjY@YA=91~3%{{8WiO=$m^7dzPlY`gik>^uDCxfyK#57Yaw_tQLj7Ml-0 znXR$!d)KSe+3znN_APdu%-vV8_pbkU*fw;&V^e!Gy!`K&!`Zl#pFMjA+JDqZm$HD4 z=I-yY^KP*h4{-b4KVSR~{XX{TTiE^m@8P`}{$Kt5o$UOFkN*Xu*Q8vH#%=okk!+qk zGWU3Pp3Eyh;M-Td$L?d}@iR;s_rh-N?Dy9fe4WuZcALb`)A;u3Z2Oq85<70ay&QJG z>+N|DyWbb~xsuJ_zf6CfMZ7)w&(}An^ZjbmeU2qMcKlZCJuU1Q%Z_hZ(a7G*#Dg4m z-U}Z(htYICaPeQi;`iD1$qjUP!%^4m&hGQwUCv_TZuEZ7=o7nMz>oXv9!5_(cWXvh zT=F=hKl<+_?D$Xb|AcKXe&zzUeRAjFY=}6o3Z_W~YZ_x_^ct#qPIKYnHb(S0Uye)`OJUZTy0pB(42_t$#xJ&cZh<#e{+9Dg=D z@9~fPmu>HK$}4Pp?lDjD?Uj$PZFAZ2Z2SMtoWiz)k;~ckiC136p5vNZe#5pOJ@l9C z_ubCEo^9{zG_(7B_%?o>)RX-A_C1#Ki-U(9&Bp&?`wQ9l1FqkMo$uAx`Fs25o5?x! z!_DDkY}{!d^7r?x-_2v=o?0-H?Vt2bmu>$geI+~3+=>5W+uOdz=f(RE9?rJUzn0(E zeZS-9+vS7D+4$pz9Ls(`eee6(_Pnj0bX|^3~vjaPBo7egLd-d;0wtdB2oKJo2xe@GqgL-)2;9;BW%-+*C4&(vT zkI(1leKmPG8+Xk$kFoP!_sIX)_VZI$GTN~J`|P;$kJyf#|CD?A^Qn1#XpQy_G3YqzVK;<{_71}8ji_uWaAtp7B!A- zgbhG#l9eV!pCemib$P5Nje(^A9za;F4{99UI5V@*NhgnB!BbNw;)v305{#KcsTu6Y z(r_sJ*vd0D8ppB}kI{|Il$O&tHr<^{jc%MxSs5(>@-DKu#<52>BIBg@YVAqC{N;QX z`*T4mSeS7_Tp`niZP1{r(_57PGwpTy$@IeZurt%);M9#yS63?OvB%is&}?TKxxHcxAwFej1i?iktD*qsgh z_G}m4LR;gg#H5y)(-Uot&8hT6C+E!TOm%kz?ejFbtuZlwN^?td0!K`8G9B37`{=|T zhu~;O?#lSiE_;khjGHl$exJ~d9FLA56O9=+xuvC(@lu zMvlUFT$GsHGIQ>%#P|bo19O73lg_o#tI2kDXFPhy$#`W>+S$V%qm7Xodqy|51lg|6 zbT(*Z4+(1pzE}>gDVs}rS^6Y25!?NI_#9pXtOyb!)bP4_WL?2;FegL#$JKCxpo3nm z*V&!M+n2G=bJA(NrNf;RK6c|+Mep0^F3Yi9^nNplawrl+qwU ze53)SR6|P8?FH44GIwJ0?1Sd-ug{oaO_{Jjv)3$MY_lp}G!3)KD2o?Z+vo&Bc(LMm zQGmi?ykJl4VQ6pO5H;{lv zA?|`lv<)Hd0#|~LxQ707m(tMk0kqT!E%WB-_T*^`W?1{rYMC%&+Wa+&7f4gp@IsNn zx<*TS950HFI4d!{kk}VVXnASOP$4Z9aTgj?gcsUTBVJU9ir|Stcv0Xn6qCjf+ldhE z8uS(fUKXWY4B(|scsXc}vi~G~!UD@Q=gpWu{h&397YJI_@B$A!vUJMx7c90CkR-xi zbQ+h}X!3Z`V$v85D#8oxs1Yv;rZw(b3@^ZzD#Tw9s)jg}8lV%ip))|u* z81uCQrfJ56mWeagFkY~LSC+>l#PuM(sw{ue5Jy9NLqHz0!AnZ;VyZZ65r3gUMR=hd zHR1(WHbwahQfn+l1TPfk4h5zLyp*Qv4d6upFG6-PWt=f_)?{pH=rh#M9WN}h zh30Kc9z$$BqJzZnV$rx_{$l8*c%eZ>c%dD&;sr6$(0;`D3j+W3o*t^wQrV7-pjF6u ziq?Pqr8Jdw051Y~5z+Ri&73p0l?=&KC#-2xvJHK^B6G1c*+jBwS*)0dSjO=pyoJtC zf)#}HmGBlCRD>0F-P$b42>rC6SBYUoL(WuB&+YFSukb7Y-{$U16rEbFmI`5NwyWF+p#4+&RP&CPR

xD+^J@~R6sQz=|9s2DET zQ6p_2D!=7UtFb+UOt%e*<0=6%j^ zK{g)BJE&zHmB?3u3?qgYHm+C{Q+qX)$frVsits`^YQ&2QQB0J*Q1K9``kfWni!xTS z$3ShNL_*o>KYJ;y4Dz|)r6Y^W25ZZy-{CBU3!Z)30)Kw8)9qZ^-knAXR4>Sq$3L0I zoZ)(Ly!-IfLj#6tscwJP!)WeFqKh;*E|_JizkMNpuNGpJJbRLY&o zFsFiDQB(kS#i9a`oC^~N;aP}3!NRB^r^oK;NmYd$jBtvf*J3RwExfZ1K(4KER)7c< zgpADM@aoL8&%rdt)Xj&}EJ>!@y{>NT=nbOD1Pf8piO|HuXm~|qd6zx%h(lE&ia&`4 zAjGEkr_jlCwihFk&Jkp~vZ6WY$0!CT(PVWL z#^CfF-3LZNyeV2Wi4eI=$TNknuN>ekWd8v-LLwnfP@+M>NM0%uVo(u_D`U(i6L`e+ zBGh4640S3>Q{dr*Acveb(x6VPjyDCx3Q!89|17SwsMP@8z&O?h-a4|`!%|Jz)R^5W zqAd^SJS$^h&!%*#X}5g5Tpw5LgP|e8f+D0%p)O|@DsDBz;tau zUKHV;ay-@u^>Ip2OM|L$Pa0E-+Oph}ibV>d7U{^Q+0#Rn-&bX;=L#YU^`MTRRfB?v z;%G<(?<@7V4#2h}aMCR78%6jLrk-M9-@7H6wH%eo2%= z*et4{4S@!z+g6rE+M15?IGE}}a5Ku_jp>#QUy(%y#kxunS4LnhgSc{9B4YbY94*8r zQ5Jv@cYJlkSrW{-cK`V#vh$1TsICKm`(mhvt0S%&c13ZuURtABJs=$_k_^sAp@C)i z9t!FZz64@AYU6ulq}6hK55r>8s*N%xq9}x9D?wUaUKNF?8YWY5)&9efiyR*x_RwT5+a5Yebmb{9mgawk(WKW}(=Yb7IO(`aaKJ9c0h`jnlE{o2 zX@Q9m@iAjuC+96%OU6_ru+?l#MIosw{1SdJlz*X&sB$1hWlAxS#(AYyDw>w^Nm>_5 z^hr@RiIO&=>Lk_Fg<0q7iU5_RF;p`RShi~bXlWhs0f2t~0QH^hBDd4YRPuN*e@2}= z9;*XV)b*+cQcO-bXfP}eQWfrca4`{1O2bOcJ|+4ih7_rfWk8A{5pE-!Pgz-3McHOS zSr(kpggbtw07y}G7WLTs&qYfM`>z8a&1;X5!(%SKR^Hdqn>$>__!_g7aIZ&jM5ZGP z!%bHc@`aJ#On;(=S6S!#>UtDA;vPl9mzX~T)#s|B3H47AZCnmb2qRP|Y+nRU%}`zpHmx8NpP;GJty%QxBo)B2}`ZI-~$TWQW$EAC~sT zSO*~W#n2Dy;Xah5I{7VRR3)zhI8^LLY`;=g5kv__neu-{0H;FeD`8-$wpzl$XiyOg zD`SqKF{OaRu$U$Y_X4S}LV(i@cpH0qsK@21ETAfd$iNhgvIeGH1K>)V&I|xf#Bt$t zH6iLGw}`hP?)A*q>a=%O$7x?j)KOuJ){5>z)NrTj?=;tAeh0ME5JepuSIp*=Sk5L5 zs)jlmBZ4}Zjf8X1G%PEyf*w~5b$Ag-Q#EX(|B_>ATc>pZbtRHxX{byk@eyPFZZ98t z){*r(B6~^!-V!5{>0GqGFSsT#B5D>cuhO)FO2Vj|UO7%J8c-6aCaprOtTj*#-6X0Y z5M8CHr3eM9!~lpY{a}h>e2=9_moPq*D1g*hhA05$7d^D&? z`YbT0M0}5BXp|UPFf7jaAYq8wEm1H;fzDUS$dU(~ZlMu^RRiEk8-or2P6Tj4cheG- zdTdfw8*t7S190RluL3wYuh0mGTxc<2Q$`y*mcA;$;%X~tRSs}vym@5-huT6|PJ~#T zhCB*LnzvL__t5VPoiowEi( zrWTq7gWr?^Eix;hVTv$Gz^8;qVo#B77;Q%?AGS*s!-8D}qhkD#>eEsbo)BKqg;PZm z)5%rws$hi}NA{F^ zb}dLp>s$lF-j43TWI4j?^JZo9hMyK~7@zQss8YHuVvy3re>*O^(O-G=SiqM928 z&`yj1s3XXu;ziKT!_a$G-Z0xFh3I~5qbV$!lB36{Yp&9z2{T3iL0MROBX$S82*>JZ zuINGx!Lp*k2}ykD>ssF4L^U|5b%zM^1@cP(NdbIC$}5Hx;gR@#iO7srtbIEpnHjrc zGBZ+AP>4e~z5w;%Y)BcE^Q^S7k~zkND1uG~H55^IaA<&r<6$VFZM45Gfbz?)!Af#v z4c4ra0*key3J}o|09|##IxsNK8#Vk2rYV9%5=l#-NQV|dk;W8Z5!funB8`dmMNq`B z7!)-atieWw)K@AGj5r^^0&mQt(1dvk9jyCr_;t~39?io8-$Y+!!}Hf2-pE{J1)T8E zAeUR#o?Ap8I-Acs?#r*jCp|m~ofQp^xPiBIPHPLw-=K<4EB`5CjY0ZwKB5CHvQmMp zgcX<|rg7!C9?WDupu|+$kvG*!wP#6L)tDWM!=R`xWd+6&rjTU?ClG-ci})!v5oyFI zi!ermiZCXZQ8mFMOEFfC-Qk`sn_NFaD6?!-5z&CNqpRW!ad!s7ergEi(Pd9Z2>=OA z>)k-sgLMFBZH?1|j?T=oxqSj?J}U$GNW>HRDJ`QTpvG;zS&e;Ra#TO6So^Hq$r+FC z$)`Q;jH`^nq}08<&cL)YbdXX zpG2SsD+*QXEn#qoWQgG}Ke7mdz-R=wD4-xeEZQA$I*g2QIy?h$QXveQIwa1Xo-?Z< z&_a;|&DLrO;?Xr6Cn*JkH8mUlEy^) z3Wmi1iHKENH-rEQVUR@6ld1v|5-QxCS`D>=4c`xa)p7ty$MPg?0Fc!Tu*3xG5LH(b z7W0MSj4FlU)H-Q^C@>73QW=RbJ-`>pvf|@NG;Ep7G$KH(v0G{%=prD}iAsT3P6tGM zE7j@|0-}z9c|_YCQx%99h?2}!4fav^f=$S+CVN3}JO-@DkvM#5Rky#F&tZAw>de0pp7>Sim16URVx-=vPdE zN`eqH)hgv~m>C95y(!g0t;Gyuz~BngIq|~KT_+&O>wqJi&2R7G)rkv?F{p*JxKI(C zfI{075HY|*;}Rn#P*7fLcQO|w@F!ySQh^gj7MLQZMK-r+nVU)WdM=XLe6m))xN%~{ zfzfdtj<+b84yHM7km4qssWT?cY#Bdc+Ju=c%@XBj6Q~S3hqI7k%*hTJa~5I(qs^mg3vxsg%!GU)rFet~PU@9N zBg936i;!6u6c7!R!Jx{qNE#axR#k*2iSkGU0aKy=3##%+Je46Mw4ncXQr&YU)zJbU zlE07S$aO$iEy;&OB@Ms5wacMm-M#KZ;*zCUotm%dJvAQ+n{1NMSX_ejOAlcHlc9f) zK>8s-Qf8!Vg`wrjcA(0h9Xi39u$}VS{i)7{NzY07ndISkBEoFm3RDTRVt7St9Vvm8 zrB@WNF{tBRIk-ZM14k8PjrbHW4Z>D7(V?BFZ=oR8R-wtP}{#8K+GRo)PcS ze_UF=Z}^g{xoYUSK=wL>%Y3JdY{E@-dW(w-#2uD`@B_9;tQ9fj9r+sjy@pk>gyjN< zf1m7|7=b*K-srtGfzNv9ETc(S6);jj9h%pc#UhZQLlqTKB;y=2#w;NZyqN83(k|?QE1!_o+FtJWkEseOCN}-(iK?G~WJLhjDS?er$}Q; zjW!q-6G$}+;tjjL^4LWQ)jX+f1D_L}06t23l<8 zK?SLjSO6MS#3iw)L=cUjO=F^c5xOxf23r+zEs({9xFiaOThJpbb>WC$YpF^t;Wiq| zmqVME1KTh|UtSl~$S)L=3|A*tE{nVDQG!dBA}!XwGl4l2%UA@Y*MD}vg!o+|<& zfeOD0p;nh=iXcCsDzk+NOqC3^|4OFfSazr)C(e>X*>?w+?U%tpVi*-;Rb_4Mc`6aL zVxttsD(a0>intgI*Se!pLyt^|24XNon=YuBhgF3kgaz~ks-cAyvI~$xrm42$Tdr;o z_$h^8Sg#14yx_Z9r9tY|h(m#=tct{n8Z`!NI+SA-XMKobW05{0ystKiwLqz?B(Y*x zOkzd*MoHAJgSa{B8~44+pwq`lD8DLWp{c<&X?vdS>57W5RSQMP*O9onxN$Y=JB6l- zm2Inirc#R6sY6koc)SkxR7AZR&8EKAt7|Qim3)#=!77TMuNoMn!ug{SgCWXp~J;BNLGj53~%GMA1K0hof!i z7J9GtUlt7m&y+n28B?L_hT*ypNS<=C)D$Nw>0hxhYZri97&uT63jj2h%}wsZYl&+k zAB-AL_$A;~%v}~vT}BE?OVEXs>I#088r06hL`l9P6z4Hlj7t#naaJw!quaWH{>dcU z#u>1RUtpCH;lE^9DmECEnv0_C2*$M9l;FK!SHY+l*mcDMmlTzFQN0J%?NEF2(yDk@ zkq`}8qz0i^H9f?wYielwmS=hcx~GuPTj-*!JyE1rluD3mBq!A#gnrfW|GlvGxR|_S`ZYxr{lt96Q!@1mFmAV-$w9 zjH>2b2*)VFr9+>r0iA)1iUhunn3X_tC377Sj(NCVdnqH1B;>UbI@{~?T#ed4HDNO* z4G?FoPgiAR23HZNEeWYlDuu_J=A5TEbEd0tY$GVdJa19{&$QR+BRjaAh34e2 z6Mdyph(^mHD0PoRvz=*-M`6kwB3|2b%en%7n761qjX0miv7rMM6eDQ6J)1lNzf*xi z2swv!2bpCU-P$~@dBU7TN07@Ty>@1SNQ`Su&~naEnhR3F!i*D+ak_Jz3xo8S&Wz)s zrU&A2Gwn32I~%28w(|UD%F2-M!(JqZ5U} z%%c-oKiBT0f=sS`VWzXY3v&o9U@{)fo&y{NZYFb`nW^b8)xC7$_z9^bCRbz6=*E^H z+l3maK`Wa_nucCr)nXTQWWd3MOJ3vnI;5fbiixasLE%A?NX#e}@Nu+;dtgd@WIcmR z90QB=dZ_;xRVwN$XfxEe7t=WzWPuNS$#U8~VlWZ!BpTOYpoBc!Ks`Obsw(lL)tZ6| zur<`;HQ|;+vTC5}zM^>cy5S?vrvQvQ9KCKGHx=;IacJ_BT4s{OpI83lzxReTR#W2z zECDgz&bk2hThp(zHc(E1C!r&Lw>1_bL*hUcxdUB`q{arG~LDU2UW~gm(0Lm~tK2 znzIQaNwPnpW^Adbn;9FW_o@IEnZPJkSw2x6_3K4DPt*sn#Fa~w$#TpP?pqmpr;sM8 z=}5Qh>B&|FtYsjrA2F8ww_y7eQRd917hIvL+dO;;Nl0Zp0y5Pq7gPdx~te(RO5=!FGhP$c9s}t6)@2 z2GtOdfuTaMGZe)4SI%)|YMp_egy?WD)c`v)R6WUb72WZbP`B5G;Hv`Iv2RyU{2}GG zQ!BJE)wu-Tm1JsJJD*qN*`k?U7>Cs_EKZ*^t7*KG4VqeK&T7h`r6JC^rh!)=s@4f^ ziUy|~;uXrlO-A!w*epflQcVA~1~(PWOTiF@;HCgah_0v<>&W0{!k=WK;8_jP6|N(D zvWHHIP#Jv`Y+47bDC>HxW4(B2V7}Ra|h_^2O*CbSx9pLDweM5t$7> z2<2J`?V3q`Qp9CcD(1rRuA>?#QVxy&OSR|-;hU~(`o8Ulnz=6cM+(^MnviOHIqA4r ztxa>lN!Pkq%Y|9eF7v4g9FCpopytZ;0&qmDZB_t2F6bgxRJ23d+C(Gj5elcN8#30L z%EhwtNF+xkR;K6CXreG|IAn+tV~#dDJM{DFfxy zQR6X>x|?NLx9%ubv8@r-gyKdmI3St!gQeNSy$EN_&o*F_BMu79)s=Ko-G| z2HurLQpL8T0ll7lZq|`_ox!@XJ*1{oF|5K}O_{CWTV?njJajCTRlpUp*5iX!6@AqN z!KR?pF!av|JFQrLb;!&&Y08O(mG?@gag`VwM=BOy~W41k6WOY*$v^gSz(vrFfMo? z+zl(H6o+?dz^^fuCXFgXEix2UAmBH4Kq>w^MkkP9(|gJ^tq1&hckXw?y7w@A@5umUtx$PnP) zMn``*qoz7d2;Nof5J>_uT2qAX8jLjf;;sP7I<^Anaq7W`Tpcle@&r>EzCx6q(x#8*t7(1nD)xyvrmsb<( zkx*kpeJ%%R4e2OkrxK9rNhT;^>2j>}A`L16H0Ah|B-3L7iFV=fd3sHm_(7z+3!9fw z3&cPo{f5<8I*eJeif66`&Whw&XaIl)d{{d4H5lXcIm4L?^-dR6w@nxC(N&@FfZkPssrDvk?7ail7Gp;P zdlTWaBxT29^AWaB`FKJL0alWdg+QT9x+2a_%{Ph`g4$(!>SX}ag^+BITI&}0`s3C3 zx+SZ~IRJ-n9cZz z{5*XJ#Ixj?VWwbwFEaBsb8uE>H;=O65vPn@y;3$A(`f?cgF48IjPKrmE1Cj<6A){~ zOQf(;4;kV|Hg=^PbTjJCurdrt8CNVSF=1fN$Js4^Si z<{I=Xf3bM)tPCvXl~igTA6ZqScDm9*UZp05lT_p`IyD7IauYYR!AHL_D>^y03f6(R zl3m_CBH=7CPQ9Hh5~lQkd9h(!Vx6sqAnrVeS^yCwlxOj zE)b}S+F1QZx)mTC8~S!&$w*M}*I)+h=ZthX&lUzbre{+b6&Cs2j^>uhj_!_CFldju zS1utA+wLsMwigutX%A9vWKc_WI)1zBfGK%FhE<{nyHXuZu&XtV$#Z6YX;$r}>;Q+C z(@3V0xn=D~fLkV5OLcy9BnPlHq->Z;39}~yxiyAjIZ)ry7IcvnAJ7Z?{KfkaUi7r1 z6SKHg5O%eKXRK3+8`OoXvI3wM5NhM0!4{ww)3j|Y4uWeyZH{z0dxZE%MyE4H`=Ygi z#>JKaKpRCb1jTgN#;Kakhn2;2tOKb2L7ld1SO;vyL6KiqL&00onh^Kuq#3OZ!a=L`{a#e!KtEXq9YK-5& z*M?-L)*#};IIp^)?Aoehpmyz7*SyB35>DSj6EYiy_OmOnesioQk5RA)0MbegLFCBZ zGF{t1(JR|`P#)wf%lZq#=%!C^l_a@u5m?P2nu*Qx=CsZ!4Q=BW5eB8Mun&nwlLmTD zTLPfuU%udgVoVOFuriYED~?LQ6--bBmAFa?C~GP7A?O50 zmKE6>gqSlrYRIs^ol_l>2&RV;QUfF*=OBz2vp{kJU3b6gPH@boOs*7?s~Oo^7f5Qo z2tx%35ly;ud@&$p2}W}Gw+WGSLTPOw(xxbD>eLrUBm@T0w8xr?#T5WBVv_~CB8eHh z;%rivG~BWfCKdRufyqy*VNx>?a->^&4c!GD2RV<_3&(>iOLe~L(dJY1_GvRG&z#Z6 zBUclUpKGXj^fbcESYymsgRe!_VLumT#)SuzV4t`#YQOi6s1Nq5dazM^9W|^-ag8%$ z1ZTi>Cc=y?Rl*oUA`-Ch1s9sB5VgPPbiD9bJi!m6eg z*iI0@7yDILmLs(EVShE{<1ZKXb%_LV22@iPQH~kcFBo5pd=`e52R_bMC#GK&_>jR7 zKv~u_juMd6PDTXyYg;!4&lgWIQ_%fJ0Qj(J8>VE~HIzCqLr3v7JwSaU%l5Rdx^6s0 zhe@fkJJ$#N%1nQeo?_#_fPtvdqx6&GgGC_P8hJ&c9KY~@V%*Dlj^8_Ct#N#0L{OFG zG8`X35ClF;_AU;EF_aj`hv}_O9G^@cQ2E6;z5zt{^!&XV$G2hBH*6%9^xrS8I0%0+ z92f0aB`xs2x?yYFC>Eim5BQZi{#Ob3oZS;jA?kxkRF95h{J!vjV&LZ)e(!*_gnRg_ zU{ok;4A&$S7!hRnWS%R=J*(}@U><4ky$NQK{*IGJmhWhcpfOxCEy_<7!adqaQ}6O$ zRK~pm#|G66l$9DP!kaLodoF_aH8t>quW&XQ&kAoxhbyWeQmN$>udL#?cc+uNsM}OQ zpjvOh>d3-V?v3V6Gh5I$iE_{RRrXG(SY8n2*2cazWCnz^9MF1q-zx5 z3SUVLQLQXE4MxvZ2el%WOF>#AVJN1yqf8&`{Ujp3qbMJEb_t%L$(jg>EkZOOSj5XU ziwuS(QEp_Bi}n?OKW4lmgldvP*;S7Z1Qp8L*g?cM??}ds}i%jNt<~}YIgUFw;KD<`I$3>Cp zs}izpR+d2^*K@7<{uE-nTz4MrjK))kU|iw`J9WZua|kmRSV7bGE=R^*w~byX|RGF8iCPo1M4(e3(~g6zOvhW6@SHQ zh(cVe!iIG!+yUXfZ2T93U#PdC2n<=5^kPF4QawvxNP~)C2)#B^Pa{JVjfwU}Dl3M? zRaO&`U4l47kwyiXS;tq!A@o(`c^Easn<%btLrAtlq-6xEZLYbn%mEzsb(jHER|kix zRuP9*oluC~_Sib7fsO7bJip-?rE z$S8}9NCi1H4FRG3w+)$w<0x>S zz~Jkn&ixupO86?_urP_YR{>l*x*;nfuv>R7Nh)Op<*;7t5NkYUc10|r`;v~z*C_N_ z4vZEom%xh%7+E#VV(k#607|q&8dM~g7BIkwC(@WwFk)CtJA|7;I73K-x`rN1$5jO* z%KU0jC2MeKI-24uy6fp4JesDrE_4=cYn&BiJVXezBZ=1wvf0i|`yAfCa8k?6>G07n zNv7M;ZK5w4u??SpVX`wF8DTzWD~)bfWQt~;cp^GcSAikKQdDqc;%gc0&_F#HF%+_l z`(hNvs?XDyA`q70h%}}YgxD1WA?kCJgGY=bA~hJnLRBSG6h|U!sa*pIeZ^E=BhVDY z8ii1{2TU|y0uV}-CDS^ZN)1Sy6}cgN5*B3B@?iuTD#eN#Y2qFWfg11yn2VSp)rl9u zp^oSt+?x;%4Fe5JdU{T&#td~uL4*Q~Ed7^EH7{_L&hu-|KA2pd55kW?L3BZ;y(`FsotX}*O|@#Sh{AFRV#_T;)3aQYQcoeZ*GaL*A@EUX zy^f9ieo+BcT~z42;0KGjE8tp#FjoUWb%0ck9AeWw04T{7$fkveUD2A#WsJ^3m4w^F|zGr|XehFA& zRqtW_tTp5otO!R9VMCO+Q5KEx9#T;_5e1QQRZ6fZ+c;?v6nhg|Di}>^j0g~cSdx8( zc%o(^X;nZr#p;FnwU8|P&l5v8fU!nbP~J!J;8Pg@;+Fu3RosWNS|<$=Fsh0}h^9Jp z#Ikv02)5OLr7t87anr)3l#vx`z>QzYfoPAF^)ZG)<)jc3+kYIo0HLk8j)qJG>Q**@ z!!H32t2q$sg+s)pR$+%I7Ng2&)n3+KO081xg9r=(;}UMz%ljhb4@0U-rDCj5lO(dm z3b8^R$*PDnuT)KmLlFLV1o_t>k}84cT0V?4AqpMK{u&Hu1GY0=jbj@@gRV|*QU1@h z)h2XgOLsbra8?$|n(T7gGkm*!N%tx$98c@WEY2S^zcaIVZj}tLqR_I^s>2#Bl&~tI zp@&%21ol}A59ke0o5QN3Pn}rM(Tk^DMQLZOIlWX{PXQS6tNRQAObazQ%SV$VgbiYu zXd<=D(xywmhnmGPVU!Onf*{`&NuLo=i}yyNE8iP4naYS2LP0iN)S*uuK_m_W^b|o2yY_I_E;xhf!G>NVmx1nHJUPPz?HScJPd6UC6J^Vf=o>> zv}_-t@tW2aVW}%Z#9>JOvO`Qly z(AJ#Llzb0$c6A%#D8|e|Z}x!wsGxee1XHeO>Z)S}U)|z@BUyJHzgI*}2TM}Rct0J# zf{;3chVet|D5#?3he*0ppP-sQU@Zx%R3;izz9J(8KLI0T4*R(C0>)C*#uF`rEu+>= z422L>DI}-JYRU%|A(8JY1!M8vK0%eP6Bkr16_y*6_z+YaWdl`BD9f~Rpbh(YCJL^C z$C)*h;?;Fs^(=)7L0SRwF9$f}mw`iCYbDSI%@?Y`)6kGqDI@)$9DhXhI!bjAG2g81 zKpSKaQzaoGF%_YwlnZrqHTH;Z&d9=6%^Jj1)$pZ2)-|Li2H?eOush8c$-)2Cj&`e; zPf;VUdqp0pOR8?ed{4nGv0NzR5V4Ny0*S$d)ne~{EaDVHiuhs~2^B*kJ{fUDjEb>C z6&+JW1yp$XNIU1B4B)nOB@bcdmMjJOK=8K^7(eIW@2YX9N}a5yFa(03R74_txzHM!JMWC`_T zl^V+BK(N>4z_M)#g>%q0aX>i~Q4ZH!ylG!u3#u*IKBqJ1qv4_BM=jEpa>dS)wyCcDzS778dQYFGHj8?l%la5TSOU2>c%L9Ml@)& zA)|hG6=+nHv64AfQqjYdGK%3>?7tD!H9QlQ4SXc~`v~Y=7h+F40w+DPt&zzyZH=R% z0PsVD96AZ-7SV^!wjbK*W(%XgvA|M+Rr+j{7?l{&I;Ry;0OZ|oEv&uT))=G@=Oa20 zR*(vihubSFcP!ap6^1d3`U`a%V~y%T%@(gX7YdhMq)YNXsTDv@{Sqh##F z=9a|x1NrgeS|^Bwc`Ia5tjO^ob)vC=4tGSH;h}>j>oqKbF*3-Oa6Fg>OE?}4D#940 z@kW4(WKXckXjjDXumHqGPk07!Iw6dq{+)zQD6^_zj8aReFnSFf52j|AQr%EhWzEIf zwhnN-JjSBr!%3MSXm9O;=cWK~lzHeUvx^I1R= A;A%uj%*W}i8tjsJ3J>xe*zl2 zl*`avePs33cj6dJcc)UL#6X3Vh$^5$$4H>6e2p0x1<89Tf~lHHE*1!k{X`VPL(ydf z3x*J;sMP>m46Ca2)XTRh0pGTOj{fs4BrVxa7&@{A_?pf@i0GF=ovW5JQ7^j+X17G6bi|&EiJWCJb+uTcE)6OI z+3Kiq$Unf!AjIqxWIqCAcU1?O4JRQY==(2n1_<_)G{=)2Rq-8n4OZVDaHtp6R68Wd zbY)Xb9<;t}Z#sNgvt^lrO|`3oZH+N)lr+O>&axmwU2RsUBrv#BHANAHby3VhB|}LZ zwZ zMc1b78Qk>Bte(swQ>)<<0g2VHhej?)A|!~$p{F1SntHj3V6TSg7t%;e; z_e3HYNxeeILk1z`D=n&uJe|V6Yia(Da^Im8x`-lDbYIcd1?8~ASz5-k&=F+Yc|$m= zwAS98MeB@SkYRjGD#Za_XF5zS%pUGVuDBAng6GH;+zN8!HCTmPLDr%HC2j?YV=wRb z<5=rVqtQo=P6i}!S5W~mxgooe(IgHvn<@c|A&nIzEjGm9CFEt%h=b9fBH@$(UII22 zXj7sAVpxpJAtj0OCWNr2B0?Kp^;JcC_E}<((4eT$1N+bA(Be&XeAHw^YB0Ls4J0Lf z87whX3jWqBV;*3y0zyP%Sg34BCXF}>H48!4Vko3x#UP}x8V0!tgBVf-!7|2Bx{DG7 zVpI%)$edtFQaq+^2&A^4 zt(BUWqM@&?OYnx3LmVgO)!L%5GF%u1jR6`(!h#(M=(Nxk#hzFvji&CM!FY!QY ztfY)V4<^*ShpW8*JP^UwJ_-&aK}`=r*;s=KfddAbA_L88b$^j6rXAHQqzuXLRS*dN zN0oBR2U@GPO~K3}I$ym!^UNtCzy^^^tcp73HBD0x>~d%YrH_LUEvpn8m3sRRLdy*# z#Ce#mZ(0FD-3Hj8NO)%*;;i#lQl}}?m5chRRWOnkM4Q26lka&}uPk_*k_0O9I};XA z#j;7%i#a89$ZB9KLSTzqSk|oChed)ag^RH~#5hCk3#+3Ir6?41VeCKJG{=^(GEuy% zZad~ce~GoF_T|y$t`6GjRr&=XWmV9IutfzGLNP3kHcF=1s$r4SyMcv1{OKFoRk_V1`sECkR4mfUk-0wfZFU^|H34 z-`Q^mV4YbTYFbu98@iLgW=?D(jyB7LHC+&GxK0V&Shio0IYq&sB5kffb&E_cG^P}B z7#26Epsxx{okEC1Jb_&$xQt1XCpl_8;gB_Sd%V^SogAP!0KD`tCZw$2i?G+JYXa0tF`f%7e| z%J~dL+39en^dE<=j&>W8sf6@@+gcZJ$o+WT*}$i65Y#`ZAa9{GXJJBTHa7u7C@O@U zWoPC%wzb zEy6h}dD7d`ZE3NL2UKc7XEJ!{Dq;~r6ABMeWwSUG5#)l#yEc_B6A?nujL)K=vJyzb z@P}wIoTG?6MuUr3WTA^M;*rQn)mi1z5{nCsjWJ3UMRjo{A%Nn(Q~j}Chfllh(tm;L>($$^#cdZrUqD^2+OOdPj590pLHvdgc=4__#KS!l7{I2 za{LZ+&%_TR{Em`gOR!cwskRKeE0ean2Lx1Dy{2v@byn0bdcuiCff2 z#0}G~h7p{+#qwm2Q~SPa+iy^PzAA6)S0~^}S18>#r)6~-oOiw61t@1f)v5PR_m<8Y zcqrPxhJcx#9$E>Bz-phJPv#X(JUwIU6ClmbRFrK`{X?z*4h8Ugue}dx?0Ip2|G#ZOCWg91wd*FUtE7u6oYlX@N=c7e`R=}XqfM_ly>9DUWc#%@a_8$09C{rT`;*uh z?UkvC&SfRQyGA2=uvp0(GvCoiur@2=Kaw4!O)lA>aR5;11xu8JL3 zM34F?>bu@mqX?Z2eU%l#n~2DEB_{R$B+LuM!mt(oByw-7xC439HDTE%H74std47cj zdU)KGXE*JKhtICBFRu@eMOyWThiDme{rY+P`@0sd%JH8LFO{xLC#U$4tu0ei^^H3p zp+9+X`tsH7XwtrkH~IJs+r?&OqRJ=I-Gzy1u%{6ZMG%gU3a;wKvfi#mcC$zv_S-HK zWPRMZfktGgDpWAHh{Y38PZ7e^|ivu3*#%+1TI^LI?$Jj~9w zXrYQ~Mxu$eP3y1+hyv9*SN3>QF8*d zx*!Nh9{_zb$a69zUPQvXZqt*Al2AWD#>c!=CTiuaVty; zoLd)S$1_zDW}fIK_}*=WDGm-il@JB?5u8xqP~*Q?G(d|K$%jDiza17l>ku%{t2*LU z)x5GXv z5;5K%9t`2&A07;W-~9pM16i3kt6tdeU|iwA(4H5_P`E1=yO>9WnqYD!yB!wxgbf{g z#1qkJEe(oa(e6h$?Aqo=hnCR@NATL{a1@`Tjmpa<1!E{T1%r{T2nt0*#2P6Qbyk$< z`q#b1Q^WVB^u$nrS5`ASDRIF5t=ilHh0c&-&=a62aW+vjT*4W{v%h)%*^5t~e;Fn{ zb@}z7k{tm}{99-tclCHc^+$y2`b5<1pv-*dSoiNB74Yqko5+6Ksulj0cDiFOKT2Zw4vUePg= zAB?&=ZifA%z?9fOOa|%Q3ws{Y;Kk$GG?B-#td{GsQ@aySJB;JYL)Jp@A zpUe=T_Fp?UyCgpXjf68$oD}YFO_pJeFwm}PB2nw5*fj8q&kkr?J}U2SFJHpX^Vh)B zi`zHV!2Br8;uM>VrvhsDcdUP)QBb!}CjS^G3P^$TZln1w6%dDzIm!+C8=Pcv+lUp@bP*y${{!G;L-3q`U& z`ut>n{RaCqJ%ZEA&emkN5gHaQum71J?cn~3Fh9q8y+PwYazd;=7BH<#! zCXv==+aFPqUIc6Uc!b!lCi^4)@|J>Qq8zbezS|>5<$(M9e_B>eB#xj2_=W`L&cb)Yo}LnfVMgW&S_6ZWc#EBuW6pD>9i<-lT1v*HPH=q$}Pc zPxAxT%`d)uaq`h$gS7eL`LG`eukkb9|2>`gdp`5`;*Iy-Ucs?0P+}HU9B@)K4TZDi zXe9a%tO|}DU=aDP6dZ^C*v_m99;1l$d${=)ng6+ej2a#H_y6y*f};-_N^GnkVv9B{ z5vCrat5wCL>MfROd;b-DwO#zaDR^&O_xg%Dd{`HIC^-EX+0d9N$P`u;Qgv1!_%X}|3+;FL{t zwwGt0GOMdE9<1K_;Oo0E8V-SLc-KVX__WiIoY z$k#r`r$bt^XYi|>#br$-*rcI{UlNB3fT}EA9q68+Q@8XjGa1G&=n%BqP^*qY@+5w&s=wWnB z+NIOThGR;wStWcsyB<_75&fsFyIvu?y!fb`&`f>D!(X)Rj{aTQ!)ROEbEgxbuD*sT z&TmAu;+ti4L(AbTUgLP8K`A<3gf$B>mRbE4y90mEj^|2!TeH(u7s2g$pQAi=_{<6c zq&aV@U=Op)#T+Eg3+z&-H@BztjmV?BIx8=7s~*dnAfj?e(~(nXTSIg zfoos(HHl9UG+MVeH<#BZU!9(dmZHCWj!=ZJ4@D#2NdR8&O93K6T>o~A@cHM|DaMSv z8!uU6_aNg=hOopyC`7`qGZiuC$OrLLM8Sei6pC!}0&JlH&y)+RW9CI+Q|6xfJWxOPO$o)ON7-?2Wd@c~=7$$Z06xwX> z6#=_fmGCu%!2j~IUw?u+;F6)yB59R6~HFimK z`4C?oFWqFU0OjON#DZ;uZ8AoUZ-fhFE4IlxiMlLHTV$qGh~e0Z7C(kVBt_i}D?5poJ+xO|1!aEu5$7-^!_E)>sar^|9#8%N*kP#}5C zf+YRvf%<+Ys_&Cd5e`QpyFZdXCiq?$nuY*g9>EM2@qm67D7cdlJKuk^sBehNfxC=z zU8u_!N?oW=J_bKg#ATO-4Ne{K9cp61#e8**UcmBVjyRz<4$HIx(_DueQS%rq(1mVp z57oS8xj8q2w(+UYvfOHbBGf>XUiDGScio*EyPhDBywos5OJvZc|Lc+#B6SjtG{SI2 zcpNV(&dRdINhOpuev9$1@6WDR{s9TdYB$PqL1uNQR`Cp&Nw*EE<)VVV|U6W0iyX>f-B1T53Xf26?+=aI(4aW%`O21KuK zn6RP%qG`lt$Q$%jO}5HIRE*ToaVP7DH>^Cq?Yf_S>6->g=*c zguw22|K|57+I)@1!lF5`j5fHa#p(Z8ZAU-sM}Hd>m%Dw{mY`7CLsF;%A>&JvGMy*w zf9*%!;P3&Ph+Ewh&)Q2xIy)vi7hHXnjLDPMg?YSg4(E2 z^w*9>R;QNx!@tzG3+k_?yRXlcxM#*`zTPVL=HIB*`O0 z)z~&k5FwFp`aF^=-n~tlqi~St7AdxR93TE)i(4c}Ec7?XSA@HXa>h*-hiFb-HavR@ z&Y=wa29tULOY-dY)$5b;<%y^)SCu#Ijj~(9JUzL%Y+7+$)UDg?m|q{h=#vfcC$Xg+ z{p5qD3S=bdkL`h|AD}$lpYeaeMSVR&cINHDj}RJABcS%WJPXFyH)YfO;O6Yl-R+-G zueu+9{dM!F=sx)R>+ftw`ER~^8k;AYsr(ov5>_2n!ro1Ud+1bVoGdVu{lWpNO_{-Y zphdaop31B}SD|m(wklED#;OLy(dOHS*-bZBfixH&1Z;cvIYuIKjy@eON3-8uQQ(gw zTxo?;W5^l|XP1M)j@#rZDD?yEt`88M^SABw)y-K@25->kSa&YhN5io~gsC_c58EUH z#t{=|(>rAEhN5$a&o0WL6cbw(i4YU56Ch3L#BC?mtiaIU$eabtR5(zqfO?s;afV_; z_xJzp(k_aV4(KM%aQ6=iU}f4O=&3<|ag(DB&l7MoA3)*8sGHHzd^h26(hI#gTgu*e zC`C*~*D1FiMtO{ak+QM4N%?SiN}hUX-TfwNC@`8D+Dia+Oj({)XwZYfKu*Butv=;hK|ac`_VTk&p=5m#>Zh}Lv{t$ ziEZNp`GG^ZT*R*V?0pELMhc{{|DkmvN0*DK@(o~30aInuBjb~#PPI^AjomN_(Hg*0 z4dW1`%l-ZTxwso92qG70Q2VQ4%o-7wQ^9-+TtL-ld;HqXj9H)V+Hm&pk(Sl{KUe$0brZ zJ*Xg&05vsYg=s8xmGn}8bmiUclq$y1TbtVA-F%JM6z>Nk_CNbi|KczHbSSc95vK}+ zl?N;eqU0;IaADGG#K#LAOAtQ4zuG1o)u`$$@AAB@f@1O?`3#9&Rc{|-(8;$!R@!VSLf~R zt4kd8iVOPVcc*7(ABgt6p2sm@Y8C^9q6}r&i(JI}nZ`4i8AOqzjg1`Hj(`b5tef&RJ{2&zA z#`A0yHyoBtj%rsZ_J=_DG=6(tEhK-R4VU`@(4XR@5vuZK`Lf;BWS=Qg_w}JD$&QW( zEfNFzmO=DMPz$sF>LV1GMb^or!0fL+diLvIFj=qNy!6qRFP=rS*FJw{``ss>fA#G7 zi_bs$#lJlH^7+SumMiar&xTL&cXk}hO$0h1nlOZ!N3kvA2l{a=7Zs+FS2>D&M+GWT zgIU%kv0C^E@?ShZ2sj@xJ_ksv#viqhfrM$F0_9wm>;mP}Kwkm;$*|@TU?f5oy}klC zeNA8%IQkR~W&y~+9C<(cTLoQ_B{*j5c+=s?wtI^mXx|&F;PF#i3sJoB10`I?q7j(e zHs@e{LQpYc?Hj>NxZ>N17J59{ltuYHO$W(TC*4vuUD81DHJGG$+VnvZcr2Jf$d2p9x8h%o$)g*hDSx3>+yHyViEmba%D^(~C^ziP{? zf#SNs&U_>iU=&;Zci*0$fX@O0rczq<*(nZFdn@=$6qr249{mThk9XxDdw|&snm8w6 zZL3$0x@%eYsF}B;F;5F|ONhgD0|MPmtCcT6ai}m^-Mf z5C8RU!EHl1HGt62Tsv<$oOL5@{7YN4xKbAICd<)RjeN6yj8`iukyEbi_1nsDkC$OmwaZkO! ziXh+yeA18UT%4TLeLoGdn53HMaPZN+3>=utQ=uqpej;ZvV&n^6adw zZ!fQp1_Fp6!@u?@RCPc8U3GbLLsu!}Nfvw$epdw$9w3{FnyU!PX{h2p)?IkR9amIk z$J4y|j=Nlz0S(o_#E8=#!xTn7rl?xtv*C(;Syx=rQc_9!cd~>P4OJjhr3K1VTF!-? z?WMG$=^=_Q{8x)g3pOrANsSf77aE)>RS}YC+KN?0)jR>MeLwcvH>r}jP<$bIAZHZm zTMGs+a!6R7BfZ9YQUnTFwtx3HDH12cx&|8OM1W`;r^OisWU7KTXB5%+;?T)U1<_h5 zM^4Sp7gZ3o^F&db6^Fa5MPbaWj#D(RM#}qB*f_sFbi`OFrE1lYi)qV2$FTZ&H^Iv@W8I`?!vt2F-MhaGJr^?RecjUUbkAeTp5t}Ni zmRmM5_*}vz^iU&gI1@FZjZ4|3G%Z@cktK`}qdiGA4n}9I&>3yhcfNaiR(>Lvd>H zk8tPy`Fv3P8fULR;_M4}8PtrYIte1hNyM-$$}^OA6dQtJ7;?l0%4(Ay3S~LDG)^s- z_a(*0BLfnCn_(K}MI@Y6uYD;C(Tm94p-0j%Xs#3Is2RT^dV#e`US)Yxv~AoX7v>3Q z)Zoy4^lf>1R#s>2$jvLE_#bUf}cIg?w{&4CUFdlHH0IMktC=h~yGe z@lXUj^KIh(FdQgq7MUiYP4xu*gJ+$e35qKcxj zNZKb+f>IXal4gpseVw;ACq0oO+7_JLi31CDD;u~EF>k%1;RooX(4D?SbP(BvEfuWn zLx}2<&7U?fG~VbB6K1lcv9%0X$dX=r+<<791!cpb8jLWJwPETSqIZ0Y zh)Cm7cfMqqbL}WlfHcEVczlzn9D$*~(LM*5vCnZNZ2J0N2t{I`K-y(xgHm#Aiu)@Y ze6KdE(Z>$OzF6Hph4c8qa-3@0@7mkj*C%~VgPW7zT~-=zY3SOEuU1=641Lk0yoS@2N3j83>8Au5Qt+CuW< zFguDx#umt={ne92I01;NG_Bg2cg0(bP2Eo2FJGSz(KI4KtWR-n-Xz62e;~>~!XeVa zmWVh|l-xpj0h`&0!{Zoxst;FbDT0netij48x>A5Nvs2mqRLs=u1k5ly(G^Qrs9x#@ zdL>0_rK>B}pl--b7SW~q>l<9*u0ob7uXxOT;%1pAy* z8(E5OcIZ%w)oam4&SkrDA59doVw2N5Sz%7&Va9+mB3Nu2qIi@^P_}JDOmIApzkBH% zDt?Gs1HWF>Ipj4VxOz=hWnLu}><`3a*CqDDr(l4NeRtYMRBRRXsxyQ&Ukk(m(5I0v^~m#tSJ#IgZA^SX3-z8ztxhvZ82f zR9DDf&Wp6`%DO6_#A!lXULe(hj>dnD*%~?_kMWt7R>&3&lm40YU>x<5k<|G4$4cia zv>qQiSB0cBGp7g+t=i@DEBfVoFuu516*-!niUI(pHbroc1R?69PN{h?j_MW@g%Tj$ zOcdww>*!1lOAhkA_NpXDkz)4&U2h*EP6X{p@)h*LdfLd^a zCsqa5SGOL;Z}gpTXo31r%p^1pjD}m|VuWV&nrSMWs0RmI&1Z94SHakFAT9)v5|X`W zI}nd?$mqMd;p{xdIof?)o$_<#J*@~D7?s9c=(|Hbf+ znW$nM)R))okL}%v*Bhmjw=0ZNiYx1*%FL%-S0&(;Agh;z2{%N<6X3TlU?+oQhHE@aY5NYW^!bJ3TMvTL~*+88^9@Y87A6R!0pt? zOo7oHM9Us3b0Q{Wrfm;JjnWW*_tQ3L5e^V87PYMiaU=>Tl%P26=) z#6j#S%#*=Uyd=(r_D~DuvfDHts8s4hoTfimPexJZYf;~DE{&23R!Rn)nq(55kJV~B z!!wTW1W403h0&x9PGF`x9AM@S2hR^*6EBTRg!%1mjv60_$r1W>mZHdg7z(6X`c~bX@zC7>=bWK@#7-#gU*;^>AvLZmR8k0;K7o#n2R(sUF%aO{DmUm-Es?L=FhH#&wKGdYDCM z$hD$!KhlBYE=CD~w33eG6G$5xj`7O#Nq4vkwK-A$@ax0gM>KPNII8R}9KXc`81x5~ zK(9~h&xeEuv1tx>IYN0=@qcxDbW>lSUZLF6%iTW(;^t@`H#x*ZYqwoqkRSD#rFjA4 z(5XwJc?;qYvG0TsZJR)0>yfurc#OAQzyp_u;-L>J#zDGhj=mS z9TwgJ)CkXvp;)e@AgNdxD$R)G#1)Fwv=!=~M`$=4Aqy>g3VGt^?<^?@1%&5w6ak6`@s4^aP@Y~wDEpeFt1U9kv!Mn*8O(6bn(QYh*2avEua_XJ{O~M81_jYWrMO#EJnj|oBC}QO|!Ud zHbuiuDH=(6DOwyCF^H1?$3c|ziYAhN8Oo^b@3xf%dZTu%%<3uwQ}PtDf(CQ3FS2=Z zC;QAG01TVm>G>T(zQz!h`t0(q`L4XJU!An|4GN51+?J;o2xY;VK7IMhQC19%5$WWi zK110Q{6Kk_2d= zT?ZV=TS3)jO~8>R(iUl1R&_nUaiZ6Rd*ei3BKyYKC)3`RH{YBzsP=*yYA|h2y7K(= zOw=Xa8Lew0&lWR^kerjkBNU2eoZiC#lXj^`1qm~#6MxZsLHc(xM#VA5#A;Rkq@WB& z2K{lwYk8@*$YaKd!@pisZ4t|cX!8{r4sBYY6<&sd2drVdc?toY@5d(kMvP4CW@H|! z(zQnfO%aJ&lqfB4Q$eKr$wKlTDkyVwbz2tomL^&#K$YS{oU#9$!|aU%Rrp5$3fG6`}TH2t+d3O^blU$`%pU*JGWi& za6IUn;s+g=*tPh}TiW|L>R5v?_0l;c#*4a||8a5e%Rw|nXx6a5S&K;PgqLXV(6m_< zrB5MdZ7^#o=6ZT0FdjAW#*FoDf53PW~z5vCa}7(uc3+~84>X2>h3K!$UH<<8IkX|zp^5)q6<6x z7q-_Xubx7J#QR~W4s1{RCaLd*7Q{`eIJ8bfwa%{({n#F4MJ%4xqq_>|d(orC^a&YC zcKL3M5~o|FynrRC>6`#*7O7%=(pkO0kh4gd37Fv-itfMoR(S1>_^OCvoz2qjn1=

ehobzlH@*!y~;&JCAWdw<`R zFM|s~!=AkkuG_P=ylI2;Gwv`mW#z!|{N`Zh7%`G91qEPojA$t>_*VTt6eP#-MQ$Zt zU}Fi}&F5U>UHX4m>ZtzYUDsX2P-kwda!@|e#VE;E1f;bMTB782T!T^<#Q76YzW2(i ze4~O&4;6SOl4_SWyD-BMVj)T$*{n*Ga`D{lxFuA^^yC0qvMPn3>27DUD$_9bKC{Pu zE)?!^aktAk3Ydk(3WckStmzt7WNirECVL8;%L6Ff!{p))o?Q7Rp4>wvIUpJb3wVJ( zK|Ci8@2#F3ihsSgJ-@_>ysv?Y_{|(Nr>mrAzsI3oH(GH8vXk4L8^J;V0}qHvC829iW1i5o`@3N2?Y39pfbl^Ct8dhQDC$k z(Q@)9Fw-5;cJc=f$-5C=nuQuY4yTqG3Uy|}y{3h$!W%MgEUaxUW~ z3u}Y;Z1XUF&ngv@Y@~h4RAeHX!MzcJ!$bK>Q2sVeLs*x06?v%Y*&xdmOa3l%)%&Buo2Z~WbHxgc!5n7g^Ho7OhBFK)3 z${{y=qH+SHsT=^blaa~^4E@b@ion4M`5K?8C~obpYbJL$#H))V+H6I|YF^Y~iR1Ya zbt<|pev1SPH2M1K@c0n_zrDJwUk(3wftIggr|B-Q&&yl<5goJ6PSH9Gd(KaPcXN3G zya?JvS-+Fp*H>--v-;KD#W!tpc-)m|H!Tq2-%oB%|A^}#u|&$r@9)~{*Fe8`_UW@< ze03nwAmm)3;Mj`;`A_|RMwPUW%Ddakm+i&T*H{ok?$1e`-vr;}eIJLD`tJI=FF9?= zMfwp!0eQfpAWFW9*l~=B3IC=)WZ2~_)dw03-KD}FHgc7yQ;!mch}XP2>kC|ecrg44 zi|X6cS_B^Tf55B#%MzW+PSMWxlZ)=`?hhaT{1>vr{NeqFhtJ!aD{R{B3ppgq!73f--h5(N_ZHkYFzQp30%n z5=(?X_nXBnOtfE)(9dN5gSjLPyErLXjidBB%fs3j#L?^fo-@+H=`4z$s$Bp3G_4VcT;qe8Ic0N4(mDmEF zx7Z18u(SN>@bda)d2#wjxwo90;zx%?Tz05KSeG0%cbix^cTI)A)24ubpKe;`H~3@w z&bF=Zxd`VQsMRi2NJ%yDizAT@xt#VNvOlq9WmfYKt{6%n`{~oeoo;*T6n0XP66F;| zlHxTG*SzR&;ba@d){zjr|7O)dmGEWiiZ)N%daL;Gs45ygRM-|mBVckl^^M_32vnQB z>Mb}%VT1YXU+eM+#3A~`MwFc!CKLx?hLMOy1V>n4dRRB6G*H{8N3 zx074gw{=9=h& zBY3z>vU~q<9Kj*MLHs+6hoAkZ>Kf3`brh6!5d|@_`~s|tEWoL0jl4b)T+Dui;T_$z zFJIrjf>X&8}1k53(x-lSqo{GAzZhgs)OEqzR%D_2>WjU;M?N4!?rQ4lNpeZAB>& zx57CxNbIr87A@H+Wi9#%qiPLntEelR2dD~Rx!wD!l%Ia|;;V@UN(BS8@J)Mrb8?3G z-yhE?p^EwN;IohZdLoFf_sVQ2vU>T$gSiI*LDbfBParSi*3Ulv%-I6gx+#yAZ;+)1Cf~BH*9u13F3ULxnZT}ag8itW(QXX*j`6R zSms9uSVsBV)26)?`vm%a%&pZKjQ&79h-4UR)x3+4OqO?QnP@e{E%Y42o3Oy)Cg_umIgmKf>biIEyns1s1LtW-! zw$~vsgIH|&in{corjVIeqX%=&tO>GCF2UyJSG#U|EdtEWvJP;Gn42k*;F@m2xMqCB zkfk%`0oGL^`Xqd?qZ)FHM@83x%oH1ukUM%aa^ zKs*?IqwHW~IWNH$B~BNEUEuT^V4!H-s5;^X%}xRh$RLtF0*16HZdM>Yo2Gt-I32Yp z%(+TP*1|#|7`=&Wt=O=x$tC>qiZPKsau?SPi5-{DxU^NjQEH_k%folH${Fvr+s&+^ zMqK}WTksZwv(lqP z9m*w+Cra@@1-Bkipek$>!DaLDC}VLdKk+ERuhStqIg1jed6a5%6+(G>l<1}|!f!5M zeBGQhcd);3etdY0SH3M@e$&2I|A;6}(Wd?1EzvwEd=`g@C9t1VSX{iFN13LLs1h2b zn7bN>7MQe+=FwT4$3pXXDehXO#9{OT%~LQG8?);q!b3y{B_EZPI6a0FWo#!Dj{s@< zXkmD`h$fxTu24~fc4a91DPjiv6cV{XP{=QrC?3RbMriV24|8$qV;r{$nnc%4)rMK! zJV183<#rpxqqtZ-mrLe|u*xS6Q4f%twrc4aJG#86m=OkGmV}V9H-^WU6f#^~BTJas z!PNnnAA|hU5t{$q&%F-V7@i%UMKhy}sp2YY&PT5f##Mbnl*_8KVf=P*GA3rR@@vJs zR+g0tm^l0DEoi>!qoDXGOEBB(keER%Id`h4Da81U^iNI?W;i&iEE6>Opy{&AZ<^)0 z?X~DWv$L!NTq5S?nk2ZUn=q~!A2E|jHymQ)EAK$i3mFiaOtF{D5k)FyR*-xeZFr4) zDG;|9b%^a!hC$vK9#MSMq62eNNsCMS78*lL6F#8nBd~~@omIN6dwuArMPbfWLb4VX z3c=`&m9MAqwl*vta+VMzyzv2?mApgx$en&fX2m{chKIFHSR_RmApExq(2Xt*^0Z+A z=QS^yf|V$)@Rk`KIA`vd0aW~CC^Kz~8Ax&jrg=uq#hk%!H7K4qMKpr+Ac?){ftpjv z;AN5rNibiMN2D8j`;;Vz9xv5h$?+74h*EW`B|`!@Pep48Gn5BWM2Vn(d{`c|G%>XjfuU2S6fjLEEkqBh zitqH)70Z*NMnAqg7hcROYASjWJw=uaCVB<)nj~F?t~F(eaxroG5Vi0vAAybOQM|v6 z=@~;8$&ONOG%^l*Us4~e0Z&$|3o>ywbq2+BHZ9i`LruLAJ@dx&sNTMG0Cz^N#$NSK z%mgu+9|dDh&h(^+R`=$u&-WSv}s&CRcN-S%1pn4M)E;1V%6*CfF;-Gp(?_=uTI zy5Y!4tx!kAfuI*MAT*f}wlO^jzf~5qF;WjGfh|g$E(ZI3M|XXW-J*5lToD>* zaZwCt+B1y@qs2qGS%Gw0P5lgUI%-jvs))0)25np#GG8<}nQUdc0i-PF2Z-g# zyQIz%WPapb*9I|KB?oQMMnM&!%BV2g-lt}7Fk&H#6Cq)@27S9wRwfJ~D zq9@1C+O}rMg>BHy>H!k^Ew_7LUCMpZtNQ)gs$g>e8Mljlgwl8XL2|adG?jyh53u~WAd>a+1DMZXTFL+^9-TbR<96yYS?oQG6m&Ol4*qEevQ93^nwwwk zx^0RHFgwdSz$HTUFV%HI5;mg8$$;pZWH{(j)zu+(AjYL)$Iv=;Iq$}nt$exdQglzoXwnvMnifql;~-=lL{AtuL4{tr zLDoh{;9_N3vGy&8^neC&x|E#d$S)ElrA(Ky;0Z58bq}91APn`f4X3h5WmtmrA;(~y z9;y44xXa3*9yGuZy)rJmxE}CUh#&aJrOYkV@j?CJEVLKIval~ZvZvta0A5xV3NJ@R zl!)p*Tjti|%-r(Q-j!qz_|+U_ic>Ywqb*M(CO8*^?J3Z++P6j>%SCGAB-#$u@4{&U zR}a(`)6~mC)J7&{bw&1oP!c(#UYZ$!T}5f|=Szf7R-i^to+tVK@petv=1~-PZCB+@ z_7ELLEgypSm4@92pKDkpNvO`SfN0DMq&h(<~738X%r2>?zvp-fnRO7*241g4_SlbF2sXY1A z*;8*p3u0j|Z7|#GkeC6lt4se#Q^?G;(j9m6rmfaqmI<;>F2UyJSG#U|EdtEWvJP;G zn44>o;F@m2xMqCBkefE=;mft7j)>uM?hy1s281RPP*PtfW>%2Amo~hH90lSQqYklM z$}q^zp`lk(zHWA5Di9AwUv4|tSk6nZMTyhJU>7+31{f$>H_jEIfo3Ox24q0fN5Ht{ z6gMl7ZtGqjI%-jvbCr;+g@r;edK1@Lv0+`4OZep#V$ZXbXghG9CO_$|ulossm5Y1SmHWiu^jo%vy6z)%_-5dfZxuZGq%4%!YEsm7 zLSKjg6}9RBL_ra>RhD;%3l2QJ6=fq_U4* z#vznz!uE&?{}Zj0FkLBk3OI7Lg>P_Ao@bzw{IoJd9iIF9U(InnqgJ8?91qb&X9d?& zW=)!>yo89V2}R-Zr^CW-T+hb!s3sGA1*NI2m})l6#`UOdg>(T8gV=$v(g9M6>gN~& zXl4|CQsGlHH^lgDe-+be|FP%2Amo~h{y%dOB zj5@@2DT7WqhlU>8<-f~EQi00E+3#$8sFq-h5~qv7E^u)<(7JK12n{ql2{a%BdgFSw z;2y1;&txmp58ssK`~dD^9Hj|s@(f*jqB1}b3J;1ZM@8ni=)x-JSz1RkTo3Y}Vyed+ zio!2zdi*fN)-1S`5$Xw{!6|aJOqY_Uc`T0SeO$^24S@`w68@@aM=4!M@)vuc?o}!| zTLe@7a|jjLR=fi1Ep|@hQ78RgpJ@wM2miV+xwNk)z zrL<5zYQCjQD>H@?%Dz-jioUfTR1e-6#4`x6{i&Y3>hP`2vbe63E>9ky1NkE&U^l8~ zqk4L;Ubl3}+oZ!`8)l<=3_is0pd1J*8z8qg)y**k(9GD`8qG^Je!EdUTOL>6f%HV> zA=Gt3X3)SwOx&T3>KPjT>6&DxHPj(?Aja>29Yci0M)l0pZ{Db$6H;2AwPM4%cs#-z zTbr?5O{9-pg;TLpv?O+P`X=9`#}L+ovd3u?Rz=V?kto;`qI6G_ltC9KENPOwj%J&0 zJrvcmR2l>)<#2*T8+DWFLEoFO5CxY#^0@G(qO_0R>jxEz*_7((fx1_z@H%@^Jq3@v zOEX7dk;18*t=L2H1G2$25AzlA0|;4)>omf&3bLr!vbfN3{gKvY_R`&zR1extEk*U9 z=1rK(zUF37593&8o@!~)O5A)J)WZc#*GUW0ljEj730x^MH6ic9E7UFqZRNrA@Dz!! z#j2p5w5>Cd|5c#~l1QU{QvB=2^lVH|@91qzPkVbK`nFRBi0c262`w%w=Bo;k9F1WP zI}oFXXJ9s_=S1;q)!`wlpXNirFdqi%Mb`&Bo#(5frjVIwr8_!j)&yB6mtb@At6jIf z76E4YJRRT?q0$iQu#kj}=^3J~42M=K`o{E#PR2v=D>)*I9(#{k%IBl?r|O6?1saG} zxq+H$_}qj}akB#Hwwn4Gx7AUL!c;|^l{IMNT=W;+lHNr5dd`Yev3N*2(wW5@A3zVJ zkKDy|LofH3nVt%#+jW&EK^is%D)ylNR@J6y(4o32tBNk}n)v$;>WM{F9>K?$Ob=?I zrlOXQ7t@0ScZ2DPa|6wVAi{}U(2qVt+ z4F!FV2i?e*gL;ZE z$poQe>Ln`cq=K9E(o0bWdP2bVrJW_QL&U1A7QIOIh?>qmR1ZU0X;D4eOD!XgI$9oF zPt1|DhU)5T)Uqw|CJvJ(PO7@ho>r{g#`SDmPw(n&T+hb!oHnQ8SA+(dodg=Wo#k&28=2x}1=4NZ z>qAE^3UjU!lC`i<2u5$>S}QiJYjO#{ykbnGkKDy|Lt@9JGcIk_aOqdqZVU489<6f5 zyDf+saXoptkOWedWmZ>N60~)U1TN%c1zk~ba6K%Ex}rt`SN(Rm9?{9?1%h%;Z*EWP zo5SNj9bTQ47bnf#HU4&aad;eY)YZsEy&Kb~ESM!#lC^qn(*D@L6oU!?>a zjI7GVQauT19`w)(i~R}MxZrscRrv5CtqdRJCq=++OwY#js0I@mFw)Rg3^f~O%irop zMJZQfn8OZ)l?;#utLo(#0%&H8NEZ21%?UAn+h4`B+JCG%9ma15XJcX>EAKuv*$oM! zsjhu=^pqfrucFGK<*CAKuS3@d{1(+SwjZMroSSRyy72+fC(AYCBxgG51_5KNcn5-} z+JMk}L)gajYz^1@WlD=;@)4L_JcOHFWwfTXkE|h1M=c6d6>(P9@bR5vOX-&M#v9Y) z6DKg)%Jicy<=bW@jcCY+@7n^?gCwo0$`H#Fqm*k9muVNEk8&HNRadvXL!%!SzFnqA z^u}^CJ$aPpVPP%H9Hmhv%E$V+l*q0SNIidx#rV&}sqBHeSE+cA6lS3O$?a4kD8SpP zOw;)O{-5TedXnQ1|I3b21T*oR5%(OkA2NpeA+n9Hj#w07 z8D)iOpmnC0x&17lU2k;&PyM<6IFM zXm%24Kn66izQ%)1akB#Hw(j+zqZWlZR|&~lSSSReH*u{M8`d?sgkN4UCelal;<_QR zFH6%nc$NyyyAM$2OS>ezjd=c$x+He#^Fn^D9oI9O~AfDj&sN zsgyZGGb<;{!+6X%nt<8jdJr`ed9yr*Mxqa$!SPJ>ai@EuItn^?P*ErQ8u9|_?WK=V zm@(+%-?$l`4D2fcbFz%B2(&|4qzsYfNfX9tQ=la}hP^z$I%}K5(|fQ-`}-2w;~Ag^7>_Yar#FY19@_aA02KbneUp@%aJ8g%x-m0 zCUZnw-WZ;;{zfDZ3EK>IkrHyzlS(vLg8@R;s%<<8VjWX5U{nKI6{1~N3^W^NmOdjP z8^d$mkDHYPn#_-aF(+qwQC}j;Rn7S@Zr)#={jut77{48yjEPyS+*&cO4S!$ys-ua% zF+3Z?qgQ!RSG?hY%f%xRv~mpyeIAvq#)+8~B=4muks(KcxW%aIWS6qwxXU}ytGT>( z1BeIP7#6;=DwFKom9sxP&xs(KVSh=^W96_EIQpJp5;+ zPsvh|o8fdSc?{O5#8e#+k@(^W^7M{8aY$nvIlBxPbBWYj_t`~478_r z$wUw9#n>KrmmDpr(MD^9cNvBmuZpmYySgiyaEtAEylGtfDDg@K%}_0CK|xWTIm#m| z=@Ug90(`YoWPRzeJt!(Qjl08lX+r$iG*h;L_hEh_lqdJq&6E|0yZh@!<_By`7P2ha zKeh)Pd(<^4Vzdx)$=a=K&lcMwD)lxev&@Ea)e?20TLuqAE^3UjU!lC`i<2u5$>S}QiJYq=2K_y8K?Xd->& zPCp{EV#hDZxU~Ckk`iyN$a=qGd-ib}4?g<*ZklDKjMe?A09q$Z>}ycZrT^8e{2trJ$Roe zONz8$B8$pUPn5DLLQ$N@r;3kYblx@bgGfeE7R*mM3zk4Rf4xNfWGRa|;-6L!KVclT zO_i6(*R1%~o2w^1EGC4p&8iq|@v~tniZ!mvpr4D&6F*7W%T<>^{k1fT=^U7hTm#_4 z-Pj&oO85EkM*L{WDLU!3h5T$GKMFxNTJBrO&ld8th5X=XVZoo5yAW-YE@$2VVo#$z z(Qu?Cc-f7gz!oJcPPe@*vJ^P|1{hCA?18%PyFs*n8z>sNLF7jKoHp&HwPU+$a%n=A zSB#1Dk-NBVNbI`+K31S^6@O0rD`)`6{g`n5~fG{2&Z;kv=6FE zi+G|7FV-hV*4F*~-!HO01=8*!xR(2~J~axjuqG<=uE^rL+ln?nYSzbFe^*BP#Q9>1 z7n*dXNowU}IcQ-)<#=#D5maPC`FNywSt5Nz*pWw^PYPQo!HvStrc5)%kZ*JE;%l9^Q&FAtx^Ka&aw`0iI|&f zlHi(d!nkI9#4PpRmMc#k5yNfYA?Sq+2z>$+^>t!q1<8A9!)wSnC&;j>TS*S8wyBLaBbbWe_Si9*ma zCf$RQHid5nNrcoU)2)orP{^nqVs>O#3N?@AL;QS86iUiPdf0^z42bOv^(d!&wP3Z}D~PZSm2 z3~Y)zXO0G4(8cUHNl~vk%@Xt_%lGGc(yS>H^yp%B*oF~Hw`x9Hy=H08Z=pRqkD+({ zZIL@$Xph2oY+TP)ulWScZQG6O*;=u0Tn`Q!(dPL?`9O}2_8h-OzP6M7(?GL?+s5@A zTwNQyAdMZ3j^Geo)F9!Ft!-BF4(TIzN*K?I9hc5XW2#1pwAd6`?^kFKL$Qb!9)D0G zQvqdsiaIDOl<{e+wy5izXJOvV#Pl$ZYNKzS%$35UbCNvbpgcnGjm?{65h4mai}zMqeP~@w5948y=iE1$izi8n0D{?)&E!;~$OpPNr|F8W&tP^~ z)r1@rQBtEl(UWp2_vxhty+t5^hqmq`boge0-M0#+PulQBS*XP2rMqk6d5WdtJy95C zJen)n%n}66_>epz3v3e4V}{w0jw<90#qN;WdWQ2Lat8W{FNPQAiu_N}FmE=X2l|*F z$Kg={Sehg&;(4;H3vrV$tQflGg-?re-Qsz+ChV%IM4vy38lh8R8)lXRBOzP*)|S5I z@bCIH)dOhBsGf($9?rwa$5X|Tm_aNZ<+~t?Kcd3u!FG;|7`X zRBWz;xMgX8-pxE{OEhmP!n(_gxx6hFhH3uh9FHif>g0GBGC;xzw1zc3L>#q?rddAz zWXgnhH}NPF1PM``06C03P$zl>$Fz>qbs<2yC*sUt;=3Q!zA#K}&?^L!-)eVtH7{(UtKv*3RDMdX! zGX&7gY}q2#_F26bO7k^XCYyJ8{dzdZC0k83YSlHSU7uO%gqa$KHaVkBICo@hta!s} z-68Nia@iXYnr}cUG0};c6(sMqWs9JQgdB<8_e@y|#>h(ObTQ1JmXe2PZK>ZtLzD)Z zol9C21DZYphO{YeRv_Khy*_l*qA*nvjYlI{n|IkRWO+>2p!Mjm2hkF=Mk3INE6dp` zYQ*)@7EU3vtmRG8v7qVF77+wh9u!r^gQ6+&j3;OX#MxW~0SkA1fYz6T73YO+?-Fqh zDGE#G_K0*AhpA^>kT{aT>w2#do<@du*#mX&Qrvx=cZs$jxo->sl0V?Fqe>-n)CcK( zOLi27Nf8%Gw103qX9#RB%M5*k!y+n+Eso&Pwtn{FhgQZBfY@P6kp!Z=FcOwfRMpJl z%AlJZq>vBwgD4@B`H}71wE!Op(=^jk9fXTuA<|YQ?4Ubs&zhT)<_@v6h#)*XE*KJQ zd}*H|%g_P(9~>!M2qzF)iB@WH#@PO9ne&iGRafKpZ5S7uT0Y(sF0YMsty+dM@Xftz z@+?B}C@Uk&^RS3L_sm3e_?i&1aU?2nX=*89OKKU3joM2sdBnjOU-+=GsK&#gma|ow zX|rT;QWOa~)$gyCXwC=T9o7y~EMXn7C$(E{VLn@!PtTv|X<9`0ZDBsL)&cqI@Z>fk z_LGl3|KLjFsi$7>R<_5=vXk?!l1w)8l9bQj+DU-M1lc6?03v~G2$lbty$%8T2S5+D zQpgHp#_3TvBGw+fAXR38Uy~U6^teJd44^V`2DvA}JBD>GXk!9flsH`s_WLd_2U<6( zju@ge5Up|pMI$$$HzIZ`+w*;u?Lpt*Dvx*(v~^gZ zY+OXBkK-HU6C>{nm0s`fU(D5m9UaS*q3TY;SLBbduuf`pQ_8{$`6tEZSZ)zMTZ9kU zwz^%ZJD*$rh-#5f3^7ZV!31ITJtRgos*GFE%otvW{OPRAp zkbtRWX^5X1l`4QtMKT5zGkOVTdmR!pu_;wiQ;1?Yt1x;n!xDFctdmQ!=H^$sZhI{P z%+9h7aEX|kYm(raZo;@`e8kXs`WE41W>XA<>>L_er;0`xE|k7|%xdAkBjO%JmUGBIw3>R+*t^EKb*~@jaYpv2P(*o{C7Gsbk5)WLkEz2kMTc zaBJ5&mKmaH@az{}r5=cVyI5~gdVsg%g2 z+WpZL+moSby{}rLIXu($GT@}U~*Rydw z8`ndO!p8M%K?J=vxSaou>#5t*Z`+fd2fcAUTdfe4PBVxIsWKbaV~oqj^{6#;X*3oV z3c-loa`A#Ttc%Da{O;Q}7j|MIedI23gd}#P2`9%5{Q=Z;3$mtQ^&n!Ot#Zb@J!tQr z6xj?(UbYENlbbw6a@M2?;1sQ|(`Rhg62OKmPEcO1nS+goc)(Oa1 zo|H+~JSpAWa!0n@kv>>K58v5nA1yh8>}lAITkgn~J92~m8e53Z$<$-B4aw-NIpbKI z0Ugp=qYu(n#K+RwMUD`CqlC}z^0y^;*@>UP79~y>gF)y>fs4z5+-~QJ&_H-H_o5ik zTZqpV;`4ol_<&16+u*#8(y$9ryr+vAbOYj|j8B&5L6f#%RFWv~*jx$;c%QHsiJz68 zB_0Egm(#OEkWUsCnVt8EI8Wdk`UDiPLUe_lMEan1tdT;}19j(8aU@6Y($lv@FIYUG zpKqDwG2(m{cpp(N1mQzTv?in=VGR;;VoZ9IM^CN&$PX;eXAc!#sh+WE3o$x6 z^=Oqd-fcnDi1RAmoVOi3l(Y(1lOeUMEbAbTiYRC!R6306BCpDNKG5eO2%p8`%~64_ z$hkeWD?*#u98{1O;e#3*2H{i0F}NH_?ugTfplMaGZsLGx$C`TURWI3M_w zStiO*W-DR|SR3V4(w14+!7r`1yspPv9@a~BR}wv_A_OADN%SD83yiU~IBb?CB0|Sc zDFsjuzC*24LnVRIHFC%3klqU7*N5fF(^x#VpGKyjUlx3sJyH?9R12t zBoBE76Vq4$OwDye!f2*zA%(pJT6Z+ihPuqdY-2%UNEa-#F@?-bD?ONVW=)WFatSs! zzuI-%YY|{}mUVzj#N1qy1lM#E#x>(3hEgcD=pK{v7zWunG_+0?jWE1?sl6&tSoGKB<-1txLJYpY?}HR;&jxaFjWy}Wi2ce zx+T4_YpvL@uE{0*@`^E$K5`e=4T&9>&SVQxEK)_*4=<_fA-JC9q%M~52rYwb(F0K) z<$_A_ax1@|q%IM!;~PN0@-#*F=f&V2WXm870IgW|PaMgSC{v&_8$)T>s)?Rf_vas0 zaL@aZ)D?mimNl9!u|2HFMDUD{c0OcmkAG5^XsYE)`(#-Z-{1dcDYz$MppxP>`Ib#n z#8sYQK+`w_?Ypr)8|x#@XdlXGV|_G-T2Bm{XVu7(n;l{uU>RkFZm`(82IHV&$PiaS zbR83~GnD;()XRz~t4fed#xRW?2rDrlrKlc{AwXWVy@LFy=8zb_-B_P;>`+cqI;^y2 zW?E?>I%n1dStplZbMvcRw@ontW@lLkxJ0PVrn*i@!r*64JS?4iQ=F`=AJ7b^GkhKm ze_m?qf-^5Mv`$@~ya7}ev;EG-a$bThN}MhRyTH+PpJTdc{i!-)h|)l`$_*5a+<@L# zpDm~B`^xExBb-WSZIeW3;>rRfO_fOLVr@{CZ5E)CD9U)SMED?YH^OHzr3-mx3FpFl zH2q4XjD)^^B}%ZG(F90HNsSO63Dg4$fT8FCiYQPT1-a5jP=qM+XMGU6bASKumqL7^ zI7Z}9p5`l9pRVeXh9?!{ZP8^ew|hrN_f6y}*{ z&n^(2MyL-%>*Z;zkAUf_X=jFYs=``GP<^mE-mtnobIWkZ)1-Z z-F(SdIDPybV$s0|lt|lI5$(B4y9-@PdtRZXiJ?NO6RR@uqZn>F!FVCksV_l$+^6t1iYw>0?tbQSSyEt*2h5LV+i0nHx+G7cfmJ6;@~D zU^CC+Jz2;L$h+{PdJ+Z}c9H5y5qAfv;R+Tu%!;_hcdo1Iq{{1j3-H+je55%)8-1c& zwS=DNK`yFA<%oq*VPw0imJD4V%qcpobU`LD2Kr;Emb5B_%(ac`(VSZve$2dT7xyEl z#2W3aTy3tch{^mY7;|!_7xg8gT-Cf(2mOH(4_UJAr5Mjc|il#S{!#)a0F zb46&N*-4-YAJDXC8n>L{W(Cr1-Rna~Eedn45|XvBPzXkE;tB?{Veya~)uS6o%D2s$ zAx6VEeBTa!`TUE|4(jskOwb`W2fzKRXV0I>e?K{Ge)glfDr(*!mZxlMRBKM+Feo^R z_u#y~uF|f{i#nO5dT?(408|fZ?>Jq`BnGRIMAp9SaTG-yG>i|`gT7Xx%+JK9WFUJc z;R8KT^D5tj>Jb41zD{M9B}f`sqBaxB6%C+^CJXDLP1~%E(vm%`tdSp9Xpfif zuB3XBG>2L^^%C9lP*zJ6`7x;;1nOkI^?Ra1WMod%l1I~kI|51 z=%DVYm55wQp_L0%Pkh8s{5;QC81GNOW@X5erbhB-m(>XqJr7@$=T~QKb9mg9XE*JK zhtICBFRu@eFYeCHK0N%@U3>lddHegj_U88R_)mwI*DuS9(?9mzG*9uPLzI?I6QWeh^$;Uy2x_>nmKt|8K7@>sP}+Ucl5jy?A-jU0$D;xXodC zb#-=H%bWc4cQ=<8z}J@-w{SgAZeL%u{m<%GcNgC%&n^G`G!2s`#bme@+Vl+50WYMS~k^$S8j#}p~PuyPRK~$4Z{k3NQlbvD!eApV> zqf6;NKi;SwjnmN0rG5Zy<4cCp>=5e!%P1@K2CXy2!0i{p3`T#T9z-&XwQ63*&-L}D z+-9SCn%h^}ickh;>h-IWMDtDaFf*4nm}@!*Jb8KQ9qCR4nVD9)qjP3W(By;Gj{?EW zZ<^)0?X^gN=4+}ir2||dR1ec8VQ?<3OP%hbxI<1!e%r;f~*2-d)$i5&!b_?W@a+pYkIX9+an-uWoOz@GtohKfM3&@OgW4 zb$M~qzK~9Nin_rIddV0nx^sJ_W|l=LIgVSy_-M}s7f@f`U4Sl70(eGs6PZ*~swYdQ z`CDATbfw%WAjx7PM$nJy$wf5ag-=uu$mA4SIoFbFh_x$@7#CIJ`3j{3CsQ$zz*R8T z@#<++Pv4lHtrNRyFp&Wx4Q<6xvtc}G8b~q9c++h>MQluu%p#M9D$R^CrV5*)IU%YC zFMr!#)h9&xvFdafza5;7iFvI2TCt4{zh5`8wU3TYa;W}?g)yRaqrz;jL)QoV#`KKE zif9Dq=32XMd;s*xa?Lo&8U3?4FLk%5=RENpf~MN=25G(lr9%ZLW>%2Am!?F990lSQ zqYklM%Aixup`llEdGGR(RG>0(_B${8?Na6rwkUDB80`05Tn@BuHm0X(FHdMR78VM@ z)Fm*dO*X7+atXg%3QLlANFTY2>xLd+Tso62=rJ%oXt>qZalwMF4bhRSE6Siv!Y*jS zHcnHP=S`8$2ledIx5P2@8jq5tq64~{>A_ifBvYzPrYGic;@go6eQy*$vwD5o-W(oB z=x#OjDSM#iRK7`254tgfOS}-8*8EN7 z!TKLtIEWrY_6fbEVyLT6X@l7wh{Ozd-F4N6n;_OPrTfhdt9Bg^VjEe)%nq&&u)U6s zune&du#B=oG0@l;Lp+1gAE*bB3}daDmyJ~w^tL@tP1%*OPXaZaPVKF4m+hhO~$8dIQwW+#EZ=>wYf zOheifH!F~C>s}u^YEhVTm5{8B>1j{KrDHWpXYs`DnBV|wBwD;^+y3%;V0>EUQ@BT_}+!~LPH?{3?Z=I&bN5F8$-=#e9? z__&lQ4CaeX>%+6oJ!=0MBYv%!j6_J^}wkV#c-1_Uy;65w{0EO zaau=7-DTBQnfdWn>haRul}ryqvE@ao2SI(Apkz$Fj51Vu_l)UDGBc|(Mj+foDAety0fM^;|Y>4KsD4dbXm>TT$k%D6>|#iX2eIvogZCM$FcL{l-=s zSMD$i9=ya55up&-23cVMl`pp)Y;e*`utkZ}#b6gW`s*9jW8qwlF(WNnSSSSZSt~ZI zYjO#{ykbnGkKDy|Lt@9JV>L>o3MjJPFRG_4Vul`9QIK?T62xuW1{JuUAVCwZCT_AU zF5(%ghohQv@&Hs1B5j;h4_XzSS` zDMatD3m>17@f`6>=vJNF-~SJDfjy&RnTD7g{Ad5bo+zw&+B9fvmejm!_>*!he^`M% z2%dosHZ0&>hx%GPYJ(z1J3Z8k;T>;zme%2qR?cln~P>^hPO4t+nV9If*0u{xrNvnF>Ho&>OgEEcIUf* zH(FB`=0F`n8w}395+d6N6q1+$Pti51s3}C@2viu|(K)jw$U3*&yggv(CB?zJX3;y~T~V+-&XM?Mb0f!~R^(mGIZNw}$J&am3LF4~Ii*XxEHo zRTGL&X;k0@>}jupn&R6zJ@`02yeJ4hDwe-wB?5$fO<;*~mv7nB(Yb*!6?zZZ~ zO^}JB*@NhQrlV=Ou2}Z!oSf-JeTgVnXZ?Tqc=OJ` zysWyV#)G@ue)(&~#5R2X9zbTg_R-Oa162lzoD9e+g-yP>g6?7=u)g^7r>a@(l zCQY04iIwKrC*}tF_?`#ycq#76Kpq6JV97g`64860ULz6FW@I-bB2AP`_hDW|#Ox%J zC(WlN>;y#t6=L}0@Lsbn5Z05?bv(7p6imbSO=H-b;q&$df>KRWUFDiC~ z0hnbWI+XH)-k2~%?+z| zeP(0{Gds9C!1g*i!Z9whqpZ*x^x}$P+mFi(Mt`6lL^6!EYF;)*MY)Y(2OAHvLrmsJ zkz!8H^rGz*?? zW_ukHGvIY~>F+p&%uFlYaW`+;YSo{92eM8s$(oyA?Yix?2rxU#I>04jZmvm!Yq|;J zn(+}cnRLU!mdjQh5eI@^$bisfD(dUR%nCA<$dIEz++x%rwo4fXd5-RoccNEQG{P=S z1>(UriiZYdSc>!!Fh*~Ro1Il!)0+Aj;&jxaFy|^ESqlq=VDu)gwPM4%mJ8u`-?rfc zIQ!x|q>tR`M`Tv)W2Sg`Rh3oOG^mi3p#NQ3WkFTtc~I9)lNVKm>cMkW(Rmg=L{)T= zzv%QP(Uh-1+=@uYFuX~|aRlxe$P*PJgJjZD9S!s*6pw^y9wif%pNIN8dr~~Y6Xu0? zRrEN_V7yOQNWXUhe( zM7_4OCmKPOE!T`+j5c22!RokO0mo=YGd9Ydfp~sMIM1!E%UFuWXV73P$F$10qR#ns# zVjWYu-`ucj*JnnSFtdZJ18lFOBP>I#11waULLShI8$&#U(H}^6`Hi(|UN%NWxy{bL z)!xa9n9PrYF(+qw(e?^*&1(KfzdDrPZY0kxKa5t*u+l_pbMlMTaifLkoLLiOom_&= z&98RdHpK*(on;;15~2E*w)Y*}O6QnN{WF`K(KT~28S01_UY=ao5t!q z1(`}@$Wb6}G3pT8r7ZaF@=o+>%Gb>XffNt#&TYQElQj&2D`w;s z>?F{D3~1UjjR%|JW(Cr1HT5&Z>8M3v&Q(IP78VM@=uKQ}#fEiFF5#C~jEVG-ySQ#h z?6`EsrQLsT6q`(u^?t?icsYtQ*;?<4b)h%5YiiK10<=W+2wfvW)=BlSLex&b@C>kvQMYb3WlO3h;&}>`flm@}U;9%%VOb@0 z+aNkG@5-XvQnt3jJX>KN@~p^VP^~Jmy*8>x^=+kJX4u0Hgq01DQdBp`5I{4dgwGjd z&K%FQQ9T>gW2pzCt}#;X!MU8cRsBjMjN(}t5nLl?i|5%2qRXZd^3yi@>vPN>tsC_l zILoGJ-$DaL#K$0Vqk7~t&;Zlu2wKR8k}T0>#6RiZn&5zc69@Q#?t+Lh!p;T_XuI z44;-lZqYo~utOAgSuq%9DGX_>ZqYom<3qMy53r4|j?fqhJs4_sloeV-HC8oIGZ_7W zdJxIL)KXaw)$vs?D|V|2k&9wy-OfG84zW=@TkU1)IiXW0n&+ynF;W{XoM0YcTj(T0 z4I;2f*eISsgAJEcD!NXV+A-=7I}qb{z>c9$k1Ik$K9UMlzEO6myU&pGHFvN@iHokf ze8|yH-zXjn*J{j_rmgb}g<$Fun44D{)-}0=-z|kD$vdQv+{JZ64=^sB$4K#XWmCp= z9plkEIMsW^wa_Z(Oo@``z-k>TEJUS=E6SE?Q110jNOo~TD%YY;D zaVKLD9xzGS;^=NOaVRBB^C+1p`#f&DZKF9(a^RoUk0> zP5XP4RaIAMRE%aE@%Sk2N{Xk*p=?f_WGK9ha55}yQzsEV6nQc{g3Xy~ zqzJN+>PKwqNcj+TV`F%fzYz>7+MTv%&CN-3cU|6|US1p?XCbOt`RXISEudU~J6CtP z*T?uM6!qUxL3u?GPtk>SUUs}_o2X8kZp+uYXihA)LhlDC42X)sUT)ANbRDLh?lt!^ z)XSssV%$VM%(Guq%m@Q8OF~FvxJT@bezRh{fo z2A!H?JLJRAV=EeA7p4N0#cT)fR_d^vmtcz$7d>_PHlv$9_s+D$RUI*&O#{tN0u9K3 zrjLLjZHk)}NVj#b4;{5A%(+TP*1|#|7`=&Wt=O=x$tC>qiZPKsau?SPi5-{DBV>4D zbbrcI^e^XWRE9}_f60S5ild+mi!>;zu1(6MX_~5@C3=tnG%`IaeM$tdELD0%xK5OW zcBc|$HVXfSK7z5C_?3Akf`l~35>6~|8r35l*OgRHnxPSxuWMN(X>ou5kIuBM2(D!k z$8o}o6=_?j#?y9afn8Twm+&rsQm*9>D{X5Z4PL38Q7k&5J1HNYr1;-NR46xS*aC#4 zAXhyC?O0^8u!RT}ny4oIh1eX6VkS!H9l@-m51P&4u`j$x*c43#7sGm>nJ88yzNst1 zdU76hX;D__w1xg88`xfPquhS^9Tm9~&RjbRx(5LRA5N>Lpi zLjcW;!m;*_ul%Xzj~Ku0uVPy5KUVz@9KYo>*Nw_Zhp1vw$~!S>@4d5mx#H!CJCf zq9jICPq-qir{z)Fc^$L3 z3RxBLnr(HMx8gioaULQPw&Fapq5;{?!i9v@5a?jku3 zhIq~Un7KOpW7XR*e(Uo2<%78#{vI&#_SIXE_e2A&I4Mgo+v|`R^4cu1I8(^Xw92a7IC;SMdPtYPPT$3nL zqODI_r3ka=>MANWhUYiWKA;@+UR$fzx9yGavanlwXzD(+(e7(+6-?jo;k@zSm(RcW z?4T~s&cxcjIr!~gJ$wF4{`<*k^Rpj?8LJvzmqAl^HOeoSNzn0z1y$0roW~$;dAGLw zveW=K!4o2KC6%;_!MZ|S3(s6inc$^!u4b_;0n*e_d6dON-`Jm|b0 zI|v@AV|M)ixA!H0Q58wUFX0LjBpfpYiE9WjfH348bG!+MNM%DS!xtFG(yqq5(A)!o%IZ|2QR2Hfx8b$z3x zUv*bkpVi$}UEM87{ox2D3{Fm)>+BUgipLSKxkZNx=eHEM^S5=7GHg3tGgb0Mk515v zfy37^Od4ZIoQ7uA9V<9O9?~SD7ltAO-mu}p+c@-N8mlGF6)~HbBgAawaD-?L%`sxNby&D=XbsP2ycXwlA{$*N!_vN^ zt&unwyg)jXC{s;boS1|(Y!X-B6wXPyv?v*E701B{&VA8wc+oU7anp#JhKr7@(^ocy z)+EKq&Jdkolih*$o47q93@K(?@|r_zL&0O*zD)KmgIzGHqKJ1bO?Unm2#OwuUk>&!rk}L<@v21vpaGCEfKe0vZ0OCrMisXbVf<(~HL{ad7 z#f=0;KnW>$h>j@Yz?RKv?O*8+bxMsyH|qyn-w^sXJ~kSwHls;824Olcw};Wa$KgA6bWok$Q3?% zrgTaP-@Z)FWJR)M@I5Wgk->voTz3PRi&XK*i67Ha61^+S1}Co+364;XjdeQ@Rq`le ze;A-t73>Mm!K}`yI{k{x377M@y`tSC|E5^dQ1TcDwX~WyzQ*BlrlI69tavn6_h>{t zofb&5EH;!pLCIZPO<~Lxrr8Wvo^a;V>G3!LL&*dCf5z=R)m3;o5!ESiod(ecY4|j{ zvc*vHz(tV}S34nh83(n(Sc{K2P4^f*X1s35u_LDqU4IQCa(lq2${ET68d^~`GYly% zonUIAyXK;j3gR(pO<4oD?OqWcTLRUN3W z_&Zm5($VBWEd@kI70yCLlP49Gy|bQ(l1El$kVzv%N?8FlOvF(wh8S)+pt*0E%bpr9 zE`3J5{lg(9epU;f>X8>LSRGf-145-L$`ZUS<~(~fkK_p`UO1a4ySz?#lig7B{CaO- z20d)ktb2}ZX?MCn{Ssfz1Ke*{ol)=Of%o4)<&l`~Rdx|8yl`6{P%`P5lAwk2q+Zn{ zkvqP){wE$&I-KxIUE<1D2_91~j-%;Ob0m9zRZ?Y_Lp=xUx_}Eb?FwN`KbTI6#x=~w zaV_Jx7MC>*MUSEAF%&(s{8M#uPBaZp2+2_Npyg6yE~6DoYgwcQ9|fZ%8x*Q%-2 z$-=aT3`I{2?H|hIhBc32>=w=7F1d!Dt5_Pq)cFtIaL>K;`4`TOED}+u0&}QZ7vlKlSNbXz_ZN`5rn%$6g}{6F*!yOE>eP3jNYDy zB8CZ-zvRRgqzg)_s)P)Bz$^e>Z(b8e(Zgg)n@thnvgUI!=<)enKDaj9?N#B#jotH` zvK}~_(^W8a{f8gf@+88$=cwrcTOD$3cwEMG+2C-MLQYEQ&uqboLK{)wByuYceRw^b zEeT&os9&7~EsQCn81*O)VDUsWJx-TIMwEoO5-zVI*P`dg)%4ip{!Tmmr-)A5+3R0L zhr{P|col~`payKd-_}7&!=}fu=`n12s)H}gv6)9_lfuO)60F&%)QbC5+zQG1QA{=> zFrsP$D2`U})g{4|a5{qz8a1Z1hbEK@%sSHAlP1vvyM79Oow93f^ahaUc&EUwXsCJ&RnNam)noTMTyQyr z+v;(N9;;2X`>gN?lhf+;1;FIP1?Lu|n3pd*9b)KF^>Y*{m8PyvBuQtqfOa=~jsQ(~kF zw?xD9@$xymuBCuVz!v}u5V!~07#0Jy zS_wQTB8JSTb&7)^4mZR$EC#|X?x|u0W%amK!-XM@VnruKQ({^~jr<=SS*I0g3av?s zkvfIw1e??<@O~3lu@HvTq;3JR4KdOXBMmXKzQS9Vm2-7gO{KePs&8g3Jz$rWBTG)% z1}{lR7bC$M`#(>Nlte|A6TL?gL}7;%{~k#-$ELzPl3*bKHUj5jdG7W)0)EvS@OeFO zbBNE7BjK_g?W7d8v!c1hQH#R|bFECm!!y)P#t6IatAtQzyX~>a= z9BIgrh8!6@kxu6Z;~vTS&>4MNBZeG_Hx}?Iw23al(X@5}I@vo6DLA3WMB_^U&0-@V zT+Kuu6TfNA1J7&-GZDH?_8U$%sDg;PD2eQYQSSv{NL;fOl}<3Vkd7=WsUTiLR}47Z zZjTMN6|J&jQ{jpMpKA3u94@QV<@7paSrG%OCzj&4;HpVq@h11F|*!GNi z#eifJUBq%cksV2yoSKd>JO>d~f0j9NW0E{o5VBcS>WiPAXBY@B6|C#&QxJKu2WL%xh#N{yLF5}X=uq|{x z=7h$!SoNsK)QqXRTIblm35imXWX0|XIIK>aM4qtoOYoSTWVgCSmtXYT6{j=i)01F! zYL|bxMCokG#8q}Wh^=kZy-CTXy692J1kOxKqDoFZhaZ}BNsdU7lHrA&nv_5)vpSl**MDl>%Fut zA=W)tyK|}Ow+Tc^vctb)vX=sBZ40H6V3-aVrUQH~(8=s{HPEEM&an8tZ7j3VN~kqgIal9*vSP~}Ug4t~w115UTssVFXo6)vj>MUN{WTip()-wJ!`0h{b` zs$ljU!*sw7?-~5!lgyIC-jwNp46iLZUG$AP-Q`%4%>g&sB_>fSB&7(6Qg*445+y|o zmnbzSn9owp<+5NvMel#u_FfJL;s@?eV)X?yR$+TQpAj(QuvvXNTYo|d47~Otn9E=DJ9NT`T zRP^e%$UrVP=*Wi&A2C8>M%FI0b#ZB6ys}4%csdHfC3@JP6K=)AD_W;nj*S+rIGhZ2 zXMVm|o25vU7)9d@5w~UaNKqS?qs409u<&Tb;s`NYIUFG#iCAsTDa31TjuWrNgE&oz z^tv9e6PZI8yg)h>-k5-DnDQ8=JZ!MwodUZe9|U4{9*!5Ym+?Lw#(p|-;aE))(=uXA zTF8w}sl|ASHKS)tl3HBIB%jh^;_C*$1TYh!8##!?BpaMQf_Y`bxn?_1Lo4cu4my&= zr^8#XDN;eagbs(;C9mJDC^oBCQXN*C>{qQ`hr?r)eLmSOs!oqT;Ekc-vB?h6_RCFq z93a5OJsbkLgL?+#=r#j(csxM`W1vLjNw{~9yiXUQL1l%1U-+w3k`BCMfP^{8N*LI@<+z2kBcCNGiHbhIsdwx z395X+Ops`Z6XHxzTq$7GGcu0K2edxU{-EN5^zCPF(&Ka50xsF(^SV`sO;rtrk8yU( zIJ;#id<=z8?OZ(HPS?rtGyrTN9Xda)yJIkXrEMsDe1313QC_$@i!-0@cpjHB!X!?u z7AqV9PH?9LgVJ_Wx(lsRR7Z+;t}sT?Vey(nY(wE=+$#C+x>XYHtM{m?+heth zir;FpxmBwRZXdL&3LGO)U4F&w4a89R*k!N<`WGmCz@{dy!pC8^sV*Bi3Zj=O;ozBF z1=%h!nNn}kBROQft_RS<1WNLZBvFH$nzG)elauGL>J-2w2T;&RudhC z)8Zx3jGi$`YH^{&e4>wuuNwdpz)Xa0qR!D-^hBz7 zf;u05zef~P9*xe2V?h>l*S$JiaNI0WcC_0>xSFSFb|m^PxW`96dy|2H>R0?=1Et6w z+2e5iwiG;P!#M^!Ho=Z?i45EiO}Id3$OEUY2s{5LvLl@Sv?Y9m(g~OCELdP_X?mmu z{6EEXthq@bf6~B9=#WwRMa!$R##PMlXXRwcqO4dY*(%$|$#RZl&k^PRiYx*5Lsdj& zQ+V<>;1t1jM{(F)KAQ{!=eSz8*I!fs_AQW?ajd}h^jzb`8%;I6FmBRsXPVjhGO`$-saubSzvoeS8P4IMc&e_ZC2 ztYUwymryCLt@irs>Z)s}lFFTu#ipexS+gsurh02;f(M(JdQSm0d>LHr4?csg$;`fD zW}VgVuB*37J+d=%M;8D|x7S-ev#OpW)YSTdK>>Gd^;Dm~rm}kO3`muu_-kwGDy?p7 zZB0EVQc(pxxynm&GAOgx)N$4I?#f<$XbPG2ki*O=Gey`O8I)O7J-c_GOsi`~=J1j+ zkKh26+f8Y~O>-#Szal6CCcxr$s>1 zO^XWLR}RxDCLOG9MRDq-l&!wB3y14mE9chv%xWCm&YdA z;T}S0fe^cS+^RF+w~JmM+*SwgYC+&-?in?enhrd)wzj%9E2nB^Wo5stv83yk_@M{a z0Y_Y&Rb5-=uBwduEeXe2A5dYn%Zh|<$5IRf!F^x#{2J@6+Ho; z!zp>;;sLKqhPx(VzY_X|(<;IdXTRU&l8_|QudlBG zK<%&S84^Zn=-zP|Qik*NPDLVIpqmyY1)EyhIdkyN9sQ*E}$g{nl*!gw;;N|ng5 ziCCT6q2?0gNy+81!%|fcgk#N7GB>b zT`oV?c-NvO?K1P0rsr;5d}GJnB|A%>*?Rr*&Z6hw#@uZ;tmvxNes=7cZA)&xXvo5p zE_rt^UERHKP47w1-hJcRUd3ArZ_V4j?3TWz50tRBSoHZXEyfy!xmA4O=^UY`PK6lT}cV5zv-1>roovZH3U)rJ1oZ zv*G@!jSE|M9d_@={oaFX`dt3vy|+GCc4TYOreXKp_VDy$4^$j~@xD!uR-b(K#;(Kn zZhoxpv!lB%e`)XSPt5-MqeGj9-+#xGSD*go^W!hwf9KQJ3eC>TDBQQ@nMElbFP?I6 z-(AlwN$WmhbK(B2FDy@2t~_yY|F)NI&dgtyQS`vwht~GKZ1@uY1z#E6LW+mn2(PT z9*p=uVR{=dKc}$%{(%0!N4(SvSj1w)TZrlZa}V=)74hE1@X>!{9}gp)gNWb%JVqx4 z@s44>j$^*oApRp5{yU6Uhj_PQyWNfcpYm{w_aNruD)euU<#`|JeLDc#58;UzJ{se1 zNB>VTKYvBHInw((!k@NbA1`A+>5Jh{VfbluA4LCDq?6x=W2vkp*zWe zbb2#)a5v`8#PG40PCdfS5xxTJ_s>{AOE_K3_i}_~Gt$HS--P~^=)V~IlMC}}$9B0M z@fM@I6#K<-EXNG=--P+N7UACLE=0F0(mR6j@4$4IVYu1J@^>S`199A)h3PCnfA1{} zuNl$ftVbO?7NA@2zdQ9@j zZ(O$AyvTghik0S@SFK*N*1T@vE$cUIy!EzCn{U74&MkLs-FEj~+jrcvbJqrG_nzH* z?!9mC{o9iF?SJ6GZ5tkX_>o5s?0*dYKK{fXo_uP<(~ZyE`s@?WJ^#e>FFg5T&Wn4u zz4YS2LodJb>ft}`cHCZNcfJ0Ef$3 zoEA#kF}xEN=6;RCP5oRXG=TXBaGkaaho=$4n7@eA*5H{8o`dP7U4`Xu7M3x*28<_QzDp6m0kbs>{X4zF@Ecb!cPUoe(liF&eJyiG&t+~oE^`DN zc8f9nBX$;kvIldwU^{HV_CJE@)q7ZYA?C07WeiTk{Qo$I!55EZZg2E|^&o>!hRkV*ZYFX7EX*dqW2Xf5EzykdE}8eH8JqepaFX zk?zd@Ys}A1ELS1c^Jn!eBjO;2HxBD7&h*2(WO@pVLjP)}pO$>?Y0D;`wx(orQrpgH zWCjv4E=+?tsI8FM&D^eSR`<@G+qI(;M%x}eQ(KuWz1sHf)2{QyePCj0*SY^i*%_@m zh&`n?ZDb`i$tJezZ|~8ibGpOXpY6N+;8o+OC6nxH+R;+gx*IVNv(?ZSCE~5gjkRXt=qRd1UMM z=Hw2Ydls0xw{JCSbYGa^!Y8eqW7cI&%O?(eK7s(VAD?*Y@!Twu{Q{8?Jfw&6$17qp#<@8F(sG~me9ps zBcvmYy<5OhM%a%2$V~$Fa-jhIaWofjF)8qA5PLmh2?CBH0v{!EFyy;_?1N7;ybFH0 zhGAl_4^DeljCTnVT8Yyga)Zzl!;!m$juj_cGF1h_DCCUyuIhqyOUwzkuyofpq?k9XSi( zO_zf88L0*1k z1=!aRPGno{1Rporq5bEZ=IdqKRv4b%3JF6LSXv}c6Iwve8lRbD7MkS>7YiMdT8tZS z5-t#$B?$u|3?#$!;X$oFIJq>NC`J&wq`&%^Utq-FEqVsr$nB z+o-$l+FjIbxOfqDUoE_ux~&H6r*7#(Pt(!&{Abou_u#7ixg_AKRud=}cRRV6=Jg@p zD+R>=zMt7>a>)%VsQ*`uo2dK2d6je-aL<;hG~U~#rF68IFlcchiFfDzZFCfG_T|;o zz3-jT6mQwml6(^W-t9$n8IbYQZo14Uy0oYn@!#L&d%7rBUuC-MX`ek(Li{)U$fL96N+ z8dE2_e7Yy|&y>!ocN)4AI@5ptoQ`%6DTk?>Gj<*IU-H(qbXKtEWzlrzm;4XS@9NfX zl4wGoFGf+gAbT@~-)+NYq316>PVv8O{xJ2=8F_%ZJzMsm^*Eq-6@|Zk@E~i^K^Ptp2Y|NV^=es6j^IxE^cd`#h*w^va2sf`WPwfuD%b(bHxn!2g~ zVEv%74}Wxep=t>jp#m>gqv$hGl;9?K1G*>7yRYg9t6JY@dI=g9yz#* zF4OkSD53ssUwDizKRRNyZAhscNZ_sOThjczeAP(@fnU7n0a}mqU$oKw zXL%zpoA{UQOrgu)21`3yZ&z$RL}$52+svl;xA!sJdY{mnm) zqV4~%dok_jyHY-*^)zShK3Yy+r#USBqyyCd%&45-oPg~c~jI8?kFbzNQ z$c40{b+z}Pd|+L-VI;wubh3eV!0C-yDSu@D6Wg(saI6w$OTi<;sgX5_(HN97OqL(}S}q z-d_*TUqbx5tlZq1xa)m4QM!Z1NYs7*(Ce2I|E`_(()=yEaWLh7$HoUK{nK~7Mcu=b zy*7e3;{JzeKY8caA{zgP<@Zq7ll&>ohqL1tC&Bw+$y6G?zgc_ge%HE?(z$=ZY6_3u zF4A~YdU$DmQ;Q#^`R)AYJ8A!2U_U;K&?(F~LC4img@YRioZqZHt*58#gJ}GJ?CDPH zrE!9n(pA#dQuoA^n+Upa_uI=UoOkM83SV`Q?JbU5&hoKj*S9qM&nvc4H}%VxsQdj> zov8bxdWgFBe!q#v7t5G9xFK~Tg>POppN3B?V)Id(>{<#R|Jyw@{IMMmQg~h&8;1gI zS^xiP+F=^5IFe|;eX3$Pg$KC5p!mWYyQsUo^(^Y5L0DE|Jj9cldM zQ`z3=;nw11Bw%jgky7H0U(3eXcly3VK*4&)m9+lmEP8^vV{9v_zwc*OZ-amQCxr*h zdXJ_v>%o^Od?e>N3U|GG5rwyZFp|Q3&fG!aPJykgA1?lY!hbAc@&9>ZH-&FMdMVB4 z1xuN9G@ zdNv>L`KULI|JwZNG+wI*w$OO1^G{Q_VmoW^+Wkyg|J%-;H2lirY+O3{$LA^h<=TZ5 z-`1H)HIM)J2TFIXy&LskP&$Re+3rS0KW{wsciwv&gzQuuGLv+?a22g|qP(@QDce~R5{Jy|@g{a@*_nZ~bvugJq2 zJP|-|K@V7PR}+v*&sWq|&#d>?`nsftff}@C^>s74q?5lI&y+cIsSE{APucoY-xY*` zG{I@2=n&V3PZ9HSh&-hq@shCb7&Fx3qQF=kJ#^OpN(xJ?$=YV9ecHzx+L zl{dA|pesEP_IoC$H}hA`>M4xOAeBhq7JhPqL+jI;`s7#p{CeMUaFR*u+X9Yw6cuQF z(yHOS6FIOowT>MX(FQiJt*#7<-=+>UBJMK3mTq&%RXaQ;z{fC_*ET`g6+TXuV5inp zx~sxbxoLP9nsytT)DXnfZY?mcaCHz%7_}o>Vi6h(j2kxa607ts8C}kFwEWaJBdF@2 zjl#y{Z~B_Q>1#RF(~<(4=zM4mjkjtPKM)94ZgiN^&1+UTOoIi>s_cxQIXr1vMtE_u zd2zDkjMZ|JHKP;@mIS5Xb8LEW;#H&gK`J-h zMpAip4K2k}pV~sD;G+slfFC%4f!(t5A4$fM6XVEww-_?NM^ib$QhE89 zCY5k8fhEIsjX{93>}@2;a7kC2jNT7c9N*DoZd*?a^X3W7qBhFDY{PN8<*Rq?QMYEz zD>sp~%Q2voBUp}ppviEAz`}COwpc;1v^@1b$uXC&LtknXKgjV;NC$+F4K&AJEq8<( zIkxf%WLC=$AChFaoF38)M%O=?8)ADJ%927pIuVgRJvFzaWX!}ICiSKt zXp>ETV(!q~+}yG7kNhsp&7GZ_UzR(9c!3MX+SDIFfOB(;2n1f-1z;5T6pV#vxs|!_ zJ*=z%JgN!Ym!OTB4bk$)=K?J^feXq2E)YqIb4b!fWk4>^OW+~`k0DvfuZBOl0rH3B z4*$+6o|m0Hhr*NSUy_w_nsqUIR(3Z1q5&)t)<-Xw)>L*XMi(_S%4iGZ9YN{ST+&1c;@BdZpFAidv@ic*OOi-=^Cro|W_o5!m_D|uQk-1MvOBL>iB-QU z=_;%$2sx}@@>w{d(yQdn@>Gqk94}WoCNrd=q&;YTl8!)sLx%^S!at&#Rj>(LU5yIs)3Ja+ZfSXWsPq>F13n;rS%};V!XMDVaNI;^@j! zxpI2$hyrMTmG&>%$w}VF7I}#S9TzGJ_+U2c|GN61^d{IQPaG{lKhfl=U~K&S6yyf; zQ#e}mC?m=z%M-@?OYO^ zn>>+~Y$EADbLcnk&v|)cd3)yOUPi)aldhIG%R9q2qJ&gZQ2}_y$!{SIhw&vhcXH{7 zxlmSCPJnBhNUw^fHytqZD!o-BW_hQNo;7(mNxxtcBpRo@1w|x0j}&G2gt<^pbK#5h z;_+lKD=5n?ChZEpA{_5%`v?2$*zv4F2nOIm6vhCoy=VH$i04@JRFOm(gJ~xx{f7PW zD?+4Lf2t<P~ALBH!EDbcxB`#XNTg-+!o~^)I%^ z_Bi|LQg-EP`>W?unfP8QYk`=>sqD`#OUu<3joR8(fHW^kFS&;l*@rKm?pm&vD3vjF zC33)9IQ1~Huk}*j5VD_9^k}|Yx!RR6Q!CaYdvaUWbRpYK^scPES+2&ZG!QeXn?_5; z_o~heF#{b^(of*~YZYmmNs3ylb4Vt?QQAea*EKF-%hPhTLZ#JW`l|03+J)m*oVHc5 z969#U!w{0bFNWVkThCYx$HE~ zzwFfHWRkY+b~Pk>$Eg~&Bh6)^?Mdpp?@Ky^9Nf8kKS`fw)-yVgs^u|mB&(CEZjdZ` z|E=%2L%CXf?bl&1kUZGkn1-lDoo*{#i(4 ztvOPL$j`>Esw7QK4rG(8y%jE6>{PCHIKjD^q-dw{Ly~*B_dk%7bH6O_j3n625hN`~ z?x-R;CU41cl5~^G(A$hlc(OQz3dO90PHdU8oksQ%=p^&7l6x}`x{pYI1I2V#V)ki!@x*yOI zk{mYUejv%__=+f!Z#HHmlAJK%(t46JSTS2kswK_ICwbUe%Xx-dx!QN5Ru7U+hZgcl zn(IUdkW^wXjv%=rq-7P!7Q$eLFqPw8*{NU}$Y`itbPzQIw>?&WGLvkobf z9BA*HMDp3xhUp}eM?@KsJMFm_$+#WTB9h^o_f2|*e3JQM zI>{dIJ#$EYE6?JPR8h_{BY7s}gDuG)E6trrrVKWDlGJp4OOSjRUlUAnQ?c*fcn`Gq zn92J|mM5M%NV2Z$yl53tWIri{?T z%gWU%FUD>pY4adDha^iaWhcoT`vp}b-BgbqB5A$Nypg1Ocvc(9E#4vLNse~8a+73N zn$_{$$jX$EQzUgi+jo(Seyi0(a`>CVnM3%{oM;5}9T`eiTWyU2By|#uDk1 zNc!LJTYk74?MAfTmt;%lepW3qeSAqS$$``bJ4k9QT5$IW^86KU&{5=%^1wI8ko#iB z51vHUI#ka(g}gw|cvRa|t~T{Y_P8U+C(`Dtk0C3dR7$xRzR14Yse@cLr*e^Rb?v_r9kflLV>p~pmCD)vcY zBOmPCnMX4HfJ+<66&H@Xa*#*mEXHw>Qx}vplDv@=G=4tPilFW2`ib&LjcOw)GlCe~ zja--VKAL2*#>r}uvT3gONtRxp=y(?Cl9dodG9u~dF_PT{kNZfP2DTdS8C{{)RDRZ$ z=5PdIMhJAfSf{^o!*`d!z%?-BAy*2m<4vB>^K8n;MpYZksC>2a_9>}2G} zO`5Mr&SE)fXd!3&4xF2X+}|{bJ{$RLmSSOX0P1ssLc~5~Q2OWeYGl=f&h-b7JLWIh z(15(EdV5kMGK=DDN;2S~=H3qE$5Un-I+5!A9;>^MD#KI`$>VFexZooHqog8*mDeP9 zg}LS3>uo7ZNgJze zzCqn=S8DnMedj&yt-IE3Qls#@^3BTEZ=T2rf8N_#p0;(0=~@L1^NJ0bQx|0Ed$d)g zXU}ljwV=R!_r}~=?zMh>ZM!#ZH}YxQpkYy&Q9$?W+2e7#a&r+o^kG+lMb(z#d9j1f z`%YJFErBGN5t^2Jwv}0I7;mtweNSe&%{E=zLd(5byY2I7{{8KHv#T6S&C@ll_T}t# z+2g!-*_nO0)ozFQ-Gx@wc?Ue}!(a5Dsoq|*qGj!fiPrn`4-2}o3_Lsb7aZ}uyvw%8 z`at2amA$q8w>l0K)rH(|OP^?Sa7RPL^Pat)od=6g#JqdhU1U?U^W>^8gD-A%)|51_ zk&+!bX~Ch=))a*aGnaQADm$H~G}Ug$g2TJcY*MEO-0nJD-nmuVV&f#++KTRMeV2X9 zN1#}xROB?6Qp1!q)a4@mg9sHV)m}N(!CCmydpKQBYLfcD&<5vUw2?PbX_3K~1F1+X zpq23E6be&FqtYo18ja5sibRq&p+Lms6MP-64ALrI)laHad=d58aNEC!Dt7))^=Yzl|P=CU|EK9eQj zGP&eu*;Ec+#HZ0HECEv_5)o*C92L%gHNRnE^Hcig3)PkWc^Y`PUO8!5l;5=UX|`EM zgKizq+#oI2B`senJxqVN%IH5LRp3VnRuYl6mJ+e%$f@SY>D`w>vAX|9y`V@PR2&=- zq(Dh^MT7-vheZ+LaVq%6A|xn?7c6v%a`FapVpD+m-4Q{>452Oh>*&wq8!^ zA5)Pff}}jl^hj`G{~4LGl(O`p|1!2=K}4iq zh)^QL4r$d6X+0-$WdG(KIJCb-1tEuyKFJn}=sYS{K;eEWMB;5C3hJ^p53zR3?mV=6nI3$p&7V!cj$Y3^FW%S=QD=+@5x&)E|V(9vZJ1B)j zu0R(Q5Gx{hL7{=fAGL0jR^2GAH-}W~KivSk*H+Q~tNYP-M{zU?4~dE-!iE2C6r=y^ zMe6_U-QUL~7akJ$$MBCzs~(lsn@^78zkTSRZ-JwFr;9HKm4lK{>8AQC@PSMO^yqnC zR&a|1xmicI`y@sHHG2N|vWggZF?G}OhTBmv^5Uba%Ek9V(ci#FVMY=dY@9Ivu5vPj zPMoK%K~IKrLkyo_+ISeYb+!DSMT>zdTZhjhL*romOJBh{_tkK) z$#Y9iN<7Sey?VhFpLMYOdB-{Vk13#^Y`H*_mICkljc1O(mID6qJ!!J~DUkgBQ)23k zWU$*JoOaA86*Pk99;`o?41zC4_s;CV=8nR4TJ~CaZZyx^OLr}l4$3~6s;X$(u~1>vlaXmKZ;oc6v}Oj>O#1afSg;nH z^4<9YdK$>Avl$_LmjuB<`i{)BHK6ypY5w}HNwE04qv4DZ8PK~v;`)Z`$q;Y=J_eK} z!J2)Cmn2L~hUkb{g3gdl&{Xa`XWiR1@JoBRkqORYVEtb9GudSD7`G+eXkIcz*71*? z804Zh}|KRmkjHx)I|e^N$|wz z_u)OA`<3-qH}$~1biRmH<#tZFM~~n&-X5zTnzOAWw(+> z7r@;QQE{PCrSM=?)VLIlDzJ}P^qjV-1cq7_Lw`IdhpO(hU$;K(g!Z_7H?Fu9fy31C zpI>?HfS1e1&S5DPfyS=)51!1*gg3<2ThnJ3fnzkie9D+2(D`vH^Ty61m|`~X{nN%m z=>B|YWbd0ZZ%QV}4I7@4iv63+)4j>a z*a?$qil>u9c7pthX+zRRJ7H*GLrmC@t?;@*&9gSF2o}#+9(d%zb}(~3+dNIC5PqF$ zKf9&65ZaDK<;TirgWVE`UvBj!Abez}pb=gIP0DM$?2C&*XKwxPQ!*tGG`hrL*@rC1 zHF7nvYAgnuHji*`ULk0D9u05H&WE`7fnVEjo~zcxrN)degVSY87M&DjK~~E>zT3MJ zm?f2Me){%4h;4Iwv;j-+WA3riq4}k-yC?0T#>{O{k*()8Zsjhx;69jiuA>N|kHSU` zq7W!~9dbuj;{N=N?dY9QKK$0LkSE3P)XXD+{yQJWm47sCs_lh12M2oHv`Tnkp0BB^ zT?&c6x1G7%Q3{33rB2Cnw!zKhbp}rNc7T-U(T43Mr@>foevx+38Mr;!afx)}8Mu7m zz}JR1-Ej2k?JtQ9H$ijHosOMhO<-64=|`{BEvQfb6dZNx40w%tGEzyp34CNiH9S{! zLeeE;&yauyc=hS}pleABw3lvc>9A^nSKap)nW{I!FZHNiW5Wihe)eElWkv%ix7=0K zifMtOHz9lSj2gk>y=|<=xl@p1z5Ux%mj=+|cfEF-djtH9hFY|XI=x*3{kuo{m;v0-n39BOHyP67#;kU01R5vz)^_z*Wrn#Jee70k&>(zFs zT(WbaOkgW09IY^U|D+M9A$Oj>OFRc2)|J}%R~w;4wl}RU_blWe8^86Od?&0qbp26w z?im;%rMGO@oq8}HHT1G?>^0CDIycA}It^-;v-3}wo&_Oye%`f#8<4)YGS$oL2H50e z-!88?3$sQaD{!>x0DT?LF{gqqfQ*lO@3Gi(u=a@PBzYn~gcEEG~7q;Bf(2?uJOuC&7EFnBa~XwmG7Ft9V$z56ya3=$OE1uh|x;GmGp z9h(*owFk<6-z|*+X(ey>je4Qr;borp;$#S50hJ+-FL5C^Q}R$`Jh>`m&D2-;E9t5$ z{8g4<@g&=9D6KE`QC50{&Hp?f`QLm#VkM_P}af`t*1WzJdy z)wKqCSyQnPRsR=iQ2jT`!Lqbn8ee~v%igFjtMKrUaP1gB|3E@JG>{h@>>unaS<%nc zc3JG8O^|<|tL?-Kk08Ru7whX<#fB1sNJ6Mh1Pir8MA{L5E%*(!MIlkaLj0tagO$Y+ zT?4cqMHNM{_1ut?n9oSD`P`7x_x?ZIwfg_z`;)d#E|R5W{;_(I7Yj?GqJ#3P2j%r1 zOQVWd|9_Y)+Rh;zd^xD(NtQI>#fVZZ?DCoc+OPCvl*B%^y1e&HzHJR>&xQ3Jd~XF7Hkw~% zSucPK`{J$_%Ps(x)67}hZ`xz2-F%6%TVGE^G6ul{J3t%v|%q6hT3bcMR zPpC#)!_ua&%RL_0fL-jv`4cW#!^umJtL~3p01mEA9V?Gof!&pH*$$p;n0fNXo?p={ zh`&8**|Ftps4T0@{v={UGI6>jWjPmgqDHkH)#Jdpj16+h@*FVM-=<&M#emoPIh7~# zIUslYdgWLn4!l=+y*u7^9$3+C`_e+#Fr{+N>}n$}eC~;ih~GO8+@lV+d#7;WTDa}y zlO7hJF-~|QF_{aGg4?w&FJ;4X@36H=9W2oOrkf?x$brR1Z`+P|nZZ(Lp8bS-CQwYt z)B9d(3FFS{H4LfQz?-#~eJbXdLF1>An-_kV1G986G}(CroLp-Ac`Xy5Y@_1lyeZ}o zcc67~wTc;-T$(?6ARrW~p^f6xZ3PMYItDswJ)w5~m^bCxBG~uK|FzP!PzaCT_ABQI zA98xG+U4(zf@F;g(Vw%oK}5-r@spv2;LY6`-9>YNc6+Hj>-$bHN7XNUYm5!N;>^$B zPjZ6SLFp<*@lJ5xYTq(GlOe4duBW|Uw}*XIZHBpEFp)H zO>vI41LwqFM+&*iz=={C^j3ujU(S^eKA7SQE9^$lr3A}?*1E^N<(ntG`tnH_U*H0- z2LiSnIc@`sk36;uvRnXj$4+Tp(rgLy>zm6xUs%H6)I}L`SC&GiRI9g#69H>hJwKmz zSOoowx_Uc;z2MBY=lwe7E5QDj#Wg+Kr6B$58Rv4P4JdHS!*&Ll!{_^LQIoe=!imKz zeb;}s23hyr5$825A;GM%RsD`D7@S>cy*9}g)_cwvnsgx(^gqPvUQ5RN&nrJ*r@Iog ztIuUV-fai{lW+Zev)mEhROO5t3R?(Tn;Pz`hB`yxVAnv;bUX0*P^EgiX9+G9hQBXt zbc8uOcLi=e;sgaQW*tkmN zwClJ~oE64vvxj3Jf(%$aX0SHe z6k?(*z|+#x?{u&QZ1|y5I=@#4KB=zuvr9wZaBS=N>j`GC>Hh1^;jb-l33yy>l&&@0 zH!;YaDDMexpR|P?EQ$jq*5R|=&yqlznb@}IL?Db~wR{@V2?lwl#sed7JGfnTpxG$g z8Coa3W`(+>!Ybw+lQ++|fU?6G?>>zbSn$*%w?TFtOe;`xKe(h8dP_f#FR@C29+$2Y zW^r8*uK6~1)?LfzUm~GYbD3B2w@9e9ANn~eFcMC` zFIGNzDiTKT3OuhmA{NZ&_1jtDBPTaw%9Dv3VxX&cp~I=YQBaxP64~hz3l+C-Sl>Jl z19e~cTU}n|ftH1P*yRKP964R#n|JpJ_`T8R9aB#R4HLI~<=$M-y_MsVTCfThI1N_* zmXC*R+6-W{C}34)@=z zicO71DL_e}`0Ywgg-e{`ADynL&{OB0T{5o*$}ij3sb<2B!32?YS0=>#e5SB{TNbPjbI)r&kPauZjKhajWkOqSK-t#8 zEa3X=^rL$gKz_-hO>XzHVCDSDZ1wg`*f({|*0_(!VC-MvICwGz=#kma&q6ZnG~Syb z-^gLPl@5dRkJ@dUzo$U^{!+$vtD`W~S57G7qe9Ke?DNcuRLC@Lby&6VIN0kv zwZG0wh3vp7+(7nPDBIAH;pCqT@j*GAf}LyOXRPMXj?lF*Y}>1MI)iKA+*DDv znNZy@&LuXz8g?w(u+va<0^U3*9<5iC1vkIm$?kg}jIbx2Nngw_8nIR5hm$C5oxW#;Q9 zjg3Awb+#kUzaNss&Av=+W6f^(`PhL7Gx|*Q@;(J5lh`n&(DGQ z(EP`4cjrO8SA52enE9ZWB23EKwE#>5li%J;w1f{=1Y=@mS;EHtEt^FtW-$9kf7C>4 zONbp)e|*HULoibJZo))cM_9XL!q``AJJ>hU@qX$(Tacb&euy)|8ZtDro!6Bth4T5H zt_vMK;J$T*W2KQREO=+GHam0)Y+X3>;%%-Ie0}eG=kDvpP;z3zF4|Ei_>$neZR-_h zI4KOMwf*D_ncrszW>+r(xU+LGc5oSJKawp9<*WwN+d6X@~J+SrVHT!bI+;pQC`554|wvz87tTH6Af|8mw?xoU(4TYae{<6 z#SPGFIiJp1P>M_^w`ZZ$7+1>cC#6my{yMCvb49H!_5H<~xK&ON;t0y8>7EN?^u z1Pf2YI46kkD_ZeR$rZNRhAjNpYX?alY@yq8CwRx-IQ-(5_0X;2zTYL*5d};-cQ!LK-uYSUeEJgAbe@`!i{N*;CF9B`CXik*-Bp@%13ADdbGLu zYl0K_oQW&mf5{C#s9eZzy6*-JUHvp&TQ@lLC2YpzB^%*%ZpboQ418}Uc^*08?FiSG zTO5iBS_f+z1AQwNE{2^)SC*a=Cc$fypYKk+iGUqTK8HVjp9FnhA3F@oNW||__IWP~ z&IL|{f!Cw&tKnkL(zLS%o8a3l-K@F7t?;wqeZ!3nz7Tcbu9r!BAe1{ur3$9`0cC;L zJu6>6RF)1HNn88EEy3l==q>{4R3|B?L);$K?iog7p2%$qt{%tj9Eiih`R8v^-@Or0vvh310_)LlGwbzb-{@CVsMSUXhx?3}@ z>ahT_POsW=!<&GL6`z(LaT36Ap0uYW5e^QS!__rf*MN=txt>iFZ#eK~_1BFVE)Zy@ zvb24Z3*=|`-C?{5hi5-im%i*yfckz6huAX=ueH8Fh^5GJeAM-PjKC)T|i$4iI9 zWUWWTTz;Q`W0U-K&%ay?<;CMOC#+u$dYdPA58!uGYx*wVHVBRYua@0bv3)#nFwM=M zzuN~2YPSx)*7b!?n%3;q>j=nw@KCorYCmibh*fIMNP%v(j)?h zJms|$pt8-uG|nOlWXh$sWKX;aJNx6*A8pA2;jh5aDWBIup~5)BS8i*;<4oKA-y$F2 zm5*G!$RQN`Xu}zk_ilszQ_`g~CX~T1s>h=14!dD!mu^c!>t;w=qY&rHECNQjMa=y0 zsj%5>U^XsO9oXPRRS_5s!NXUE`XQT`E2{hd=H9EliZfMbeQ5)K~?Tx;>PX|uk zT>9=ap$ki%zT0fMwFT-{T6%8TNQJ3|FO_RzEXL`Eh}AUoY|K!1=~AxER>$yFNOOqJJpqzYs1Y|i37eXDnaq5 zbISnifr6Z^`P3IoXw2<-UphzwgDYi&QnQyrZRv%dYEo2~*kSHqDjEX*)9<$*#&|%} zK)F(?0u|(Czkjr0c*16xk*c!a?chW0+GkSRUf8;aIgDb@Adld>{0fr z-BS#p*sM>J*rWrQAM%#>b!fq1wYO{UZQ2agoDQRv_M0L2cVJ(p^AvD9=;o<^@d#{T zEb88TXgVDB(0O0@;Sg-TG@;&ja}|seYVDc7wgRp*&Kpnz061D(TL#*Xfx_0clXmW^ z0du)Efljhhz+vGN-U|GVH#RMX_oDSoR1p9yW8 z!II;k7ovVVe(W%KY|q-Ha9;(Evi6!kSvDSme5QEr)K-RL+qWHWeLDeyEk9M8ep7}F zed`YSE)6(febBL8T^Uw)PV_Kzvx2b0M0ul&Dy%Ji^U%gx9p-;qcrIO66<+R(~bKj1d><$Uh7fL!uF9SEl`FGA5c)%p5`bnuRyCEyx`(*R#MbN*2@uX<6 z2h7;A?&;VuSduN>uDnxgCG3Ahm`I;q3FqZk4lb1PhYUAj(Gg8d-SN~2c{x9*T<&v6 zT|N{JHPEZ|!~J2_kJ)Y6!XVf$eCl$yBLJLR{D=AAGn?qw-EvEyy5%8@% zHE2z9B=nqKQ~o3)3f?W$G|nCw4I7m_E{(Vr3HoU|GLTvwb&WI%mW|z{>=y-B?<@^-phEJUt1twgwDr`W+3C4Qm+3F2%s=qg7Ia z!=s_5OgJ)aFd7;*&URVzBN`6hli6F4>YO4&52VXn; zhs^j`m=btm;Vn@-$d}g48ZwH9uW$P=X4J=kQTxTY($0yX9$jgy+7ts5zbQ?6^g9-6 zOv9}BeA?_-YrjDfa&;Tx8Y zT?@{Oe>_R1t$`)f^)9(32@ogkeN5|TEUc?oynn*kBxr7XpYJIf4FNwoAIl7ngj){l zVyAgWLuY62yaY5XWqkeN`{*BS*ki?n5LmnOi>+tFt^Nl{QYa(-Xe z)ks(@($8+zM)|F)=rJ_FX9r>1&3Z3KK~u9<-~;7oSlP&1w`)Q)gc~2$THY549euZ- zJx+~>+n+W;R7nzKE)0Iv$Bl+&Uo~|d_Zaw|G}KpTn+ZOTHJYFHqCSOnlN-y|Lbio0 z?alWnkdi+Yu3L};!>pTZI%M)-d+;<-!jVmoI#4x6=X5S)uk^bfqnZb5f$#R)c;>3>#&A|sAFVi6T+Q_e`PUga%X!kJlo?IYmC(o+co(p3=t<2>wZU8Ng{Q&n^ zE_BLX@iCLjgH7(4&03ptL8+ybXQG@7F{(-Lo@=DR>FILPuXg6a3PrEgYFatapHeNf zU7ZW;Q$-WBYc@bTJ?)d1Z64qcgs0py$$^j0tbDDk^T4~?w&7l9F3h+hGb!`dda#S! zdG>r<9?ZTlV5q-08zy{-Os}@h0hjb2mNibfAQwveQslA$yo|WBzb?*&hhtcRc^)|s zc+_}kV_yy^z6!8kI5`i_SJoJ8@Jk1iIYii_`S}o9V}EOfX&zh}w=#9}m24=_nP)gW zBNyn8AHM!-Rt))VZvdGs9Erc=0d|Mgs$*t2lB z+Y_8`a1Hx&Y%UmtHnZ*19iz}(0Up2PPZf%yofcdFRD3EV{^@E z{G5Ck>}WVvbUqgn6{9Av*}DN88ec>xUd#i=t$D4oyK}&DugQq>ra6$8uqI1KJr7E_ zPYbX4TG!S!)Iw{a2~M7+bwfkQwZ}{3}4>vhxQUsX#8G17miXYTaR080ClFL<9FUoD$h7qc-fx)+-$Nl2 z3NF20x5Oq7T+Wp#xZKJC=M{CSE{k(uYK`K;&S80=>v8S*lB6`Sy)Jhuo011}9u)Zp z9&Z6Tl?wSPXn}!{ruaOKW_aE?%F~+L3D(i^Pi$shgTqH3Ute*d5$unPic`z3!=`T^ zjoW-r!NH52^KMZaA=dfo>-6ER;9Y2JC!mltfL<4 zqbZl>)zrbZOPNXS^XfrU!Rdqh%x3uU<=vIEtM#zmUCWx)*#xUyp1Rcku7e#bK6N{; z=?0gSx~hW_$Dyx&=(Q{V5`5ggRZfjj4}14_e=8n!0T?UQjHzn%Fns8NmUc-4#M@t^ z51UvIC)SVKJ1nRkHkU~Gr!*Z0%hit~;!9iLT(VZctwT+4#rLw`^79RlF=o#hP1~ABcJLhFc1H_ts zda+FP3cQ+T8=s|(Kflo3LC6?ihN0x-hz*oRh#9}SFmhuP%rbvZx#ivigBfL8O|I8N zub(dCVo?J~RsMXpOT7(l1PtAg`qTtFyJWxWkHw$Mq((*$uWtc#CWtsAj@&9@QO2K69fS|9l}ss)%na3DaX8KMTv^Md^J|KKSpm1r zRBfB7S36DoF!k>mQvEl2`&$KE;k)+RSAjkl=+6?e+{F&G^fdV(*JALY3stvnSt7h6(Plq4Iv`hQWPlnki*SvpFU<}7jY7Q|r zQ}J8g;{)bPC}4eH{EVdsro)|8JGQ157(vC$(3{PBXTX?;E43o-Oov&U=lVUR8i2XR z%3n{384!y<3-r~Q3J(&Vl)Tzx2+is%-iGJSfQg=3Zu}{F@ciPZ*fm>>puGL*c85_j zVeD6iibFJx|Ay^^sJU~%Vn(v-tQ#|-r>W3HMR6vax+b&P z$Y(Y*awl3}Dx`p(YxMGCNA;m#(t|lw(FTxu-t|t{Oha%KF;BLZ&VZU;UQB%NZ20wT zci{Ca`VgXe%yFrr32=mVr`{%0p*OHjh z(%Q7kvTP27ef%DEd$tk8Q${_1v|k@=HQC$J2aLcbub@S()Bva_G@mFiXG7pZ-zTCg zCXn;fGGT?1A$*?EHTU7+S>T&8_L2ve0$UH>c6>3s2<(#-4|mg@VQg=I@uLYd;aulC zS*PaNFa#kJj=!1-;Sr2Y5vvS9G010+!;fh&x{kJe25{+o)601~XM)}`!)cHAqukss?d2^n04*bz z1k3yxz=){mkJ&d13h2x8#+uBAsi~8SUe+1FmYXkT%^Sgha|Y}D5A2%r=b;M)T<3BJ z4Z3E*VW&)7rYIR!uQH-z&mjJtA1hJxGei1$U$&ES>Y z9`4?=0L_jOpKqOC0N=hvFiz|@1I|MB-0zzJ{1zHd3cg|ihIgEd%~qR&o2^RU)@)N4 zLwk9{{JRP0rTa>^HkgBt@vaNLKLGky$NMe@ofkfc(Z@-WsMqO3pq(KXEAryPr;UtkK0&g}Qw$T5S7oZ>e-3``)b zW^#FCG5*~ppEo(e6fP}3bv@>q8I+zJy}Y8r3Zkvvo(iCu!Bbl{FD?sUbzw`swvh$M zn3YA)656Z*M zVA*Mpp39q2uKS+*txGk9{SR*V8R7LiQk5CX%J{2_!#zC?D@-6b?}E+vy7}N}lUtOw z-4vGom{q#0)g0)bH6BhpY62|fGxl@Ma6W-A53YG?4ymkxw{?bA@G{K%W!f$LHOdQx zA1AbpL2zoQI#taaq*hnOTGd6BPtfaQ%u@C>-&5-chE&(dBGGk z3}+874WYp9%bsl6ISZh=`h{97#SSvGZLZ!JXAZ;Na!L(@oMDsZ=_^^2Eg@xb+>N{x z6Nm}%{;jL$1j}pJt!mkB3}6HuK*_guirs;;Pr50f3|25yH)*P1cFR1z2+dyl`n@w8R%%Q3Nc{3mT zdDUl>P%LK#4vul}UZ$B~W%qPM@)HYKAGhc7n@6GW_Vq~V7nCq~us*SWMQkYCymtD5 zs&N?jUP>6YVN4e8WKS`$EVmqX?K2 zE1g+++7AqzbnZ+w5P@3VvcY+u37FAk?(&T$f}8dMr3Vy6;IF>!GxdWX%=@UjcJmoO zQ2M0!bw#cpgsC=-m`fGGiAl3Zth(w8QoKCFI}_H!$t<5_&D3SEx~r#2J9ayGkM(;{ z5EKAfPlE~%ZcT)1o$Nq>0J!yY`|`@`{_rh7+f=>Q9}3^cg}B^U33IME%pW@vzhqIG ztgpB`6dW~PE&DMg6x^TXxM{eA!Fr{c3*Az%1aiz&r~FW*%|7YInQF@?9t2n%6CSCz2@Zk z>jv-%K-=UoYvhvP@TZ^0R?dos*y{DGeD5W~!8apbPnZ`A8FAAeDrv2Pp{D`HeJ;`9 zuyRz&$u;=H-LXY7v0EeI#p|V3zW4)OgMG&xon1rVUQW3(Q!O7<7YDNrPYeT}vWjm9 zbjqQpdgqi4)51V`Wp0B(QasdeGq3aiQ2`I84$nNiCIX^<-neaoKLpm?a^30ir*QD> zRWnRj7zwLGf4KLLjs)JkTMOA!L!e_`*g|u^2r%rdYcMz;3cGHsV{y&|!<^f9hHSlp zVbQ9(fch9$IQnMO@ppIP;o_O%ewu|F6l!LkRNuP_sFH)Cz{Be@c2jO=b@ zDTAOQIXJ&92(bU|m_jxW$T4Xp0o?vurQ?wzB# zUiz*MlrEqCyi;i!M9WqMc2((t_L9SP4($eTd8f^eq}O_IrSme(;d_8x`uSLO zYq;s@Gac^lt4w&kWCn=5M>n>6>p^RhR`L1k)1kH`uKli!E)-}rWt|$F1*rzZM~r+i z4a_s;Q#UfrVZxT1r>gHx1DpJo+DQ7;|K|$$FK19!_^aH(TJ3)-;QueTasRCXR>KMy z_y0!y>)#UmxAOPz>LJVDzqg9Fbl>+7ki8-q<} z3ka@|#^Tbb1Xa9ylPMrrxY?S|W-xHiCRe;ulPY8g2nLiL z#Su~%9IkkuH=WJo667BU7Kh3cNv>lvL|hgNH;fWIsz4~&A<7p}nRGgn&E>IKbh`LD zI)}lfu&6vPh0W!%8RGjfL_97_$e;;D0uGHw5no5}DSRP~&g2S23_e>#C&$923wbm) zox-FNJf4s*Ze!wJQx=QKU{m>22A3+H2c6F(Xd*&DW#Ddf0U_RT%AgBz?>bX}yIOfX zF7r>nA_`l;rxQ#DSIFdw$4q14MpG{Ca;0#o0tQRGGgU-q(sAc1_9UR71jT#Uai|;` zo6BU-XfpO4uemnix?Df8;@X8_%uGB#ivt*LLo(>BQ}Fi=LzXl8c!gg zFvN0Z&{#s833ke5;BI`f4W}-oQ*ehZZp3AvP{lIfP;pvpCKH7s;8R#+8=u3W5DY5r znPsE+Spu?+#T7BxOc9+bA_zW%D(-{HVDeZ(g3V#0;sq?RT$pU!>dzN4m|T{C#*ysV z=Top#HlHTovS~ON$(RWS4+SIS3IqZ=ZuceoVB??|9D>H-5p*iflWap%;^45TRDl33 zLLw_6RY2kJ@sk807Nht&Izht@sRWmSnv~cdolPP5RGvUc<8pXRzGTdVh{_l6m=qpC z#es_3s7wljB4lvrR0@NjO6-)vL;X_dA~uIhBPbm4?qPz(pi|j|2u~=$BMKyPVc^}- zDyV2{6gE?0Lu?A0&!P&M=nqUTjVbmSHn|a*ps{&07MsSFTt^YG_&hF6NH7EfJ|PmH zTfm^8rSa)(E}zE3(+k8hr4j-bMCeGZGy<+1o;`Jvs@xpV5`F=;G<5Tdb)eFi<5O~->EXdD45k0qWP zO(bFn&^d(!8a;BFkB{d&;C62G5po`=Kn5RY&*pOp z5rrWh3y05Va_BSx6$MVphP;l70zh}=h}c3NhtCq*GnK%=LL<* z7xgS6^NNtm7O>Ej$UbNo&rt+43?_UATQU|7g+^m?`4l>81=S?6D;B{JanRc^LgVQH z5}9&nB8reD;Ghw*`4ayX;k|Im6d{Gprs5eOVwsYTMnR*%AdO}xmNT6|)nIC)Q3X69 zSK_NoAzL71a)cZqje&;`NbE(3=Mo5+Y!1e5v=Z@Hm;$DN&EfMf`k@!H#A9KySPUkH zF(yUCVbTBeA&z?lACDtJaq}d33nLYdg^V3MhCn2?E3`)jUBqIt*=zxuFR>veraLxj zO+;oO1|c2`##|vPn86TawPEptq-FLYF30sNu^50hlktn8R-8M3YST+ z#rA>&q4P*PrE{@7ki&2aP2ViQ_n(O~ojR z)`~fpgYi@BgQz4nn}cHIVV+`3@*5i?0fq$BEh>{Cz8{r`r4KqG149&^_#?IbU{UB8Y=orq(-@Mx%3|S#SSAWtOw8d7NsM6O z0VJqn3^!;TEJ=(&v#0ZE95i3Nn2_WY4oZ@ZfrpL8ERy73I;H?B2hTa;Vftb*#A6m> zdgh_%*l5in1|c4w2up9$Soi`CMn=APzEmL{lMWU>7+e^CPFTTE#>D7E7jV!_36|LI zNv|TX(5BLaXx9w!_;_fZSUK`pSf*oPC%%rxX9$2_sALLO5f;4s)&Hj~%UIRX(4>qilWDD*t>+^{#St#H%? z8zm|d-wz#>gWe@%iTGGzN#rM_GO!v!!3o%yHzo0kN9AKGW)n0tbRL&2?w87-b15Po zYKXY`O(vJl{lCCR@mJjeVpWUF94;GUnApZq z7+7HAVg~J!fr=onBa06#&+*JGbbOp1*+yX#m^ScVJiUs?q>KCIu?Yez5}^PCBbOnm zv-t##A;7eN)iF!NljL(WVm?Pio~whUtVj~qgbad%OA9(%hz^Ja3E2m#5tk?E_9!ka zZ^ZtF5f0Nd){S)RfhReC3+rbF6XO+A#H3=eDVC{-&%yNqF1IjUqKAlWgM~#IhDDqN zN64hoB{3TlF!}*5ns`Di8O3e5jARMW3$esSVM*30R5lyqGinWMMHIJK1_A+;UT)3YWkHFX9Nqafro0 z?TpnwHb48?R@42j2rX2L=JiWqcsVS*u+Jy`?rL|m-OFma47q&anUXRulBwLI+CkAvo$Y|#|FmOk~N+osdR8l zV~;dd{ZhBEjaxICv22ZIMkCD&Y*FeiY1(dei|&@>@ooY*V8Aipb!@^CLdveztVH4KLW_LN>EQgcL$sur--Q^HqIfTQ>`s~^N-dneBy&qCncXjLPjNRQ;|9@ZK z{r~&jzwW(N!oCiiX;uW*iq4sW$tb>t*ghERBFt-Gs!mRm0(uREFbUi=2VIfyA#gd+ z_>ekG_`n1)<~e`@CUh{uz@$m#-Y^+k0uuxd0X!3MMr{w1u@a0DFm;E3$)b$=26WPV z8jKZ98GxQ(`&e8o!8vv%C|?eQlQ<7J&`t(Vlo=}m=@Ltrv%=Vv$>nlDwISaW;gc|q zKz-q4t0H(`g>Xk8?F#TCbUfbTBG@P@%MzMI<>7$6Tpk_AMf&NO0YUx+l|^u}Pa1Ot2#0x~5ytBc?Z>=3F2qYdfND7_TQ zv;H*alpfX zYJ=0k?VJ|Dl3`TM0<*}a$(WJ?qLmVXCZoFNNb4+syWfd4}*V-W&Lgg=)cssVFJIFk;>@D#ZQ zv^fj?53D8!Hav#w24Mo2gu^5r0!Je72b6~lt`*2K@ELG3(h}j5;6~A;WN{1vWa1LG zY2fIoY!W;b7$AgPa67|XBM+_t_(cvPP&s06Fit{hFnmF9K~hK_Y4FxyhBA!iOF*zt zz6u0|pi`AZn8fB`;)Je&dxKsFW(1QJ7|(E7mVlQ)Fc3zv9FW5tPA`o@ks?F`p?`y& z<7?0M-T#JS)iv7&=tv8 z0mDCp>L9=dd|^zDJwTWVA}}zuPoW?}68FC)@c+#a3Lsg0T@F(4%{9rGUXOX;E8hEjkc=Q2SY;(ctjcW`ba8 zHGe3xc5`pDfQ^R5b=C3Jg%s%aYTvm{!1a!sfQgf83aI^syY9rFa3IpksxSmhGBqM4 zN5ru?Q5?trMxl3J9wH@23U}NoRBIcD6ZWc)8KM_uT23sVZ#E zo>dxamFC4$)A4Tf@Xxn zrilnNqt@EVlt4v@J)j0axeUiw!IBhcfOFU)XXdA;592$hXOBpgC+ECL#easc|f_={CH;#+3L;rK-TGiM9H|YHdTQ$JO!W91y7z z#7gOi-B79xkSUjdZ%Tr33@9m$B2YF=D7Tn&GQ6ZyE=eC@4MLfVYXjV8m0vC|%Og#t zqAb_tm62s=y#{SNXWEej-g~t%^jYANDz|7r3PprQ1>U1DyrMK3^0IOSkF|&2M0o4{ zLg5^~a71vI24=IS3}JCCA)`h^Sy$@i=2a~NoeH}rSLFH%Kz~F)NY~52vuf2Ng4}2* z%_9P;Dx75rXJVqs%@JX7rLn*?(65rR4pai)fg-~kr?duBAw>3%2y4}HQ|)rN1+iws zZ~H- zWN^z@VXC6k8-%|hZS-fI1CZ;25I3f3pk;90;w)%-*3wf& z+F3PG?W+^L9HoQROHGltF?M_;h~Ky#_P(%^tzTZs{7`l z9gf>)Yrq(Mmtc@%#W2VbijE3I|9EE*Bn>I;%lq6cShbi1d{pm3ZE2x+MXoQixD9Gr zf_J$J{?oyZg}6gH2Eeoup+v3S6J13Jx?1H}8lt$_OKrk~|sKc`c@T@S{kl!Vx0zd0Y$9WO8`_N;@QsCy1&GoFM^=dI)C~X`(%Qyt|3_|ZZ=@N9CfmGWMTER~`#J1ooyd!~ zdK=v24V=WZmeG2n8vRc3w3|gckPsWwjslrUj-tgFQhJkQ#*ezuN=xPCGLZg}=0>?X zBCTvBS2nH~hw07FIR1|tE=I1gtvQV!)Lg94k9zw+f14ZVI&NZPfF6N?FsY57HtZS* z2KgY_szEY6Hl9h2!n#OU@Vp71>{q>g=Gh}-C2H8zDD0}q=JK1w#gBUXaN%Pm3iZIB zOSGrwK<~b>vgaW9x*J{T&1gW+VaWMT%N}#}T;F)Y&Az)QHh^$RkHUs0*lDy$zT-#T z$amJu%{2%ORLe@mxTJIo&pH0L8y-9+fS)z@4A>iPU`H|*&@)Z{qFdAVF0b+9P9^9U^ODIHESDk%LXm}aGuP~-6rRL;c~D( zzv^v3yZau_`)$1FxBWJJINv+q@rUy*n}_#Kr+Ake5$&D{I06LL|H8REX=#&W#*eyj zcx&nU-|x8r!Drrh=H2}$uezZe$tidDxA`{Q4$=Lf@$C>TB`bagH<#&$AN3H&#-50{ zz~0yVcnTseV{o=i@D+sq)r~;#h>N-n!?4;u^z9J-;j%S5|LSc(yW1iz{u?j)ZNKdz zF1-XEkGQmm#Pm|9=!u960R(6vth`MHe|+R$-8j7Qh>P{WBlX!2{xI&Q-twMK|1h}> zq<%U{h$wek#HAP7eA`7_dI2;ZacL=8@#FTQ>os|2Xjx}~v#{W38Yw#_in+j#rtnc( zzSKh;LVG0qMu320zjNu}r;$(~yw#0B@Y6^-6n^U+w7W0-);n##4a0A}10D~*wV1?s z4}K1e)^DF`!`RsqJFE#CNsgk|pNN}zmWTlHqizI*pGJbtpoIfNJIHDLpypzQe$?9s zdbjPc?m@fTb~~(l5W^lB(^4WjH3VKpo0c{+LHDcPKKI5?4e{ETl!6seJqCLBjcGjx z!8RPzdJaQ=YDmi_?>*Nydg9a&GW4ZK(fcg(o8>!x)Qx~1?Xq=)Qm+npNHL)h)-r@pr4)Yl&B6W#Y@H!Yz}w~k+t;5DwA;D)>t|f{JgEy zxLqmsnZ>YWG7Q|VrpED1F7BieXqWsz~ayIS6%eL?zTkBrBe6ue#9-gc6tKhk1Linez*m z!+7jhy$xu0-xCnOjTim4--b^>dIvoI1f*p_h2H5DJ@Nzu0RmB;Y$`3L^ivjfV8kD6 z`U3qRDJD}RQgVb$b^NLuhd2HNq|Hx2{KDiikoxH)A)?%E>)RTPfAuzi$H_~J$cukequ(idg1itwB-q55&gX-_3e>;4(Mmaa zX)Ag050i`8`RODfqTFrEH+rGXx83rMUI2|R-)JdW@#FTQ>ot4gaS^e~B%Dc^&8PE$ zBP{;ajl&mTzR{888~)*PF*`q{m<7e=5Au)c`+c0+a#95bwjlfx#sg#&EEX((`#>Kz75fFZU zb0^Pl_K%v275eEj?F0Sd?pTa$q!HpJxhcm-i#d{`DFN63fX)-oE>?{FBxWzzDb!sHX* zBl)X&3If`2LldPJ@pc<}kyJP?+y$;$w+v0n!$KRjVo5Sgrq%?v2NWB-Mr@vXUsJvE*T4 zerD=4sGC|}t}a&;=hmmUS?ga>@6Mss{o_N^)ITV01a>i9a~Tiv12*+1@Il-X#sg9&io z+GgLXBFB9z@Mdjz+ituynaXnC8cpBo1r2kZguZj%N(g!9d8>vT_pRI8w~le&8dcxw z4h{FMLY%chlwtzOl!{y+-}?9ac)Zmq$d?WfZ%xmMNSXy`HR5cH)P^2!&4}*2wO(t= z&2p_OS5#6!VnyJtmQr}zBNW`h0jr09})CI z(k2nhCH>wZS^;$uYmps|o1?fOQmb2HxHgK$pr9A>%x!fOEw-X)u165?m~NC@cPPj; zm6cUEDz3R!R~#epa=E^slM5pp9~OR=qB}n=VW6+p8fE1CjE_qeD$2T2u^1dv3p11- z3ihjYrC44q6HmVYoxr&Z7(~0N1DFc#2mVgtSdkxcdCaNaW#cO;>O!#&&)}fL;=>_L zW~}PiNxI;Owdk0n4yuN-`58zJ3PJp@Ynh!PRMi~DyFi4OKLm4TOkKIaAeqkaU_wNL z2_Dd}5w@Ecsntr%_0W67)SUZ59Htv3MyeTAAm93TJeWY1UTksk7M_Aioew6Y+?Q_I z%t#&V*hZ339X#EQb|}GW@-{Uhb=TXz7mhZGST1!Ul;D9ob6E~>GXaJ&^KK>@s}qjU zx-c*3>`U#;V3~Y~laMfQ(}f2dfwRoTQ7qARu_bK=(*%TG4nj@MIzbGN=~{~J2g2xu z5yC*;Fv0F{5W(tz8HB<8F4HyCvE$FjJy0~vx-H>);~e>jFk}}PFqyFHbllO1v6jh# z4P~>25Ulta6oS~UYnh#`S5-5F5NLY`BDp+(5X4J&!7AEh+MVXoHKYz?hu&sX$F$hg zWOG#BB;^vaBN`@yqsfp``LQ^ohTi17=BQNDQeyO*WfgM+hRKIWSvBASIaVGGu4A#R{3uJ60(Hr0Gc2>{o1qIb?w`cW)hPjC@ajg!}nCqc;tzO_T-3Yl> zh8*{;TPLj!y{0RCYmbEk9H5{cva5WnJ2cF%aNjx_kCty$k>kE~d;3=Rd8g4QS)Lx# z+icEhqIaf=CzUrzweTrcFZot?Xt-~U?4H^&6*=x(x3_O~-xN=sp#wFdx4Ca62XB%f zJ8hLMMO$HJ=?)F|tnw5$6I?Y-YW8VE1d1?9Qvk?(n7VJ*Z5GCt@>7XXt-~U zu5VS5nZ;h;PrO0vLx~+X{hOZNVH6D@Vtsc;D-x^upN|EEf zbzA$^bdK}Zi27C!Xt-~UtZ${rao@VFeQU-&s7FJG{8?7}X> zdRVgiz~!(yZF|Dz-Xoh%yW6kj+&sEW!m`rUMv^Vef}0V2t$_=AHZ?NtJn#dgO(K>{ zF7R4pTlHE4FW|e%al3$Mt_R)pa8=;K}p#ay}(+ER{uSZTJ3w4Wr8?gRnhLWMK=VE%_B_B2*X2Kg9oCz{Ln1Q z_0&`Fk|UBhk{p|p#(W+@Ko#J~3YSwWD1!}~>ZWo%^FjmKu(n7TBhAF*sDe8ZRI}X? zS}$Z64|9l~GXXZ`V3f_Y#ZD*O`mDr0lr$Kws(}^<9Q4K$nTY>kPEvRhXZdIt>G^8p}90IX!u7PKevg zVIAz)Mv@sc+>CAv!3jN^8U+W^CK1ae9z)O~XeVY*pJGv0dRcuZ7xRk*2knGk7ZWt= z=x8k9?rwxkH$r~f%VJrf9SoHTF#GIy48h3;Sz&{l9@FJ9gq79!5oT9#T1RtvpGBI?0SzE;h^S$^yP&nkrFY z^6C6`X>#8*(o$u^IEyDcQg#k#tG~x(nnHS9a^A~H^HK}yyn~YpHaGjYtY>y7Xbo$J z1dRmf2TV=D6$z@@?ifNk?T!o^$OBLsM%he;!BS%)E@$*mag&A6%ckj|*3fpyn00jV z9b{In*f3pvshy35$%hz2(17DHgsrXdB0+d5L@kn(&T0?=B^gnHC^$$0>@ zXqznEa^Ga~8W?uyIV$U@kSCBfiC8Z27y^Y&*mMj50bvIs^QHPRRm0uY2$^oA{I>hb zAy()B18&D-2;suVV+c`?A%u%Rk0Csy#7E=y&-Bq$)%NA4cot7Rvl_-Vgaq_4Tqghx z3PA==9z%#^3_+CU#7vNN2uLB%*C8w{*Q%vjMRu93k}15&;OM-b4CeY3Ax=mbHFWb- zPb8>jyJHBwwhqAw88bBtf(_G+5Es>e3lu}(3%pFTP#Sm=vp!=8Q~{#7JUH8O#|rU0 zTO_D^)K4V+Awf0U?O3%KpTi0f=2QaA6`B9(L~zy?8FM{!j+H#~aIq4m%PJOu$Bb%J zojca81ZS~z!e|UjMf|R`UqtvDDjz0Dpun*5K+eyO}ks-=w8773cM z#j&c;IU_+eEk$;IxI$I7be60aGR&zsV8a9$I?Vr=fYAwuv~t$TMp(|lv0=KbVjXv^ z8gSgPZrQPt=bizV6XZB!ZG9-f2?^@!EXIozT#=xf?RKm^$XH=*fYnBra}16R(~Xd0 z)qvxUwIz-CTJ0z%kv@>3JB&a5LtoY?Qn|a=^#a!%= zVa_o)HcYqc9V>sYj48&N8j+GC$uwH2C;B`uMpYPdI~1C_Dp%CO1?uT{kD1B)BgyBezW!ctkOpbb3qH(7om?8BoW+Qw<#%1PafLkFU_ zwTy<^y1Br$*U+*B?=w`Be7Y=>q=K8=T#)Lyx61U0XY~Mv`4k7UU;@lAnEPpGkk!>f zSTTd!2-9U1>wJ(^H7M>?w-O3)?K7Io(}e;w1y`CxHADBRJ$Ej5uXt5=V3mIl{KT~9F7UG7j6n_Vz<5Fef%uGussi!0TSX->z-En|S&iw_^bQbJjy!19BM5gJo zi4{?KlT=IA1Zd-x-DHzy9GntSO-qrT?U$--X-$CS1r75O4&X2W1`l&UCS-KNIv?d| zAynLCBlL2RXqj1uJc!WHbY&k*msPCee3c;=h_Ui@tU-E>(HjTpGgOm&h8G7&i#_+6 zt{$PRUdS-7!aZxGII9L6cdXku6p$9(r<|02IT$a!%}9}n+L@RUl{ZOo&RRj6K-E@M zPiXc)hC5cn!D~9A?L3)aZ5CoMKr^X8j`i>OTV%{{kx`=oDc>i+stT}rQn!qN&Z^q{ z7MWDabxnZfpb^T~El;3rB$>B`n-M(#)&)G98rg;tTyeBX#B#}HK4Ot=)k|bNaAzLK zVMJvD3}rSSHw~vcVV(bLb)dn{09nq#p<}wNVjT}Ogn?UUk8W|XK!(6p(Cd+-Tf#uz zaH>gXPJay@enTCjZoh`JPAp@rdY5@zyW?0)zSs2-6h{|~wJ0TVJ-EdMKSO0q;X2nr z)3wabv{_X%Oi!rQ1R}Y7c^0L4z46E^&4UN`J+xB55;q zC3Ue_v5h1%XSm5j2aQ!lDM94b(o!7>9k}Dp9F_yzOn{-xd|OAHww(d8Vg@%srpqeU zai{HJudK`gaO0r^*n?P@Ij)cKuhtsnX1P`cz|ku#t8!7PRFs8^vaVEcuoZ7xw5Wv` z>JI~X!?c=*4&>$)rLG;2N^E_8dYVq5EoiG7P0k53Zc@gnth?Ve%(g9KtZ{A}jd#yR zJ#?V9cOa6>S9Cd%T)T-8rK+|gtn0LhYKBd6(;{Ci8`ao>J2IQ%GmNfvgooPMh`Ao7 zK&q>iFkM!$j`L8>qyjls{_0s<#|kg8w41z8_h@yj8S}}cCW=Be!*;Bxyu0~YrnAyEV2BI8(wJoFP%ou&J(;%VhW}TL<_ooRsqXASP>*h~-izxUMJi zo8nB26LPX!FTy#dXi1n`7=07L_HdBnY_}U8mxY79feLo+ftiCFHansq-k0eb>gYz6 zm`aF`Jw0(WP(({y$w=s;^T^BrK|^JNz^5vpeb#QF5yk5 z(wy%)pl9Kw6#5`0Ym^0GsH8`ibTR)h_fh=_a)lIsu%CyijO7|=#Xd}s& zF$mTz4-we~k@f(dQ(gUV8dcL-|dZM4DP^lx*Jit>t(T< z0*jGXJXnCof{9*FAl*lur8-J~eFpOMSCZmlX;7U)5Ygy;7H z*Y!kxQ=ExC@8?_8!rG!14sx9Bb_3fD2YXI;+f8>papZir#eA2wI48W@c-^j)%kuNb zk`BwR9m#$#5MEE@IpK|HbT=I2IN|LE!W$0uoba}r@Ot9N32%!DFKh8YgqJU7=&cBE zevB{P-IjdU3wTajGXJXnCof{9*FAl#SFbgb)|HQ zFJ{=5T-OtMPIe2AB}t|yM1@3xrlvK9|ScrWgb#m-GmPac~S z;w8B$!+T57g2qNuURjL`^kk~|5$&$FIiHFzm6b|KC$KJ8U@?KP@5JosQ^GL3QVjiv z{%AJFle}maBuB`H-qtc2l8epqy0U<8nAQ+dVe*N~OY+xVWrmBPCQ2_}xMy;9lI28J za2if#NY9DVxFn8?;_V{({}6ezFU`)n<`XMqq;XMF6=+`S>t3m*T2SZn<*1-~P(6BW zFsZ-geZf>xZ7A}Zk)$ZPzhFYHE>{#4Wd^X;`t&wy{VR%GDp!{mmTT2gtsi4U`kYxH)+JgXH5 zTF~Hw9Mx>MXY~Mv`4k6em;eKZxgXPrrxPw$OX@ZdXXZi0O#>diY#VNKun@CO*ZWn9 zTOhB>U#sriXVTs1RYmt5_SC63*e<=zEZ5|HOy*1FO;W8UtX3pw#$DuAJ&-Z`Bg>G- z1QfEO~%sgOEZuweoW9p-;b!03cS zC9GCtSkA$*VY;khZon}4FpnFiCKbrB^7XSVW2{L-P?pZA+8kp|C*1=9j(esoc5EZb z3>t3M>STUtvS=GM%O&R-sYR4FqZ*5~+z*1Dqq1qT>I$SyB9==ojgaXLEYrH0~2Cb>{uH_$!~jDEGu*X;5xse08`Vj;FHVs1zi>WjlHY2 zMj5?E38Zz;M;0o|x>B*28*BA2hwLz*kH~_8Q1Lg^(I5I9gt0r<6F2~9s7#Q7Q&l85 zZ1J6D)NTS%T^^iuCQEoBL6eBekZfwu#tMd6QJDgR@p7XvV?k zsZ`Ta$u!^sIaamEz;I>`t|Ok?4n~Asp-tbL`H&8y%*h;Tf=@j?uSa!`#~vuvhOgziT(uHkq;O zq}6d^Kz`2u66N@d#{8 zkTQAqQ)lQz+JW@Y+syP#Gj5aBQF)UT4<+E;A0`X+fQAjs4&X2W1`i`{CS-KNJeCk= zt)VXSIVa4_Pft_NuRhvlQVz&*-`bLuUgCJF8+~g|>b*6zc2H2KZablL9pQ3E*msDKU2_<3*=k*VlQXj zYF(nOL2^)2Q##|4*L-YcN=|+f+N*Z$9w{AT| z>(FaxFd(==Q_Zl$2vRb`eJhNMF2h{se5(gE%&&0YIvS6ZZ>7j_-@3hhtNRNasb_I8 zUwWHGV@>qV^Z=X6o1|K}w7_AQqZtPu=BTEn$YDSJN@m=l;l4Ghr*Lb?ao@VVeQS>` zE$AiR>JAO}tHvkY9zm>^YPQ?6N?y<~x8eW}6JW7$ z=7CIMaGh{?`?zfB2g^CQoiJTiF}LMXOr9*XBC>8fv^KMkOG7S@Z{?)5$4IN>ekKfc zEDqL7Z!_C9c_5PkQ+bnA3!h{4jB8DMAj7G&!G`IwFh|GZtQv5E94kM2+sas_ z5mB0xa^u$5M^pD`G-nAbB$I{%C`^Z(QO&R)VI@ULbbW+nIe&!K-?G=-Mv@WP2%qWV z44+Mnf+J~@h~-iz&UEoao_Q^Y=?W8II5P)lnycu9Ly6-$qsVd&4j$8G73;X?4hOjo zpXw4eem!!kOE}mYjy&nmu`XeE07D(4Z^MS8P%LAu@!*n(oa^F@prJC9Is2hOs$wh7 zbzznhi16|dEhT)~!7l17DHAY~0HFsSNm#B`OSOvJq7bbM5;PuQNa$CZ|XE;M3mA2 zBMA_C&>v+<^J?>(z}fgJ%+E}nhF9LI1vxi4J$YM+9YDR#3KoDt^aS%bGsNJ^S;5sW4aOY+@3beN=<5PF;ynOEHo04 z1UoBah208%Jdz-5;<)(AZalPh>kmlNLjXxzdY9qp1foXH&N zYQXRI{t%I2$bu~}YM7Vr(ziWMqMKAm|Kxo`5;RGG|)%!D@?knlp!;1S;F5MDd@ zEX0(K)5bB^9Z8!+ESEUpZ4u$MLy_?py9l8>cubcQ-gXe)@bKe=w;Lk7Vt5^Z6JGft z1#H-U-Ie3a6;n`|#R5w1U*UiqsfbwX8>@ zx3!Fh+8p6UtGLJJV0pdY-N2ec6iy0v-wRB41`w)XX?uE`wY_NzfhoXFZl* zX@*IYA*Ir$m|DA9fiP;&Ii{xI0tD5x6xsQaC{-!eR#t0OrP_oCXs|eXs45qgN=3n& zukl6$b-gx~S6fkiDYZK=jEOlw!vq*OjI^0>(FupKRaa|gD=;i)QjZ|eFkM!$jz6nR ztGET)a>h?wq*F=Fm?d!}CC$N+2`O=06mJ*N|A&Oa%yA)3J-)HWkR^dAMA-SmYdjYo zP?L8sUV59EuF3tF?3c=$q*~-xU7(;D2Zw@cT8bR9U-g8Bc@+n6m;i%^IUo}AAm7Tj^ICi>jK^XMF;?tVefU;QA>Dc3s_i{=*mOo6i-Yyj z+st-N9>`?CRNf@jBG;-ltQ85GvC(=p1s5c!rlrUsyH+n`m{)PYh6ylqm;*8aqZ1Cp zvuX`(MTX@Z92=&~D%Nq&ssR_sv3@NUi_wXhpvDAl(=L#$c)E>YEToNhf@p%M4F#Zm z)!N4JVPQllSDQ*5f&#)&R=84Is}E_CK>oDv@n&tcTvR7kSCmS5ZRKcE&Pz*UrKORg zn3qPt;7DY)CGh$KBW{OAa59{V%{d@X>(ly1clJfFrrmFQ<;7ind#{c4?daRFbJy-Y z*X+G^-*wl&>ea8=zyCEiycYf4i2e@Uxcesjh5QeFzw5@M=%at&=D`EE96a>8U-?zx z*W$na`dbr2!-sFX{m97ZQBg{!(wXd7Efzf8fb~_SEk@{mch{_e1~u!@u{D-~Z?zJo~Yaf8vv$ z`WOH5xle!QUw!ruKliVn|NIyJ%@_ae3;*uLKl;*_zw*al{o2?6kJ$b{S%&?t>mO3f{p>Ds zefV1P`P@$R_ci2mzgoUQRWCO{u0OASe(OD|+!N&UyT3&~2j8oHs@M0czn^;txh|;J zpHr{L-c7DQspc0`KZn(FzFqzOlKS~e^;1;KxmPXss#>4tNOG~e)%zFK`sLL0zOKr{ zQ2)eE_=ku67#a5Wpke>wUJB*lUNtTB@ts|Ld#=4(*!7LOZ~A6o|1SJ*@1w#!PrZKX zZs+YaKt~4Mp-B8YzA5n<2{m69#AGt2^{59Zd2W{FQXLAjk$3*|cG_)o%4%Ft$ z_0{6Yat*Z|{#v~K_C4Bnko#aC{KKt2Oj`Zbm~0IXU!!Uc?jd@hbVuKC?B_fCcH9)Z z?wQMAUi0(H{4@LecAh_oyloH|Xkc*kp($ay!{bN#YacMD`6tl(5($W$%uqNJr zthTZuS4-!bG7w{UFmM(PcLxTqZM?N2s6(G7eB)}pRuZKVWo@JgqIJ14A{`jK0bPYx z-Hog@Mik(qBa$xt*GyG`EtX4#`tlk|{=ndl)A#A_?>=5Tsle4&!T(QG%F9=pQ`M!K zPGEmBD`qlMax5!KnPh$}2lXix)2Z~LlFpXorQ%XaU!UDea;2fbu#G=;>0FO1Q?IRo zw=NwRH2lGMP&dBwKQpdu>@yUl*fvqbgL8(H{TTKGj_S?Hid8kWxI~4cj zb1!|laP_4(qT(J>ZzpPlFFf*^J*YN=fBM%yZ~pnKaBc2VZ$I_k2mbhLzd3_$-}l1O z@4xe7{|##osJCCaGx=XW^Ovtfx3jbFIKS_u=k&Lsr)9InlsqOTm*9OqVj7THT1uCS zOX*w|pvfkSXPo8`{eeO12GG6X*nvX>KfJa7 z$vtS)f$u*$+W*Bq_4|iY{ZG^HpUw7vh<<-I-@mGgy!?j#b@l6q?q-^g9qWIydiklz z{)g4CPe0iI)7|uYqyH~q~V`+tth4UYMho9?+@hTgoffF@%ra(@EI$E~r!U1*|J zdE$mccR(9}$G!g69fNxp*NP7*&8g$J#-_Xg)IrO* z3+a9a(j6T5C8P@geeW1c*J6w$i^uy09`8#$b}aFoi9;93h^!9Gkk~hHA;10JgZ;@^ z;_(wKF>Jgf0dTHg>`Oc~nfUhcLl+H0ws8Q5)KXBWy%~iYv*^@VVWI zryGgq-+Czf?OnqUY?U?$wIu;IkP?apakSc6wFHC%A8HprJQ#BNQ0>Q_LC`<_+Xwrf z-4RRt^~d^H13UXn(imSD8u-Fc0*0-`bB{nd2mKhX!-Qu;HvmvW+kyM}G9LbfmBY=1_w{n;;c*}6D) zlIu5a8~Da;2^feI&;KRLm9nbAy_9MV3=G>`gYgr z;lA?rgN!s8s1`8?5}g3{oOt#hZUL*KQwYtg#q0wV17E_~Jogkf^XK0QxzI?nO(Xya zdE#&1anL>*Fz{D6sUJX6x!Az--@gS2Ut%x%0x`G)hX$`N%+8*^FizO};NM(#6#YpX zlZ8UzEc}Q5?kf~F3dfcU_o7SiStv~4>+lnP7YbA87hIyB@Z%I*Io<%Z!dnaQ{p7}R zxHyS^pFqD)FN4~#TmjO$g49l+--S!)cd7tc-@6RSP0XVbOrqa2CdQhFD2(8y$5qgkLJ)tQfTz$>{Jnb zKtUiKsMiL$MH@lQj%0n$y_I5WVtuhXQ<+azvX_a_rK3wnkDfq@po-$Z$rGfg_{S8w zKY?WLIU!xXcy9(>J$okQTK-e${zd~NC)P_R@?vr2#Dka5=T|QmCK@O`+z2>Pl-lHR z{QFe>cBmix#*lz2s@lZrV(Q9l>B4#G^8Bm>>4@@${Mvl-{DX_h%tOVEV>QS03X9Mw z^y$@TdS@3;o>*I4sjOWV?b0k1vbcUAUMN7rZH9v@$4!5`^q{bikctwQ}{cH zuAZ!{E0dsmaY8DcJU>~wc<+@GTwAQnR4yg+YnSgqU1y?p6m*^Oo4ADTN+W>11BKn3}zM>B3CqVru3Jv{U2q z1zeZH=`)asi+$aP?r)%?O{^DJO83s9N}N0n7hTGG0^i5=U%UV~pUgi5-CzooT*^C< zM)z@Joh)4~o}4+pn4GPk>FKhk)drZH^dV*+j{bjlS6#L=$DhSW!NbPEO-#C12fOO%KU7K|RGL9{r%lwGhc&ob0| z-la8&5f?`|twpSKbXSmoj~o^qE z>hObAl@Rx3cZoAXyxHSsx$1~}ZsK6M@Klx1gH=3k!HP{97< z>y&PcXgKU$e=o!Zo1fQdf_Ug~S{{8~Ci6UE=h1_n3V39W$?*$_u92~g?;uX|4LW=m(Z@b1{XXKUXC1b_MjW@& zy}=tqKIy^Bw}_ME{s$k_LtGLd;?~zXymll~{2V$WTExb$eiAXiZ@h?_oCFcC_DK|x z-k2m}cDLyh*W1pt3@tzToq!3$xR$#SZEDR76>!8dzfJ<$^|~7_phPtzQNTM-W(QihIi>B z0+u^iUc_^^qeYCWyZni8e3;|IifM>%7q)i-*O`{;=`}S1R^FfBC*Z`NUbYf&>(*>m zz^Ccv$pS9DH!M@Y@>8mh5U>Whc&dP1ZJRD`j<_wz^!P{u&zmN{D&W%zBm8m@y*)3@ z6Y$vm6^jL|lyaf+D8x%YO-T~avy1okTtwqX_s$B~VWP6}G{n8zJ+D<&%(R?dw%W;B zh^;qYJ|iI0|Jq#vJ1h!rSsQWYw&^Pbw0Oi>)j>Rc!M~e;{byfVQyLn+v$Phopso!3Qda2v{cIcDR7M-*#mMe6w?1oPgOKCw3EX1eH8cz)P{M z#t9f|;n~XKN}gq4D$`lO$6Xq80=mtck|JRJwKaMPc()TdNI*yHc_RhfyDVh9fVY=8 z-$|-)RKUS!pBD<)^vX+T})l6GTrltI7hFjj1N$=tVO`Tpw;% zML6!NS2*>~tUSx}6JjNI5jXCuU01+ihuYf-*l_k>5o^SccMv|i&r9|Ya7g-HZvpfD z{Cx#{QEi}~fN_Dn8tFatKI+p>L1!UOY}|ElAH*(q4{XRr z+}8ExPkj;JPg@=%U~by>L;=mt?iR6~A)O|C&dWV9P(b$z9kL@9=2^B0tlLk(iG{X< z1YBq5B%*aPF+})Wxct&|0aun&&JeJ3bd6a8O6f(j1)Tk=;D(5c%~v#9oM*Z4>?sl5 zDjBa4KHu_sD`H~fP7j37=X*@5>yAhgm6~`U=HIE10ElZ+>x^%V*ro62U7m>M<;<6q z#h;EM;S&C5Y_5mQt7cn+Vi&hED0WIcJ-rGo^hz06HJ;Wxv$TPUL)$IWGW(P_wr)DU z@${^|6|Kmq!Y$ML^sgfE8eru)BYU8!Lr8wxtuy)#Mmctm_sr`zw1!%Hqi}0p|KW8S zXBkxUnmJ%(y{0)1?Y7MvILbbFM$-vivj&Z+-*#Elsco|ck82RIZ9p~e*+VACqK@Xb z+dg~fBvs<&-4nd$44cv@{n3q6+vg0=h2EtqRBw_$V!HPL6UX)i`6KiEMp*k#Y%(`z zR`c;>^yz}Rqvixn^BP#)XWrc(y_9^E{%YjN)8UV5bsYc^dnZEGI`Q>Tu*mrUQ@-?(10$xWBe z*fH2j_S4zjOY?ROmv|4V>A!5|o>2~=^Ey^Qwd$Fdsix2?XI#yyOiFYdXRc>aSjOT} zef+VzY-^`i)9N$5q5jM<;YQ}W_3)Xjq0LShZR~iO1vwPpR-sg-R)-b;s~51s>GM6*5Li5K=TEYjf+8 zCR9!|A>&`4Xl4BCreI47%jlQht*y^Br~{Trq9%=FLd>wOTSJfnN2PKFoy7BBSeq687XIPND9YDg_>m43SOgDlO#`w z9?D5&8YM4h6e_ipWEt^1wNk34)D$o0c!g9d=TXqld+4tpVrBg6N9e@{a*ajW#JzdGwlJ9F0=p zah)bR{N^P>#H+#z(=D{Ho!c{KW0NG=(1bs05Ss$*&-we^jweIiijB${FG+&JXY<{?`i+8f z7j88(KhXu=9#ocDnUx63CmppvH!vPtpR;xo%O-$T<(tEQQ6xe0w8`!-jwZqFGjmqH zwA4bs$<)=Vfobrd_EoDy9^13|;G0mb792aQ4z{|M2nJOD(Wdj`q1VmoEe|}*fX1cl z8ZSe=*)lIJdK^!H=ndMOyZ-UOta|i*(u*WG6)`c$Gbsfc)zHiuKSB%6ZO!L&s}>8P zK~wy9IwgYh>*?oTtCFBz?&w@2yCfLjmUOE1(HY>@0A*TSNuecm0&OoC~OYAUH^ zGCT}WOkeJs1i35Ji|yL=0ZmM0^S*;);l_Y|6?R6Y!fM;YWQ{lR@V@Z%<|VZfA*Oq~ zJ3~@?faTz$w?gN|L+>SKO;6s5fw|31www%0f}S8ozb}hP0PkJRhYoy| z2xc*FCr@pe0#~M-k{g~(gyZA;XWqV;1huau-*>@&%DUWpdxIMpFuY|&izsaO#%0S| z4EITbHPj`OTMX*!r%Cxccg4enqiaqEC=y}R{n9fx&5wn~n;y<~i^ktKp9e!d5@AiN zsS8uuCPL=o_u5PQv~YIMq%f<@1c+KOsF57?oj<1gK1pN(cs=d3bf9-KD9dITTMkVG zMWn&Tb>)-b;)pk0WbL#t^LoWLBM2?*Nepe&1@&_$$J%$qo_~j#f_4Io)vRXUO5~t^6n33=Q8l%qJ8CQ7b3yAtJlf$ zZ8#WxedJH$f2Ltxz{rd^eHJY1?5}FebI|ns{^!~U45X&@+&Mgzg^Jnb{Zbcjz^_$h zy&K3vrK+u01pmZ9zOmDUYfB=)A+@s6oY_3wKH4={u84%FH9i|^`tne2%&MWj6+45Z zVJ=>y7D!d0toEK){*YJba@DlQUkIgU7QvM3S?;@bK)FyUe0Vvki8T{*Z=u!^}K z$wO!GIMC=43oiDRN3Ph*f#={Z#>WyEnAq(0rOx>@y#Kj+x-Ah2Eyq}1Zf%hQ<9yo9 zeuM2>yDsre&^!hPB+Ogh;35s7UA^Nv(F|-KefNe9wtL|?i%u>K4G*uE+tb=M9$X%b zoOxm^4dD+bXDoV&cF?)}=6m%b;C-c@`jyx4u(OHv?fEBYcuGtg@z$|3w41d|+Ua&U zwC>ZoXKkMtNM8^YzUB!95$hVPdF#YNf!nMG&mM+DO#Y6kJ_a;Mx|Hhnx+4dB>Z*S+ zsu2asJI3^xhzQ^x-VV7wA{T5f4Vi!D)+pG%@?rSTFE>N*^|6B$%cjBUnKjex$*Hh) zNBc_(el$#Y*CMHUm(j4sx#NY9k)uIEJ)i!%{3w`7q)pB~kOSd^-!zX#e>*+Gy_v@; z!9Sdr`mc%ys%ygDt9N7Iz?ONbjjzT*LVp({>gQOfGU{N?h>Lw7BTqJP-m5s^v`gJ= z21Y}t{$|Zy+9tq@t|PAzn+8DI@Y(HbEc(Izy}vy3?3D>grTdy$ZSD^5US;NIKgxvk zpygfp1KCi%T4{^5r@O?Rvr4t>=5JH|_^+{!ObI z*3W?&wyUA5sy9sUGGg0}cLO1Ei(lmFd0CJ+>%A)xlL@moMuneWKLC#R>+~QkJsT>X z@tXW>NnaS!@7?5)_WfW9nf}wJ%UQ6l@U{KuoJ^Q-uuI?c(gUD*c+~tF>$2h4onO@R zE@Zh{)voH=3fXYrxgA~mZZ-^SsCYWWC=&|w5(h3`(;JfSSu{5o z+#hbu3duZRl?5*>ZY0X2y&Frna-eAec9{!}H4r z2Em&L)5f)%(hJ5~wVA4p90;~uUWNp<>;uoXBvYzxS+Hbm>4Ef{0dT7PyiU!#_l2AO zPlkH8&4#k^LG|3WWP|HZKTp1+-v>s|SbBJgX>S;3u<6Q{T7zN5+`H;CbQZ*&$?0*b zQ8ol@yi$9(TOX)u-1S)RK3On4_vFo%rLrM%&a*JH!vjF8Ox7P`o&{qbtK=z1hd@Et zmd@t}^o52lM#?bbYv27zLBtX@be+ChtZr$cu1hSn=``mJo&4@R9| z<$me@FgTEE{9JLnJ1jg9J;F7A5cJQibidTbOz^E)t5!PE8#Y$o>b0?Xf0*x~Y_WTC zCOqFY&gqrMAV@j!g30me3xN$++F7HWwK1P3%WpOS_RhVRAH6Rd4q5G8?7k)w9?Cc9 z?OU7${vmGpiD_9NFQ2!r)S)a`7V>=cx{$trL~m7D{P}GaU{LJJRtCkvP)G6pe-Pv^ z8U!&cD*UUksah%}Iw3xkV-n)o9jm-s)B+MlsMCKrR8|4O@~ao*_Zs7BJByf4VdRUndF!JJ&4Y|*QVA_+R`fe8f(04-H%IvJBu&^MO z?PYBPaic2wJ=o9!=vudG&#l)0TIboA-0!LYBhSsH3-X#n-2tQC9k*%;1s1y=Jh36+ z_>mjlVX9z=HY`}UYhdl&7ZhN>*3xdvbWdnG>+*>zU3_718T!Sk3w7be zkheMMvB5y8$y+PsBn)}k-J@z}6>MHRVY$XR2%0>pn9x-_>Ae zFr;ras<}O11~R>ez7C5BC{+-?X{E0(%zx0W4dCuC- zY8YGE(#Y(D8ji_=9h_p_;g^05&WEI{;MEfP(ptg|Ds@_1e*7eNINZfJX2?5#$hr^^ zXWQHX{0@1xtu#vsOHS#-%Q-U0dwQbkR0Acn)(p{`o9+f%_SdHm-fs#$3$;X#tM#D9 zE`u}F+%QP1w0!%_NF`LYEjVFyT?MQ3wNp2>YYsCP)IT%Q)gMk*)$4xO+8!R?+)?#b zq8tvz)Ga%!ZZNbBtaJLgw;axoi|d!`5D1fRO7BumX&m$JJ|gw<%~3 zt?`+*O9?$PCYW?8^apKLi{lULg~5nMbFw83mEhfT(v*&7iDv8L(kJoEcLVa-<3D?2u58=%ZP9;@mYxcbV|?hp~Iwy_xM$A+wBl~aPE!WzzM zy|7Je>-E>tHijp*b+a3HATp)>Mt=1<^WvaZEjl3`M-CHlc%jMX!rKjFiyOvvIh8Tk zSbatUi!TW#j2P6wp8)eUrNy9h#A05T9ey>zY@^?>wo5HwtYSs_b!#&Scri9~X?0U5 zTz~SFvacCLdMw=DqLu}mdv-K%!O2=+)&C~Zd$t+03$y9i(%b?}hi5Nwb~S_0diT3F z3NZ({{xr?ctF1sDGWKCuxDDJLc4V&K6Eiq2yXA8+%oI%8tL&c~GKFR<)+V(zFo*F4 zo%i0;Sb^Dgo2FOY%;8LF_3~=jCNTBl<2lQhT7s|l&b+W~rtq-FeY?34<}gaODXsJ+ zONj9dS#vh38pvAJ_Pgt30WF>m@9FAn4(4Oe#cW<}2FCj9Jez6pxdux&b!p=SKi6FP z%Kz9v7;t8#kJZqcPa_=lFh;YxbJB$%?jG+o%C4t*c^lI^4;gBo5A5i@3=*CO`wn3|LEF%rf`o|ZO+?d z3JSX({dbnLgv<;3?o^34gI&>{trBA_flt%7uJDr?TsgkT^o$f;6urdYunZTuly}QS|S;E>QXDtV-ETChgYVd3PePokC zJx=AD!Otl+^5e(NVMm1~N|!Pn0Ny^>ba#1G*y(ZXj4Gdi%L6BlezM&XHcmXUhd5^j zvqoF;!`E3t*-Bf|#`~DTy(qJc2F*-h@#cyPUX8ZIk%7RgQ;sCVb(2!f?A4|)m(aT8 zIGID^?wxmr?Q($b?w*}WZL^05tFq5}T{MLjCXscFW|%_0X`R6fR+&Lag*uBb>@|Z5 z5mm;}b!$SEl#!FFHueI$7gqxI~V7pgA*|6>1 z;Z<&d?A%ies9!_5%L^=^N~0;MC+pRQ-5stw>?_v_!e%r|S?kpjYRH_>q;uxxwk9FKk}>9PO+XQwlcfNG6eN{;5R^IP&Q!^;FKEt)Ui3w!in$y$ysu`49wCIQG>z2AiAAvFf`dWcI^tS435^`R(?zcU+683oq5u^E%+^RUDe=WAXs*6;8beA zA9(prdmdCO5GpFCPVv>a!7TsA*|ArHVFqPNVQ< zsNU01D19||PJEqEXzVgvk67Ls-aR+ZuW_k8?6`OM*7kSJ!C*EQ*{x9kjLa(4Y^rZ7 z*l@To{;pvlbgMmJs{O)1DD(Q4TI89Q(9U!A$h;>lpi>GnZ1&5RQ2ypiwYEbG&@-&- zeDq>4@!+hzh(DeSXn;H|NetOD3{mjf)#J<8kE>2W)ABuhO{jwyFt)fOI(3318at_J5tki zRN&ufLD;t0Er3{~I`P(026MfgqH8~B4&4ga%uCj7!D!36m0o7SFmQDGi6>8jVA493 z>-*QOppDW=yIie=ptD&6xt1*;d($ubui^QZ>z%LK+b0ODeNRr^;MN>$vitQKzk}4k3*?&s|~}1PkZPxH|($Ni(-01FmQTz$|F$>qHz;BHgpbJy@lNzFque z<*nO8jIwZoWu!k`O&{6Z%%Cas3TyJ79q0$u&$bWps@Dd#s|@yS=;i_Pin8^?8g+zT z*Ph==*c<|ZZ*D&fVmpInPGVO-OybJz@Ae;h76Im_d3r0_dw@^v?ZYwyZIi!LVW_qsmo`tD2aanOj&|RjXd3X06(FBz3Lp+1T3IJ2*PkcXn}Y zASFndT%lB{H4PiNxqCokPcQE#KE8fU{hKupXb~6`+%hCItX1naZQHf)(6LkJ@Cb@# zSdNd3ivB4kHZDFPF-e=8lA6{fJ)>*4?mc?;>YbU@C%bRI{sRUM8a!m^u;C*{=8PIW zX6(4}6DCfYJY{O`wCOYQX3m;DCx7m|`3n{ z1v_@`+P!D*zWoOd9y)yF=&_%VpEy}~>hu|b$$TIjHIQ)lq;pKFP)U83a!iw0Oq1AM zt|3y6-|9`I9KSt`kN7p^=y(0`YKPg8km&q?irKY^bXY|WV){JVwsQ~*N4DwR3c^7_&OCnMx-vMJYSMhUr#2EYBIY=}dB}ll z&fTR%Sr%eu?$#en@enuuT4gU|4*GW+Iy;?UVZ$!dyOUqCFy&S0;j0RGcsjq4X9xXA zIGAA;czi_!JfATCz~oE@f?ZGE$;pa>K6~l2YiC4)JAe9I{{;;A7;mjkSC4|@D>sdP zW}F168V4$=^+b`S~+Tnt9Q%{K)JxkLPQlqCuYd22U1z zeFw};+dxB`h)nZ+5(Zk|o43H^2n&y^C#+h$IU1ba<-ID{$iw5q2Uf?pghP+QeK!lb zv0&V0^YS|+3spv5EggV^n+@kRT(mKJB&6y$kj0MXpx=WgwW?laVd^^jfcOIp>{>EF zS^hW+Rxx{?jA@C3$(7?z=Q~8exotU)b^CCzKctp=KnM-K7qbJ>hNIk9_Gd0KPXo)z z@4T9NQ!uK**mie(BA`r$)qvyu@xCXs?P?C>f!bnm^-X^Uk|w_7d+lT(!YDqmU;qow zou1hDKhD6uM;q=9%wpl#mZoC@aIksPvZyU7H|)Q*S#htr-c_DW_5M`Z92=(ZjiItkIr# zn8)}gbI|QV9(vSXS!Zzv8s40)bl~-|CE%`<_?bUupnm7dtCZm! zTdgm+QnPfko90Z*^ZT(tVd2+()Qu;3?;fOOYMbq?cTf9af7t5$4o z3+=YpcHjA+Gb|2sJa%t>1k9>i-N%;d4C#|pfrgJeLxcla+NE*_7&0!W@YTr9P*qhW zEiI`dG-SuO2oH?_&*PP+(eHZnHnp33@FT{0)dxu`G=shGAkVQEJ8)d+$ zT}H}z#+lHzjUvzQcnWNOo>SL)L>4TlaJyjLi*(T2u}QMCAPuskR-e8e*BkN;{9a#M z(FaZ_U3OVIr@?S%W}VNJK5%y1j_Nyybp^|+bBCW#O^04@9oOuilLakL$8L{U)*b5F z`No&I+7k|LsWRQJd@tBt#cf!2t8S2geSvFz_jEXxbh5FNOBQhbH=WI^+yf>}8alCM z-JX!~V08a$`+LE%hGz#HTi*?s6((2qn)U+cJ)S2#cXxxcN$Y!dZqfrL?9FaZxZ=xH z?ULvit|#2lq$I{xpADC?8a$iTvIiWSeg9<6jvlboNSe9Zv@6)N8}3%gN{4=XvR`e& zLDxp7%Bs%gXTU-0F=tvYNrT9VkE*GT_JH663p-xTPlv6>p7;8ujes@%k138G%Yf?z zH-CBFBomaD#vbb$;NbD;f`$1PvcRTwsd6*g^n_lo9fq1rONVMrTQnI|kOg;A;=5Ky zebim1l*+79Veh%bTRXWvup-K--GIAY;i$KZUhc{aSpMWqLC1wz5SozRc(HqLsQ$+P z;p=YQ!R^9fqlL|TLT*p*yLM*X;aI|iWo??Kfysl8FVFPPhUyT#vRkvRFl1}QpcQ?4 zLRzUlv9kiY!-<@hRjf931-{?%jJo)8;M|Dv(+%tNhI3OIA6$JI^*Q#=*4V&)koGLI zUa9^#I9@Kh;;P7AU~RYSUbU@V!D^WC@*`H6Fl_6%9*fI%gS2szPG5MH0)|6p>b>aF z4YdZ!{{H{X=EV|@VZeU0jgl`(O zbM2AdF#L(r(zS*u&@!x!vHVOQ=wAQ$;`OHKFuByiOy`_5cu}FDvPuoK%ey}}itx(- zLl>7C*{%D)+SQ##xk`ILwZaj3&I8h6!+!k<$M0vtj_T)}f=~8>kac4>`EeQ04}ckY zrz>nIrRg%KYd3hdwBnJgmAbC09L+k1YCaumD2l|ivX{*O|Q>PR_)W2yM~==jKQ zGmhV?O=KLuJuEVgB{j^K;2_5;-5|%SRWQkCqZ3$|cuh&Swu3X5&ox?F*9K^-a@VfB zvxB2l)w1;!4sg561{=pF^`Tkcz1CYz?BUUihWkCj9l(is713mp9XRVpy{W2ofx5MVw=i5LpmuZ9-dl_Ff(2X$MPEV;5c8U<0N2 zO?`6J5uo3_#Tf5u_ORr^jJPtdt>IpmliQwDu!X73*VP|!u^x1&AJKYMcUx#^dsptJ za)gR2TamlxT0{B1wT>O!s)ESe(JH5<_As&D+Gv}4cCgX#b-7KUw!n`lbD_dpM+m>y zCGf~tTiD>yqDtA3_E2rX&R*K5*6?utj+-@a*}$CZ1M65u*g~T%hckQ!*g?ykZh`5i zY#_Je%*bUj3AnyF(sE;M8yN4DR_;UvN6=S|I9!18wO=lcwQ#Y8)%{Wj**0?mGhb8N zd094KUwT&KS|gpn`m&4DrbSM$$+6|6bMIq8F~WR8r3wxPg1qWw! zYmY(~Sg`VJ#qfAL$hW@yb5fc;tm$FgP5-eY)EZXx?fCQd5RkBML%6>!TyB)UrR94Y zFz8vf$Gm~DaBSzRgWU>k!F0f@GLld`xG~{Lk0E|`U>urru0{hln4$Km`Dk(@;GA}y z={4OQI@?z&b>T&0m{fc4j3(1P;Flh6Hy?9$gPu*(=J1Eq@HF$d*5ZO1pwHwhH3kDr zv+HMx~Sqa^4zdbUxI9Yvc$UcU@btw6z3+PFB2gkhh2DkE>@m)FWWU?aBqGDFVXo z)HrO0S&3s*yDROb)Z0k1>v9%?5z!h{Mpr|h0& z3!^-mrDyM}3)R}qJwAG!3tYM2Ic7|@9b9;^Gl&hx=kI;y)nkSWG;iC-ck)m>Xxt{S z|HfGK!`+2@Osl&>weG>?-ZmoO?QJ_Rc^Lxss&|o{`nrJA;-_AN_BVi>aSJxnj6Ixf zG|WVAumc>5UX)#`O9N;XP`au%%^Ap=DT@xo5MZd8rQKui1Pu(fuAceY1=>~|PnMqP z3~qz_K2%?IfsQlG8+RR0A6y1gWnU$@LifcZhuq#rfYB?n7fDB)A%5!fbyFxQklS;s zmM0vc;V%s*U&wTTm-Q}WR>n6DNAq?(e*Mx0o}6lRB9F3$xLmT0va$;t?^`+Xxq~(I zXw9n5y>x(awmUYid|(SMSxvKRNNquRY3p`vs55lkJz@7NcSo4>)9m=!5mKmxg2pPHrTH-N)a4osSv?+lSw4|f}damXxmad(r&F3|mA zss16e379{9N{oL87ijU2Oz_($h5N^<+&|+{AAC&bUbdTN4PFh-uj?>G2BwdE2S_{| zKt z=?FQw?76Zr4sdJqs2%0kIl|Ap_rZwz4Zg)Rio>IoL2*O>VN8SgI^jFl;bV?0NwRXoM2NrhCYlGDO5Lc!50Ql^o~ z305vA#q)52zEaMrH8LqdYor=hIFICcm6W6@ysZ-N%?QURMnfuO*a|g^M>K?ROrxOX zJjatVHNnt?N}Lo-D%HZcl}bUXXc?;z+lTXj!4BnBO6)(; zt|*mU#^7QQv|X8!P>Scta41x&Vr2LXg5%Vp?s;BD%LtB_votE75al2h3XMW0S850u z&&U+wc`A~|^W->l9c>t!DCm@vQlyHbNjWK3s0o?azvx0}+ANyBl98c_i07#^N*S(8 zK|_<#Xjr;yS-iUpC%&U^l7y~*@dTxmQE3zk7Uj^%rY4jcmQ%{mF44NgXUNnXnt>b@ z#?$x@v3=;MDq8RxshVSyy8b{bQ&BXpW@xpNlFM~^Rw%HoXz)D8QH)xAF2(YcoR(>L zsglFV{G$F8Y8uhAwGjAIF_fCEX{Em4JuGL4_E7`RceyLWjN@XMaNiF3NPc7tV*t+8Bu>E zCFPYGHAxd{8LL$2Y>ku?oKlKrf!C9SC>x_7Wx|R9Dal~4;B>l2D^bfeDprQ-K{FJd z%c$kJDnVGVpk~kqMA^_xI2B7%3UnDY8me#}D`!aTa0Q{_*wH)GN=~8RIq|vZ(GP10gP+>J19PX#2azZ>0mutwRQbNw+Ni?kz+sCO@QeH-& zm8p4|f)jmOMJjk1b_M#4f}wT#)8K^+p(Y8G15I7jvz*5M$N0lYSyDlhq8tPolT@iu zkf<6Jp|ex963qz>gJpz>sTQA$;fv>KDbLb^Vl?9Y&~0$(S5RZ`* zqpn8EvPJQQQsFf!3?JC@8l9h0EGl1vd6DAOyi#XtQVrUpj8tMEljDLI@wr0BFf57I zhPFi)=@~^p_r)MckH*Ge@Cl}t@>v{He2Sf@W3b~n-m>?R63R3=_4RSS88p*V(@k~|~!FJ>$Z zFj91DDTQ_-wiy`&$Fd4S#wj!yLv_BTVrYiv6g=h(ra2P?wYY8bY zh>FIdkzkY6jv5v&|(EI7W<7-$)p&*1w)gfUPOPBV(wL-_hLThF`bL&u~JTo zzJwOT5R8fuxsRMB} zDaGUEEXF6Do)v2JUnB#_Kal+Bbbx7E#c34iLp&~H-xg zrc|+{s)*OAS)>#~Tod?!C>w!spJCBao@oQ4+r2(4YA#ONu< z(4*vZwnhqJ7oC*VAf3cSC)j}oBf3&p7>a8~RWz;3DVRM-R502;>X9t64IV=5(Sm$ zA5u&V7&HlN2mVgy=NN7njHNWvX_8||@jRIVYX?@&UKVq8L_#s!Sx{YVx`7)8lc!kS}VcP%Rf7zQf~2D7EE zK0>8Ql~@I#wPF&|-H*~BLBRFPc&$Kf#QRAx6U#Z|K0@3;E+RZb$V*BL+~|rt8l2c> zg+|3_G+0BaRV0bAMmUdWNg>#&keVPl)^R;_J0YR4tVT++N}ZoeF=|LzWKGybSnG*; z!F8G%8j~;D4T0Jb+rp9>xeDz=hTIct1o1p2g-)nI70?Pf<^i$KX{=*7jPydigkC4= zfWw%K)SbZKf`M08zp-d$B!T{{LcOYV+)>IgY9w(=8OPy+b<>J8sek~q|um2#QO>1TSg&&!`vdrh%Q`Y*U~~q%COMFOpDqT+o#0B2<=LS`4huFC+dn;N-|K)HSJLDV<+3xVBoQVi`G02_>Rve;7lt;z3`>5d`dZ z@qR*nVmT@D5G8VQU92OpT%v{b_ec{NjatVeIEv(0F*YN|((x-adh|CuiN*#qa`71q z)~ys81ybv%vQP09y z0(!m-twTjItf&K0h$FllhgGm>Lw$=n6=JT6k|8;!(NRQup>doA87dx^qHpT#1xX`~ zwTm2sjG7~K&%j`VYQX@Ay2JPmX#QWSy~uf673Yz1diAshh@+pPVSmc3f zWIbqb7}_aO_ZY=d(?S5msDKPecm|RyYzt-w4W=Ge7sIedM2d{XA*sL~(T(R|Uou$0 z%5lpIOc%O1iK$T-p2IqlQlgEB_d|Atp&u zj%g8`N<%4iF%~_X#+ZYWNfipYuAWiIr7E;K9`bO=KoRw^vkTnb&x!SMhbB*5V-S}xFfQO`(Wap(+39#Jm2O2?&6GTn&;}98*$ZEsbGbDz*;?9ty1uzoxIG9T#n-#A`289-jR)dX1U5f3)ID)|xM`clF>{ihZSS5j#GnybJ z=#deRVXcazL0E-iZlKYX#Wv&pCy#N26AkqkcV|+OIWI5%$1?%;z%>EtC?sOQc(?dEh!9- z=*Ac|m>4idV5iA(5nZ&&#v0aQ4QrRxS=im;yZQ+ykUm*nR`?rW zzC^bd%x3I8LrKw}P@e!F?=XqehXcMLK`kWKxM@!!hbse9wVd^bU$trK6XNsXldye~ z#KT<@pU}m@QQ{(rj>p|R(x_O8y;2gDkgB!+aHeNqlMny4PDx0NW~?Rtff5J%C@wZS zHLd|VCPpikcR|`mxX3Z_xZn`2#)Za+jEs_@lnSPSHr?Lw!(E=B2)dRV;V~BX7z(GO z;~hRWK_Q8Yj>q*s_RbQaox1-gr{I1i5_?QGQWxC9M@ocAr47ZO_V$hv&(NY%y@Of> zhB=@lcr00B9hQ)SLbae$m?$pU+Q?8;kdYz26?2Uf4jLJXdJ!)c^kFZs1{Z>KA=F`n zs-c`{NGRIdOHc{AQ`}0NLMlp}Vr1wS64WYK;?+(F zy$SK_cYCJ$->5I|C@MCVi;w*4ZSPYxhzesUd^AgG!-c)#GQx#^lVF@LQlbl;kdTbo z(S^a78s&n+hE!^7%J&%D`@bvEVZaYfh)wv*P3@!ZgvWi43OxQs8w%D13yDt}_Tyig z?0@5CDcV#{#}YngOLhMfWAgqRHR#C-suO7A-^d$&Uk3PG0-%bU{fR|>O2S(FjcW8N z3K#+ji~4a@`PpPz6zsyqVE2hWf5mEl`4HUxYzogLU_4G{IEUoaIFZRaOC2Q?l6hY* ziI1@5h_yt=<9Rw<+=Apw99|&k>V{JMk$#QCs~Zw>@syGjz666eeEC&TiK1#H`Hz$< z#UhIGq1~iNe37G~2PPK_i5-#>&KDj0x}*g0y^@LnOv_Qp3Gwdst>R;_42-ugA$|$z zM|^Ocul4gE5%uAgKNTJw&vNNPA7Byc=;p4_xrR{2km5f-CO)C65f3?VLaYpr zW|5~#;!|T|B?(%|$Aj_Mm9e;JN@8zY#3O8V*L}F`;~7Ohmt4d!cx^P&k=P7ddq+pZ zNZe>MQ9@%OS)xzY{m=D@hdUt`)&A|yBXwOS7NNS1{O|SN$1#Bty|?lId+*DxM<_q~ zp6XY=7m^xB$0E-XocN=s{iW{r*$!O$KiGll@AN)G_Obtd$k_SxyOQ~ToXf*FCMqBy z`9m5{!O!v0SRI7N;?b|?vm!e3J^4&1>~Zfme1*xe#iJx8^0=f#*g5g{2;QP-thI=v ziMjmG688&|ZTNqG{34Yp>5o5ZJU#za%Pf8y@P{boC;VFXG9ZLY&WL9|I3`kU;p5j$ z^!piR2@@^hd!v4Q3snejQ@&OvX&OrZD2e`24euXQ!zxhu-`6=M?5||-H~a^X&Og}# zv1HKM-M32yLi3Td{HV6-_92A)`7cdBjJSM!dn$~%;7e6&(popx^6{1F*CVB(C`Nqo zjp@fQBJ!J}w>jb)MBPYWae?e^pAb(cP+ArT0bIIpvG@fsoJ%65P04Ucle;QPR@1rx z1F?vACrfxhi3n`?U(%oKpQb;v&!m+(|P@MndP5~f-*+5MMXU0N*WhuI~6mYhW|#C2ir&kL2( z68b9{Dj$C^m#?|{A1RSPB${GgUOcoPnUD|}%L%1yJjbLI4+QIuh8G?Ev$R=4>XKGt z_;Fs+*TnuqWk~Vdr(&_i;r`+|I=TeQP*}y|J8(8!>}Kv86^ZQo^Q@4Hb+M_0C6ukfFrPdd{yx6b5 zIVkjdg#L_C;~c!04|L@(m}H1^5o0)EQdewBbYg5Yr*)QmWX>XO{ylGfNqHE70wXad zzg~ZrJf%b~`06CH?;5uCa3^)cw!aa~1LrS(90PYIOTJVWJ{A9WMil!37@%R^W#P8H`3872oh;v^TQ#WZIk4>Ob{$c@deh~2UOp*AHdNMjbK^vZigb-&!{?=uR3e9hl#6RAZ&SZ(S z=S=>kG2&MNL7aN}M`Ve(oGu=U z`_5E5@+hLD@gbpzyz|#XYJc|}ijN7bg!}*K)`%LH ztG-VBN_5Kacn`S#f6yBLz(CHgjrK>GeDL)u^*_qbU*zq`NJ{XmFH#Uim=H<6@gC_{ zMw=K*rSJ*bICp!DE82u;w&XOllGN)*f5|I|{*9Wvgy`QFME}U7iyzf)f8=@-JLdai zZHdyLIGtd<`J+#h{PWH8$Dbzoqrda~^ZVp?Oq2X4Otkx7>_AfbT|+i*?j`5B{-{C! zpFpy|QSu2%yTrWu?=1O#c)^K=@^1-F{1L%_=rsR(g#LV*{|EGIah?9985rar>enB0 z_NXAsx9W`#;+E*ql7+=TV-grjD*sk_@hwUJ84H;NNq=8x`ggqP6XgEJG(W-I$&X_~ z-qT-K`l=ZA;t zwoS%;nsD#a4+nJ{14|r4lJDgc*2c#t@ZJrinbPRC(-SUCH(GRr&0Vy zO8j!FuV}RBguhIqA^$cT#ZCP2OF6%y(V`Q+u2J8xwywAi`;Xe`zfP<8ov5hRuV}UC zgs*GWC#-3!&?aGjRI5S%He1E-wZSh4`HEJHPWZZ3z1q9`{!x{-{#U6~A$7&?fWU9` z`;tmOp70HoM#l@QRR5C@@UPP>e)l4Nwc1xSTXe!-uGxQAa7B~E@7YWXgs*E=ieFLr zA86I?LbCzCs3lq zl0+p56aE=3agw+Y5}yG4U9`j}u!&egMT=LYhR5M9!s0&@6STs1xJ5gMTl?ZJyAs^i zUmMBcp2kdcY;=n7)f-|}Wn}oJqAE-n$W=5XWv=*TRz<6!OO#Z?ONJ7^HZsH_%E-{c zIvT%|CR7p_6ebC56%Zf^3XxdrTK36t$D;GC`J(n^e+Z5v{(?&X{4cP%pZ)^x_pj|arM=_8k3Hw~clVHWiu6-m|1f5Riuv=+ z=xVK_GtA3zpgl+lr&FF9FIiew{!ME+M`1N+5%Ojusx>&&fn4uu+aQxTG zhL-Su?OhLS6W1BPbN*o3bPc6nc=j60NF=qKS2=s@Pg}`@VN~ z{)5lX)LBl3d`j$l?|bk2-uu1Z`@VN)`(44Oe+v6DrvGAH)md=;i{vvK;K9P#$tR2R zM!uVFd7J>!20&V}11XD(KxVsOQ-HJ}<+8g&MZEAp7QlzXFEsGmSO$3FxEiz4#CZb* zu1FW{#VC*jenAPCm<{mo^X!~J5>n(dCCZ$P{rA$oYVTu$&7>MO*}OULcW?S$rfg(jtqC zK+Xpwgymd75^*uuxLUG)@JlWcfpS))T$W&yB3^h#7UVvH8TP?28BFT8`UD6l331*4 z!OsY9+P=X(^hJDYS7?9-I9u%olTFnnn-k(i;YkWW66i53F-?Yg{E9;6y^b{T|Jwve zYka}_R+tZw7N&1-1GFHVw7?I*1%ptZWs;_d^T)}>eRCH!w*gEn28%`YD_iiBv7|U} znB0svcO@Y0xq+0$MIdu5$_bDrWMr1e0gP0{MIh$|5-4X?{oIC3ZIdi60+|Cya|3K7 z3O=&lsOlpvsp2A#^8#tl4J3*e#n)OF@#O|#Bsf8WROSklFj5g0895&#!3pLB62vBO zp+z9)15yA+B2rlxsfdd}&I=?`PW7AaWNHhtc;SI8z*QURu~{(KO!ZlrMa20G8h9&h zQTu<`wgeo!y)FwTDdL4E$)X3+4nSsE+98m#xCmrUw*^RngUJGh|mp2ieXVV+*HBDj@&EmGB zdDwOoHpjSFN^8&2l$q9)l$Mn*(JsB^){5Icb9?2oJC@(cuq!y-%DeP;fA(`#pRfAD z7w=icf9c-)zFd8OjiJ_Pf_1LNT2~KOb`1|axO&an#y;V$0UaZBx@*c0hp{ zZvbAu>J5DD>)-gMH?Z?tk372T+q<`XXU}){es9Y@5IweK>-V<}{JbJ` zPaHhNROqZ!d8Ljk+ddJD?O-VOVk!I1Z_4nWKYX%`r;2ogeQQ&GDp$B&ZYhc>WsHn* z1DWL1MRw=+~m8$R#4AxxLP$jhHO&Scy!(R~##>b+g@!+PR znV`giGc^D1qmO6f8$1ir038P+p{YeO2<+Z5m(lmY`&0B5R7$xYbX>7 z>bOc@z0Vr**4r&sv%k({GNu$~nxZ2k@X+&3g3!TN>XUsgkE!Lih5q zSTtZX)dVMM{7^LJjntTQ+%n8HLH0zv2KF2quQ8?Q*KAy`LL! zK?P}lEIKhBj0JRDS~8`WcF9M>&%|O|*8-rg{`wySumAIvwThV5aWsYhm~pXGf>>ec zPN;C?J>%b(zPg!In1iBZgwFoz{lOsDF67)ir;_q zr(dMY-?gkccIkz?QU3CM-B11X-;p$VCG2Tuv<0$sX<9zt zCor?h`C%(_R5`y^&veN}-nBC`^69D7$(pl`%tLba5eG9ZpI#ke4k~ueZD$(f?29{T z<_(RU_~RVSyj&_LPMt|f?0Y9AvG<&Yc~Q}I?NKdreyLpNcL%i0q*DKomN`LM4bJT2 zJJ(fsW3lMeK-@nV9Pu(dxVTjmSBiVAMfNSzKL`kbo2po);k3R9*o%C;t$7u-L9b*k zf!&^Q-lpc21V<-R(^_;&P0>IwHM4vqI6fFnt5+6{h10cHM#l$Wdi#R|aqq}jB$!sb zG!~5{RNev&yj|Ire^U@1Cl!cG%`OW<|6+=1ET6T-hy({D?Ym3Z-6i}727X_S-ksszxr`V9s^^I6U5B;IHi|!DPO4UzBx$}*v)5^U zpTIwB(z}yEfK!hXp!XEf`!?wHQ2a}ENxjHMN#{Wgdr-q4w(`dJy^s9H??o{2ObMs-$@%X_7|d)N201127^tMk_8Oq=DW8 z$8oUt{+Nu&-x`y8GiIM;VlU0^rTN)L{_VATw-oH;m{3TKEPK$==u@TgrA?gl4tz|kXe%F|uxkX-o-8k1J&DX2h>(%_#A%0&o z&|L|lTq3gCRPw;eMS{R(umW^{B$jNn# z|M{bIvTfP@i=^ptC40G&2M^DmjX^V&(4AOy?quiadsVW*=!q zW|fr&r&X38#B63Zm_03ZS2BD4a*Ek(4py=`TgA>+@emyO)928pvU=gP%94iI%*+Ox zuS;G13F+!@U(+Pnkm|N2JEsEdsQ?dgm)~=R#`$)`>W9l%B7uRcQm4QGc)*t)(TKQegcMU-s?bEa`D$6$EcfmRjGb0$Q}hV z9J~Kn;)X8$D1{g*mu9C9Y56w~YSk{!*wduF7eJAXVn6(32`mzLEuP$o%UP~Mbh$S6 zHApKy?k~62;$fO<5k;{Z{_$8Zic?}!SZv2Eco#(n$%iDQMX?Q2kfppsq62c8<4{f9 zF2cEUsu?m{Fx`sjjbW(PWD`NxAXaO|wAhbnvk0c!!=TR5gDq&mbSGLMmc;;`f;Kp? zjbglOrnYvPG!>2F;c#?=A^C`+e5xO_XME<~9sNQN=o_}x*4i*rYBZj7xB2l68UlKt zUsIAu0YNq=nO@f(@tYl!zR}J|j}WQzNQL@qL$$T7s02Goj+R!bsU*>catD@O*J|>( z+pU<}-DS?%{tlE+#i68QGSF%_`bS!aJ-zlZkLZY_J^}(!irQM5NxCCe4gG^O84{?9 zU2}~2%!95#U$4p2<1&FBqqomK(IfN@`vmL5{;8&DR(+xmNRg^9s_5(XIa?=uBasP@ zQB9{P){*`}aZv=|$>9*E@!jD>s=*14Jsv)Mfi_uO?!idExtFxhwGPJ2?HYqNZuazb zPWnb%Gh)*)_K%=0;SCkLJ4kwP5|kqR$Wu6+nCpzd!!n_++hGbgds_nT_Q3$;`68W> ze!)KBS%=hdL~EgL_VIJ{qntqP?A_rXvF||rfOj8~zd0;+A#dPlljjHF*E-C>qq~Eo z;mnDRbA<7o97_M8z9@Rs`rA2!{U!J%_|)B_@EsUS#(4lfaMJA@H;!3eS#~3=EMz~L6XDXdLk##dHSpZ%>UfwJBXYN z9q?n^*Vj7X!NG;W2fQ|lqD>y+u;hebFyT2Z4C?fL$(*2{o!(O4+xT#0N z7xeYW%5LNjPFmU3;aLOLqS76hqwaZ46aniK^aRYU}(sn`oxu-CKq zE}p&je)ir){hnmO(-Y5o-}Ae>|GnNV{x&m{$;@w_Hj`wwOB+AZOwVkq06$)48G2?~ zwY0Qe?G2v4XK9gzN$}KTdl8RM|L|cSVxK;P>`D<=-&x~x0P%3m^kF|EPA=;bbP%z1 z{aRf)G&F0`i^KU#ggpB9aqc#6d6C75yRS#iN8FVti7Y`B#ctuTzuBQ!?)XKna~y}0 z-517lII2~*z`XaU+4CV?%>o@OPDiH_1ph8uM{;U1-S;Q+M$t*RZ#CPG7mTQVEvaFmNwMHyS zKHOf0n4ats1&FEV4JNxH5C6yNu3(~mv1hzXdI-SzXEZt&yGzi5gpei-dTlcc|2!<)$~TIOsmvRHS24Ubisu`=#B^1x;vjNFg-tZ&2F9Lje(ZQ-z5&Pm4uh&BU9bmXw(l9By5JT)MVK8OhS zqXRi?IHy%rk6`V3f=jnPDPX4F3;}3@-cx-n| zdWJjRJT;HUiEW50+_7@Zn%H+$i!I!K%G|A0t=K|!sL^!}>nyGEgu@L-s|4sF+CP!U zawtjg9L8a{L3J-yLG;MFXJCl8pt#i8o9Xf=nvM#*dW^dr4D zaTpsKRLWtsT9P9iHg&d>qkHn#0KIHk8M_i9lRc ze}hFNVvyY6p%mLWJDa)biTFO`&BZ`Ovkfy&DiIYuKO7v0c;i*=J&A}5$K`IxM)bOS ze$5m_(}7;gN)V~Tz2~n%l*WxonAEMvq8;=IpNAL{v5H-dm~9%}cL!oE2g{%%i0xOn z`d>oq6FSrLKH{(%HQ_Dd2g6W_QTHMXot#M00&%hJ{k$PLsQ1$Y@-h$ucC=5MfT$Rq zH?$bBahiL~3dC%YB5EsQ^Tmxq4AZO05ciCEpHSM+s67kTb>B~xs1<>1DB0}+pQk64w6=yS$@Q8uFO``e4AAU506 zuDAs8b@wxq)^NvT60>(9HW^=(b__8sxPf&)cTDyBB=3teZWjI3%&^*g~e&y1I9g+q|*UTE;)Q zb!NxBaWf!E$EdFR{G3^C8MPdG?3|xlGVxQUp={f zZqL#gTN1T&jq4dAQ(9(vR$?s!Q!7io*wAp+Ov^k&&%DqUKekr|wp#TpD3$qFB~c^qBF&+6|_REmeW#^Sg4ecB*CSz1S?d?X^})q3Y8QssflT!5rgDmp(=qQ zPC0;$6?AaNIxNjBKOJnY(t6!$eo@lW^tTtm`ZDyib@pm&?bX(f*D?Dabo2$u});|@vWIwWMix27ld|Up&|WZJ4C1>zt*topOv!I=g+RRWd`AFY`;jA zhQjUI=G(PxyKwFLN)6bsncDdAGBf0Y_w z*3nVveq>)iGCHI}Z%^x(pVqP6%IobPsuX){V`lT|P^jf$F{~b5E;c%j{n`lVahaC- zd|6G)wA1ypbbb{H)5zFh)C?sX93u~p3S-NMV7j*XbZuK1Hw2$of$b2O;fz;@=e-)% z$y)P>-l~0C$nvuzO{-9=KJug&9T`?Gp{umbS83a};+pZ2G)X`HtOox}LGoMX$OopSyK{=hrjfRu?_v&A0k_I_5 z-M2R1odgY&s6w9yv*2;yh@SJW&4K5$kHimelLBq)y{aaiJQ^Mkr{8aNOog_yO1F%5 z%7DlflAbGLN5jLg;5j|Er@^5aRhtdY&VrXdKM!!NH3oY1&wBVEZUO{vJ~G84DHSq1 z6g@9HodE&+@68zxfa{a69YP5p@o%^qR0Mvt=PfOl{l1Vcp1e2M3o(eA8jox~BWLzZ(lvt~V0jDoKL+(H-Mo2r_|QW|J#8 znF`0dJZ^ZxE(MEA(qQIHU(@HeMuU4N;{n&hQsCX* z#F__El3`O$=-y+Ub79{yH^tn``LNgc=KGV`X|P~;^QZdeX)vYcjnn;#(%@mqn~(`p zlOS!<&(`N&i~?T+@6?(hM z_QYO+aa+&vGvKfhPn!YCCit}Gw#e<>3N2|i?Re3J#oG9S&) z&Ci5gedsa${8E6pTzuznhb)Nu;D0EIFT!K`4+Ykgu2w5ng_(&DU9pfe_GuB!8R2wSk~rkh;> z^znXj_n0~l8djN+cVkUH_&7B0?RYdFUh1v0)eoKkGk3S}d9iv7SUzuRFI~F;bfe?I zepUwb$(R^vr~f-Hcu)#HE8vgPHd{rQ>1AsP@Zl_>X}BQMam($$bO9_op{VVuJreZx3eU?{PK4K4&l?eeV<7ZmtHR9<(xH7&$)sEQ z6G12a*_-Q-1=49Y<6d7&hgRq0*2X#$;MLJXj`!b;0oRzSlkas)hfb^8PLro6!SlB- zdbyP)!!M6&>}ujL2E6Rx=Z*4o7 z!twF%n%Vcv06|&lhSLp3LDo;!eYSMSho2lw?#;WE1kH?yGLQL5kP`eMrSSYPc=}V# z!!Oz=LAQE~A0*!$2EF&sQOYMK!u<5)Q)N^ttg99&+xcV!c*H49>`YRj^OTVJ&e91G zaw|F0@JJ>^+peLF9%ewT5e*hEQBQyX)0pLJ?=FEoBT|3a^3xQUc~<(arS2@q{b|mm z>uy)UretK^zLIQs_w$LF2_0v`rLfLZPe~FX>v*$k^EM5K4zqR^8kU`es}SJRICVal zn7;INb=(hoyZT0sB&R^g%f`E}ho^vf{V1nb)n-6!jbm@?oksbOsk(LL+wrjaopF7Y zT{<-I^IqiAemrCj^eCHgI1@6ePOmfQb_Trac)m))v=?-GQ8eh-gLJ5^ zJ5KDLJ09HM_8#J}C<~msw11F2BnMPk3q~#Pm;;T9+s()ojt2SlYU8hWn*zfHKXnXM zWCr=MvqHiw9;x-O&r$j#g5JLUldeM}!Bv&Ehr0;<5~n2>pe*K}Z{yyS(r ziVpm8=Z?7gLv2W3-~H#bC~$>!B~xP8`;=Q8~%{r1Go zek9xIQM@m#?<3j5SFP_YOL-!DDvQpYWcpmz?U7nH?0Rk3G%vi-WTOYNCX*#IEIgmf ztoD1=ei!>nR#sSLVsn#cvK|dyK77*ivFuuJtp#1izL3qZJKbm8il?$Crb}0bXFrkY zb=`XEUf@I7p~a(F;?Q&1w7RozM@@SsBd>Lwuw(Qi*_de3PqpT$Y}(rDX|>lsm7ULG zTD8P}M}-F--Y9-53m}hPIs4O7+43Z(*;M#l+3bihO&`mj$ku5kojz}NU$#_8!`r&g zWc?P*TDLX+xvXB$hG{EKJ(h(m$&4--_E6R-A-Bbr)AwbLc{F$_p31h3aFed?{#1s6 z%6UEfd>N`3R*15tVZ}u?AkO;mBw^4QSG2!B=?8KH{N#P`!}&8L5y zmw!n&Q|S<3h<896KNTUc07s>Krfn0z#>9q3gtX-+-Zp}W$XJ0oGA=?T!!kJI5wOwG zkVtp&fCDG5|;j1rS0A_XdWtXv_FVFl4F2COl$7!4bk)iJBXOG%FIr&He? zU3=M|tJ<<+q2Q~{$JX=nH|;uFs$U~^eZDsuCX@8clk{xwanlINU(+HSh9oQec$py| z+zP_2+hd~=A4W)Fm8wDR%xdkxJX}Ae^)NqJ_+;Iku(2Xo zylLa(LmN860;X$T&8{th&>yGRWvPP1%Oef*TepV2Yx)jMOZJ7+1-o-@*a(50MozL6 zcL1xqJ<1wx1-RDzz5n?UF0i@Z>Wi`Atzo^*Qloi$F-$6Qx{%2R?^{FXftO3Qp1H%6QPxHibzEVci%IqK zh3%mJz6et1cyp+lC+OO~l!W97?S{{7=ntJIt*E-bJq-&@x?j0Bt~0#eaZUL`t2eZz zTs!z4kb?7N7Xxy$ADkO+GRxD90If5`ill2IXgNG`(zBS3uxZt%I~yBxfd}uNwW^vX zh7JSTI6jtkhGlO|?CU*s0iR79XJ%}40Z+0ltySmZwL;L@r7pgQg`J2}h+ZcNO* zG^ErM40F2FGc;=ho_i&ZckYRR?i#7%w%G-|mpR4Qn6`xT&)*gFZrc*x#`mb{Ugi!V zeH-m#joN{9{D!rGy+~;EGCFqZd?Dz?)l8UY!axwNTjlK%!L!IN_AQ!vLY8ymq+EY{ zuxhco7t@l02ma5WKRxFTTayL{)(Mw@+tLJ4_XC~biQ0DPt49HFF}D3FukO3-{#Ut zNDZ%BFky8RRO{os&8dA9tQ_C0Z=;>DuxI|+%u~}Gf+%^asU~ zvylfRec@JF&e#j3-tc;Y;=E_)Nbt<-biQ|j8Xk(9p78$Q1NLK�EBrgt6A_=#x96$AG+0*iX=S3P=?(*7*g30$W0nKqriJ~?&9i-B`A;?~YDIsrw;DD1?!vwx z-{;n0dw*}pwX$e-CVv2AzcBncSnLUx{3LH5YPmvnHfj60C9cpft$82O@D8xM%9f)C zqg+9@tJMSj#T`NG__QtSWdZO@>FLdP+O`MFK>f#!7WzRC9jilUy#_*{QL+sUECj8% zaN9MsE8O$$*`Y(7FnFO#^y}O%0+#F;wRc7zA1KNbx6ii@2UwFcM83B#oW3|in&IFE zcAE>u`+N6;uBtNwURip=CU@E5!CO3G>8<*03_i32W723r;8}m@K08}KDZ>XoocnOT zxOZomvZJ=wu098#r8)j^d2^3u3F`)ctKq%gd-DR}?(3)1-t_Aa zwS7*`Idq1F!XwZ7O?%e`ZpE5x@x2lOMpvKj7VTET?2x3_2TQuZihVDfCvE5t!F2l4 z!ZS+P@vg|c?)@&XVWr~bbhl7=={*1Dyf|N|yP)Zw>+`~)YW z71OALvgee7>mmkldOvNWtP|GZDc^<+b#mbQ+rp1wg9LnGs= z)vDJpF|Apvb{#Ww3rnlI*7fQ)Xeel8)3`}fTRVG)X3ZU)oLdkiB@~GzG$VCs*{Zb+ zT-&sDbN6WH*}g-^PF|h8eY*Ji`3D3Bb?w%@N6%ip`vmuuE0ii$9nvqf|A4UYh{&jc z(J`@cg9gVZ3>i9X_=v=$5`HGdRR{ykS?Yi|FHg4LyW$U)>J9h5cy~nH~Sh5O@jK?+f zC|pT%uPkgTu5Vsk-*&bYI(y4Xm@$vHuOyYfq4BxLUcEj+w`_Gb?Cap=HfePinCd$B zVVxJ9VZ*(iC-e4phOTkz#PcTmfn@)O^br?)Vcv7w+0uem;PJ>VLB7BR0!j-mcIin& zVZp_V`bz_0&dymo?wWUk-FEGAYrpb`vqPhI=S*@3qigR5+K04+Cyh+sllOe#xq(1r zG(rZZ=LdERDCz{ute@sw?GOYvR_O;#KOF#IX-dT|385?F51@eHNb<1;K%8=8N3k`-0i(I}Y}5J43H! z_k=^`zEH(&q1NHHz7RksD!SYVgw5IAk}qU?!+_OY(_EA;;5D@Nxv`sk;K}yteJ{`P zht>=B%o=##9}4$Bd@fqz11l`XbZgSM3+TmDFZ&+(sm)$MR*yaa~XT;<-(e;MiOzW3)X?HkqVoSPVau0a) zHl&~Gj4zP4E@X^$_Jw5yyWWhm2n5sqwcfW5?gCazY9y^)*cFoA7p`g!hA z#tW7{&e}fBt{VhQfa`~D^oEUtB8FKX?E<4GnsiHh%W6ZS*NC;^`o5|+D7?<=V`araruFexWV~Z zq{tsK+nsOOXpk?|%kE-F-U|Xls?cK?%Kz4ePHTo?`Q0Zxoo6iV0@M0u%OaoS8fwYT zvukR~VPL<@LSOwrh)uLks?{bM+TAR&Io>x7oZcm5rs=0bs^htPzm$#y?UUPHXEYuO zXLYm=nWSgJ+#YVdTet;7%l4O_-ab?eqf_e~pY=;uupa0@C}I}?kz8kX$lSgl2{c+X zJKrB7GbdHwejCZp#h*s&t0N z4NLDgaT@~5O5fDHI#3Rq7H7SDdDahJ4Yh08dSe_Utfu{ko$3j8M!G{czxIN8Ez+mi ztRD!yT#5YpO_fmk%D}_DeIN|Zvg&m7iW^u?ZX+Jk5+m+$7VqRE`ohFh;?CDmZjRaS zyQH`?6dXyt*3H)RhxxFd`B(q8x9`?$yg4lh`p?02soELk3VY^J9Gdf%}Ny|=n9Xx1c}f=-BGC$^ys zs)o9)s%8gJl7G3>E5!vyI!!X(W-QQSQziR*nEPgE1Z9; zWuDc&DNqmAWTubV54rOK=2ZEm2=?{rKc(*ZJ@Dpf&x;dc$3n|z?K=>@?%?gXkPdt_YJV{FNw3}g&O9GDc|3Zu#imd& zIx@d+SK}bKxpplpvl$4@e=?bTX0rmSZ8kj8>!>e$IILRsf{BJkes}g;-;D>ULG6C= zAJ9G;?wGjkKps4vBG_nPRt%X@OXn6~c`K`Mfxsvlk{bu5%i1X+Ms&1&mAzdcp z{x9~}*66W4;#p-Jx5V*npZz5_oX`F;wGy8_d{<^l3nm>d4?JBzaOVg}vduVG-(fhk z+?r77TfR>!M%;BdIJZeMAe znenhUazwKI@)Ve9V03or`vlNCab`~4<%2;weC*s6=aa#>XiiAl+hK5X=@_j9NCCYJ zx78+A$#BH3>h{rd^Wk*odL#3@41>}^&-2WdCd2(SY2>jgcz+|@UU<3daJYWDW9J@` zDKL7-1iK4q!@;f5t_2z4iLhc|yS(S;l3-@*15>Pi8VVDedCLrU4hNh1?^c?4CPRB! z(AL>+hQpH=wa;EDMZR{G<+Sw~0R=yeu)df&1V%OM)OSU60*ra*-80fO6(+TCZw(W%0l?zcjj5k0H=^?dhycmvkTl zW_#Rp90sdanVZhKo&v5C-Igit@vv=9Vq@j9creYdZFRY2Dp-D4Tk^DJ4osulTz)it zI2gvh5EkSPhwV4a^E$2^1~YwDnQlBi2wE-p(CM%o?@xA%Kl!lb2xz{mj@H1PL!o5(pPURhsHyqS4yP!vXO9q5Y=|$kwH-J z)P*)<&kcsuQt|i;!&6{xglgWME5q>t%h_!Rb~prOS!+cG4Fgp{C!o!hI zhOF$h$*Y2sVBPyxxn1^VfrU#>arWJzU|B06Cua5#ID4gf%X-ffpm4y#mNjjLLLYzC z;bpgm!oXujS8q*C0mtHlVGs3EA>FmKor6^(JdSBKDfd)1SctAQw#56qik=hScZy1c zjfx(<6f4KTXN>??f%~I9 z2H8+3PB4G{AsMJ8ou4i7-85=<0}egW=Ar*z2l|gCViw(@DcRr9hh7Q-|@*lEAIWmO!KP z`7pOzqb6CFDe&T4*v-h@$k!?cH@?+*wz{(1ON z7<*{a<7&rZVUqVC2XfI+*zerM;OK@d7(L2A+xd7h^gdpl-C~=F?WOkS4a$by!6PQm z$V>$jrH++CmI7}_E!%uHIUA1Lp6u~KG8(S#Jh?s=@4M0&fyRyQWW)W8l*z|-#lVf2 zdvSR~;-If@q~-XqL6G7qoxH(w2-FE!ygF|L%Axfy(n4%&ekE{gKW+>-k=i4t7^c9*8OM9MElY$D(_E*s zI^)3X)yn%1F5`0pN2kx2rb>kMn~KREp4o8P{Jqf{PrUzsX+YxC@Fd8Q2B)1(9SNqP z;$w4HB!StTMO#lT$cNa%wSor1RCra>ZOZc5$xy|4)XuX5#zCR=p_;CGBcY+rhHirV zR5)Wjq;Hie1t8-MXBxC=|GZHllLQls^4?7oje(<{*Y^CpI0d?On9zMrR1(a#be}k^ zXBG@@l67+3+GN<~vbmetXrC5)tJh2dZcI0tU%II|1cvsc)cYEMX`DK+?};W* zGpOJ8CI=kh;g;KX)pQdSMV)Oq`6N)MY8Gwtv4e}VqIXS%y>Vfu=9r?1)&aNH|KN59w^ z`kL#aRVLA-|Gd24B1~T!@@nu0C%E`zT9y2bj!^YR^wa01jUm60U5CChN0{DR7~ZKn z0qgJCUf-@m!p#*CCTYD07?~bEZ&XKTIC*!jvq;}QgfJj>vh1MOgqSab9it^b6a?` z=R$j%*9{>w{$=BP3k5*A93QeexH;@38~oxNWeY1Jl&N<%H-n{#RHHUtPS9hb!KSqa zw$Ol0x;5Jud9TvKU__KXWXEN!Ng?e)E9gbLxO8V&Gx&5BU2`F9Ol`f(-naz_yC1Jn zuW@r|mA&ZN_`!CtvP<*C!=_E3Q-2?|$qQ%5SuVf-q=_^1at?J=HEj-!3#xZ*SKI=2 zyeSzlX@nz~_Ovjr+uR;*-Z@ejcC7`p@@(rgVSQ65n%=A1nJ`DNyYP#N{x39WGn?D? zJZ%SU{PN6pXS+i7M7KQK1I}=wtnHaOU7Ns?s=n4e8xv6G*8Cz{t1F~Wc##;v+JLpK zpEpc*fVr`2BP|o`;6UI##i9ev;N0P`6U&T6V7;LBb^9$suq#tH4*Iz<^ou@l_95PJ z5jiYqd@qTB{8c06Vfjwr`ux%@9lUX{JNL}(8M%(I>4;zCK5qtUK6o*^Wp5`)E={y~ zb(RGAmP{?SXA8LbTI#UHw16F9P$f`ZhlJ(rxnAcUw1A?)566r7hS zUR($#;kf?6rC}}vEKnHX%!+_lQ^OB#9!kOPQ*JY7csql2%ZY{c3n}RSRI${eIsq%% zn%QNc{+#pNqp!1zgdQK7d-v+&3|CD-dRyBOy7z8$}>U(bewerKca%};Fs!|qKG?jeORse|>EeM1RY+%@rmM_*^~A_T>)oSdP_(#$%I zhqi#c_YRhD+!c&0wjb(>)6R~=cMm%K)ETndDg^V?B&=h{g&ua7fPT1b|KKtbj!dt! z@Z4n*&NkaSHPDEFU=v@DCiMu|Rc-Y{pAi%s6Fqu3zoigb7Pze|J}QG*PZAd;j%@=i zC;7$fzS|bo3RuI&)7rq`$bJJSymW=0wn2g32im~8h-dN7rnG~@1)d`&I=2A_a&J!D zK@UhCW@J{c4*|vFdepzCShuR(7PQk2kiXD7Luu>Zw7a50kunx zy^TIifF$x!?btA9&|iGUY0x?Xo_M9-TCtmegCWzMt;GbqR>ZvD5bq2PBMn@821;Pb z;h!7&%qC#j*cyiOEGW3C?-SHveGAY_a^HSgLBi{OIxYc6Nw}ijNV5D9ma9Ib#seE=^3xB|@4|$fc}U%qVCsO+r$v zil9ifl%Ujr4SJ+8oMQCNrvRpScR0LM5Kb2Qc{J4;<-{OB}$Q+B^64B zrU@Zm9}S}Mf~#H>;-r6hbgf>f#qhE_>gl|qGYb8u}IBGE!p zN)vLiQbaI3FLFvOrIkWT$Vy4IT+Z`CGDx68BoQgZN;RS3PsGql?6XR!lF(utX|6s7 ziBZc{YKBqcSP6MKPz%*+k(v@pS%OiiG}lm|1W=@kVMsY%T*Il5q(w@CphPMbtE2d_ zq)B90#1J&ek`fie^P-}WVk8)al$A&nBA$B^g_Bar385PAVQKotC|HysLV}~KRHIOH zsp%U_Q%Z$MNHYxH zSCcA8t{jTCL?umLl zxlkykB^sSkD;O!y9)<5rDEMoj(u-9nRA`J8DslO{7$Qn8RH1sX1VIQze0@@sACg9% zSd?dcOM`0*%7I#`l!|c#)U=f4`>T*+kC1K27EyV#qQn~sl z78RXAvqUK6Dnia*LoQY*aJU&-BB8OJd^v0%zGcFSX>=AiqI@|5wU?pQ3N&wGu}Z@M zf>0q93016=rLdpeH8{o-aO`P~ZD$myXA;hsN<=DjCOlU{ z5ksLxP)aFQE@b%rs#TO!MJSM126bJj@jXg3x@xo#A{-D}q&XKYoRa2*2o0}F!nZ|? z`az*AsA!VJ@;uKX5%x>Wh*g|*mS|*%Wl2iGNtFoAG|g)4l?dfo!YPqhgf@%62A0Pm zQcFcRFoZ&5C#3{~hMq+Wi&h!^Bi9e4g{9DJ3q_Prj`oS~8@dyP6s@!f4KX&GOGEQ2 zM4=EN2|^XE*7zBb9L=plOmXtTs(J3wpK?}%Qqod}k!oza9QmLa^wuOgDeN47T?ML# z3JthifqKL6=W-e5hhP#WFBwy;}fK}Z>_mb(TDkVr^j zY(V2UQRVzmQi5bD^hvZxs8sP|Nuq7UU_pe4|0;Q%7NY4SrE-NDLn7=D&x=Y)NTn(j zIs%pX+BL!aUw*hB`TF1jXpm{7-*q)LEkIlyt78eX*H{+Ndg-~vRJQ%7Y4OfDnzG` zCCXT?4GAaclp)ssMm(KTBEa!%Y#jc4JNWMmAaMZ*!nh27h z2(^^w0M%PAMMI7umy#r;{Fp1{7#&E|5}Ht><79Xar0C>v)D#lbX$cxuuD=RCL?qB} z(<-G@6Nh0$L*eOabdC(i16K}53^jm2TZ+ShLc*UbCUK-$^s_XM0?q3$rD8DHP>2~6 zKTY2quCD)d?SKZc`<^6O%tqei!YK?Ea_kbL{p5~&moGdgZoDpYFH(6nMii2*k254 zg@g!^w?p_9Jp@{5R)`rIyQab@4Mzv%S&Blb88%4G1S4@QP|X!CN36}=*`iXvJ5%~R12I1tw)V7!!UxRROqP`Ds-J1J1#-1frY$NAdN4j! z$L>2Ax5jJu`lQt#3;NJPF7Hb`EqK8kgL^7bgCkZ3FN%oi4g)Qm4F!Kj2*OVmP@7^ef=H87^2C=!jFP=zx#&Ad!RqiZAx5?!H+q$CIh8@F<81_rFlQwgwDmZ_67xEU9*4on3)*r*P0=d8Z|HM`Xid?zVkn0un7b}A zpp;5+Dymcp(bV!WEBX$MY0#L62r23VZ#zkhL-8$jF(oF^nrr3|s2n(Nh#OVA)Fg;a^g8f6?O3_Mq;02EG0kUF#m zXwSI*V!VZOON?NosNFaj=h9GDNeq23s8lG0$PSl=^97oqx%dDBez8V&(Y>Rr;3aSt zj82F@S4Cq0fHspvw~6sGFO%qRXlx;d1{4MpY7Gah3TKHp;>f31t(lYHA&d1X#V9c% z38C>lYHmh|FB%glX(&bf7@^^#NEVH$gjAuC;M2r7`BRX1d5qqL<@!q~acW6P#bQ#0 zE=*%PF_OYqL8(AdzzGr0vyzpfgHh2m`ewCAGuM@(vBH%B2}!d!AJUAk5T~VTIm(t4 zo5_eYb56BFuA!o;CW^^M}n3e?YRhj7|#nTpo+rDKaEQWlJYoFj#`Gk2<3w_ z#~N8@bxFlw=qGsRMk zUMj^>B}VtS8iLU~PNz7oR2a+|m0T~KiIP?UNp{^87r z=MUW_8c%f6O5D2vHI}PS&4^f`9G7U67#X57;nT#}PgD_%T5&3`k$YUiL9xPE9M?-o zv}9a4N-DysDGn&N!h2yQ~Hv9VYgEtF&6$ttnsG_RLv`NTN) z!Pp)rIY=T`AFl7<AF`GtC)k<1INO9`Vx0zxITxvm5(4S*e!=(|(u>_r?R3*Z7AByK5rztqk zRihtc6c`HdZB{WXPSqI#*BEf(DB*RBMO`3pVF1HRWLjf4lz3sZQzD#(p>}HY4VM70 zE$GxyjV0)Pxa;yu8B(lMjxLT?@NJRci~z+H<8*}z)s5!`cmI&1874S;j3FP-JX#Or^2|6+hO7>ua=pj?ti9+mZrZw8QX&UOZ<4%?2o@{RG*41?$66?t2Y03lZUpJmcNIWhg(2(V}XF%TE>z4V(-?*-vi&H z@$eQ%@YOL|5Wsy66*Ln#kpd4d%nKEGc)L3Y+V~4X`7`2z!*O2{{%cfZbZl^_s=9GI zKcCJ5n|8QeiNLL&JUWCGw391C!$M;dY^ob$<$fO3jk!w-+Ij?Z^}q|*QRoYtNXm&4 z1rTBvl5!D>oy9cO!%on~+x_!w$*0-XjWNHvvAs>Gji7_Sz}qK4;2r4YCGhbR*l1ez zS-L}cc^h?kd$74!z*}H15q!)m?}jGbjMqwr6Ne5czBN-%sRe45Kt&TjR3P2Mfv_u;6=mUts`&?iG71a+VO}*9j@l{%(Yz zmvuP%Yu^S1hlfTu3J~P+9LNWSIPx9-4?+tW_u~6tdk2A&AT$EEaKbI@1a_oYfHmWL zXfYM$<;zHbYfDAP>}~j_d>%t^!3CcokFPufoBMeN);Ids-ZlHHdtnnxeBPdatr$t{ z{@L-ENaBK48Sep9u3DdDS4iNGz_!vEf{5mf_ScF0HT&-;@-NXgju5dkDWtz5S-)m~ zH$pf=`LFF8T9SW^L&COH4BG!;hxAD={vL<)cc=*c4-zA6%l8+fzrX;{E)>ZbHwjZ| zMEY5FMF;=1+r%}VzrwhE*7z^yai3*>zrLX){N?fJC(`$Y`u~I1&?LO@Nu2Q!fXY0p zW`N4GzuN%)8y6#aB1!UhiLA)}LFDhOSZIjkL{9oui6qOj|9&F>68|6;yATp*imG%8 zhsG(_$L#M$2(JJClc$>8Jd^9RP(oBLB%fq|H%|VYW(%1@yM&KwewE0I>>ot_uG!*< z5cy3aIgzXMW|W4=^6Vc(ev3#UCy`$#vLgHM zC-Sd~NQ4-X2!Dr=^6c+M2)7*f51vTjOi9W)QVCV*9ZTlZ?C-|OzcC`w5Gg?-D-Cxw zM3!g&Ao6$27EdI4%HJcB&&InoKZyJ$k(|g?dS8o|NUA*h2a(?*5^* z-%sRU;7Bz+mN?UrO4m9*%l>YJ{6+V*K5P7!OTV9Gf4{!rvi3hXNMKwDp$kel!&IsO z(vR7=82jBg`8Ng$8X_e~WTng28Y0WH|9&F>0!yOlG3|_Rj#ajf^I7(HBjhi#B%d|@ zOH1-u_V?@CUvq=(v&R2reZ$4wzeFa5w2P2-W<-@PiAga}lZ_(#e{b9>8vj@J?Vr6U z&u{}OqMS)_r7L&*c=OrcbG-kZ-i;%YMk2pr%s?q>;$V zw5^=T^6Vc({?0lHPbAu<-y^aj`v;NVB$5-kubVq@dG-$?f1gOoh2SKzvL#SXwSUe2 zLF6}yFNKOO(2Z#4c#yaUZEl7ITo3 zPqM!oC;!ffgky?EA}d>OEhn-(`v;M~Yqs#+Ewuk%C-T?qA4GnWNKWJ`Te#*V5{WF& z{`-miOYA?_W5yYGJ^PBS`!)N!5%Sk8T$hjLU){IAWZ@d8%oO25NS$$Wx5~!tv+VCT zZvW=K{i7FmQI>JMNy=F&s_cntjW|?ff4A}e7se#zMAAs)*A<5>&;CK=*IlWGNQ#rl z%5FDkB(fs=?5Hy-`k>@W(O%_d)$#qASh4t^XVGw9oV_8haYb0;1lG5n=s%` z4DN_+yYrhv;GR@&0?p1;Dn~&x{#?yYR7AW;io0v$mS?#A8t$blVsKA22`y5na2t8t zN)Wf;#65f^xFx9?Hzmfs75IKO2zzrSbF#>m=z_wl< z4t$?AndMt={UbWeZ}eqs6Ppml;`S4`$Ch0T8y04V&Bdm3UmgFAeN@CHly?!o8C&Z7 zD%Jm%-ts%QRldE7KqXJG0EI2AURLP?9f6-+$3am0VGCDX6 zw+^(o`3(V@Er1+)+Tm6{ajaeMe>fq$`O`S2R+Vi>Q!|l=} z`IMTN(0HDjm>=A5+)$!V$>HP%O|FI-ZJLfIl^pb|c=!di@xm67hUEl-tR=+ueaS;n$#JE7q-D&>h3}a6}pTI8MqQWO4VEi$> z{UniFaA;|q|LdgvPf6Gxndjpe@xkx^!m#{iApg^0sru`%;93yB4)pnqeLHVKd z|5~Nzp6&e;O8-r#BKadq|3kk1>GJiTE^%mr7O^uy{kiD=n~~%nlCOWMQ~B=)1s}`# z6H4Fvn~@~*L+iQQ#Q(P!HgN-l|7uQ;S}*-`3!8c0oIm~{0sEo$Tr7!?+Yut}$#sf* z`TtL-{SRONKX5yPla~KX>wlFm?s@Y+Az$CrdfZa_58N;NS86@JY4P8ViuBJ->wh@= zKln`8|CqD?sP_0P>3>S?zbRmUOznTnA^+KP$p7Ag;jgX#DFOTOAjThkto8r43F90j ze*XA_7#~fTgxfy+Ped+19>l;OR>!-q6_2I=x#wek9Tw@I3n@Hi zkExk4{sw(LNxc18?edpg{@!1F#WxEpUV!?3Kd@(cQLp@crpoV#|9J4M{4xIjF?&aR zuLsZY7lm_lwd1v_2$Pcfy4qXT$VgLcV??H7t29Z;VH)Hc&QYxEYg3S=-f=r#HZ!N6 zd$ancwtI}my9fft?rFBYEz@AJ;jZT?yKL&5$qHI|Axe5-?5UQ5VPp50T#46eTre}L zy<^>RdyAv@Ihtqhnfd^ug7Z(n}Ato(Rw zId&;8-vw_!aQ~ruA7(a;ckWsm=Wf6`_ilz26<8W-3iAJE=xOWi)z-3ZLA3B89*pKR0W$D5*?*<3W=;%@iih+#X9wm!CqRLS$^5eYJBE6S+U5teZT+yl-M&_fRZSf$OG9bgn1o5)iY)MJ zlkr7q$CzRZGc7A~EiFFVR6#4mYB?wV=MD`P35(tq9auV>BPo4DY0m@Jj{v2 z`ngmRljCBXSbS5@iTtE~lMb4Tw}}pkcvLKA+sNO zsmAvuIx;SnjaFG%es=gtF~j9aPP3`-yE0_=RmbyH5~jU?PqHg0vux;;~P^x8ir}F`|o{RWWi-0Yue4$)m78hitNRG z$|AdLh~~0um+58FKaKzF?7+~TlzLwSF1wvXv^h9r`={Ba_~g7u%BV#OTByRe+Hsss z)oO)YMd6F>LW)!fRq9WOas9Kgt(B$gpN##xam}gdJ8xa^AM9L(;rXPs6^e;+@TtO< zm#AlcM_~4oK5CSfRXy{)_GS~b%fFv(ZdOq7{gJcT)Qa!7NVBN&B9ldC(dAz^(q@Gf zC8oQW^({|c)XFTr{OkKZW*aN6b1=lLK&L!CZ=|+aieCA_i=s~sE^hziVAYjR57z3K zov)~D_BsQzV$<>pE}u3qJ6+L`MduC7hHx2ZJj?2|uO>%H9UL5^?8k=7&1^BXsTS%E zzAh1#RkL0zo<_53O?4~{HO8ZZds8j1dfe$;Vil#vczl|s%SH_P^w^MN*bNoo(?o-C zHnv}+>eD3s$mo#IYd4LI4MrQIWP@Yy&5bbjv*LQukzpUJG{&}=sN$mKv7wO>{;V=G zLKUNFp+2vP71!a)S%pT%qs@y#L*V|~Gq`+f%=d_Am2q5Mm6oWBCAx&kBO=fMR9e6S z3!oW|`|<)DX;nXVEA+D-X`CU(q$ zDIrX`;?Jlo)?kuvLx;FO#>oTwuglFUwQWj_Yo z*(WojY9>rTEeNPHzdK!;#I6TzD03&7NH-nIO$R&w4qNSUXIt{7+LBy>yCTee1l)ZR zd%WGoZP|||MNL*}GOKN8iy?Pbq+Qia!od3UA2RpGlGEjt*vTC>me{`L{CQcKm)YEV zY~@{)KFw+R$t(@Hnt=P{52q_Gv2z8t>&1EjxOVf`HDgMRvB&$^S342^RytQ3IX>7% z29|lSsLFM)ApVy=vXwRu6BGN`2IbfW_EV89?RRgRd12EtrhWCPl?4>LCKUVfj?;BZ zme}lPvh9HNPc>8BlikY6Zic+VRt~}!RyCnEN(|-brZ!TL-T~=r%7eSY8;`zg;_P%% zHnSpuvFo49ptY(LwNbhdZL1Wtj|J`AvY=hO(nQ8CswQYQM;^)h*(>t zU_JhTjJ6+!^+ab7l5iQ*_{SXYv1%B? z2=$}X-&4f2-XI6o2u+JppXaC_;GiBT0!~Yj1`+DVi9s;)0=)tajdbMYp=pojs8E$i zscBjQMD;dK>#yv2>)69T+J_lJnDK9W#N42nM9Vx*i_q^D?>{g}_W-p=0mG}$Zs*NCS? z%8GYHtQZ$d553Cuc#M#L*S9C^C40oASjd|VB&Twjcqto*kHAl5JtJ)4dxikgVSnwW z@hYjNRq|tnfg~9b3q#mJ->4WD_8S?jb1ZZaVbG6B9rV?<(ztL|tT2_2#lwa_St2_A z{$7l9;M*<6h3oW8k01NL^XCV}xL7*G#}OkFoz!V@vH0-U9?paHB`%tW5Ec{7S`SSV zP9PDyB0{7d7wG2>rALSTY3~Oq)4#w=)xp-r4{h4|QAcm%@R8MuB*(cd&R+-xKb}y5 zVn3Qr;RqB{Fl71T}oV;DYWhHv9GO~SeRGxFWs6A=sle~YQ8x_2 z_;swLA0A4%e=yjS%1`1!)aC?cbBjE6N7S@i2;52+de3M;%^nc|4! zPZR2*Ghyh6={Rx3_$+1CzTD?+fsNurHlGX*J3%P@_L2)Q#-i?S396H`M}|&5G>jdR zcT%<8Fxu*8LrmV67*XB6G)ci?H&0LA{i3H|hkxh|wlI(PUfpIiQGt?a#_O94>b#mm<=KDIX?%>cw!vFvP literal 0 HcmV?d00001 diff --git a/test_helpers/Cargo.toml b/test_helpers/Cargo.toml index 0568202f111..1b0882ef708 100644 --- a/test_helpers/Cargo.toml +++ b/test_helpers/Cargo.toml @@ -5,16 +5,19 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order dotenvy = "0.15.7" parking_lot = "0.12" -tempfile = "3.8.0" -tracing-log = "0.1" +tempfile = "3.9.0" +tracing-log = "0.2" tracing-subscriber = { version = "0.3", features = ["env-filter"] } observability_deps = { path = "../observability_deps" } workspace-hack = { version = "0.1", path = "../workspace-hack" } -async-trait = { version = "0.1.73", optional = true } -tokio = { version = "1.32.0", optional = true, default_features = false, features = ["time"] } +async-trait = { version = "0.1.77", optional = true } +tokio = { version = "1.35.1", optional = true, default_features = false, features = ["time"] } [features] default = [] diff --git a/test_helpers_end_to_end/Cargo.toml b/test_helpers_end_to_end/Cargo.toml index e7f6104779d..64ad443179f 100644 --- a/test_helpers_end_to_end/Cargo.toml +++ b/test_helpers_end_to_end/Cargo.toml @@ -5,36 +5,43 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order -arrow = { workspace = true, features = ["prettyprint"] } +arrow = { workspace = true } arrow-flight = { workspace = true } arrow_util = { path = "../arrow_util" } -assert_cmd = "2.0.12" +assert_cmd = "2.0.13" +assert_matches = "1.5.0" bytes = "1.5" data_types = { path = "../data_types" } dml = { path = "../dml" } futures = "0.3" generated_types = { path = "../generated_types" } -http = "0.2.9" +http = "0.2.11" hyper = "0.14" influxdb_iox_client = { path = "../influxdb_iox_client", features = ["flight", "format"] } ingester_query_grpc = { path = "../ingester_query_grpc" } +insta = { version = "1.34.0", features = ["yaml"] } iox_catalog = { path = "../iox_catalog" } +iox_query_params = { path = "../iox_query_params" } mutable_batch_lp = { path = "../mutable_batch_lp" } mutable_batch_pb = { path = "../mutable_batch_pb" } nix = { version = "0.27", default-features = false, features = ["signal"] } observability_deps = { path = "../observability_deps" } -once_cell = { version = "1.18", features = ["parking_lot"] } +once_cell = { version = "1.19", features = ["parking_lot"] } parking_lot = "0.12" -prost = "0.11" +prost = { workspace = true } rand = "0.8.3" -regex = "1.9" -reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls"] } -snafu = "0.7" -sqlx = { version = "0.7.1", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] } -tempfile = "3.8.0" +regex = "1.10" +reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls-native-roots"] } +serde_json = "1.0.111" +snafu = "0.8" +sqlx = { version = "0.7.3", features = [ "runtime-tokio-rustls" , "postgres", "uuid" ] } +tempfile = "3.9.0" test_helpers = { path = "../test_helpers", features = ["future_timeout"] } -tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } +tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "signal", "sync", "time"] } tokio-util = "0.7" tonic = { workspace = true } workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/test_helpers_end_to_end/src/addrs.rs b/test_helpers_end_to_end/src/addrs.rs index 2fc6249da80..69a0ff5479f 100644 --- a/test_helpers_end_to_end/src/addrs.rs +++ b/test_helpers_end_to_end/src/addrs.rs @@ -11,7 +11,14 @@ use std::{ // running locally. static NEXT_PORT: AtomicU16 = AtomicU16::new(8090); -// represents port on localhost to bind / connect to +/// Socket type +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum SocketType { + Tcp, + Udp, +} + +/// Represents port on localhost to bind / connect to #[derive(Debug, Clone)] pub struct Address { /// the actual address, on which to bind. Example `127.0.0.1:8089` @@ -21,23 +28,33 @@ pub struct Address { } impl Address { - fn new() -> Self { - let bind_addr = Self::get_free_port().to_string(); - let client_base = format!("http://{bind_addr}"); + fn new(t: SocketType) -> Self { + let bind_addr = Self::get_free_port(t).to_string(); + let protocol = match t { + SocketType::Tcp => "http", + SocketType::Udp => "udp", + }; + let client_base = format!("{protocol}://{bind_addr}"); Self { bind_addr: bind_addr.into(), client_base: client_base.into(), } } - fn get_free_port() -> SocketAddrV4 { + + fn get_free_port(t: SocketType) -> SocketAddrV4 { let ip = std::net::Ipv4Addr::new(127, 0, 0, 1); loop { let port = NEXT_PORT.fetch_add(1, Ordering::SeqCst); let addr = SocketAddrV4::new(ip, port); - if std::net::TcpListener::bind(addr).is_ok() { + let is_working = match t { + SocketType::Tcp => std::net::TcpListener::bind(addr).is_ok(), + SocketType::Udp => std::net::UdpSocket::bind(addr).is_ok(), + }; + + if is_working { return addr; } } @@ -57,26 +74,94 @@ impl Address { pub struct BindAddresses { router_http_api: std::sync::Mutex>, router_grpc_api: std::sync::Mutex>, + router_gossip_api: std::sync::Mutex>, + querier_http_api: std::sync::Mutex>, querier_grpc_api: std::sync::Mutex>, + querier_gossip_api: std::sync::Mutex>, + ingester_http_api: std::sync::Mutex>, ingester_grpc_api: std::sync::Mutex>, + ingester_gossip_api: std::sync::Mutex>, + compactor_http_api: std::sync::Mutex>, compactor_grpc_api: std::sync::Mutex>, + compactor_gossip_api: std::sync::Mutex>, + catalog_http_api: std::sync::Mutex>, + catalog_grpc_api: std::sync::Mutex>, + catalog_gossip_api: std::sync::Mutex>, + parquet_cache_http_api: std::sync::Mutex>, } impl BindAddresses { pub fn router_http_api(&self) -> Address { - get_or_allocate(&self.router_http_api) + get_or_allocate(&self.router_http_api, SocketType::Tcp) } + pub fn router_grpc_api(&self) -> Address { - get_or_allocate(&self.router_grpc_api) + get_or_allocate(&self.router_grpc_api, SocketType::Tcp) + } + + pub fn router_gossip_api(&self) -> Address { + get_or_allocate(&self.router_gossip_api, SocketType::Udp) + } + + pub fn querier_http_api(&self) -> Address { + get_or_allocate(&self.querier_http_api, SocketType::Tcp) } + pub fn querier_grpc_api(&self) -> Address { - get_or_allocate(&self.querier_grpc_api) + get_or_allocate(&self.querier_grpc_api, SocketType::Tcp) + } + + pub fn querier_gossip_api(&self) -> Address { + get_or_allocate(&self.querier_gossip_api, SocketType::Udp) + } + + pub fn ingester_http_api(&self) -> Address { + get_or_allocate(&self.ingester_http_api, SocketType::Tcp) } + pub fn ingester_grpc_api(&self) -> Address { - get_or_allocate(&self.ingester_grpc_api) + get_or_allocate(&self.ingester_grpc_api, SocketType::Tcp) + } + + pub fn ingester_gossip_api(&self) -> Address { + get_or_allocate(&self.ingester_gossip_api, SocketType::Udp) } + + pub fn compactor_http_api(&self) -> Address { + get_or_allocate(&self.compactor_http_api, SocketType::Tcp) + } + pub fn compactor_grpc_api(&self) -> Address { - get_or_allocate(&self.compactor_grpc_api) + get_or_allocate(&self.compactor_grpc_api, SocketType::Tcp) + } + + pub fn compactor_gossip_api(&self) -> Address { + get_or_allocate(&self.compactor_gossip_api, SocketType::Udp) + } + + pub fn catalog_http_api(&self) -> Address { + get_or_allocate(&self.catalog_http_api, SocketType::Tcp) + } + + pub fn catalog_grpc_api(&self) -> Address { + get_or_allocate(&self.catalog_grpc_api, SocketType::Tcp) + } + + pub fn catalog_gossip_api(&self) -> Address { + get_or_allocate(&self.catalog_gossip_api, SocketType::Udp) + } + + pub fn all_gossip_apis(&self) -> Vec

{ + vec![ + self.router_gossip_api(), + self.ingester_gossip_api(), + self.compactor_gossip_api(), + self.querier_gossip_api(), + ] + } + + pub fn parquet_cache_http_api(&self) -> Address { + get_or_allocate(&self.parquet_cache_http_api, SocketType::Tcp) } } @@ -97,13 +182,16 @@ impl Display for BindAddresses { if let Some(addr) = self.compactor_grpc_api.lock().unwrap().as_ref() { write!(f, "compactor_grpc: {} ", addr.bind_addr)? } + if let Some(addr) = self.catalog_grpc_api.lock().unwrap().as_ref() { + write!(f, "catalog_grpc: {} ", addr.bind_addr)? + } Ok(()) } } -fn get_or_allocate(locked_addr: &std::sync::Mutex>) -> Address { +fn get_or_allocate(locked_addr: &std::sync::Mutex>, t: SocketType) -> Address { let mut locked_addr = locked_addr.lock().unwrap(); - let addr = locked_addr.take().unwrap_or_else(Address::new); + let addr = locked_addr.take().unwrap_or_else(|| Address::new(t)); *locked_addr = Some(addr.clone()); addr } diff --git a/test_helpers_end_to_end/src/client.rs b/test_helpers_end_to_end/src/client.rs index 53ce5f0b2ba..74e8ada9612 100644 --- a/test_helpers_end_to_end/src/client.rs +++ b/test_helpers_end_to_end/src/client.rs @@ -9,6 +9,7 @@ use influxdb_iox_client::{ connection::Connection, ingester::generated_types::{write_service_client::WriteServiceClient, WriteRequest}, }; +use iox_query_params::StatementParam; use mutable_batch_lp::lines_to_batches; use mutable_batch_pb::encode::encode_write; use std::fmt::Display; @@ -16,10 +17,10 @@ use tonic::IntoRequest; /// Writes the line protocol to the write_base/api/v2/write endpoint (typically on the router) pub async fn write_to_router( - line_protocol: impl Into, - org: impl AsRef, - bucket: impl AsRef, - write_base: impl AsRef, + line_protocol: impl Into + Send, + org: impl AsRef + Send, + bucket: impl AsRef + Send, + write_base: impl AsRef + Send, authorization: Option<&str>, ) -> Response { let client = Client::new(); @@ -46,7 +47,7 @@ pub async fn write_to_router( /// Writes the line protocol to the WriteService endpoint (typically on the ingester) pub async fn write_to_ingester( - line_protocol: impl Into, + line_protocol: impl Into + Send, namespace_id: NamespaceId, table_id: TableId, ingester_connection: Connection, @@ -80,8 +81,28 @@ pub async fn write_to_ingester( /// Runs a SQL query using the flight API on the specified connection. pub async fn try_run_sql( - sql_query: impl Into, - namespace: impl Into, + sql_query: impl Into + Send, + namespace: impl Into + Send, + querier_connection: Connection, + authorization: Option<&str>, + with_debug: bool, +) -> Result<(Vec, SchemaRef), influxdb_iox_client::flight::Error> { + try_run_sql_with_params( + sql_query, + namespace, + [], + querier_connection, + authorization, + with_debug, + ) + .await +} + +/// Runs a SQL query using the flight API on the specified connection. +pub async fn try_run_sql_with_params( + sql_query: impl Into + Send, + namespace: impl Into + Send, + params: impl IntoIterator + Send, querier_connection: Connection, authorization: Option<&str>, with_debug: bool, @@ -98,7 +119,12 @@ pub async fn try_run_sql( // Normally this would be done one per connection, not per query client.handshake().await?; - let mut stream = client.sql(namespace.into(), sql_query.into()).await?; + let mut stream = client + .query(namespace) + .sql(sql_query.into()) + .with_params(params) + .run() + .await?; let batches = (&mut stream).try_collect().await?; @@ -114,8 +140,25 @@ pub async fn try_run_sql( /// Runs a InfluxQL query using the flight API on the specified connection. pub async fn try_run_influxql( - influxql_query: impl Into, - namespace: impl Into, + influxql_query: impl Into + Send, + namespace: impl Into + Send, + querier_connection: Connection, + authorization: Option<&str>, +) -> Result<(Vec, SchemaRef), influxdb_iox_client::flight::Error> { + try_run_influxql_with_params( + influxql_query, + namespace, + [], + querier_connection, + authorization, + ) + .await +} + +pub async fn try_run_influxql_with_params( + influxql_query: impl Into + Send, + namespace: impl Into + Send, + params: impl IntoIterator + Send, querier_connection: Connection, authorization: Option<&str>, ) -> Result<(Vec, SchemaRef), influxdb_iox_client::flight::Error> { @@ -129,7 +172,10 @@ pub async fn try_run_influxql( client.handshake().await?; let mut stream = client - .influxql(namespace.into(), influxql_query.into()) + .query(namespace) + .influxql(influxql_query.into()) + .with_params(params) + .run() .await?; let batches = (&mut stream).try_collect().await?; @@ -148,8 +194,8 @@ pub async fn try_run_influxql( /// /// Use [`try_run_sql`] if you want to check the error manually. pub async fn run_sql( - sql: impl Into, - namespace: impl Into, + sql: impl Into + Send, + namespace: impl Into + Send, querier_connection: Connection, authorization: Option<&str>, with_debug: bool, @@ -165,12 +211,35 @@ pub async fn run_sql( .expect("Error executing sql query") } +/// Runs a SQL query using the flight API on the specified connection. +/// +/// Use [`try_run_sql`] if you want to check the error manually. +pub async fn run_sql_with_params( + sql: impl Into + Send, + namespace: impl Into + Send, + params: impl IntoIterator + Send, + querier_connection: Connection, + authorization: Option<&str>, + with_debug: bool, +) -> (Vec, SchemaRef) { + try_run_sql_with_params( + sql, + namespace, + params, + querier_connection, + authorization, + with_debug, + ) + .await + .expect("Error executing sql query") +} + /// Runs an InfluxQL query using the flight API on the specified connection. /// /// Use [`try_run_influxql`] if you want to check the error manually. pub async fn run_influxql( - influxql: impl Into + Clone + Display, - namespace: impl Into, + influxql: impl Into + Clone + Display + Send, + namespace: impl Into + Send, querier_connection: Connection, authorization: Option<&str>, ) -> (Vec, SchemaRef) { @@ -183,3 +252,24 @@ pub async fn run_influxql( .await .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}")) } + +/// Runs an InfluxQL query using the flight API on the specified connection. +/// +/// Use [`try_run_influxql`] if you want to check the error manually. +pub async fn run_influxql_with_params( + influxql: impl Into + Clone + Display + Send, + namespace: impl Into + Send, + params: impl IntoIterator + Send, + querier_connection: Connection, + authorization: Option<&str>, +) -> (Vec, SchemaRef) { + try_run_influxql_with_params( + influxql.clone(), + namespace, + params, + querier_connection, + authorization, + ) + .await + .unwrap_or_else(|_| panic!("Error executing InfluxQL query: {influxql}")) +} diff --git a/test_helpers_end_to_end/src/config.rs b/test_helpers_end_to_end/src/config.rs index 91bc9fdcb37..7c45a5e276d 100644 --- a/test_helpers_end_to_end/src/config.rs +++ b/test_helpers_end_to_end/src/config.rs @@ -34,6 +34,9 @@ pub struct TestConfig { /// Which ports this server should use addrs: Arc, + + /// Wait for server to be ready during creation. + wait_for_ready: bool, } impl TestConfig { @@ -58,6 +61,7 @@ impl TestConfig { wal_dir: None, catalog_dir, addrs: Arc::new(BindAddresses::default()), + wait_for_ready: true, } } @@ -73,6 +77,45 @@ impl TestConfig { .with_catalog_dir(other.catalog_dir.as_ref().map(Arc::clone)) } + /// Create new catalog node w/o peers + fn new_catalog(dsn: Option, catalog_schema_name: String) -> Self { + Self::new(ServerType::Catalog, dsn, catalog_schema_name) + .with_env("INFLUXDB_IOX_CATALOG_CACHE_WARMUP_DELAY", "100ms") + } + + /// Create a triplet of catalog cache nodes. + pub fn catalog_nodes(dsn: impl Into) -> [Self; 3] { + let dsn = Some(dsn.into()); + let catalog_schema_name = random_catalog_schema_name(); + + let n0 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone()); + let n1 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone()); + let n2 = Self::new_catalog(dsn.clone(), catalog_schema_name.clone()); + + let n0 = n0.with_catalog_peers([ + n1.addrs().catalog_http_api().client_base(), + n2.addrs().catalog_http_api().client_base(), + ]); + let n1 = n1.with_catalog_peers([ + n0.addrs().catalog_http_api().client_base(), + n2.addrs().catalog_http_api().client_base(), + ]); + let n2 = n2.with_catalog_peers([ + n0.addrs().catalog_http_api().client_base(), + n1.addrs().catalog_http_api().client_base(), + ]); + + [n0, n1, n2] + } + + /// Create a minimal router configuration that doesn't connect to an ingester. If you need a + /// router that connects to an ingester, call `new_ingester` first and then pass the resulting + /// `TestConfig` to `new_router`. + pub fn router_only(dsn: impl Into) -> Self { + let dsn = Some(dsn.into()); + Self::new(ServerType::Router, dsn, random_catalog_schema_name()).with_new_object_store() + } + /// Create a minimal router2 configuration sharing configuration with the ingester2 config pub fn new_router(ingester_config: &TestConfig) -> Self { assert_eq!(ingester_config.server_type(), ServerType::Ingester); @@ -117,6 +160,7 @@ impl TestConfig { wal_dir: None, catalog_dir: ingester_config.catalog_dir.as_ref().map(Arc::clone), addrs: Arc::new(BindAddresses::default()), + wait_for_ready: ingester_config.wait_for_ready, } .with_existing_object_store(ingester_config) .with_new_wal() @@ -224,6 +268,11 @@ impl TestConfig { .with_env("INFLUXDB_IOX_SINGLE_TENANCY", "true") } + /// Enable partial writes. + pub fn with_partial_writes(self) -> Self { + self.with_env("INFLUXDB_IOX_PARTIAL_WRITES_ENABLED", "true") + } + // Get the catalog DSN URL if set. pub fn dsn(&self) -> &Option { &self.dsn @@ -323,6 +372,46 @@ impl TestConfig { .with_env("INFLUXDB_IOX_COMPACTION_SHARD_ID", shard_id.to_string()) } + /// Limit the number of concurrent queries. + pub fn with_max_concurrent_queries(self, n: usize) -> Self { + self.with_env("INFLUXDB_IOX_MAX_CONCURRENT_QUERIES", n.to_string()) + } + + /// Set up a metadata signing key for bulk ingest. + pub fn with_bulk_ingest_metadata_signing_key(self, metadata_signing_key_file: &str) -> Self { + self.with_env( + "INFLUXDB_IOX_BULK_INGEST_METADATA_SIGNING_KEY_FILE", + metadata_signing_key_file, + ) + } + + /// Use a mock presigned URL generator rather than whatever object store may have been + /// configured. Allows for testing bulk ingest without needing S3. + pub fn with_mock_presigned_url_signer(self) -> Self { + self.with_env( + "INFLUXDB_IOX_BULK_INGEST_USE_MOCK_PRESIGNED_URL_SIGNER", + "true", + ) + } + + /// Register catalog peers. + pub fn with_catalog_peers(self, peers: I) -> Self + where + I: IntoIterator, + S: std::fmt::Display, + { + let peers = peers.into_iter().map(|s| s.to_string()).collect::>(); + self.with_env("INFLUXDB_IOX_CATALOG_CACHE_PEERS", peers.join(",")) + } + + /// Set [`wait_for_ready`](Self::wait_for_ready). + pub fn with_wait_for_ready(self, wait_for_ready: bool) -> Self { + Self { + wait_for_ready, + ..self + } + } + /// Get the test config's server type. #[must_use] pub fn server_type(&self) -> ServerType { @@ -351,6 +440,28 @@ impl TestConfig { pub fn ingester_base(&self) -> Arc { self.addrs().ingester_grpc_api().client_base() } + + /// Return a HTTP base that is usable for health and metrics. + /// + /// This depends on the [server type](Self::server_type). + #[must_use] + pub fn http_base(&self) -> Arc { + let addr = match self.server_type { + ServerType::AllInOne => self.addrs.router_http_api(), + ServerType::Ingester => self.addrs.ingester_http_api(), + ServerType::Router => self.addrs.router_http_api(), + ServerType::Querier => self.addrs.querier_http_api(), + ServerType::Compactor => self.addrs.compactor_http_api(), + ServerType::Catalog => self.addrs.catalog_http_api(), + ServerType::ParquetCache => self.addrs.parquet_cache_http_api(), + }; + addr.client_base() + } + + /// Wait for server to be ready during creation. + pub fn wait_for_ready(&self) -> bool { + self.wait_for_ready + } } fn random_catalog_schema_name() -> String { diff --git a/test_helpers_end_to_end/src/data_generator.rs b/test_helpers_end_to_end/src/data_generator.rs index d167fd86873..c070abd247f 100644 --- a/test_helpers_end_to_end/src/data_generator.rs +++ b/test_helpers_end_to_end/src/data_generator.rs @@ -1,6 +1,7 @@ use std::time::SystemTime; /// Manages a dataset for writing / reading +#[derive(Debug)] pub struct DataGenerator { ns_since_epoch: i64, line_protocol: String, diff --git a/test_helpers_end_to_end/src/database.rs b/test_helpers_end_to_end/src/database.rs index 4aa16baaa4b..8e242284e9a 100644 --- a/test_helpers_end_to_end/src/database.rs +++ b/test_helpers_end_to_end/src/database.rs @@ -11,7 +11,7 @@ use tokio::sync::Mutex; static DB_INITIALIZED: Lazy>> = Lazy::new(|| Mutex::new(BTreeSet::new())); /// Performs once-per-process database initialization, if necessary -pub async fn initialize_db(dsn: &str, schema_name: &str) { +pub(crate) async fn initialize_db(dsn: &str, schema_name: &str) { let mut init = DB_INITIALIZED.lock().await; // already done diff --git a/test_helpers_end_to_end/src/grpc.rs b/test_helpers_end_to_end/src/grpc.rs index 3cd14dae1e1..0ad3da3fe87 100644 --- a/test_helpers_end_to_end/src/grpc.rs +++ b/test_helpers_end_to_end/src/grpc.rs @@ -491,7 +491,7 @@ impl GrpcRequestBuilder { } } -pub fn field_ref_node(field_name: impl Into) -> Node { +pub(crate) fn field_ref_node(field_name: impl Into) -> Node { Node { node_type: NodeType::FieldRef.into(), children: vec![], @@ -499,7 +499,7 @@ pub fn field_ref_node(field_name: impl Into) -> Node { } } -pub fn tag_ref_node(tag_name: impl Into>) -> Node { +pub(crate) fn tag_ref_node(tag_name: impl Into>) -> Node { Node { node_type: NodeType::TagRef as i32, children: vec![], @@ -507,7 +507,7 @@ pub fn tag_ref_node(tag_name: impl Into>) -> Node { } } -pub fn string_value_node(value: impl Into) -> Node { +pub(crate) fn string_value_node(value: impl Into) -> Node { Node { node_type: NodeType::Literal as i32, children: vec![], @@ -515,7 +515,7 @@ pub fn string_value_node(value: impl Into) -> Node { } } -pub fn comparison_expression_node(lhs: Node, comparison: Comparison, rhs: Node) -> Node { +pub(crate) fn comparison_expression_node(lhs: Node, comparison: Comparison, rhs: Node) -> Node { Node { node_type: NodeType::ComparisonExpression as i32, children: vec![lhs, rhs], diff --git a/test_helpers_end_to_end/src/http_reverse_proxy.rs b/test_helpers_end_to_end/src/http_reverse_proxy.rs new file mode 100644 index 00000000000..5b58a93a7af --- /dev/null +++ b/test_helpers_end_to_end/src/http_reverse_proxy.rs @@ -0,0 +1,160 @@ +//! Poor-mans simulation of an HTTP/2 service that randomizes incoming requests to a number of backend services. + +use std::{ + net::{SocketAddr, TcpListener}, + sync::{Arc, Weak}, + thread::JoinHandle, +}; + +use http::{Request, Response}; +use hyper::{ + client::HttpConnector, + service::{make_service_fn, service_fn}, + Body, Client, Server, +}; +use rand::seq::SliceRandom; +use tokio_util::sync::CancellationToken; + +use crate::service_link::{LinkableService, LinkableServiceImpl}; + +/// A basic HTTP reverse proxy for use by end-to-end tests +/// +/// Intended to approximate a Kubernetes Service. +/// +/// # Implementation +/// This runs in a dedicated thread in its own tokio runtime. The reason is that we potentially share a single proxy +/// between multiple tests, but every test sets up its own tokio runtime and moving IO tasks between runtimes can cause blocking. +#[derive(Debug)] +pub struct HttpReverseProxy { + addr: SocketAddr, + shutdown: CancellationToken, + task: Option>, + links: LinkableServiceImpl, +} + +impl HttpReverseProxy { + pub fn new(backends: I) -> Self + where + I: IntoIterator, + S: ToString, + { + let client = Client::builder().http2_only(true).build_http(); + let inner = Arc::new(Inner { + backends: backends.into_iter().map(|s| s.to_string()).collect(), + client, + }); + assert!(!inner.backends.is_empty(), "need at least 1 backend"); + + let addr = SocketAddr::from(([127, 0, 0, 1], 0)); + + let make_service = make_service_fn(move |_conn| { + let inner = Arc::clone(&inner); + + async move { + Ok::<_, hyper::Error>(service_fn(move |req| { + let inner = Arc::clone(&inner); + + async move { inner.handle(req).await } + })) + } + }); + + let listener = TcpListener::bind(addr).unwrap(); + let addr = listener.local_addr().unwrap(); + + let shutdown = CancellationToken::new(); + let shutdown_captured = shutdown.clone(); + let task = std::thread::spawn(move || { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + + rt.block_on(async { + let server = Server::from_tcp(listener) + .unwrap() + .http2_only(true) + .serve(make_service); + + tokio::select! { + _ = shutdown_captured.cancelled() => {} + res = server => { + if let Err(e) = res { + eprintln!("server error: {}", e); + } + } + } + }) + }); + + Self { + addr, + shutdown, + task: Some(task), + links: Default::default(), + } + } + + pub fn addr(&self) -> SocketAddr { + self.addr + } +} + +impl Drop for HttpReverseProxy { + fn drop(&mut self) { + self.shutdown.cancel(); + + if self.task.take().expect("not joined yet").join().is_err() { + eprintln!("server task error, check logs"); + } + } +} + +impl LinkableService for HttpReverseProxy { + fn add_link_client(&self, client: Weak) { + self.links.add_link_client(client) + } + + fn remove_link_clients(&self) -> Vec> { + self.links.remove_link_clients() + } + + fn add_link_server(&self, server: Arc) { + self.links.add_link_server(server) + } + + fn remove_link_server(&self, server: Arc) { + self.links.remove_link_server(server) + } +} + +#[derive(Debug)] +struct Inner { + backends: Vec, + client: Client, +} + +impl Inner { + async fn handle(&self, req: Request) -> Result, hyper::Error> { + let uri = self.pick_backend(); + + let (mut parts, body) = req.into_parts(); + + // build URI + let mut uri = uri.to_owned(); + uri.push_str(parts.uri.path()); + if let Some(q) = parts.uri.query() { + uri.push('?'); + uri.push_str(q); + } + parts.uri = uri.parse().unwrap(); + + let req = Request::from_parts(parts, body); + self.client.request(req).await + } + + fn pick_backend(&self) -> &str { + let mut rng = rand::thread_rng(); + self.backends.choose(&mut rng).expect("not empty") + } +} diff --git a/test_helpers_end_to_end/src/lib.rs b/test_helpers_end_to_end/src/lib.rs index e6d1db77a07..8fa331c9ebb 100644 --- a/test_helpers_end_to_end/src/lib.rs +++ b/test_helpers_end_to_end/src/lib.rs @@ -16,9 +16,11 @@ mod data_generator; mod database; mod error; mod grpc; +mod http_reverse_proxy; mod mini_cluster; mod server_fixture; mod server_type; +mod service_link; pub mod snapshot_comparison; mod steps; mod udp_listener; @@ -30,6 +32,7 @@ pub use config::TestConfig; pub use data_generator::DataGenerator; pub use error::{check_flight_error, check_tonic_status}; pub use grpc::GrpcRequestBuilder; +pub use http_reverse_proxy::HttpReverseProxy; pub use mini_cluster::MiniCluster; pub use server_fixture::{ServerFixture, TestServer}; pub use server_type::{AddAddrEnv, ServerType}; diff --git a/test_helpers_end_to_end/src/mini_cluster.rs b/test_helpers_end_to_end/src/mini_cluster.rs index 5ab9833bc2c..49115f056e1 100644 --- a/test_helpers_end_to_end/src/mini_cluster.rs +++ b/test_helpers_end_to_end/src/mini_cluster.rs @@ -1,6 +1,8 @@ use crate::{ - dump_log_to_stdout, log_command, rand_id, server_type::AddAddrEnv, write_to_ingester, - write_to_router, ServerFixture, TestConfig, TestServer, + dump_log_to_stdout, log_command, rand_id, + server_type::AddAddrEnv, + service_link::{link_services, LinkableService}, + write_to_ingester, write_to_router, HttpReverseProxy, ServerFixture, TestConfig, TestServer, }; use arrow::{datatypes::SchemaRef, record_batch::RecordBatch}; use arrow_flight::{ @@ -50,6 +52,12 @@ pub struct MiniCluster { /// Standard optional compactor configuration, to be used on-demand compactor_config: Option, + /// Catalog reverse proxy. + catalog_reverse_proxy: Option>, + + /// Catalog cache servers. + catalog: Vec, + // Potentially helpful data org_id: String, bucket_id: String, @@ -97,6 +105,8 @@ impl MiniCluster { ingesters: Vec, querier: Option, compactor_config: Option, + catalog: Vec, + catalog_reverse_proxy: Option>, ) -> Self { let org_id = rand_id(); let bucket_id = rand_id(); @@ -107,6 +117,8 @@ impl MiniCluster { ingesters, querier, compactor_config, + catalog, + catalog_reverse_proxy, org_id, bucket_id, @@ -202,13 +214,24 @@ impl MiniCluster { /// querier. Save config for a compactor, but the compactor service should be run on-demand in /// tests using `compactor run-once` rather than using `run compactor`. pub async fn create_non_shared(database_url: String) -> Self { - let ingester_config = TestConfig::new_ingester(&database_url); + let catalog_configs = TestConfig::catalog_nodes(&database_url); + let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new( + catalog_configs + .iter() + .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()), + )); + + let ingester_config = + TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr())); let router_config = TestConfig::new_router(&ingester_config); let querier_config = TestConfig::new_querier(&ingester_config); let compactor_config = TestConfig::new_compactor(&ingester_config); // Set up the cluster ==================================== Self::new() + .with_catalog(catalog_configs) + .await + .with_catalog_reverse_proxy(catalog_reverse_proxy) .with_ingester(ingester_config) .await .with_router(router_config) @@ -223,13 +246,26 @@ impl MiniCluster { /// compactor service should be run on-demand in tests using `compactor run-once` rather than /// using `run compactor`. pub async fn create_non_shared_never_persist(database_url: String) -> Self { - let ingester_config = TestConfig::new_ingester_never_persist(&database_url); + let catalog_configs = TestConfig::catalog_nodes(&database_url); + let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new( + catalog_configs + .iter() + .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()), + )); + + let ingester_config = TestConfig::new_ingester_never_persist(format!( + "http://{}", + catalog_reverse_proxy.addr() + )); let router_config = TestConfig::new_router(&ingester_config); let querier_config = TestConfig::new_querier(&ingester_config); let compactor_config = TestConfig::new_compactor(&ingester_config); // Set up the cluster ==================================== Self::new() + .with_catalog(catalog_configs) + .await + .with_catalog_reverse_proxy(catalog_reverse_proxy) .with_ingester(ingester_config) .await .with_router(router_config) @@ -247,9 +283,17 @@ impl MiniCluster { /// than using `run compactor`. pub async fn create_non_shared_with_authz( database_url: String, - authz_addr: impl Into + Clone, + authz_addr: impl Into + Clone + Send, ) -> Self { - let ingester_config = TestConfig::new_ingester(&database_url); + let catalog_configs = TestConfig::catalog_nodes(&database_url); + let catalog_reverse_proxy = Arc::new(HttpReverseProxy::new( + catalog_configs + .iter() + .map(|cfg| cfg.addrs().catalog_grpc_api().client_base()), + )); + + let ingester_config = + TestConfig::new_ingester(format!("http://{}", catalog_reverse_proxy.addr())); let router_config = TestConfig::new_router(&ingester_config).with_single_tenancy(authz_addr.clone()); let querier_config = @@ -258,6 +302,9 @@ impl MiniCluster { // Set up the cluster ==================================== Self::new_based_on_tenancy(true) + .with_catalog(catalog_configs) + .await + .with_catalog_reverse_proxy(catalog_reverse_proxy) .with_ingester(ingester_config) .await .with_router(router_config) @@ -280,20 +327,36 @@ impl MiniCluster { /// create a router with the specified configuration pub async fn with_router(mut self, router_config: TestConfig) -> Self { - self.router = Some(ServerFixture::create(router_config).await); + assert!(self.router.is_none()); + let fixture = ServerFixture::create(router_config).await; + self.add_catalog_reverse_proxy_client(fixture.strong()); + self.add_ingester_client(fixture.strong()); + self.router = Some(fixture); self } /// create an ingester with the specified configuration; pub async fn with_ingester(mut self, ingester_config: TestConfig) -> Self { - self.ingesters - .push(ServerFixture::create(ingester_config).await); + let fixture = ServerFixture::create(ingester_config).await; + self.add_catalog_reverse_proxy_client(fixture.strong()); + self.ingesters.push(fixture); self } + fn add_ingester_client(&self, client: Arc) { + for ingester in &self.ingesters { + let ingester = ingester.strong(); + link_services(ingester, Arc::clone(&client)); + } + } + /// create a querier with the specified configuration; pub async fn with_querier(mut self, querier_config: TestConfig) -> Self { - self.querier = Some(ServerFixture::create(querier_config).await); + assert!(self.querier.is_none()); + let fixture = ServerFixture::create(querier_config).await; + self.add_catalog_reverse_proxy_client(fixture.strong()); + self.add_ingester_client(fixture.strong()); + self.querier = Some(fixture); self } @@ -302,6 +365,34 @@ impl MiniCluster { self } + /// create an catalog with the specified configuration; + pub async fn with_catalog(mut self, catalog_configs: [TestConfig; 3]) -> Self { + assert!(self.catalog.is_empty()); + self.catalog = ServerFixture::create_multiple(catalog_configs).await; + self + } + + fn add_catalog_client(&self, client: Arc) { + for catalog in &self.catalog { + let catalog = catalog.strong(); + link_services(catalog, Arc::clone(&client)); + } + } + + /// Register catalog reverse proxy. + pub fn with_catalog_reverse_proxy(mut self, proxy: Arc) -> Self { + assert!(self.catalog_reverse_proxy.is_none()); + self.add_catalog_client(Arc::clone(&proxy) as _); + self.catalog_reverse_proxy = Some(proxy); + self + } + + fn add_catalog_reverse_proxy_client(&self, client: Arc) { + if let Some(proxy) = &self.catalog_reverse_proxy { + link_services(Arc::clone(proxy) as _, client); + } + } + /// Retrieve the underlying router server, if set pub fn router(&self) -> &ServerFixture { self.router.as_ref().expect("router not initialized") @@ -344,8 +435,10 @@ impl MiniCluster { /// /// [`GRACEFUL_SERVER_STOP_TIMEOUT`]: /// crate::server_fixture::GRACEFUL_SERVER_STOP_TIMEOUT - pub fn gracefully_stop_ingesters(&mut self) { - self.ingesters = vec![]; + pub async fn gracefully_stop_ingesters(&mut self) { + for ingester in self.ingesters.drain(..) { + ingester.shutdown().await; + } } /// Restart querier. @@ -485,7 +578,7 @@ impl MiniCluster { /// org/bucket pub async fn write_to_router( &self, - line_protocol: impl Into, + line_protocol: impl Into + Send, authorization: Option<&str>, ) -> Response { write_to_router( @@ -499,7 +592,11 @@ impl MiniCluster { } /// Write to the ingester using the gRPC interface directly, rather than through a router. - pub async fn write_to_ingester(&self, line_protocol: impl Into, table_name: &str) { + pub async fn write_to_ingester( + &self, + line_protocol: impl Into + Send, + table_name: &str, + ) { write_to_ingester( line_protocol, self.namespace_id().await, @@ -675,6 +772,8 @@ struct SharedServers { ingesters: Vec>, querier: Option>, compactor_config: Option, + catalog: Vec>, + catalog_reverse_proxy: Option>, } /// Deferred creation of a mini cluster @@ -683,6 +782,8 @@ struct CreatableMiniCluster { ingesters: Vec>, querier: Option>, compactor_config: Option, + catalog: Vec>, + catalog_reverse_proxy: Option>, } async fn create_if_needed(server: Option>) -> Option { @@ -693,6 +794,17 @@ async fn create_if_needed(server: Option>) -> Option> + Send, +) -> Vec { + servers + .into_iter() + .map(|server| async move { ServerFixture::create_from_existing(server).await }) + .collect::>() + .collect::>() + .await +} + impl CreatableMiniCluster { async fn create(self) -> MiniCluster { let Self { @@ -700,37 +812,36 @@ impl CreatableMiniCluster { ingesters, querier, compactor_config, + catalog, + catalog_reverse_proxy, } = self; let router_fixture = create_if_needed(router).await; - let ingester_fixtures = ingesters - .into_iter() - .map(|ingester| create_if_needed(Some(ingester))) - .collect::>() - .collect::>() - .await - .into_iter() - .flatten() - .collect(); + let ingester_fixtures = create_if_needed_many(ingesters).await; let querier_fixture = create_if_needed(querier).await; + let catalog_fixtures = create_if_needed_many(catalog).await; MiniCluster::new_from_fixtures( router_fixture, ingester_fixtures, querier_fixture, compactor_config, + catalog_fixtures, + catalog_reverse_proxy, ) } } impl SharedServers { /// Save the server processes in this shared servers as weak references - pub fn new(cluster: &MiniCluster) -> Self { + pub(crate) fn new(cluster: &MiniCluster) -> Self { Self { router: cluster.router.as_ref().map(|c| c.weak()), ingesters: cluster.ingesters.iter().map(|c| c.weak()).collect(), querier: cluster.querier.as_ref().map(|c| c.weak()), compactor_config: cluster.compactor_config.clone(), + catalog: cluster.catalog.iter().map(|c| c.weak()).collect(), + catalog_reverse_proxy: cluster.catalog_reverse_proxy.as_ref().map(Arc::downgrade), } } @@ -742,13 +853,11 @@ impl SharedServers { // aren't present so that the cluster is recreated correctly Some(CreatableMiniCluster { router: server_from_weak(self.router.as_ref())?, - ingesters: self - .ingesters - .iter() - .flat_map(|ingester| server_from_weak(Some(ingester)).unwrap()) - .collect(), + ingesters: servers_from_weak(&self.ingesters)?, querier: server_from_weak(self.querier.as_ref())?, compactor_config: self.compactor_config.clone(), + catalog: servers_from_weak(&self.catalog)?, + catalog_reverse_proxy: server_from_weak(self.catalog_reverse_proxy.as_ref())?, }) } } @@ -756,7 +865,7 @@ impl SharedServers { /// Returns None if there was a weak server but we couldn't upgrade. /// Returns Some(None) if there was no weak server /// Returns Some(Some(fixture)) if there was a weak server that we can upgrade and make a fixture from -fn server_from_weak(server: Option<&Weak>) -> Option>> { +fn server_from_weak(server: Option<&Weak>) -> Option>> { if let Some(server) = server.as_ref() { // return None if can't upgrade let server = server.upgrade()?; @@ -767,6 +876,20 @@ fn server_from_weak(server: Option<&Weak>) -> Option(servers: impl IntoIterator>) -> Option>> +where + T: 'a, +{ + let mut out = vec![]; + + for server in servers { + out.push(server.upgrade()?); + } + + Some(out) +} + static GLOBAL_SHARED_SERVERS: Lazy>> = Lazy::new(|| Mutex::new(None)); static GLOBAL_SHARED_SERVERS_NEVER_PERSIST: Lazy>> = Lazy::new(|| Mutex::new(None)); diff --git a/test_helpers_end_to_end/src/server_fixture.rs b/test_helpers_end_to_end/src/server_fixture.rs index 1e667885cee..03ce374de7e 100644 --- a/test_helpers_end_to_end/src/server_fixture.rs +++ b/test_helpers_end_to_end/src/server_fixture.rs @@ -16,13 +16,18 @@ use tempfile::NamedTempFile; use test_helpers::timeout::FutureTimeout; use tokio::sync::Mutex; -use crate::{database::initialize_db, dump_log_to_stdout, log_command, server_type::AddAddrEnv}; +use crate::{ + database::initialize_db, + dump_log_to_stdout, log_command, + server_type::AddAddrEnv, + service_link::{link_services, unlink_services, LinkableService, LinkableServiceImpl}, +}; use super::{addrs::BindAddresses, ServerType, TestConfig}; /// The duration of time a [`TestServer`] is given to gracefully shutdown after /// receiving a SIGTERM, before a SIGKILL is sent to kill it. -pub const GRACEFUL_SERVER_STOP_TIMEOUT: Duration = Duration::from_secs(5); +pub(crate) const GRACEFUL_SERVER_STOP_TIMEOUT: Duration = Duration::from_secs(5); /// Represents a server that has been started and is available for /// testing. @@ -45,6 +50,20 @@ impl ServerFixture { Self::create_from_existing(Arc::new(server)).await } + /// Create multiple, potentially interdependent sever fixtures concurrently because [`create](Self::create) only + /// returns when health is OK. + pub async fn create_multiple( + test_configs: impl IntoIterator + Send, + ) -> Vec { + let test_configs = test_configs.into_iter().collect::>(); + let n_configs = test_configs.len(); + futures::stream::iter(test_configs) + .map(|cfg| async move { Self::create(cfg).await }) + .buffered(n_configs) + .collect::>() + .await + } + /// Create a new server fixture that shares the same TestServer, /// but has its own connections pub(crate) async fn create_from_existing(server: Arc) -> Self { @@ -62,21 +81,50 @@ impl ServerFixture { /// /// This will break all currently connected clients! pub async fn restart_server(self) -> Self { + // unlink clients because we are going to drop the server + let clients = unlink_services(Arc::clone(&self.server) as _); + // get the underlying server, if possible let mut server = match Arc::try_unwrap(self.server) { Ok(s) => s, Err(_) => panic!("Can not restart server as it is shared"), }; + // disconnect so server doesn't wait for our client + drop(self.connections); + server.restart().await; let connections = server.wait_until_ready().await; + let server = Arc::new(server); + + // relink clients + for client in clients { + link_services(Arc::clone(&server) as _, client); + } Self { - server: Arc::new(server), + server, connections, } } + /// Shutdown server in a clean way and wait for process to exit. + pub async fn shutdown(self) { + // unlink clients because we are going to drop the server + unlink_services(Arc::clone(&self.server) as _); + + // get the underlying server, if possible + let mut server = match Arc::try_unwrap(self.server) { + Ok(s) => s, + Err(_) => panic!("Can not restart server as it is shared"), + }; + + // disconnect so server doesn't wait for our client + drop(self.connections); + + server.stop().await; + } + pub fn connections(&self) -> &Connections { &self.connections } @@ -118,9 +166,24 @@ impl ServerFixture { self.server.addrs().querier_grpc_api().client_base() } + /// Return the http base URL for the catalog HTTP API + pub fn catalog_http_base(&self) -> Arc { + self.server.addrs().catalog_http_api().client_base() + } + + /// Return the grpc base URL for the catalog gRPC API + pub fn catalog_grpc_base(&self) -> Arc { + self.server.addrs().catalog_grpc_api().client_base() + } + /// Return log path for server process. - pub async fn log_path(&self) -> Box { - self.server.server_process.lock().await.log_path.clone() + pub fn log_path(&self) -> Box { + self.server.log_path.clone() + } + + /// Get a strong reference to the underlying `TestServer` + pub(crate) fn strong(&self) -> Arc { + Arc::clone(&self.server) } /// Get a weak reference to the underlying `TestServer` @@ -136,6 +199,7 @@ enum ServerState { Starting, Ready, Error, + Stopped, } /// Mananges some number of gRPC connections @@ -149,6 +213,9 @@ pub struct Connections { /// connection to querier gRPC, if available querier_grpc_connection: Option, + + /// connection to catalog gRPC, if available + catalog_grpc_connection: Option, } impl Connections { @@ -183,6 +250,14 @@ impl Connections { .clone() } + /// Return a channel connected to the gRPC API, panic'ing if not the correct type of server + pub fn catalog_grpc_connection(&self) -> Connection { + self.catalog_grpc_connection + .as_ref() + .expect("Server type does not have router") + .clone() + } + /// (re)establish channels to all gRPC services that were started with the specified test config async fn reconnect(&mut self, test_config: &TestConfig) -> Result<(), String> { let server_type = test_config.server_type(); @@ -223,6 +298,18 @@ impl Connections { _ => None, }; + self.catalog_grpc_connection = match server_type { + ServerType::Catalog => { + let client_base = test_config.addrs().catalog_grpc_api().client_base(); + Some( + grpc_channel(test_config, client_base.as_ref()) + .await + .map_err(|e| format!("Cannot connect to catalog at {client_base}: {e}"))?, + ) + } + _ => None, + }; + Ok(()) } } @@ -250,31 +337,38 @@ pub struct TestServer { /// Is the server ready to accept connections? ready: Mutex, + /// Path to log file. + log_path: Box, + /// Handle to the server process being controlled - server_process: Arc>, + server_process: Arc>>, /// Configuration values for starting the test server test_config: TestConfig, -} -#[derive(Debug)] -struct Process { - child: Child, - log_path: Box, + /// Service links. + links: LinkableServiceImpl, } impl TestServer { async fn new(test_config: TestConfig) -> Self { let ready = Mutex::new(ServerState::Started); - let server_process = Arc::new(Mutex::new( - Self::create_server_process(&test_config, None).await, - )); + let (_log_file, log_path) = NamedTempFile::new() + .expect("opening log file") + .keep() + .expect("expected to keep"); + + let server_process = Arc::new(Mutex::new(Some( + Self::create_server_process(&test_config, &log_path).await, + ))); Self { ready, + log_path: log_path.into_boxed_path(), server_process, test_config, + links: Default::default(), } } @@ -283,39 +377,57 @@ impl TestServer { self.test_config.addrs() } + /// Stop server. + async fn stop(&mut self) { + let mut ready_guard = self.ready.lock().await; + let mut server_lock = self.server_process.lock().await; + + Self::stop_inner( + &mut ready_guard, + &mut server_lock, + self.test_config.server_type(), + ) + .await; + } + + async fn stop_inner( + ready: &mut ServerState, + server_process: &mut Option, + t: ServerType, + ) { + let server_process = server_process.take().expect("server process exists"); + tokio::task::spawn_blocking(move || { + kill_politely(server_process, Duration::from_secs(5), t); + }) + .await + .expect("kill politely worked"); + + *ready = ServerState::Stopped; + } + /// Restarts the tests server process, but does not reconnect clients async fn restart(&mut self) { let mut ready_guard = self.ready.lock().await; - let mut server_process = self.server_process.lock().await; - kill_politely(&mut server_process.child, Duration::from_secs(5)); - *server_process = - Self::create_server_process(&self.test_config, Some(server_process.log_path.clone())) - .await; + let mut server_lock = self.server_process.lock().await; + + Self::stop_inner( + &mut ready_guard, + &mut server_lock, + self.test_config.server_type(), + ) + .await; + + *server_lock = Some(Self::create_server_process(&self.test_config, &self.log_path).await); *ready_guard = ServerState::Started; } - async fn create_server_process( - test_config: &TestConfig, - log_path: Option>, - ) -> Process { + async fn create_server_process(test_config: &TestConfig, log_path: &Path) -> Child { // Create a new file each time and keep it around to aid debugging - let (log_file, log_path) = match log_path { - Some(log_path) => ( - OpenOptions::new() - .read(true) - .append(true) - .open(&log_path) - .expect("log file should still be there"), - log_path, - ), - None => { - let (log_file, log_path) = NamedTempFile::new() - .expect("opening log file") - .keep() - .expect("expected to keep"); - (log_file, log_path.into_boxed_path()) - } - }; + let log_file = OpenOptions::new() + .read(true) + .append(true) + .open(log_path) + .expect("log file should still be there"); let stdout_log_file = log_file .try_clone() @@ -362,9 +474,7 @@ impl TestServer { log_command(command); - let child = command.spawn().unwrap(); - - Process { child, log_path } + command.spawn().unwrap() } /// Polls the various services to ensure the server is @@ -387,6 +497,9 @@ impl TestServer { ServerState::Error => { panic!("Server was previously found to be in Error, aborting"); } + ServerState::Stopped => { + panic!("Server was stopped"); + } }; } @@ -416,14 +529,16 @@ impl TestServer { let server_process = Arc::clone(&self.server_process); let try_http_connect = async { let client = reqwest::Client::new(); - let url = format!("{}/health", self.addrs().router_http_api().client_base()); - let mut interval = tokio::time::interval(Duration::from_millis(1000)); + let url = format!("{}/health", self.test_config.http_base()); + let mut interval = tokio::time::interval(Duration::from_millis(100)); loop { if server_dead(server_process.as_ref()).await { break; } match client.get(&url).send().await { - Ok(resp) => { + Ok(resp) + if resp.status().is_success() || !self.test_config.wait_for_ready() => + { info!( "Successfully got a response from {:?} HTTP: {:?}", self.test_config.server_type(), @@ -431,6 +546,14 @@ impl TestServer { ); return; } + Ok(resp) => { + info!( + "Waiting for {:?} HTTP server to be up: {:?}", + self.test_config.server_type(), + resp + ); + return; + } Err(e) => { info!( "Waiting for {:?} HTTP server to be up: {}", @@ -471,7 +594,7 @@ impl TestServer { pub async fn wait_for_grpc(&self, connections: &Connections) { let server_process = Arc::clone(&self.server_process); - let mut interval = tokio::time::interval(Duration::from_millis(1000)); + let mut interval = tokio::time::interval(Duration::from_millis(100)); let server_type = self.test_config.server_type(); loop { @@ -486,6 +609,20 @@ impl TestServer { `influxdb_iox compactor run-once` instead" ); } + ServerType::Catalog => { + if check_catalog_v2_service_health( + server_type, + connections.catalog_grpc_connection(), + self.test_config.wait_for_ready(), + ) + .await + { + return; + } + } + ServerType::ParquetCache => { + unimplemented!("ParquetCache server should not use grpc, only http"); + } ServerType::Router => { if check_catalog_service_health( server_type, @@ -544,6 +681,24 @@ impl TestServer { } } +impl LinkableService for TestServer { + fn add_link_client(&self, client: Weak) { + self.links.add_link_client(client) + } + + fn remove_link_clients(&self) -> Vec> { + self.links.remove_link_clients() + } + + fn add_link_server(&self, server: Arc) { + self.links.add_link_server(server) + } + + fn remove_link_server(&self, server: Arc) { + self.links.remove_link_server(server) + } +} + /// checks catalog service health, as a proxy for all gRPC /// services. Returns false if the service should be checked again async fn check_catalog_service_health(server_type: ServerType, connection: Connection) -> bool { @@ -568,6 +723,35 @@ async fn check_catalog_service_health(server_type: ServerType, connection: Conne } } +/// checks catalog service V2 health, as a proxy for all gRPC +/// services. Returns false if the service should be checked again +async fn check_catalog_v2_service_health( + server_type: ServerType, + connection: Connection, + wait_for_ready: bool, +) -> bool { + let mut health = influxdb_iox_client::health::Client::new(connection); + + match health + .check("influxdata.iox.catalog.v2.CatalogService") + .await + { + Ok(ready) => { + if ready || !wait_for_ready { + info!("CatalogService service {:?} is running", server_type); + true + } else { + info!("CatalogService {:?} is not running", server_type); + false + } + } + Err(e) => { + info!("CatalogService {:?} not yet healthy: {:?}", server_type, e); + false + } + } +} + /// checks the arrow service service health, returning false if the service should be checked again async fn check_arrow_service_health(server_type: ServerType, connection: Connection) -> bool { let mut health = influxdb_iox_client::health::Client::new(connection); @@ -606,24 +790,74 @@ impl Drop for TestServer { .try_lock() .expect("should be able to get a server process lock"); - server_dead_inner(server_lock.deref_mut()); - kill_politely(&mut server_lock.child, GRACEFUL_SERVER_STOP_TIMEOUT); + if let Some(server_process) = server_lock.take() { + let test_config = self.test_config.clone(); + let log_path = self.log_path.clone(); + let links = self.links.clone(); - dump_log_to_stdout( - &format!("{:?}", self.test_config.server_type()), - &server_lock.log_path, - ); + let kill_and_dump = move || { + kill_politely( + server_process, + GRACEFUL_SERVER_STOP_TIMEOUT, + test_config.server_type(), + ); + + dump_log_to_stdout(&format!("{:?}", test_config.server_type()), &log_path); + + // keep links til server is actually gone + drop(links); + + // keep test config til the very last because it contains the WAL dir + drop(test_config); + }; + + // if there's still a tokio runtime around, use that to help the shut down process, because our client + // connections need to interact with the HTTP/2 shutdown and we shall not block the runtime during that + match tokio::runtime::Handle::try_current() { + Ok(handle) => { + // tokio might decide to not schedule our future, in which case we still want to kill the child, so + // we wrap the kill method into a helper that is either executed within a tokio context or is + // executed when tokio drops it. + let mut kill_and_dump = ExecOnDrop(Some(Box::new(kill_and_dump))); + handle.spawn_blocking(move || { + kill_and_dump.maybe_exec(); + }); + } + Err(_) => { + kill_and_dump(); + } + } + } + } +} + +struct ExecOnDrop(Option>); + +impl ExecOnDrop { + fn maybe_exec(&mut self) { + if let Some(f) = self.0.take() { + f(); + } + } +} + +impl Drop for ExecOnDrop { + fn drop(&mut self) { + self.maybe_exec(); } } /// returns true if the server process has exited (for any reason), and /// prints what happened to stdout -async fn server_dead(server_process: &Mutex) -> bool { - server_dead_inner(server_process.lock().await.deref_mut()) +async fn server_dead(server_process: &Mutex>) -> bool { + match server_process.lock().await.deref_mut() { + Some(server_process) => server_dead_inner(server_process), + None => true, + } } -fn server_dead_inner(server_process: &mut Process) -> bool { - match server_process.child.try_wait() { +fn server_dead_inner(server_process: &mut Child) -> bool { + match server_process.try_wait() { Ok(None) => false, Ok(Some(status)) => { warn!("Server process exited: {}", status); @@ -637,7 +871,16 @@ fn server_dead_inner(server_process: &mut Process) -> bool { } /// Attempt to kill a child process politely. -fn kill_politely(child: &mut Child, wait: Duration) { +fn kill_politely(mut child: Child, wait: Duration, t: ServerType) { + if server_dead_inner(&mut child) { + // fast path + return; + } + + kill_politely_inner(&mut child, wait, t); +} + +fn kill_politely_inner(child: &mut Child, wait: Duration, t: ServerType) { use nix::{ sys::{ signal::{self, Signal}, @@ -652,23 +895,23 @@ fn kill_politely(child: &mut Child, wait: Duration) { let wait_errored = match signal::kill(pid, Signal::SIGTERM) { Ok(()) => wait_timeout(pid, wait).is_err(), Err(e) => { - info!("Error sending SIGTERM to child: {e}"); + info!("Error sending SIGTERM to child ({t:?}): {e}"); true } }; if wait_errored { // timeout => kill it - info!("Cannot terminate child politely, using SIGKILL..."); + warn!("Cannot terminate child ({t:?}) politely, using SIGKILL..."); if let Err(e) = signal::kill(pid, Signal::SIGKILL) { - info!("Error sending SIGKILL to child: {e}"); + info!("Error sending SIGKILL to child ({t:?}): {e}"); } if let Err(e) = waitpid(pid, None) { - info!("Cannot wait for child: {e}"); + info!("Cannot wait for child ({t:?}): {e}"); } } else { - info!("Killed child politely"); + info!("Killed child ({t:?}) politely"); } } diff --git a/test_helpers_end_to_end/src/server_type.rs b/test_helpers_end_to_end/src/server_type.rs index 3cd4a346031..ab23f8217b9 100644 --- a/test_helpers_end_to_end/src/server_type.rs +++ b/test_helpers_end_to_end/src/server_type.rs @@ -7,6 +7,8 @@ pub enum ServerType { Router, Querier, Compactor, + Catalog, + ParquetCache, } impl ServerType { @@ -18,6 +20,8 @@ impl ServerType { Self::Router => "router", Self::Querier => "querier", Self::Compactor => "compactor", + Self::Catalog => "catalog", + Self::ParquetCache => "parquet-cache", } } } @@ -66,12 +70,25 @@ fn addr_envs(server_type: ServerType, addrs: &BindAddresses) -> Vec<(&'static st ServerType::Ingester => vec![ ( "INFLUXDB_IOX_BIND_ADDR", - addrs.router_http_api().bind_addr().to_string(), + addrs.ingester_http_api().bind_addr().to_string(), ), ( "INFLUXDB_IOX_GRPC_BIND_ADDR", addrs.ingester_grpc_api().bind_addr().to_string(), ), + ( + "INFLUXDB_IOX_GOSSIP_BIND_ADDR", + addrs.ingester_gossip_api().bind_addr().to_string(), + ), + ( + "INFLUXDB_IOX_GOSSIP_SEED_LIST", + addrs + .all_gossip_apis() + .into_iter() + .map(|a| a.bind_addr().to_string()) + .collect::>() + .join(","), + ), ], ServerType::Router => vec![ ( @@ -86,26 +103,79 @@ fn addr_envs(server_type: ServerType, addrs: &BindAddresses) -> Vec<(&'static st "INFLUXDB_IOX_INGESTER_ADDRESSES", addrs.ingester_grpc_api().bind_addr().to_string(), ), + ( + "INFLUXDB_IOX_GOSSIP_BIND_ADDR", + addrs.router_gossip_api().bind_addr().to_string(), + ), + ( + "INFLUXDB_IOX_GOSSIP_SEED_LIST", + addrs + .all_gossip_apis() + .into_iter() + .map(|a| a.bind_addr().to_string()) + .collect::>() + .join(","), + ), ], ServerType::Querier => vec![ ( "INFLUXDB_IOX_BIND_ADDR", - addrs.router_http_api().bind_addr().to_string(), + addrs.querier_http_api().bind_addr().to_string(), ), ( "INFLUXDB_IOX_GRPC_BIND_ADDR", addrs.querier_grpc_api().bind_addr().to_string(), ), + ( + "INFLUXDB_IOX_GOSSIP_BIND_ADDR", + addrs.querier_gossip_api().bind_addr().to_string(), + ), + ( + "INFLUXDB_IOX_GOSSIP_SEED_LIST", + addrs + .all_gossip_apis() + .into_iter() + .map(|a| a.bind_addr().to_string()) + .collect::>() + .join(","), + ), ], ServerType::Compactor => vec![ ( "INFLUXDB_IOX_BIND_ADDR", - addrs.router_http_api().bind_addr().to_string(), + addrs.compactor_http_api().bind_addr().to_string(), ), ( "INFLUXDB_IOX_GRPC_BIND_ADDR", addrs.compactor_grpc_api().bind_addr().to_string(), ), + ( + "INFLUXDB_IOX_GOSSIP_BIND_ADDR", + addrs.compactor_gossip_api().bind_addr().to_string(), + ), + ( + "INFLUXDB_IOX_GOSSIP_SEED_LIST", + addrs + .all_gossip_apis() + .into_iter() + .map(|a| a.bind_addr().to_string()) + .collect::>() + .join(","), + ), + ], + ServerType::Catalog => vec![ + ( + "INFLUXDB_IOX_BIND_ADDR", + addrs.catalog_http_api().bind_addr().to_string(), + ), + ( + "INFLUXDB_IOX_GRPC_BIND_ADDR", + addrs.catalog_grpc_api().bind_addr().to_string(), + ), ], + ServerType::ParquetCache => vec![( + "INFLUXDB_IOX_BIND_ADDR", + addrs.parquet_cache_http_api().bind_addr().to_string(), + )], } } diff --git a/test_helpers_end_to_end/src/service_link.rs b/test_helpers_end_to_end/src/service_link.rs new file mode 100644 index 00000000000..e649f340b19 --- /dev/null +++ b/test_helpers_end_to_end/src/service_link.rs @@ -0,0 +1,99 @@ +//! Helpers to ensure service links are respected during shutdown. +//! +//! This does NOT affect correctness of the tests but often speeds them up because clients (like the ingester +//! communicating with the catalog) no longer get stuck on retries during the shutdown phase (and would be killed after +//! a timeout). +use std::sync::{Arc, Weak}; + +use parking_lot::Mutex; + +/// An abstract service that can be linked in a client-server relationship +pub(crate) trait LinkableService: std::fmt::Debug + Send + Sync { + /// Add new known client. + /// + /// **NOTE: This does NOT perform the opposite operation ([`add_link_server`](Self::add_link_server)) for the + /// client. Use [`link_services`] instead.** + fn add_link_client(&self, client: Weak); + + /// Unlink all clients from this service. + /// + /// **NOTE: This does NOT perform the opposite operation ([`remove_link_server`](Self::remove_link_server)) for the + /// returned clients. Use [`unlink_services`] instead.** + fn remove_link_clients(&self) -> Vec>; + + /// Add new known server that should be kept alive until the client is gone. + /// + /// **NOTE: This does NOT perform the opposite operation ([`add_link_client`](Self::add_link_client)) for the + /// client. Use [`link_services`] instead.** + fn add_link_server(&self, server: Arc); + + /// Remove given server. + /// + /// The server will no longer kept alive. This is a no-op if the server is unknown. + /// + /// **NOTE: This does NOT perform the opposite operation ([`remove_link_clients`](Self::remove_link_clients)) for the + /// server. Use [`unlink_services`] instead.** + fn remove_link_server(&self, server: Arc); +} + +/// Simple implementation of [`LinkableService`] that can be used as a struct member. +/// +/// Using this as a struct member and NOT directly is important so that the tracked [`Arc`]s use the actual service +/// struct, not this helper. +#[derive(Debug, Default)] +pub(crate) struct LinkableServiceImpl { + clients: Mutex>>, + servers: Mutex>>, +} + +impl LinkableService for LinkableServiceImpl { + fn add_link_client(&self, client: Weak) { + self.clients.lock().push(client); + } + + fn remove_link_clients(&self) -> Vec> { + let mut guard = self.clients.lock(); + guard + .drain(..) + .filter_map(|client| client.upgrade()) + .collect() + } + + fn add_link_server(&self, server: Arc) { + self.servers.lock().push(server); + } + + fn remove_link_server(&self, server: Arc) { + self.servers + .lock() + .retain(|server2| !Arc::ptr_eq(&server, server2)); + } +} + +impl Clone for LinkableServiceImpl { + fn clone(&self) -> Self { + let clients = self.clients.lock(); + let server = self.servers.lock(); + Self { + clients: Mutex::new(clients.clone()), + servers: Mutex::new(server.clone()), + } + } +} + +/// Cross-link server and client. +pub(crate) fn link_services(server: Arc, client: Arc) { + server.add_link_client(Arc::downgrade(&client)); + client.add_link_server(server); +} + +/// Unlink clients from a given server so it is no longer kept alive. +/// +/// The known clients are returned so they can potentially be re-linked. +pub(crate) fn unlink_services(server: Arc) -> Vec> { + let clients = server.remove_link_clients(); + for client in &clients { + client.remove_link_server(Arc::clone(&server)); + } + clients +} diff --git a/test_helpers_end_to_end/src/snapshot_comparison.rs b/test_helpers_end_to_end/src/snapshot_comparison.rs index 6803a35beb0..1aeced7322b 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison.rs @@ -9,12 +9,11 @@ use arrow_util::test_util::{sort_record_batch, Normalizer, REGEX_UUID}; use influxdb_iox_client::format::influxql::{write_columnar, Options, TableBorders}; use once_cell::sync::Lazy; use regex::{Captures, Regex}; -use snafu::{OptionExt, ResultExt, Snafu}; +use snafu::{OptionExt, Snafu}; use sqlx::types::Uuid; use std::collections::HashMap; use std::{ fmt::{Display, Formatter}, - fs, path::{Path, PathBuf}, }; use tonic::Code; @@ -31,28 +30,6 @@ pub enum Error { #[snafu(context(false))] MakingOutputPath { source: OutputPathError }, - - #[snafu(display("Could not write to output file '{:?}': {}", output_path, source))] - WritingToOutputFile { - output_path: PathBuf, - source: std::io::Error, - }, - - #[snafu(display("Could not read expected file '{:?}': {}", path, source))] - ReadingExpectedFile { - path: PathBuf, - source: std::io::Error, - }, - - #[snafu(display( - "Contents of output '{:?}' does not match contents of expected '{:?}'", - output_path, - expected_path, - ))] - OutputMismatch { - output_path: PathBuf, - expected_path: PathBuf, - }, } pub type Result = std::result::Result; @@ -146,96 +123,51 @@ impl Display for Language { pub async fn run( cluster: &mut MiniCluster, - input_path: PathBuf, + input_file_path: PathBuf, setup_name: String, contents: String, language: Language, ) -> Result<()> { // create output and expected output - let output_path = make_output_path(&input_path)?; - let expected_path = { - let mut p = input_path.clone(); - let ext = p - .extension() - .expect("input path missing extension") - .to_str() - .expect("input path extension is not valid UTF-8"); - p.set_extension(format!("{ext}.expected")); - p - }; + let test_name = input_file_path + .file_name() + .expect("input path missing file path") + .to_str() + .expect("input path file path is not valid UTF-8"); + + let output_path = input_file_path.parent().context(NoParentSnafu { + path: &input_file_path, + })?; + let output_path = make_absolute(output_path); - println!("Running case in {input_path:?}"); - println!(" writing output to {output_path:?}"); - println!(" expected output in {expected_path:?}"); + println!("Running case in {input_file_path:?}"); + println!("Producing output in {output_path:?}"); println!("Processing contents:\n{contents}"); let queries = TestQueries::from_lines(contents.lines(), language); + //Build up the test output line by line let mut output = vec![]; output.push(format!("-- Test Setup: {setup_name}")); for q in queries.iter() { + q.add_comments(&mut output); output.push(format!("-- {}: {}", language, q.text())); q.add_description(&mut output); let results = run_query(cluster, q).await?; output.extend(results); } - fs::write(&output_path, output.join("\n")).context(WritingToOutputFileSnafu { - output_path: &output_path, - })?; - - // Now, compare to expected results - let expected_data = fs::read_to_string(&expected_path).context(ReadingExpectedFileSnafu { - path: &expected_path, - })?; - let expected_contents: Vec<_> = expected_data.lines().map(|s| s.to_string()).collect(); - - if expected_contents != output { - let expected_path = make_absolute(&expected_path); - let output_path = make_absolute(&output_path); + // Configure insta to send the results to query_tests/out/.sql.snap + let mut settings = insta::Settings::clone_current(); + settings.set_snapshot_path(output_path); + settings.set_prepend_module_to_snapshot(false); + settings.bind(|| { + let test_output = output.join("\n"); + insta::assert_snapshot!(test_name, test_output); // panic on failure + }); - if std::env::var("CI") - .map(|value| value == "true") - .unwrap_or(false) - { - // In CI, print out the contents because it's inconvenient to access the files and - // you're not going to update the files there. - println!("Expected output does not match actual output"); - println!( - "Diff: \n\n{}", - String::from_utf8( - std::process::Command::new("diff") - .arg("-du") - .arg(&expected_path) - .arg(&output_path) - .output() - .unwrap() - .stdout - ) - .unwrap() - ); - } else { - // When you're not in CI, print out instructions for analyzing the content or updating - // the snapshot. - println!("Expected output does not match actual output"); - println!(" expected output in {expected_path:?}"); - println!(" actual output in {output_path:?}"); - println!("Possibly helpful commands:"); - println!(" # See diff"); - println!(" diff -du {expected_path:?} {output_path:?}"); - println!(" # Update expected"); - println!(" cp -f {output_path:?} {expected_path:?}"); - } - - OutputMismatchSnafu { - output_path, - expected_path, - } - .fail() - } else { - Ok(()) - } + Ok(()) } #[derive(Debug, Snafu)] @@ -250,41 +182,6 @@ pub enum OutputPathError { NoParent { path: PathBuf }, } -/// Return output path for input path. -/// -/// This converts `some/prefix/in/foo.sql` (or other file extensions) to `some/prefix/out/foo.sql.out`. -fn make_output_path(input: &Path) -> Result { - let stem = input.file_stem().context(NoFileStemSnafu { path: input })?; - let ext = input - .extension() - .context(MissingFileExtSnafu { path: input })?; - - // go two levels up (from file to dir, from dir to parent dir) - let parent = input.parent().context(NoParentSnafu { path: input })?; - let parent = parent.parent().context(NoParentSnafu { path: parent })?; - let mut out = parent.to_path_buf(); - - // go one level down (from parent dir to out-dir) - out.push("out"); - - // make best effort attempt to create output directory if it - // doesn't exist (it does not on a fresh checkout) - if !out.exists() { - if let Err(e) = std::fs::create_dir(&out) { - panic!("Could not create output directory {out:?}: {e}"); - } - } - - // set file name and ext - out.push(stem); - out.set_extension(format!( - "{}.out", - ext.to_str().expect("extension is not valid UTF-8") - )); - - Ok(out) -} - /// Return the absolute path to `path`, regardless of if it exists on the local filesystem fn make_absolute(path: &Path) -> PathBuf { let mut absolute = std::env::current_dir().expect("cannot get current working directory"); diff --git a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs index 259f88ec721..1008af4a8df 100644 --- a/test_helpers_end_to_end/src/snapshot_comparison/queries.rs +++ b/test_helpers_end_to_end/src/snapshot_comparison/queries.rs @@ -4,13 +4,16 @@ use arrow_util::test_util::Normalizer; /// A query to run with optional annotations #[derive(Debug, PartialEq, Eq, Default)] -pub struct Query { +pub(crate) struct Query { /// Describes how query text should be normalized normalizer: Normalizer, /// Specifies the query language of `text`. language: Language, + /// Comments that precede the query + comments: Vec, + /// The query string text: String, } @@ -22,58 +25,75 @@ impl Query { Self { normalizer: Normalizer::new(), language: Language::Sql, + comments: vec![], text, } } - pub fn text(&self) -> &str { + pub(crate) fn text(&self) -> &str { &self.text } - pub fn language(&self) -> Language { + pub(crate) fn language(&self) -> Language { self.language } - pub fn with_sorted_compare(mut self) -> Self { + /// Add a comment to the query + #[cfg(test)] + pub(crate) fn with_comment(mut self, comment: impl Into) -> Self { + self.comments.push(comment.into()); + self + } + + pub(crate) fn with_sorted_compare(mut self) -> Self { self.normalizer.sorted_compare = true; self } - pub fn with_normalized_uuids(mut self) -> Self { + pub(crate) fn with_normalized_uuids(mut self) -> Self { self.normalizer.normalized_uuids = true; self } - pub fn with_normalize_metrics(mut self) -> Self { + pub(crate) fn with_normalize_metrics(mut self) -> Self { self.normalizer.normalized_metrics = true; self } - pub fn with_normalize_filters(mut self) -> Self { + pub(crate) fn with_normalize_filters(mut self) -> Self { self.normalizer.normalized_filters = true; self } - pub fn with_no_table_borders(mut self) -> Self { + pub(crate) fn with_no_table_borders(mut self) -> Self { self.normalizer.no_table_borders = true; self } /// Take the output of running the query and apply the specified normalizations to them - pub fn normalize_results(&self, results: Vec, language: Language) -> Vec { + pub(crate) fn normalize_results( + &self, + results: Vec, + language: Language, + ) -> Vec { language.normalize_results(&self.normalizer, results) } - /// Adds information on what normalizations were applied to the input - pub fn add_description(&self, output: &mut Vec) { + /// Adds any comments from the input to the output + pub(crate) fn add_comments(&self, output: &mut Vec) { + output.extend_from_slice(&self.comments); + } + + /// Adds information to the output about what normalizations were applied + pub(crate) fn add_description(&self, output: &mut Vec) { self.normalizer.add_description(output) } } #[derive(Debug, Default)] struct QueryBuilder { - pub language: Language, - pub query: Query, + pub(crate) language: Language, + pub(crate) query: Query, } impl QueryBuilder { @@ -83,6 +103,9 @@ impl QueryBuilder { ..Default::default() } } + fn push_comment(&mut self, s: &str) { + self.query.comments.push(s.to_string()) + } fn push_str(&mut self, s: &str) { self.query.text.push_str(s) @@ -108,13 +131,13 @@ impl QueryBuilder { /// Poor man's parser to find all the SQL queries in an input file #[derive(Debug, PartialEq, Eq)] -pub struct TestQueries { +pub(crate) struct TestQueries { queries: Vec, } impl TestQueries { /// find all queries (more or less a fancy split on `;` - pub fn from_lines(lines: I, language: Language) -> Self + pub(crate) fn from_lines(lines: I, language: Language) -> Self where I: IntoIterator, S: AsRef, @@ -150,6 +173,10 @@ impl TestQueries { _ => {} } } + } else if line.starts_with("-- IOX_SETUP: ") { + // ignore setup lines + } else if line.starts_with("--") { + builder.push_comment(line); } if line.starts_with("--") { @@ -183,7 +210,7 @@ impl TestQueries { } // Get an iterator over the queries - pub fn iter(&self) -> impl Iterator { + pub(crate) fn iter(&self) -> impl Iterator { self.queries.iter() } } @@ -208,8 +235,8 @@ select * from bar; queries, TestQueries { queries: vec![ - Query::new("select * from foo;"), - Query::new("select * from bar;"), + Query::new("select * from foo;").with_comment("-- This is a test"), + Query::new("select * from bar;").with_comment("-- another comment"), ] } ) @@ -228,7 +255,7 @@ select * from bar TestQueries { queries: vec![ Query::new("select * from foo;"), - Query::new("select * from bar") + Query::new("select * from bar").with_comment("-- no ending semi colon"), ] } ) @@ -290,8 +317,14 @@ select * from waz; TestQueries { queries: vec![ Query::new("select * from foo;"), - Query::new("select * from bar;").with_sorted_compare(), - Query::new("select * from baz;"), + Query::new("select * from bar;") + .with_comment( + "-- The second query should be compared to expected after sorting" + ) + .with_sorted_compare(), + Query::new("select * from baz;").with_comment( + "-- Since this query is not annotated, it should not use exected sorted" + ), Query::new("select * from baz2;"), Query::new("select * from waz;").with_sorted_compare(), ] @@ -324,7 +357,10 @@ select * from foo; assert_eq!( queries, TestQueries { - queries: vec![Query::new("select * from foo;")] + queries: vec![ + // Note the --IOX_COMPARE is not treated as a comment + Query::new("select * from foo;") + ] } ) } diff --git a/test_helpers_end_to_end/src/steps.rs b/test_helpers_end_to_end/src/steps.rs index 8315870ffad..c727de40125 100644 --- a/test_helpers_end_to_end/src/steps.rs +++ b/test_helpers_end_to_end/src/steps.rs @@ -1,19 +1,23 @@ use crate::snapshot_comparison::Language; use crate::{ - check_flight_error, run_influxql, run_sql, snapshot_comparison, try_run_influxql, try_run_sql, - MiniCluster, + check_flight_error, run_influxql, run_influxql_with_params, run_sql, run_sql_with_params, + snapshot_comparison, try_run_influxql, try_run_influxql_with_params, try_run_sql, + try_run_sql_with_params, MiniCluster, }; use arrow::record_batch::RecordBatch; use arrow_util::assert_batches_sorted_eq; use futures::future::BoxFuture; use http::StatusCode; +use iox_query_params::StatementParam; use observability_deps::tracing::info; +use std::collections::HashMap; use std::{path::PathBuf, time::Duration}; use test_helpers::assert_contains; const MAX_QUERY_RETRY_TIME_SEC: u64 = 20; /// Test harness for end to end tests that are comprised of several steps +#[allow(missing_debug_implementations)] pub struct StepTest<'a, S> { cluster: &'a mut MiniCluster, @@ -22,6 +26,7 @@ pub struct StepTest<'a, S> { } /// The test state that is passed to custom steps +#[derive(Debug)] pub struct StepTestState<'a> { /// The mini cluster cluster: &'a mut MiniCluster, @@ -154,12 +159,14 @@ impl<'a> StepTestState<'a> { /// }.boxed() /// }); /// ``` -pub type FCustom = Box Fn(&'b mut StepTestState) -> BoxFuture<'b, ()> + Send + Sync>; +pub type FCustom = + Box Fn(&'b mut StepTestState<'_>) -> BoxFuture<'b, ()> + Send + Sync>; /// Function to do custom validation on metrics. Expected to panic on validation failure. -pub type MetricsValidationFn = Box; +pub(crate) type MetricsValidationFn = Box, String) + Send + Sync>; /// Possible test steps that a test can perform +#[allow(missing_debug_implementations)] pub enum Step { /// Writes the specified line protocol to the `/api/v2/write` /// endpoint, assert the data was written successfully @@ -170,6 +177,8 @@ pub enum Step { WriteLineProtocolExpectingError { line_protocol: String, expected_error_code: StatusCode, + expected_error_message: String, + expected_line_number: Option, }, /// Writes the specified line protocol to the `/api/v2/write` endpoint @@ -217,6 +226,16 @@ pub enum Step { expected: Vec<&'static str>, }, + /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables + /// with the supplied parameters. Then verify that the + /// results match the expected results using the + /// `assert_batches_eq!` macro + QueryWithParams { + sql: String, + params: HashMap, + expected: Vec<&'static str>, + }, + /// Read the SQL queries in the specified file and verify that the results match the expected /// results in the corresponding expected file QueryAndCompare { @@ -233,6 +252,16 @@ pub enum Step { expected_message: String, }, + /// Run SQL query using the FlightSQL interface, replacing `$placeholder` variables + /// with the supplied parameters. Then verify that the + /// request returns the expected error code and message + QueryWithParamsExpectingError { + sql: String, + params: HashMap, + expected_error_code: tonic::Code, + expected_message: String, + }, + /// Run a SQL query using the FlightSQL interface authorized by the /// authorization header. Verify that the /// results match the expected results using the `assert_batches_eq!` @@ -271,6 +300,15 @@ pub enum Step { expected: Vec<&'static str>, }, + /// Run an InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables + /// in the query text with values provided by the params HashMap. Then verify that the + /// results match the expected results using the `assert_batches_eq!` macro + InfluxQLQueryWithParams { + query: String, + params: HashMap, + expected: Vec<&'static str>, + }, + /// Read the InfluxQL queries in the specified file and verify that the results match the /// expected results in the corresponding expected file InfluxQLQueryAndCompare { @@ -287,6 +325,16 @@ pub enum Step { expected_message: String, }, + /// Run InfluxQL query using the FlightSQL interface, replacing `$placeholder` variables + /// with the supplied parameters. Then verify that the + /// request returns the expected error code and message + InfluxQLWithParamsExpectingError { + query: String, + params: HashMap, + expected_error_code: tonic::Code, + expected_message: String, + }, + /// Run an InfluxQL query using the FlightSQL interface including an /// authorization header. Verify that the results match the expected /// results using the `assert_batches_eq!` macro. @@ -332,7 +380,7 @@ impl AsRef for Step { impl<'a, S> StepTest<'a, S> where - S: AsRef, + S: AsRef + Send, { /// Create a new test that runs each `step`, in sequence, against /// `cluster` panic'ing if any step fails @@ -382,6 +430,8 @@ where Step::WriteLineProtocolExpectingError { line_protocol, expected_error_code, + expected_error_message, + expected_line_number, } => { info!( "====Begin writing line protocol expecting error to v2 HTTP API:\n{}", @@ -389,6 +439,40 @@ where ); let response = state.cluster.write_to_router(line_protocol, None).await; assert_eq!(response.status(), *expected_error_code); + + let body: serde_json::Value = serde_json::from_slice( + &hyper::body::to_bytes(response.into_body()) + .await + .expect("should be able to read response body"), + ) + .expect("response body should be valid json"); + + assert_matches::assert_matches!( + body["message"], + serde_json::Value::String(ref s) if s.contains(expected_error_message), + "error message did not match: expected '{}' to contain '{}'", + body["message"], + expected_error_message + ); + + match expected_line_number { + Some(line) => { + assert_matches::assert_matches!( + body["line"], + serde_json::Value::Number(ref n) if n == &serde_json::Number::from(*line), + "error line did not match: expected '{}' to be '{}'", + body["line"], + line + ); + } + None => { + assert!( + !body.as_object().unwrap().contains_key("line"), + "error line should not be present" + ); + } + }; + info!("====Done writing line protocol expecting error"); } Step::WriteLineProtocolWithAuthorization { @@ -466,6 +550,27 @@ where assert_batches_sorted_eq!(expected, &batches); info!("====Done running"); } + Step::QueryWithParams { + sql, + params, + expected, + } => { + info!("====Begin running SQL query: {}", sql); + info!("params: {:?}", params); + // run query + let (mut batches, schema) = run_sql_with_params( + sql, + state.cluster.namespace(), + params.clone(), + state.cluster.querier().querier_grpc_connection(), + None, + false, + ) + .await; + batches.push(RecordBatch::new_empty(schema)); + assert_batches_sorted_eq!(expected, &batches); + info!("====Done running"); + } Step::QueryAndCompare { input_path, setup_name, @@ -507,6 +612,29 @@ where info!("====Done running"); } + Step::QueryWithParamsExpectingError { + sql, + params, + expected_error_code, + expected_message, + } => { + info!("====Begin running SQL query expected to error: {}", sql); + + let err = try_run_sql_with_params( + sql, + state.cluster().namespace(), + params.clone(), + state.cluster().querier().querier_grpc_connection(), + None, + false, + ) + .await + .unwrap_err(); + + check_flight_error(err, *expected_error_code, Some(expected_message)); + + info!("====Done running"); + } Step::QueryWithAuthorization { sql, authorization, @@ -569,6 +697,26 @@ where assert_batches_sorted_eq!(expected, &batches); info!("====Done running"); } + Step::InfluxQLQueryWithParams { + query, + expected, + params, + } => { + info!("====Begin running InfluxQL query: {}", query); + info!("params: {:?}", params); + // run query + let (mut batches, schema) = run_influxql_with_params( + query, + state.cluster.namespace(), + params.clone(), + state.cluster.querier().querier_grpc_connection(), + None, + ) + .await; + batches.push(RecordBatch::new_empty(schema)); + assert_batches_sorted_eq!(expected, &batches); + info!("====Done running"); + } Step::InfluxQLQueryAndCompare { input_path, setup_name, @@ -612,6 +760,31 @@ where info!("====Done running"); } + Step::InfluxQLWithParamsExpectingError { + query, + params, + expected_error_code, + expected_message, + } => { + info!( + "====Begin running InfluxQL query expected to error: {}", + query + ); + info!("params: {:?}", params); + let err = try_run_influxql_with_params( + query, + state.cluster().namespace(), + params.clone(), + state.cluster().querier().querier_grpc_connection(), + None, + ) + .await + .unwrap_err(); + + check_flight_error(err, *expected_error_code, Some(expected_message)); + + info!("====Done running"); + } Step::InfluxQLQueryWithAuthorization { query, authorization, @@ -650,7 +823,7 @@ where Step::GracefulStopIngesters => { info!("====Gracefully stop all ingesters"); - state.cluster_mut().gracefully_stop_ingesters(); + state.cluster_mut().gracefully_stop_ingesters().await; } Step::VerifiedMetrics(verify) => { info!("====Begin validating metrics"); diff --git a/test_helpers_end_to_end/src/udp_listener.rs b/test_helpers_end_to_end/src/udp_listener.rs index 7a47c0fdac2..02da4149b83 100644 --- a/test_helpers_end_to_end/src/udp_listener.rs +++ b/test_helpers_end_to_end/src/udp_listener.rs @@ -32,6 +32,7 @@ impl ToString for Message { } } +#[derive(Debug)] pub struct UdpCapture { socket_addr: std::net::SocketAddr, join_handle: tokio::task::JoinHandle<()>, @@ -117,7 +118,7 @@ impl UdpCapture { // wait for a message to appear that passes `pred` or the timeout expires pub async fn wait_for

(&self, pred: P) where - P: FnMut(&Message) -> bool + Copy, + P: FnMut(&Message) -> bool + Copy + Send, { let end = Instant::now() + Duration::from_secs(MAX_WAIT_TIME_SEC); diff --git a/tokio_metrics_bridge/Cargo.toml b/tokio_metrics_bridge/Cargo.toml index 7b2faeeff59..9a43a1a3a18 100644 --- a/tokio_metrics_bridge/Cargo.toml +++ b/tokio_metrics_bridge/Cargo.toml @@ -5,10 +5,13 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] metric = { path = "../metric" } parking_lot = "0.12.1" -tokio = { version = "1.32", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] diff --git a/tokio_watchdog/Cargo.toml b/tokio_watchdog/Cargo.toml new file mode 100644 index 00000000000..c050a9295b9 --- /dev/null +++ b/tokio_watchdog/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "tokio_watchdog" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +metric = { path = "../metric" } +observability_deps = { path = "../observability_deps" } +tokio = { version = "1.35", features = ["macros", "net", "parking_lot", "rt-multi-thread", "sync", "time"] } +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] +test_helpers = { path = "../test_helpers" } diff --git a/tokio_watchdog/src/lib.rs b/tokio_watchdog/src/lib.rs new file mode 100644 index 00000000000..e7e2d759e2e --- /dev/null +++ b/tokio_watchdog/src/lib.rs @@ -0,0 +1,231 @@ +//! Monitors if the tokio runtime still looks healthy. +#![deny(rustdoc::broken_intra_doc_links, rust_2018_idioms)] +#![warn( + missing_copy_implementations, + missing_docs, + clippy::explicit_iter_loop, + // See https://github.com/influxdata/influxdb_iox/pull/1671 + clippy::future_not_send, + clippy::use_self, + clippy::clone_on_ref_ptr, + clippy::todo, + clippy::dbg_macro, + unused_crate_dependencies +)] + +use observability_deps::tracing::warn; + +// Workaround for "unused crate" lint false positives. +use workspace_hack as _; + +use std::time::{Duration, Instant}; + +use metric::{DurationHistogram, Registry, U64Counter}; +use tokio::{ + runtime::Handle, + sync::mpsc::{channel, error::TryRecvError}, +}; + +/// Tokio watchdog config. +#[allow(missing_debug_implementations)] +pub struct WatchdogConfig<'a> { + handle: &'a Handle, + metric_registry: &'a Registry, + runtime_name: &'static str, + tick_duration: Duration, + warn_threshold: Duration, + new_thread_hook: Option>, +} + +impl<'a> WatchdogConfig<'a> { + /// Create new config for given runtime handle and metric registry. + #[must_use] + pub fn new(handle: &'a Handle, metric_registry: &'a Registry) -> Self { + Self { + handle, + metric_registry, + runtime_name: "tokio", + tick_duration: Duration::from_millis(100), + warn_threshold: Duration::from_millis(100), + new_thread_hook: None, + } + } + + /// Set runtime name. + #[must_use] + pub fn with_runtime_name(self, name: &'static str) -> Self { + Self { + runtime_name: name, + ..self + } + } + + /// Set tick duration. + /// + /// The tick duration determines how often the alive check will be performed. + #[must_use] + pub fn with_tick_duration(self, d: Duration) -> Self { + Self { + tick_duration: d, + ..self + } + } + + /// Set warn duration. + /// + /// Determines how long the watchdog waits after each check before it detects a hang. + #[must_use] + pub fn with_warn_duration(self, d: Duration) -> Self { + Self { + warn_threshold: d, + ..self + } + } + + /// Sets a hook that is called when the watchdog thread is created. + /// + /// The hook is called from the new thread. + #[must_use] + pub fn with_new_thread_hook(self, f: F) -> Self + where + F: FnOnce() + Send + 'static, + { + Self { + new_thread_hook: Some(Box::new(f)), + ..self + } + } + + /// Install watchdog. + /// + /// # Panic + /// Panics if the sum of [tick duration](Self::with_tick_duration) and [warn duration](Self::with_warn_duration) is zero. + pub fn install(self) { + let Self { + handle, + metric_registry, + runtime_name, + tick_duration, + warn_threshold, + new_thread_hook, + } = self; + + assert!( + !(tick_duration + warn_threshold).is_zero(), + "sum of tick and warn duration must be non-zero" + ); + + let (tx_request, mut rx_request) = channel::(1); + let (tx_response, mut rx_response) = channel::(1); + + let metric_latency = metric_registry + .register_metric::( + "tokio_watchdog_response_time", + "Response time of the tokio watchdog task", + ) + .recorder(&[("runtime", runtime_name)]); + let metric_hang = metric_registry + .register_metric::( + "tokio_watchdog_hangs", + "Number of hangs detected by the tokio watchdog", + ) + .recorder(&[("runtime", runtime_name)]); + + handle.spawn(async move { + loop { + let Some(start) = rx_request.recv().await else { + return; + }; + + if tx_response.try_send(start.elapsed()).is_err() { + return; + } + } + }); + + std::thread::Builder::new() + .name(format!("tokio watchdog {runtime_name}")) + .spawn(move || { + if let Some(hook) = new_thread_hook { + hook(); + } + + loop { + std::thread::sleep(tick_duration); + + if tx_request.try_send(Instant::now()).is_err() { + return; + } + + std::thread::sleep(warn_threshold); + + let d = match rx_response.try_recv() { + Ok(d) => d, + Err(TryRecvError::Empty) => { + warn!(runtime = runtime_name, "tokio starts hanging",); + metric_hang.inc(1); + + let Some(d) = rx_response.blocking_recv() else { + return; + }; + warn!( + runtime = runtime_name, + hang_secs = d.as_secs_f64(), + "tokio stops hanging", + ); + d + } + Err(TryRecvError::Disconnected) => { + return; + } + }; + + metric_latency.record(d); + } + }) + .expect("start watchdog thread"); + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use test_helpers::tracing::TracingCapture; + + use super::*; + + #[tokio::test] + #[should_panic(expected = "sum of tick and warn duration must be non-zero")] + async fn test_panic_zero_duration() { + let registry = Registry::default(); + WatchdogConfig::new(&Handle::current(), ®istry) + .with_tick_duration(Duration::ZERO) + .with_warn_duration(Duration::ZERO) + .install(); + } + + #[tokio::test] + async fn test() { + let capture = Arc::new(TracingCapture::new()); + let registry = Registry::default(); + let tick_duration = Duration::from_millis(100); + let warn_threshold = Duration::from_millis(200); + + let capture2 = Arc::clone(&capture); + WatchdogConfig::new(&Handle::current(), ®istry) + .with_tick_duration(tick_duration) + .with_warn_duration(warn_threshold) + .with_new_thread_hook(move || { + capture2.register_in_current_thread(); + }) + .install(); + + std::thread::sleep(warn_threshold * 2); + tokio::time::sleep(tick_duration * 2).await; + + let logs = capture.to_string(); + assert!(logs.contains("tokio starts hanging")); + assert!(logs.contains("tokio stops hanging")); + } +} diff --git a/tower_trailer/Cargo.toml b/tower_trailer/Cargo.toml new file mode 100644 index 00000000000..e2da6380911 --- /dev/null +++ b/tower_trailer/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "tower_trailer" +description = "Allow to send HTTP/2 trailer using a tower layer" +version.workspace = true +authors.workspace = true +edition.workspace = true +license.workspace = true + +[lints] +workspace = true + +[dependencies] +futures = "0.3" +http = "0.2" +http-body = "0.4" +parking_lot = "0.12" +pin-project = "1.1" +tower = "0.4" +workspace-hack = { version = "0.1", path = "../workspace-hack" } + +[dev-dependencies] diff --git a/tower_trailer/src/lib.rs b/tower_trailer/src/lib.rs new file mode 100644 index 00000000000..153d0c1fec0 --- /dev/null +++ b/tower_trailer/src/lib.rs @@ -0,0 +1,194 @@ +#![deny(rustdoc::broken_intra_doc_links, rustdoc::bare_urls, rust_2018_idioms)] +#![warn( + missing_debug_implementations, + clippy::explicit_iter_loop, + clippy::use_self, + clippy::clone_on_ref_ptr, + // See https://github.com/influxdata/influxdb_iox/pull/1671 + clippy::future_not_send, + clippy::todo, + clippy::dbg_macro, + unused_crate_dependencies +)] + +use std::future::Future; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use futures::ready; +use http::{Request, Response}; +use http_body::SizeHint; +use parking_lot::Mutex; +use pin_project::pin_project; +use tower::{Layer, Service}; + +// Workaround for "unused crate" lint false positives. +use workspace_hack as _; + +// re-export public types +pub use http::HeaderMap; + +/// Layer that installs [`Trailers`] as a [request extension](Request::extensions). +#[derive(Debug, Clone, Default)] +#[allow(missing_copy_implementations)] +pub struct TrailerLayer; + +impl Layer for TrailerLayer { + type Service = TrailerService; + + fn layer(&self, service: S) -> Self::Service { + TrailerService { service } + } +} + +#[derive(Debug, Clone)] +pub struct TrailerService { + service: S, +} + +impl Service> for TrailerService +where + S: Service, Response = Response>, + ResBody: http_body::Body, +{ + type Response = Response>; + type Error = S::Error; + type Future = WrappedFuture; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.service.poll_ready(cx) + } + + fn call(&mut self, mut request: Request) -> Self::Future { + let trailers = Trailers::new(); + let callbacks = trailers.callbacks.clone(); + let existing = request.extensions_mut().insert(trailers); + assert!( + existing.is_none(), + "trailer layer/service installed multiple times" + ); + + WrappedFuture { + callbacks, + inner: self.service.call(request), + } + } +} + +#[pin_project] +#[derive(Debug)] +pub struct WrappedFuture { + callbacks: SharedCallbacks, + #[pin] + inner: F, +} + +impl Future for WrappedFuture +where + F: Future, Error>>, + ResBody: http_body::Body, +{ + type Output = Result>, Error>; + + fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let result: Result, Error> = + ready!(self.as_mut().project().inner.poll(cx)); + + match result { + Ok(response) => Poll::Ready(Ok(response.map(|body| WrappedBody { + callbacks: self.callbacks.clone(), + inner: body, + }))), + Err(e) => Poll::Ready(Err(e)), + } + } +} + +#[pin_project] +#[derive(Debug)] +pub struct WrappedBody { + callbacks: SharedCallbacks, + #[pin] + inner: B, +} + +impl http_body::Body for WrappedBody { + type Data = B::Data; + type Error = B::Error; + + fn poll_data( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll>> { + self.as_mut().project().inner.poll_data(cx) + } + + fn poll_trailers( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll, Self::Error>> { + let result: Result, Self::Error> = + ready!(self.as_mut().project().inner.poll_trailers(cx)); + + let res = match result { + Ok(trailers) => { + let mut trailers = trailers.unwrap_or_default(); + + for callback in self.callbacks.0.lock().iter() { + callback(&mut trailers); + } + + Ok((!trailers.is_empty()).then_some(trailers)) + } + Err(e) => Err(e), + }; + Poll::Ready(res) + } + + fn is_end_stream(&self) -> bool { + self.inner.is_end_stream() + } + + fn size_hint(&self) -> SizeHint { + self.inner.size_hint() + } +} + +type TrailerCallback = Box Fn(&'a mut HeaderMap) + Send>; + +#[derive(Clone, Default)] +struct SharedCallbacks(Arc>>); + +impl std::fmt::Debug for SharedCallbacks { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("SharedCallbacks").field(&"...").finish() + } +} + +/// Handle to manage trailers of a HTTP response. +#[derive(Clone, Debug)] +pub struct Trailers { + callbacks: SharedCallbacks, +} + +impl Trailers { + /// Private constructor. + /// + /// It is pointless / a potential bug to construct this type outside this crate, because it will NOT be hooked up + /// into the layer. + fn new() -> Self { + Self { + callbacks: Default::default(), + } + } + + /// Register callback that is called when the trailers are sent. + pub fn add_callback(&self, f: F) + where + for<'a> F: Fn(&'a mut HeaderMap) + Send + 'static, + { + let mut guard = self.callbacks.0.lock(); + guard.push(Box::new(f)); + } +} diff --git a/trace/Cargo.toml b/trace/Cargo.toml index 1e431dabf56..1f09b8a3c07 100644 --- a/trace/Cargo.toml +++ b/trace/Cargo.toml @@ -6,6 +6,9 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] chrono = { version = "0.4", default-features = false } observability_deps = { path = "../observability_deps" } diff --git a/trace/src/lib.rs b/trace/src/lib.rs index 3a12d9be4ce..6e352ff209e 100644 --- a/trace/src/lib.rs +++ b/trace/src/lib.rs @@ -35,7 +35,7 @@ pub trait TraceCollector: std::fmt::Debug + Send + Sync { } /// A basic trace collector that prints to stdout -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub struct LogTraceCollector {} impl LogTraceCollector { diff --git a/trace/src/span.rs b/trace/src/span.rs index 1ed1d549faf..d7a0de1670d 100644 --- a/trace/src/span.rs +++ b/trace/src/span.rs @@ -58,27 +58,26 @@ impl Span { } /// Record an event on this `Span` - pub fn event(&mut self, meta: impl Into>) { - let event = SpanEvent { - time: Utc::now(), - msg: meta.into(), - }; - self.events.push(event) + pub fn event(&mut self, event: SpanEvent) { + self.events.push(event); } /// Record success on this `Span` setting the status if it isn't already set - pub fn ok(&mut self, meta: impl Into>) { - self.event(meta); - if self.status == SpanStatus::Unknown { - self.status = SpanStatus::Ok; - } + pub fn ok(&mut self, msg: impl Into>) { + self.event(SpanEvent::new(msg)); + self.status(SpanStatus::Ok); } /// Record an error on this `Span` setting the status if it isn't already set - pub fn error(&mut self, meta: impl Into>) { - self.event(meta); + pub fn error(&mut self, msg: impl Into>) { + self.event(SpanEvent::new(msg)); + self.status(SpanStatus::Err); + } + + /// Set status of `Span` + pub fn status(&mut self, status: SpanStatus) { if self.status == SpanStatus::Unknown { - self.status = SpanStatus::Err; + self.status = status; } } @@ -110,6 +109,25 @@ pub struct SpanEvent { pub time: DateTime, pub msg: Cow<'static, str>, + + pub metadata: HashMap, MetaValue>, +} + +impl SpanEvent { + /// Create new event. + pub fn new(msg: impl Into>) -> Self { + Self { + time: Utc::now(), + msg: msg.into(), + // assume no metadata by default + metadata: HashMap::with_capacity(0), + } + } + + /// Set meta data. + pub fn set_metadata(&mut self, key: impl Into>, value: impl Into) { + self.metadata.insert(key.into(), value.into()); + } } /// Values that can be stored in a Span's metadata and events @@ -183,9 +201,9 @@ impl SpanRecorder { } /// Record an event on the contained `Span` if any - pub fn event(&mut self, meta: impl Into>) { + pub fn event(&mut self, event: SpanEvent) { if let Some(span) = self.span.as_mut() { - span.event(meta) + span.event(event); } } @@ -203,6 +221,13 @@ impl SpanRecorder { } } + /// Set status of contained `Span` if any + pub fn status(&mut self, status: SpanStatus) { + if let Some(span) = self.span.as_mut() { + span.status(status); + } + } + /// Take the contents of this recorder returning a new recorder /// /// From this point on `self` will behave as if it were created with no span diff --git a/trace_exporters/Cargo.toml b/trace_exporters/Cargo.toml index 53a6c0ec318..177ad961fc7 100644 --- a/trace_exporters/Cargo.toml +++ b/trace_exporters/Cargo.toml @@ -6,17 +6,20 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] async-trait = "0.1" clap = { version = "4", features = ["derive", "env"] } futures = "0.3" iox_time = { path = "../iox_time" } observability_deps = { path = "../observability_deps" } -snafu = "0.7" +snafu = "0.8" thrift = { version = "0.17.0" } -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt", "sync"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt", "sync"] } trace = { path = "../trace" } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] -chrono = { version = "0.4", default-features = false, features = ["clock"] } \ No newline at end of file +chrono = { version = "0.4", default-features = false, features = ["clock"] } diff --git a/trace_exporters/src/jaeger.rs b/trace_exporters/src/jaeger.rs index 1e6e4fbdb5e..d02c68a82e8 100644 --- a/trace_exporters/src/jaeger.rs +++ b/trace_exporters/src/jaeger.rs @@ -33,6 +33,11 @@ impl JaegerTag { value: value.into(), } } + + /// Key. + pub fn key(&self) -> &str { + &self.key + } } impl From for jaeger::Tag { @@ -169,7 +174,83 @@ impl AsyncExport for JaegerAgentExporter { self.rate_limiter.send().await; if let Err(e) = self.client.emit_batch(batch) { - error!(%e, "error writing batch to jaeger agent") + let e = NiceThriftError::from(e); + + // not a user-visible error but only a monitoring outage, print on info level + // Ref: https://github.com/influxdata/influxdb_iox/issues/9726 + info!(%e, "error writing batch to jaeger agent") + } + } +} + +/// Thrift error formatting is messy, try better. +/// +/// See . +#[derive(Debug)] +struct NiceThriftError(thrift::Error); + +impl From for NiceThriftError { + fn from(e: thrift::Error) -> Self { + Self(e) + } +} + +impl std::fmt::Display for NiceThriftError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.0 { + thrift::Error::Transport(e) => { + let kind = match e.kind { + thrift::TransportErrorKind::Unknown => "unknown", + thrift::TransportErrorKind::NotOpen => "not open", + thrift::TransportErrorKind::AlreadyOpen => "already open", + thrift::TransportErrorKind::TimedOut => "timed out", + thrift::TransportErrorKind::EndOfFile => "end of file", + thrift::TransportErrorKind::NegativeSize => "negative size message", + thrift::TransportErrorKind::SizeLimit => "message too long", + _ => "unknown variant", + }; + + write!(f, "transport: {}: {}", kind, e.message) + } + thrift::Error::Protocol(e) => { + let kind = match e.kind { + thrift::ProtocolErrorKind::Unknown => "unknown", + thrift::ProtocolErrorKind::InvalidData => "bad data", + thrift::ProtocolErrorKind::NegativeSize => "negative message size", + thrift::ProtocolErrorKind::SizeLimit => "message too long", + thrift::ProtocolErrorKind::BadVersion => "invalid thrift version", + thrift::ProtocolErrorKind::NotImplemented => "not implemented", + thrift::ProtocolErrorKind::DepthLimit => "maximum skip depth reached", + _ => "unknown variant", + }; + + write!(f, "protocol: {}: {}", kind, e.message) + } + thrift::Error::Application(e) => { + let kind = match e.kind { + thrift::ApplicationErrorKind::Unknown => "unknown", + thrift::ApplicationErrorKind::UnknownMethod => "unknown service method", + thrift::ApplicationErrorKind::InvalidMessageType => { + "wrong message type received" + } + thrift::ApplicationErrorKind::WrongMethodName => { + "unknown method reply received" + } + thrift::ApplicationErrorKind::BadSequenceId => "out of order sequence id", + thrift::ApplicationErrorKind::MissingResult => "missing method result", + thrift::ApplicationErrorKind::InternalError => "remote service threw exception", + thrift::ApplicationErrorKind::ProtocolError => "protocol error", + thrift::ApplicationErrorKind::InvalidTransform => "invalid transform", + thrift::ApplicationErrorKind::InvalidProtocol => "invalid protocol requested", + thrift::ApplicationErrorKind::UnsupportedClientType => { + "unsupported protocol client" + } + _ => "unknown variant", + }; + + write!(f, "application: {}: {}", kind, e.message) + } + thrift::Error::User(e) => write!(f, "user: {e}"), } } } @@ -243,11 +324,13 @@ mod tests { use crate::thrift::agent::{AgentSyncHandler, AgentSyncProcessor}; use chrono::{TimeZone, Utc}; use iox_time::SystemProvider; + use std::borrow::Cow; + use std::collections::HashMap; use std::sync::{Arc, Mutex}; use thrift::server::TProcessor; use thrift::transport::TBufferChannel; use trace::ctx::{SpanContext, SpanId, TraceId}; - use trace::span::{SpanEvent, SpanStatus}; + use trace::span::{MetaValue, SpanEvent, SpanStatus}; struct TestHandler { batches: Arc>>, @@ -382,9 +465,11 @@ mod tests { span.events = vec![SpanEvent { time: Utc.timestamp_nanos(200000), msg: "hello".into(), + metadata: HashMap::from([(Cow::from("evt_md"), MetaValue::Int(42))]), }]; span.start = Some(Utc.timestamp_nanos(100000)); span.end = Some(Utc.timestamp_nanos(300000)); + span.metadata = HashMap::from([(Cow::from("span_md"), MetaValue::Int(1337))]); exporter.export(vec![span.clone(), span.clone()]).await; exporter.export(vec![span.clone()]).await; @@ -452,14 +537,18 @@ mod tests { let logs = b1_s0.logs.as_ref().unwrap(); assert_eq!(logs.len(), 1); assert_eq!(logs[0].timestamp, 200); - assert_eq!(logs[0].fields.len(), 1); + assert_eq!(logs[0].fields.len(), 2); assert_eq!(logs[0].fields[0].key.as_str(), "event"); assert_eq!(logs[0].fields[0].v_str.as_ref().unwrap().as_str(), "hello"); + assert_eq!(logs[0].fields[1].key.as_str(), "evt_md"); + assert_eq!(logs[0].fields[1].v_long.unwrap(), 42); let tags = b1_s0.tags.as_ref().unwrap(); - assert_eq!(tags.len(), 1); + assert_eq!(tags.len(), 2); assert_eq!(tags[0].key.as_str(), "ok"); assert!(tags[0].v_bool.unwrap()); + assert_eq!(tags[1].key.as_str(), "span_md"); + assert_eq!(tags[1].v_long.unwrap(), 1337); } #[test] diff --git a/trace_exporters/src/jaeger/span.rs b/trace_exporters/src/jaeger/span.rs index f6234f5e3cc..d4aa44f4544 100644 --- a/trace_exporters/src/jaeger/span.rs +++ b/trace_exporters/src/jaeger/span.rs @@ -56,12 +56,15 @@ impl TryFrom for jaeger::Span { let tags = match s.metadata.is_empty() { true => None, - false => Some( - s.metadata - .into_iter() - .map(|(name, value)| tag_from_meta(name.to_string(), value)) - .collect(), - ), + false => { + let mut md = s.metadata.into_iter().collect::>(); + md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2)); + Some( + md.into_iter() + .map(|(name, value)| tag_from_meta(name.to_string(), value)) + .collect(), + ) + } }; let logs = match s.events.is_empty() { @@ -115,11 +118,14 @@ impl TryFrom for jaeger::Log { type Error = String; fn try_from(event: SpanEvent) -> Result { + let mut md = event.metadata.into_iter().collect::>(); + md.sort_by(|(k1, _v1), (k2, _v2)| k1.cmp(k2)); + Ok(Self { timestamp: event.time.timestamp_nanos_opt().ok_or_else(|| { format!("timestamp cannot be represented as nanos: {}", event.time) })? / 1000, - fields: vec![jaeger::Tag { + fields: std::iter::once(jaeger::Tag { key: "event".to_string(), v_type: jaeger::TagType::String, v_str: Some(event.msg.to_string()), @@ -127,7 +133,9 @@ impl TryFrom for jaeger::Log { v_bool: None, v_long: None, v_binary: None, - }], + }) + .chain(md.into_iter().map(|(k, v)| tag_from_meta(k.to_string(), v))) + .collect(), }) } } diff --git a/trace_exporters/src/lib.rs b/trace_exporters/src/lib.rs index a2f07a2db83..b1a5337b95e 100644 --- a/trace_exporters/src/lib.rs +++ b/trace_exporters/src/lib.rs @@ -10,6 +10,7 @@ clippy::dbg_macro, unused_crate_dependencies )] +#![allow(unreachable_pub)] // Workaround for "unused crate" lint false positives. use workspace_hack as _; @@ -205,8 +206,23 @@ fn jaeger_exporter(config: &TracingConfig) -> Result> { )?; // Use any specified static span tags. - if let Some(tags) = &config.traces_jaeger_tags { - jaeger = jaeger.with_tags(tags); + let mut tags = config + .traces_jaeger_tags + .as_ref() + .cloned() + .unwrap_or_default(); + + // add hostname + const TAG_HOSTNAME: &str = "hostname"; + if !tags.iter().any(|t| t.key() == TAG_HOSTNAME) { + if let Ok(hostname) = std::env::var("HOSTNAME") { + tags.push(JaegerTag::new(TAG_HOSTNAME, hostname)); + } + } + + // commit tags + if !tags.is_empty() { + jaeger = jaeger.with_tags(&tags); } Ok(Arc::new(AsyncExporter::new(jaeger))) diff --git a/trace_http/Cargo.toml b/trace_http/Cargo.toml index 89c90c53e95..691a8aecdc5 100644 --- a/trace_http/Cargo.toml +++ b/trace_http/Cargo.toml @@ -6,18 +6,22 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] +bytes = "1.5" trace = { path = "../trace" } futures = "0.3" hashbrown = { workspace = true } http = "0.2" http-body = "0.4" -itertools = "0.11" +itertools = "0.12" metric = { path = "../metric" } observability_deps = { path = "../observability_deps" } parking_lot = "0.12" pin-project = "1.1" -snafu = "0.7" +snafu = "0.8" tower = "0.4" workspace-hack = { version = "0.1", path = "../workspace-hack" } diff --git a/trace_http/src/classify.rs b/trace_http/src/classify.rs index eb53df97c3f..4b00bdc5757 100644 --- a/trace_http/src/classify.rs +++ b/trace_http/src/classify.rs @@ -6,39 +6,59 @@ use std::borrow::Cow; /// e.g. a request that encounters both a ClientErr and a ServerErr will /// be recorded as a ServerErr #[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd)] -pub enum Classification { +pub(crate) enum Classification { /// Successful request Ok, - /// The request was to an unrecognised path + + /// The request was to an unrecognized path /// /// This is used by the metrics collection to avoid generating a new set of metrics /// for a request path that doesn't correspond to a valid route PathNotFound, - /// The request was unsuccessful but it was not the fault of the service + + /// Method was not allowed. + MethodNotAllowed, + + /// The request was unsuccessful (4XX) but it was not the fault of the service ClientErr, - /// The request was unsuccessful and it was the fault of the service + + /// The request was unsuccessful (5XX) and it was the fault of the service ServerErr, + + /// The request produced a response that is not 2XX Ok, 4XX ClientErr or 5XX + /// ServerErr. This is unexpected and likely shouldn't happen + UnexpectedResponse, } -pub fn classify_response(response: &http::Response) -> (Cow<'static, str>, Classification) { +pub(crate) fn classify_response( + response: &http::Response, +) -> (Cow<'static, str>, Classification) { let status = response.status(); - match status { - http::StatusCode::OK | http::StatusCode::CREATED | http::StatusCode::NO_CONTENT => { - classify_headers(Some(response.headers())) - } - http::StatusCode::BAD_REQUEST => ("bad request".into(), Classification::ClientErr), - // This is potentially over-zealous but errs on the side of caution - http::StatusCode::NOT_FOUND => ("not found".into(), Classification::PathNotFound), - http::StatusCode::TOO_MANY_REQUESTS => { - ("too many requests".into(), Classification::ClientErr) - } - http::StatusCode::INTERNAL_SERVER_ERROR => { - ("internal server error".into(), Classification::ServerErr) + + if status.is_success() { + classify_headers(Some(response.headers())) + } else if status.is_client_error() { + match status { + http::StatusCode::NOT_FOUND => ("not found".into(), Classification::PathNotFound), + http::StatusCode::METHOD_NOT_ALLOWED => ( + "method not allowed".into(), + Classification::MethodNotAllowed, + ), + _ => ( + format!("unexpected 4XX status code: {status}").into(), + Classification::ClientErr, + ), } - _ => ( - format!("unexpected status code: {status}").into(), + } else if status.is_server_error() { + ( + format!("unexpected 5XX status code: {status}").into(), Classification::ServerErr, - ), + ) + } else { + ( + format!("unexpected non-error status code: {status}").into(), + Classification::UnexpectedResponse, + ) } } @@ -47,7 +67,7 @@ pub fn classify_response(response: &http::Response) -> (Cow<'static, str>, /// /// [1]: https://grpc.github.io/grpc/core/md_doc_statuscodes.html /// [2]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Trailer -pub fn classify_headers( +pub(crate) fn classify_headers( headers: Option<&http::header::HeaderMap>, ) -> (Cow<'static, str>, Classification) { match headers.and_then(|headers| headers.get("grpc-status")) { diff --git a/trace_http/src/lib.rs b/trace_http/src/lib.rs index 2aeb0398138..06c26bba040 100644 --- a/trace_http/src/lib.rs +++ b/trace_http/src/lib.rs @@ -16,5 +16,5 @@ use workspace_hack as _; mod classify; pub mod ctx; -mod metrics; +pub mod metrics; pub mod tower; diff --git a/trace_http/src/metrics.rs b/trace_http/src/metrics.rs index 32cbbd3eb6a..e32035a05d9 100644 --- a/trace_http/src/metrics.rs +++ b/trace_http/src/metrics.rs @@ -1,60 +1,128 @@ use crate::classify::Classification; use hashbrown::HashMap; +use http::Method; use metric::{Attributes, DurationHistogram, Metric, ResultMetric, U64Counter}; use parking_lot::{MappedMutexGuard, Mutex, MutexGuard}; use std::sync::Arc; use std::time::Instant; -/// `MetricsCollection` is used to retrieve `MetricsRecorder` for instrumenting http requests +/// The family of [`RequestMetrics`] to publish +#[derive(Debug, Copy, Clone)] +pub enum MetricFamily { + HttpServer, + GrpcServer, + HttpClient, + GrpcClient, +} + +#[derive(Debug, PartialEq, Eq, Hash)] +struct MetricsKey { + /// request path or None for 404 responses + path: Option, + + /// method or None for invalid methods + method: Option, +} + +/// Metrics collected for HTTP/gRPC requests #[derive(Debug)] -pub struct MetricsCollection { - /// Whether this `MetricCollection` should publish to grpc_request* or http_request* - is_grpc: bool, +pub struct RequestMetrics { + /// Whether this `MetricCollection` + family: MetricFamily, /// Metric registry for registering new metrics metric_registry: Arc, - /// Metrics keyed by request path or None for 404 responses - metrics: Mutex, Metrics>>, + /// Metrics. + metrics: Mutex>, + + /// Maximum path segments. + max_path_segments: Option, } -impl MetricsCollection { - pub fn new(metric_registry: Arc, is_grpc: bool) -> Self { +impl RequestMetrics { + pub fn new(metric_registry: Arc, family: MetricFamily) -> Self { Self { - is_grpc, + family, metric_registry, metrics: Default::default(), + max_path_segments: None, } } + /// Restrict metric paths to `segments` + pub fn with_max_path_segments(mut self, segments: usize) -> Self { + self.max_path_segments = Some(segments); + self + } + /// Gets the `MetricsRecorder` for a given http request - pub fn recorder(self: &Arc, request: &http::Request) -> MetricsRecorder { + pub(crate) fn recorder(self: &Arc, request: &http::Request) -> MetricsRecorder { MetricsRecorder { metrics: Arc::clone(self), start_instant: Instant::now(), path: Some(request.uri().path().to_string()), + method: Some(request.method().clone()), classification: None, } } - fn request_metrics(&self, path: Option) -> MappedMutexGuard<'_, Metrics> { + fn request_metrics( + &self, + path: Option, + method: Option, + ) -> MappedMutexGuard<'_, Metrics> { + // method is only important for HTTP / non-gRPC + let method = match self.family { + MetricFamily::HttpServer | MetricFamily::HttpClient => method, + MetricFamily::GrpcServer | MetricFamily::GrpcClient => None, + }; + MutexGuard::map(self.metrics.lock(), |metrics| { + let key = MetricsKey { path, method }; let (_, request_metrics) = - metrics.raw_entry_mut().from_key(&path).or_insert_with(|| { - let attributes = match path.as_ref() { - Some(path) => Attributes::from([("path", path.clone().into())]), - None => Attributes::from([]), - }; + metrics.raw_entry_mut().from_key(&key).or_insert_with(|| { + let mut attributes = Attributes::from([]); + if let Some(path) = &key.path { + attributes.insert("path", truncate_path(path, self.max_path_segments)); + } + if let Some(method) = &key.method { + attributes.insert("method", method.to_string()); + } + if let (Some(path), Some(method)) = (&key.path, &key.method) { + // help Grafana because you can only repeat a single variable, not a cross-product of the two + attributes.insert( + "method_path", + format!("{} {}", method, truncate_path(path, self.max_path_segments)), + ); + } let metrics = - Metrics::new(self.metric_registry.as_ref(), attributes, self.is_grpc); - (path, metrics) + Metrics::new(self.metric_registry.as_ref(), attributes, self.family); + + (key, metrics) }); request_metrics }) } } +fn truncate_path(path: &str, segments: Option) -> String { + let search = || { + let s = segments?; + let mut indices = path.match_indices('/'); + for _ in 0..s { + indices.next(); + } + let end = indices.next()?.0; + if end + 1 == path.len() { + return None; + } + Some(format!("{}/*", &path[..end])) + }; + search().unwrap_or_else(|| path.to_string()) +} + /// The request metrics for a specific set of attributes (e.g. path) #[derive(Debug)] struct Metrics { @@ -69,10 +137,16 @@ struct Metrics { } impl Metrics { - fn new(registry: &metric::Registry, attributes: impl Into, is_grpc: bool) -> Self { - let (counter, duration) = match is_grpc { - true => ("grpc_requests", "grpc_request_duration"), - false => ("http_requests", "http_request_duration"), + fn new( + registry: &metric::Registry, + attributes: impl Into, + family: MetricFamily, + ) -> Self { + let (counter, duration) = match family { + MetricFamily::GrpcServer => ("grpc_requests", "grpc_request_duration"), + MetricFamily::HttpServer => ("http_requests", "http_request_duration"), + MetricFamily::GrpcClient => ("grpc_client_requests", "grpc_client_request_duration"), + MetricFamily::HttpClient => ("http_client_requests", "http_client_request_duration"), }; let counter: Metric = @@ -98,20 +172,25 @@ impl Metrics { /// A `MetricsRecorder` is used to record metrics for a given http request #[derive(Debug)] -pub struct MetricsRecorder { - metrics: Arc, +pub(crate) struct MetricsRecorder { + metrics: Arc, start_instant: Instant, path: Option, + method: Option, classification: Option, } impl MetricsRecorder { /// Sets the classification of this request if not already set - pub fn set_classification(&mut self, classification: Classification) { + pub(crate) fn set_classification(&mut self, classification: Classification) { if matches!(classification, Classification::PathNotFound) { // Don't want to pollute metrics with invalid paths self.path = None } + if matches!(classification, Classification::MethodNotAllowed) { + // Don't want to pollute metrics with invalid methods + self.method = None + } self.classification = Some(match self.classification { Some(existing) => existing.max(classification), @@ -122,7 +201,9 @@ impl MetricsRecorder { impl Drop for MetricsRecorder { fn drop(&mut self) { - let metrics = self.metrics.request_metrics(self.path.take()); + let metrics = self + .metrics + .request_metrics(self.path.take(), self.method.take()); let duration = self.start_instant.elapsed(); match self.classification { @@ -130,7 +211,9 @@ impl Drop for MetricsRecorder { metrics.request_count.ok.inc(1); metrics.request_duration.ok.record(duration); } - Some(Classification::ClientErr) | Some(Classification::PathNotFound) => { + Some(Classification::ClientErr) + | Some(Classification::PathNotFound) + | Some(Classification::MethodNotAllowed) => { metrics.request_count.client_error.inc(1); metrics.request_duration.client_error.record(duration); } @@ -138,7 +221,30 @@ impl Drop for MetricsRecorder { metrics.request_count.server_error.inc(1); metrics.request_duration.server_error.record(duration); } + Some(Classification::UnexpectedResponse) => { + metrics.request_count.unexpected_response.inc(1); + metrics + .request_duration + .unexpected_response + .record(duration); + } None => metrics.aborted_count.inc(1), } } } + +#[cfg(test)] +mod tests { + use super::*; + #[test] + fn test_truncate() { + assert_eq!(truncate_path("/health", Some(1)), "/health"); + assert_eq!(truncate_path("/api/v2/write", Some(3)), "/api/v2/write"); + assert_eq!(truncate_path("/api/v2/write/", Some(3)), "/api/v2/write/"); + assert_eq!(truncate_path("/api/v2/write", Some(2)), "/api/v2/*"); + assert_eq!(truncate_path("/v1/p/000000000000053e", Some(2)), "/v1/p/*"); + assert_eq!(truncate_path("/a/b/c/d/e/f", None), "/a/b/c/d/e/f"); + assert_eq!(truncate_path("/a/b/c/d/e/f/", None), "/a/b/c/d/e/f/"); + assert_eq!(truncate_path("/v1/p/", Some(2)), "/v1/p/"); + } +} diff --git a/trace_http/src/tower.rs b/trace_http/src/tower.rs index 120f416cc9c..bfba5e1d9cd 100644 --- a/trace_http/src/tower.rs +++ b/trace_http/src/tower.rs @@ -18,6 +18,7 @@ use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::task::{Context, Poll}; +use bytes::Buf; use futures::ready; use http::{HeaderValue, Request, Response}; use http_body::SizeHint; @@ -25,11 +26,12 @@ use pin_project::{pin_project, pinned_drop}; use tower::{Layer, Service}; use observability_deps::tracing::{error, warn}; +use trace::span::{SpanEvent, SpanStatus}; use trace::{span::SpanRecorder, TraceCollector}; use crate::classify::{classify_headers, classify_response, Classification}; use crate::ctx::{RequestLogContext, RequestLogContextExt, TraceHeaderParser}; -use crate::metrics::{MetricsCollection, MetricsRecorder}; +use crate::metrics::{MetricsRecorder, RequestMetrics}; /// `TraceLayer` implements `tower::Layer` and can be used to decorate a /// `tower::Service` to collect information about requests flowing through it @@ -43,7 +45,7 @@ use crate::metrics::{MetricsCollection, MetricsRecorder}; #[derive(Debug, Clone)] pub struct TraceLayer { trace_header_parser: TraceHeaderParser, - metrics: Arc, + metrics: Arc, collector: Option>, name: Arc, } @@ -52,14 +54,13 @@ impl TraceLayer { /// Create a new tower [`Layer`] for tracing pub fn new( trace_header_parser: TraceHeaderParser, - metric_registry: Arc, + metrics: Arc, collector: Option>, - is_grpc: bool, name: &str, ) -> Self { Self { trace_header_parser, - metrics: Arc::new(MetricsCollection::new(metric_registry, is_grpc)), + metrics, collector, name: name.into(), } @@ -74,7 +75,7 @@ impl Layer for TraceLayer { service, collector: self.collector.clone(), metrics: Arc::clone(&self.metrics), - trace_header_parser: self.trace_header_parser.clone(), + trace_header_parser: Some(self.trace_header_parser.clone()), name: Arc::clone(&self.name), } } @@ -84,12 +85,30 @@ impl Layer for TraceLayer { #[derive(Debug, Clone)] pub struct TraceService { service: S, - trace_header_parser: TraceHeaderParser, + trace_header_parser: Option, collector: Option>, - metrics: Arc, + metrics: Arc, name: Arc, } +impl TraceService { + /// Create a new [`TraceService`] for instrumenting a client + pub fn new_client( + service: S, + metrics: Arc, + collector: Option>, + name: &str, + ) -> Self { + Self { + service, + trace_header_parser: None, + metrics, + collector, + name: name.into(), + } + } +} + impl Service> for TraceService where S: Service, Response = Response>, @@ -106,23 +125,22 @@ where fn call(&mut self, mut request: Request) -> Self::Future { let metrics_recorder = Some(self.metrics.recorder(&request)); - let request_ctx = match self - .trace_header_parser - .parse(self.collector.as_ref(), request.headers()) - { - Ok(Some(ctx)) => { - let ctx = RequestLogContext::new(ctx); + let request_ctx = self.trace_header_parser.as_ref().and_then(|parser| { + match parser.parse(self.collector.as_ref(), request.headers()) { + Ok(Some(ctx)) => { + let ctx = RequestLogContext::new(ctx); - request.extensions_mut().insert(ctx.clone()); + request.extensions_mut().insert(ctx.clone()); - Some(ctx) - } - Ok(None) => None, - Err(e) => { - error!(%e, "error extracting trace context from request"); - None + Some(ctx) + } + Ok(None) => None, + Err(e) => { + error!(%e, "error extracting trace context from request"); + None + } } - }; + }); let span = request_ctx.as_ref().and_then(|ctx| { let ctx = ctx.ctx(); @@ -196,7 +214,7 @@ where metrics_recorder.set_classification(Classification::Ok); span_recorder.ok("request processed with empty response") } - false => span_recorder.event("request processed"), + false => span_recorder.event(SpanEvent::new("request processed")), }, (error, c) => { metrics_recorder.set_classification(c); @@ -292,16 +310,29 @@ impl http_body::Body for TracedBody { let projected = self.as_mut().project(); let span_recorder = projected.span_recorder; let metrics_recorder = projected.metrics_recorder; + match &result { - Ok(_) => match projected.inner.is_end_stream() { - true => { - metrics_recorder.set_classification(Classification::Ok); - span_recorder.ok("returned body data and no trailers"); - projected.was_done_data.store(true, Ordering::SeqCst); - projected.was_ready_trailers.store(true, Ordering::SeqCst); + Ok(body) => { + let size = body.remaining() as i64; + match projected.inner.is_end_stream() { + true => { + metrics_recorder.set_classification(Classification::Ok); + + let mut evt = SpanEvent::new("returned body data and no trailers"); + evt.set_metadata("size", size); + span_recorder.event(evt); + span_recorder.status(SpanStatus::Ok); + + projected.was_done_data.store(true, Ordering::SeqCst); + projected.was_ready_trailers.store(true, Ordering::SeqCst); + } + false => { + let mut evt = SpanEvent::new("returned body data"); + evt.set_metadata("size", size); + span_recorder.event(evt); + } } - false => span_recorder.event("returned body data"), - }, + } Err(_) => { metrics_recorder.set_classification(Classification::ServerErr); span_recorder.error("error getting body"); @@ -309,6 +340,7 @@ impl http_body::Body for TracedBody { projected.was_ready_trailers.store(true, Ordering::SeqCst); } } + Poll::Ready(Some(result)) } diff --git a/tracker/Cargo.toml b/tracker/Cargo.toml index 3143c12e35d..d058226cbaa 100644 --- a/tracker/Cargo.toml +++ b/tracker/Cargo.toml @@ -6,23 +6,26 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] futures = "0.3" hashbrown = { workspace = true } -lock_api = "0.4.10" +lock_api = "0.4.11" metric = { path = "../metric" } observability_deps = { path = "../observability_deps" } parking_lot = "0.12" pin-project = "1.1" iox_time = { path = "../iox_time" } -tokio = { version = "1.32", features = ["macros", "parking_lot", "sync", "time"] } -tokio-util = { version = "0.7.9" } +tokio = { version = "1.35", features = ["macros", "parking_lot", "sync", "time"] } +tokio-util = { version = "0.7.10" } trace = { path = "../trace"} workspace-hack = { version = "0.1", path = "../workspace-hack" } -sysinfo = "0.29.10" +sysinfo = "0.30.5" [dev-dependencies] -tempfile = "3.8.0" +tempfile = "3.9.0" # Need the multi-threaded executor for testing -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "time"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "time"] } test_helpers = { path = "../test_helpers", features = ["future_timeout"] } diff --git a/tracker/src/async_semaphore.rs b/tracker/src/async_semaphore.rs index 6377c95bf73..3b8ce7b37a7 100644 --- a/tracker/src/async_semaphore.rs +++ b/tracker/src/async_semaphore.rs @@ -1,5 +1,12 @@ //! Tooling to track/instrument [`tokio::sync::Semaphore`]s. -use std::{future::Future, marker::PhantomData, sync::Arc, task::Poll, time::Instant}; +use std::{ + future::Future, + marker::PhantomData, + ops::Deref, + sync::Arc, + task::Poll, + time::{Duration, Instant}, +}; use futures::{future::BoxFuture, FutureExt}; use metric::{Attributes, DurationHistogram, MakeMetricObserver, U64Counter, U64Gauge}; @@ -284,8 +291,8 @@ impl<'a> Future for InstrumentedAsyncSemaphoreAcquire<'a> { this.metrics.permits_acquired.inc(*this.n as u64); this.metrics.holders_acquired.inc(1); - let elapsed = this.t_start.elapsed(); - this.metrics.acquire_duration.record(elapsed); + let acquire_duration = this.t_start.elapsed(); + this.metrics.acquire_duration.record(acquire_duration); // reset "pending" metrics if we've reported any if *this.reported_pending { @@ -308,6 +315,7 @@ impl<'a> Future for InstrumentedAsyncSemaphoreAcquire<'a> { inner: permit, n: *this.n, metrics: Arc::clone(this.metrics), + acquire_duration, span_recorder, })) } @@ -380,6 +388,9 @@ pub struct InstrumentedAsyncOwnedSemaphorePermit { /// Metrics. metrics: Arc, + /// The time it took to acquire this permit. + acquire_duration: Duration, + /// Span recorder for the entire semaphore interaction. /// /// No direct interaction, will be exported during drop (aka the end of the span will be set). @@ -387,6 +398,13 @@ pub struct InstrumentedAsyncOwnedSemaphorePermit { span_recorder: SpanRecorder, } +impl InstrumentedAsyncOwnedSemaphorePermit { + /// The time it took to acquire this permit. + pub fn acquire_duration(&self) -> Duration { + self.acquire_duration + } +} + impl Drop for InstrumentedAsyncOwnedSemaphorePermit { fn drop(&mut self) { self.metrics.holders_acquired.dec(1); @@ -406,6 +424,14 @@ pub struct InstrumentedAsyncSemaphorePermit<'a> { phantom: PhantomData<&'a ()>, } +impl<'a> Deref for InstrumentedAsyncSemaphorePermit<'a> { + type Target = InstrumentedAsyncOwnedSemaphorePermit; + + fn deref(&self) -> &Self::Target { + &self.owned_permit + } +} + #[cfg(test)] mod tests { use std::time::Duration; @@ -611,6 +637,7 @@ mod tests { ); let p1 = semaphore.acquire_many(5, None).await.unwrap(); + let p1_duration = p1.acquire_duration(); let fut = semaphore.acquire_many(6, None); pin!(fut); @@ -619,9 +646,12 @@ mod tests { tokio::time::sleep(Duration::from_millis(10)).await; drop(p1); - fut.await.unwrap(); + let p2 = fut.await.unwrap(); + let acquire_duration_method = p1_duration + p2.acquire_duration(); + let acquire_duration_metric = metrics.acquire_duration.fetch().total; - assert!(metrics.acquire_duration.fetch().total >= Duration::from_millis(10)); + assert_eq!(acquire_duration_method, acquire_duration_metric); + assert!(acquire_duration_method >= Duration::from_millis(10)); } #[tokio::test] diff --git a/tracker/src/disk_metric.rs b/tracker/src/disk_metric.rs index 4267a169dae..261b4d664b3 100644 --- a/tracker/src/disk_metric.rs +++ b/tracker/src/disk_metric.rs @@ -3,7 +3,7 @@ use std::path::PathBuf; use std::time::Duration; use metric::{Attributes, U64Gauge}; -use sysinfo::{DiskExt, RefreshKind, System, SystemExt}; +use sysinfo::Disks; use tokio::sync::watch; /// The interval at which disk metrics are updated. @@ -53,10 +53,10 @@ pub struct DiskSpaceMetrics { available_disk_space: U64Gauge, total_disk_space: U64Gauge, - /// The [`System`] containing the disk list at construction time. - system: System, + /// The [`Disks`] containing the disk list at construction time. + disks: Disks, - /// The index into [`System::disks()`] for the disk containing the observed + /// The index into [`Disks::list()`] for the disk containing the observed /// directory. disk_idx: usize, @@ -92,13 +92,14 @@ impl DiskSpaceMetrics { .recorder(attributes); // Load the disk stats once, and refresh them later. - let system = System::new_with_specifics(RefreshKind::new().with_disks_list()); + let mut disks = Disks::new(); + disks.refresh_list(); // Resolve the mount point once. // The directory path may be `/path/to/dir` and the mount point is `/`. let (disk_idx, initial_disk) = loop { - if let Some((idx, disk)) = system - .disks() + if let Some((idx, disk)) = disks + .list() .iter() .enumerate() .find(|(_idx, disk)| disk.mount_point() == directory) @@ -120,7 +121,7 @@ impl DiskSpaceMetrics { Self { available_disk_space, total_disk_space, - system, + disks, disk_idx, snapshot_tx, }, @@ -135,8 +136,8 @@ impl DiskSpaceMetrics { interval.tick().await; let disk = self - .system - .disks_mut() + .disks + .list_mut() .get_mut(self.disk_idx) .expect("disk list never refreshed so should not change"); diff --git a/tracker/src/lock.rs b/tracker/src/lock.rs index 0f067dfdc22..98b37619d07 100644 --- a/tracker/src/lock.rs +++ b/tracker/src/lock.rs @@ -2,7 +2,8 @@ use std::sync::Arc; use metric::{Attributes, DurationCounter, Metric, U64Counter}; -type RawRwLock = InstrumentedRawRwLock; +type RawRwLock = InstrumentedRawLock; +type RawMutex = InstrumentedRawLock; /// An instrumented Read-Write Lock pub type RwLock = lock_api::RwLock; @@ -12,6 +13,11 @@ pub type MappedRwLockReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, RawR pub type MappedRwLockWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, RawRwLock, T>; pub type RwLockUpgradableReadGuard<'a, T> = lock_api::RwLockUpgradableReadGuard<'a, RawRwLock, T>; +/// An instrumented mutex +pub type Mutex = lock_api::Mutex; +pub type MutexGuard<'a, T> = lock_api::MutexGuard<'a, RawMutex, T>; +pub type MappedMutexGuard<'a, T> = lock_api::MappedMutexGuard<'a, RawMutex, T>; + #[derive(Debug)] pub struct LockMetrics { exclusive_count: U64Counter, @@ -86,9 +92,26 @@ impl LockMetrics { pub fn new_lock_raw( self: &Arc, t: T, - ) -> lock_api::RwLock, T> { + ) -> lock_api::RwLock, T> { lock_api::RwLock::const_new( - InstrumentedRawRwLock { + InstrumentedRawLock { + inner: R::INIT, + metrics: Some(Arc::clone(self)), + }, + t, + ) + } + + pub fn new_mutex(self: &Arc, t: T) -> Mutex { + self.new_mutex_raw(t) + } + + pub fn new_mutex_raw( + self: &Arc, + t: T, + ) -> lock_api::Mutex, T> { + lock_api::Mutex::const_new( + InstrumentedRawLock { inner: R::INIT, metrics: Some(Arc::clone(self)), }, @@ -102,7 +125,7 @@ impl LockMetrics { /// /// This is a raw lock implementation that wraps another and instruments it #[derive(Debug)] -pub struct InstrumentedRawRwLock { +pub struct InstrumentedRawLock { inner: R, /// Stores the tracking data if any @@ -126,7 +149,7 @@ pub struct InstrumentedRawRwLock { /// exists. /// /// This is done by delegating to the wrapped RawRwLock implementation -unsafe impl lock_api::RawRwLock for InstrumentedRawRwLock { +unsafe impl lock_api::RawRwLock for InstrumentedRawLock { const INIT: Self = Self { inner: R::INIT, metrics: None, @@ -229,7 +252,7 @@ unsafe impl lock_api::RawRwLock for Instrumented /// /// This is done by delegating to the wrapped RawRwLock implementation unsafe impl lock_api::RawRwLockUpgrade - for InstrumentedRawRwLock + for InstrumentedRawLock { fn lock_upgradable(&self) { match &self.metrics { @@ -292,6 +315,54 @@ unsafe impl lock_api::RawRwLockUpgrade } } +/// # Safety +/// +/// Implementations of this trait must ensure that the `Mutex` is actually +/// exclusive: an exclusive lock can't be acquired while another exclusive +/// lock exists. +/// +/// This is done by delegating to the wrapped RawMutex implementation +unsafe impl lock_api::RawMutex for InstrumentedRawLock { + const INIT: Self = Self { + inner: R::INIT, + metrics: None, + }; + + type GuardMarker = R::GuardMarker; + + fn lock(&self) { + match &self.metrics { + Some(shared) => { + // Early return if possible - Instant::now is not necessarily cheap + if self.try_lock() { + return; + } + + let now = std::time::Instant::now(); + self.inner.lock(); + let elapsed = now.elapsed(); + shared.exclusive_count.inc(1); + shared.exclusive_wait.inc(elapsed); + } + None => self.inner.lock(), + } + } + + fn try_lock(&self) -> bool { + let ret = self.inner.try_lock(); + if let Some(shared) = &self.metrics { + if ret { + shared.exclusive_count.inc(1); + } + } + ret + } + + unsafe fn unlock(&self) { + self.inner.unlock() + } +} + #[cfg(test)] mod tests { // Clippy isn't recognizing the explicit drops; none of these locks are actually being held @@ -302,7 +373,7 @@ mod tests { use std::time::Duration; #[test] - fn test_counts() { + fn test_rwlock_counts() { let metrics = Arc::new(LockMetrics::new_unregistered()); let lock = metrics.new_lock(32); @@ -319,6 +390,21 @@ mod tests { assert_eq!(metrics.shared_count.fetch(), 2); } + #[test] + fn test_mutex_counts() { + let metrics = Arc::new(LockMetrics::new_unregistered()); + let mutex = metrics.new_mutex(32); + + let g = mutex.lock(); + drop(g); + + let g = mutex.lock(); + drop(g); + + assert_eq!(metrics.exclusive_count.fetch(), 2); + assert_eq!(metrics.shared_count.fetch(), 0); + } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_shared_wait_time() { let metrics = Arc::new(LockMetrics::new_unregistered()); @@ -366,6 +452,29 @@ mod tests { assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200)); } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_mutex_wait_time() { + let metrics = Arc::new(LockMetrics::new_unregistered()); + let l1 = Arc::new(metrics.new_mutex(32)); + let l2 = Arc::clone(&l1); + + let g = l1.lock(); + let join = tokio::spawn(async move { + let _g = l2.lock(); + }); + + std::thread::sleep(Duration::from_millis(100)); + std::mem::drop(g); + + join.await.unwrap(); + + assert_eq!(metrics.exclusive_count.fetch(), 2); + assert_eq!(metrics.shared_count.fetch(), 0); + assert_eq!(metrics.shared_wait.fetch(), Duration::ZERO); + assert!(metrics.exclusive_wait.fetch() > Duration::from_millis(80)); + assert!(metrics.exclusive_wait.fetch() < Duration::from_millis(200)); + } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn test_multiple() { let metrics = Arc::new(LockMetrics::new_unregistered()); diff --git a/tracker/src/task.rs b/tracker/src/task.rs index 9fbc0df9b37..1631f936abd 100644 --- a/tracker/src/task.rs +++ b/tracker/src/task.rs @@ -204,7 +204,7 @@ impl TaskResult { } /// The status of the tracked task -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq, Copy)] pub enum TaskStatus { /// More futures can be registered Creating, diff --git a/tracker/src/task/history.rs b/tracker/src/task/history.rs index 42db815f854..3ef8e2e9d91 100644 --- a/tracker/src/task/history.rs +++ b/tracker/src/task/history.rs @@ -86,7 +86,7 @@ struct SizeLimitedHashMap { } impl SizeLimitedHashMap { - pub fn new(capacity: usize) -> Self { + pub(crate) fn new(capacity: usize) -> Self { Self { values: HashMap::with_capacity(capacity), ring: Vec::with_capacity(capacity), @@ -96,14 +96,14 @@ impl SizeLimitedHashMap { } /// Get the value associated with a specific key - pub fn get(&self, key: &K) -> Option<&V> { + pub(crate) fn get(&self, key: &K) -> Option<&V> { self.values.get(key) } /// Returns an iterator to all values stored within the ring buffer /// /// Note: the order is not guaranteed - pub fn values(&self) -> impl Iterator + '_ { + pub(crate) fn values(&self) -> impl Iterator + '_ { self.values.values() } @@ -116,7 +116,7 @@ impl SizeLimitedHashMap { /// from the buffer. /// /// This returns the replaced value (if any). - pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> { + pub(crate) fn push(&mut self, key: K, value: V) -> Option<(K, V)> { if let Entry::Occupied(occupied) = self.values.entry(key) { // If already exists - replace existing value occupied.replace_entry(value); diff --git a/trogging/Cargo.toml b/trogging/Cargo.toml index 3c7c6960815..b4f547c785f 100644 --- a/trogging/Cargo.toml +++ b/trogging/Cargo.toml @@ -6,12 +6,15 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] clap = { version = "4", features = ["derive", "env"], optional = true } logfmt = { path = "../logfmt" } observability_deps = { path = "../observability_deps" } -thiserror = "1.0.48" -tracing-log = "0.1" +thiserror = "1.0.56" +tracing-log = "0.2" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } [dev-dependencies] diff --git a/wal/Cargo.toml b/wal/Cargo.toml index 789080c5bcf..1a9fc0a0dd7 100644 --- a/wal/Cargo.toml +++ b/wal/Cargo.toml @@ -5,8 +5,11 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order -byteorder = "1.3.4" +byteorder = "1.5.0" crc32fast = "1.2.0" data_types = { path = "../data_types" } generated_types = { path = "../generated_types" } @@ -15,10 +18,10 @@ mutable_batch = { version = "0.1.0", path = "../mutable_batch" } mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" } observability_deps = { path = "../observability_deps" } parking_lot = "0.12" -prost = "0.11" -snafu = "0.7" -snap = "1.0.0" -tokio = { version = "1.32", features = ["macros", "fs", "io-util", "parking_lot", "rt-multi-thread", "sync", "time"] } +prost = { workspace = true } +snafu = "0.8" +snap = "1.1.1" +tokio = { version = "1.35", features = ["macros", "fs", "io-util", "parking_lot", "rt-multi-thread", "sync", "time"] } workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order diff --git a/wal/src/blocking/reader.rs b/wal/src/blocking/reader.rs index 582fcda42b8..c0e9dcac174 100644 --- a/wal/src/blocking/reader.rs +++ b/wal/src/blocking/reader.rs @@ -11,8 +11,10 @@ use std::{ path::{Path, PathBuf}, }; +/// A closed segment file reader over an `R`, tracking the number of compressed +/// bytes read. #[derive(Debug)] -pub struct ClosedSegmentFileReader(R); +pub struct ClosedSegmentFileReader(R, u64); impl ClosedSegmentFileReader> { pub fn from_path(path: impl AsRef) -> Result { @@ -28,7 +30,7 @@ where R: Read, { pub fn new(f: R) -> Self { - Self(f) + Self(f, 0) } fn read_array(&mut self) -> Result<[u8; N]> { @@ -36,6 +38,7 @@ where self.0 .read_exact(&mut data) .context(UnableToReadArraySnafu { length: N })?; + self.1 += N as u64; Ok(data) } @@ -66,6 +69,16 @@ where let (actual_compressed_len, actual_checksum) = decompressing_read.into_inner().checksum(); + // Track the size of the entry header and total amount of compressed + // data successfully read so far by the reader. The header values are + // tracked here to avoid continuously counting bytes read from a + // corrupted segment where no further entries can be read. + // + // This accounting is done before checksum/length mismatch, if the data has still + // been read in successfully. + self.1 += 2 * std::mem::size_of::() as u64; + self.1 += actual_compressed_len; + ensure!( expected_len == actual_compressed_len, LengthMismatchSnafu { @@ -100,6 +113,12 @@ where Ok(None) } + + /// Returns the total amount of bytes successfully read from this reader's + /// underlying file, in bytes. + pub fn bytes_read(&self) -> u64 { + self.1 + } } struct CrcReader { @@ -208,6 +227,7 @@ mod tests { let entry = reader.one_entry().unwrap(); assert!(entry.is_none()); + assert_eq!(reader.bytes_read(), segment_file.size_bytes()); } #[test] @@ -236,11 +256,17 @@ mod tests { let entry = reader.one_entry().unwrap(); assert!(entry.is_none()); + assert_eq!(reader.bytes_read(), segment_file.size_bytes()); } #[test] fn unsuccessful_read_too_short_len() { let mut segment_file = FakeSegmentFile::new(); + + // The bad entry will prevent any entries being read, thus the + // no bytes can be reported as successfully read. + let want_bytes_read = segment_file.size_bytes(); + let bad_entry_input = FakeSegmentEntry::new(b"hello"); let good_length = bad_entry_input.compressed_len(); let bad_entry_input = bad_entry_input.with_compressed_len(good_length - 1); @@ -260,14 +286,22 @@ mod tests { assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => { assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof); }); + assert_eq!(reader.bytes_read(), want_bytes_read); // Trying to continue reading will fail as well, see: // assert_error!(reader.one_entry(), Error::UnableToReadData { .. }); + // Ensure no magical bean counting occurs when stuck unable to read data. + assert_eq!(reader.bytes_read(), want_bytes_read); } #[test] fn unsuccessful_read_too_long_len() { let mut segment_file = FakeSegmentFile::new(); + + // The bad entry will prevent any entries being read, thus the + // no bytes can be reported as successfully read. + let want_bytes_read = segment_file.size_bytes(); + let bad_entry_input = FakeSegmentEntry::new(b"hello"); let good_length = bad_entry_input.compressed_len(); let bad_entry_input = bad_entry_input.with_compressed_len(good_length + 1); @@ -287,14 +321,18 @@ mod tests { assert_matches!(read_fail, Err(Error::UnableToReadData { source: e }) => { assert_matches!(e.kind(), std::io::ErrorKind::UnexpectedEof); }); + assert_eq!(reader.bytes_read(), want_bytes_read); // Trying to continue reading will fail as well, see: // assert_error!(reader.one_entry(), Error::UnableToReadData { .. }); + // Also no magical bean counting when cannot read more. + assert_eq!(reader.bytes_read(), want_bytes_read); } #[test] fn unsuccessful_read_checksum_mismatch() { let mut segment_file = FakeSegmentFile::new(); + let bad_entry_input = FakeSegmentEntry::new(b"hello"); let good_checksum = bad_entry_input.checksum(); let bad_entry_input = bad_entry_input.with_checksum(good_checksum + 1); @@ -320,6 +358,7 @@ mod tests { let entry = reader.one_entry().unwrap(); assert!(entry.is_none()); + assert_eq!(reader.bytes_read(), segment_file.size_bytes()); } #[derive(Debug)] @@ -356,6 +395,23 @@ mod tests { f } + + fn size_bytes(&self) -> u64 { + std::mem::size_of::() as u64 + + std::mem::size_of::() as u64 + + self + .entries + .iter() + .map(|e| { + // Each entry is sized by the two 4 byte + // header values (checksum and compressed_len) + // as well as the length of the compressed data. + (std::mem::size_of::() + + std::mem::size_of::() + + e.compressed_data().len()) as u64 + }) + .sum::() + } } #[derive(Debug, Clone, PartialEq)] diff --git a/wal/src/lib.rs b/wal/src/lib.rs index 145d1cf0f5f..0801021e16b 100644 --- a/wal/src/lib.rs +++ b/wal/src/lib.rs @@ -24,6 +24,11 @@ use std::{ time::Duration, }; +use hashbrown::HashMap; +use parking_lot::Mutex; +use snafu::prelude::*; +use tokio::{sync::watch, task::JoinHandle}; + use data_types::{sequence_number_set::SequenceNumberSet, NamespaceId, TableId}; use generated_types::{ google::{FieldViolation, OptionalField}, @@ -31,13 +36,9 @@ use generated_types::{ sequenced_wal_op::Op as WalOp, SequencedWalOp as ProtoSequencedWalOp, }, }; -use hashbrown::HashMap; use mutable_batch::MutableBatch; use mutable_batch_pb::decode::decode_database_batch; use observability_deps::tracing::info; -use parking_lot::Mutex; -use snafu::prelude::*; -use tokio::{sync::watch, task::JoinHandle}; use writer_thread::WriterIoThreadHandle; use crate::blocking::{ @@ -235,7 +236,7 @@ impl Wal { /// /// Similarly, editing or deleting files within a `Wal`'s root directory via some other /// mechanism is not supported. - pub async fn new(root: impl Into) -> Result> { + pub async fn new(root: impl Into + Send) -> Result> { let root = root.into(); info!(wal_dir=?root, "Initalizing Write Ahead Log (WAL)"); tokio::fs::create_dir_all(&root) @@ -550,7 +551,7 @@ pub struct ClosedSegmentFileReader { } impl Iterator for ClosedSegmentFileReader { - type Item = Result>; + type Item = Result<(Vec, u64)>; /// Read the next batch of sequenced WAL operations from the file fn next(&mut self) -> Option { @@ -558,6 +559,7 @@ impl Iterator for ClosedSegmentFileReader { .next_batch() .context(UnableToReadNextOpsSnafu) .transpose() + .map(|result| result.map(|batch| (batch, self.bytes_read()))) } } @@ -567,6 +569,12 @@ impl ClosedSegmentFileReader { self.id } + /// Returns the total number of bytes successfully read by the underlying file reader + /// from disk. + pub fn bytes_read(&self) -> u64 { + self.file.bytes_read() + } + /// Open the segment file and read its header, ensuring it is a segment file and reading its id. pub fn from_path(path: impl AsRef) -> Result { let path = path.as_ref(); @@ -629,7 +637,7 @@ impl Iterator for WriteOpEntryDecoder { self.reader .next()? .context(FailedToReadWalSnafu) - .map(|batch| { + .map(|(batch, _)| { batch .into_iter() .filter_map(|sequenced_op| match sequenced_op.op { @@ -680,6 +688,7 @@ mod tests { use std::io::Write; use assert_matches::assert_matches; + use data_types::{NamespaceId, SequenceNumber, TableId}; use dml::DmlWrite; use generated_types::influxdata::{ @@ -730,7 +739,7 @@ mod tests { let ops: Vec = wal .reader_for_segment(closed.id) .expect("should be able to open reader for closed WAL segment") - .flat_map(|batch| batch.expect("failed to read WAL op batch")) + .flat_map(|batch| batch.expect("failed to read WAL op batch").0) .collect(); assert_eq!(vec![op1, op2, op3, op4], ops); @@ -863,15 +872,9 @@ mod tests { assert_eq!(wal_entries.len(), 2); let write_op_entries = wal_entries.into_iter().flatten().collect::>(); assert_eq!(write_op_entries.len(), 3); - assert_matches!(write_op_entries.first(), Some(got_op1) => { - assert_op_shape(got_op1, &w1); - }); - assert_matches!(write_op_entries.get(1), Some(got_op2) => { - assert_op_shape(got_op2, &w2); - }); - assert_matches!(write_op_entries.get(2), Some(got_op3) => { - assert_op_shape(got_op3, &w3); - }); + assert_op_shape(&write_op_entries[0], &w1); + assert_op_shape(&write_op_entries[1], &w2); + assert_op_shape(&write_op_entries[2], &w3); } #[tokio::test] @@ -916,7 +919,7 @@ mod tests { // error is thrown assert_matches!(decoder.next(), Some(Ok(batch)) => { assert_eq!(batch.len(), 1); - assert_op_shape(batch.first().unwrap(), &good_write); + assert_op_shape(&batch[0], &good_write); }); assert_matches!( decoder.next(), diff --git a/wal/tests/end_to_end.rs b/wal/tests/end_to_end.rs index 331c9b49bfd..aa53c8c85fa 100644 --- a/wal/tests/end_to_end.rs +++ b/wal/tests/end_to_end.rs @@ -1,3 +1,4 @@ +use assert_matches::assert_matches; use data_types::{NamespaceId, SequenceNumber, TableId}; use dml::DmlWrite; use generated_types::influxdata::{ @@ -61,7 +62,7 @@ async fn crud() { // Can read the written entries from the closed segment, ensuring that the // per-partition sequence numbers are preserved. let mut reader = wal.reader_for_segment(closed_segment_details.id()).unwrap(); - let mut op = reader.next().unwrap().unwrap(); + let (mut op, _) = reader.next().unwrap().unwrap(); let mut got_sequence_numbers = op .remove(0) .table_write_sequence_numbers @@ -69,7 +70,7 @@ async fn crud() { .collect::>(); got_sequence_numbers.sort(); assert_eq!(got_sequence_numbers, Vec::::from([42, 43]),); - let mut op = reader.next().unwrap().unwrap(); + let (mut op, bytes_read) = reader.next().unwrap().unwrap(); let mut got_sequence_numbers = op .remove(0) .table_write_sequence_numbers @@ -78,6 +79,11 @@ async fn crud() { got_sequence_numbers.sort(); assert_eq!(got_sequence_numbers, Vec::::from([44, 45]),); + // Ensure that all entries have been read and the total bytes read reflect + // the segment size. + assert_matches!(reader.next(), None); + assert_eq!(bytes_read, closed_segment_details.size()); + // Can delete a segment, leaving no closed segments again wal.delete(closed_segment_details.id()).await.unwrap(); let closed = wal.closed_segments(); @@ -114,7 +120,7 @@ async fn replay() { // Can read the written entries from the previously closed segment // ensuring the per-partition sequence numbers are preserved. let mut reader = wal.reader_for_segment(closed_segment_ids[0]).unwrap(); - let mut op = reader.next().unwrap().unwrap(); + let (mut op, _) = reader.next().unwrap().unwrap(); let mut got_sequence_numbers = op .remove(0) .table_write_sequence_numbers @@ -125,7 +131,7 @@ async fn replay() { // Can read the written entries from the previously open segment let mut reader = wal.reader_for_segment(closed_segment_ids[1]).unwrap(); - let mut op = reader.next().unwrap().unwrap(); + let (mut op, _) = reader.next().unwrap().unwrap(); let mut got_sequence_numbers = op .remove(0) .table_write_sequence_numbers diff --git a/wal_inspect/Cargo.toml b/wal_inspect/Cargo.toml index 26bb9cbe754..4224b2761ba 100644 --- a/wal_inspect/Cargo.toml +++ b/wal_inspect/Cargo.toml @@ -5,13 +5,16 @@ authors.workspace = true edition.workspace = true license.workspace = true +[lints] +workspace = true + [dependencies] # In alphabetical order data_types = { version = "0.1.0", path = "../data_types" } hashbrown.workspace = true mutable_batch = { version = "0.1.0", path = "../mutable_batch" } parquet_to_line_protocol = { version = "0.1.0", path = "../parquet_to_line_protocol" } schema = { version = "0.1.0", path = "../schema" } -thiserror = "1.0.48" +thiserror = "1.0.56" workspace-hack = { version = "0.1", path = "../workspace-hack" } [dev-dependencies] # In alphabetical order @@ -20,5 +23,5 @@ generated_types = { version = "0.1.0", path = "../generated_types" } mutable_batch_lp = { path = "../mutable_batch_lp" } mutable_batch_pb = { version = "0.1.0", path = "../mutable_batch_pb" } test_helpers = { path = "../test_helpers" } -tokio = { version = "1.32", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } +tokio = { version = "1.35", features = ["macros", "parking_lot", "rt-multi-thread", "sync", "time"] } wal = { version = "0.1.0", path = "../wal" } diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 29c9fdd5194..ce4996cb7a8 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -17,10 +17,9 @@ license.workspace = true ### BEGIN HAKARI SECTION [dependencies] ahash = { version = "0.8" } -arrow = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["dyn_cmp_dict", "prettyprint"] } -arrow-array = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", default-features = false, features = ["chrono-tz"] } -arrow-flight = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["flight-sql-experimental"] } -arrow-string = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", default-features = false, features = ["dyn_cmp_dict"] } +arrow-array = { version = "49", default-features = false, features = ["chrono-tz"] } +arrow-cast = { version = "49", default-features = false, features = ["prettyprint"] } +arrow-ipc = { version = "49", features = ["lz4"] } bitflags = { version = "2", default-features = false, features = ["std"] } byteorder = { version = "1" } bytes = { version = "1" } @@ -29,8 +28,6 @@ clap = { version = "4", features = ["derive", "env", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } crypto-common = { version = "0.1", default-features = false, features = ["std"] } -datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178" } -datafusion-optimizer = { git = "https://github.com/apache/arrow-datafusion.git", rev = "81f33b0e27f5694348cd953a937203d835b57178", default-features = false, features = ["crypto_expressions", "regex_expressions", "unicode_expressions"] } digest = { version = "0.10", features = ["mac", "std"] } either = { version = "1", features = ["serde"] } fixedbitset = { version = "0.4" } @@ -44,7 +41,9 @@ futures-task = { version = "0.3", default-features = false, features = ["std"] } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } +hyper = { version = "0.14", features = ["full"] } indexmap = { version = "2" } +itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" } libc = { version = "0.2", features = ["extra_traits"] } lock_api = { version = "0.4", features = ["arc_lock"] } log = { version = "0.4", default-features = false, features = ["std"] } @@ -52,29 +51,28 @@ md-5 = { version = "0.10" } memchr = { version = "2" } nom = { version = "7" } num-traits = { version = "0.2", features = ["i128", "libm"] } -object_store = { version = "0.7", default-features = false, features = ["aws", "azure", "gcp"] } +object_store = { version = "0.8", default-features = false, features = ["aws", "azure", "gcp"] } once_cell = { version = "1", features = ["parking_lot"] } parking_lot = { version = "0.12", features = ["arc_lock"] } -parquet = { git = "https://github.com/alamb/arrow-rs.git", rev = "7c236c06bfb78c0c877055c1617d9373971511a5", features = ["experimental", "object_store"] } petgraph = { version = "0.6" } phf_shared = { version = "0.11" } proptest = { version = "1", default-features = false, features = ["std"] } -prost = { version = "0.11" } -prost-types = { version = "0.11" } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } +prost-types-5ef9efb8ec2df382 = { package = "prost-types", version = "0.12" } +prost-types-a6292c17cd707f01 = { package = "prost-types", version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } rand_core = { version = "0.6", default-features = false, features = ["std"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax-c38e5c1d305a1b54 = { package = "regex-syntax", version = "0.8" } -regex-syntax-ca01ad9e24f5d932 = { package = "regex-syntax", version = "0.7" } +regex-syntax = { version = "0.8" } reqwest = { version = "0.11", default-features = false, features = ["json", "rustls-tls", "stream"] } -ring = { version = "0.16", default-features = false, features = ["std"] } -rustls = { version = "0.21", default-features = false, features = ["dangerous_configuration", "logging", "tls12"] } +ring = { version = "0.17", features = ["std"] } +rustls = { version = "0.21", features = ["dangerous_configuration"] } serde = { version = "1", features = ["derive", "rc"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10" } similar = { version = "2", features = ["inline"] } -sqlparser = { version = "0.37", features = ["visitor"] } sqlx = { version = "0.7", features = ["postgres", "runtime-tokio-rustls", "sqlite", "uuid"] } sqlx-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "any", "json", "migrate", "offline", "uuid"] } sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] } @@ -84,7 +82,6 @@ thrift = { version = "0.17" } tokio = { version = "1", features = ["full", "tracing"] } tokio-stream = { version = "0.1", features = ["fs", "net"] } tokio-util = { version = "0.7", features = ["codec", "compat", "io"] } -tonic = { version = "0.9", features = ["tls-webpki-roots"] } tower = { version = "0.4", features = ["balance", "buffer", "limit", "timeout", "util"] } tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] } tracing-core = { version = "0.1" } @@ -116,7 +113,8 @@ getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } heck = { version = "0.4", features = ["unicode"] } indexmap = { version = "2" } -itertools = { version = "0.10" } +itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } +itertools-a6292c17cd707f01 = { package = "itertools", version = "0.11" } lock_api = { version = "0.4", features = ["arc_lock"] } log = { version = "0.4", default-features = false, features = ["std"] } md-5 = { version = "0.10" } @@ -127,14 +125,17 @@ once_cell = { version = "1", features = ["parking_lot"] } parking_lot = { version = "0.12", features = ["arc_lock"] } petgraph = { version = "0.6" } phf_shared = { version = "0.11" } -prost = { version = "0.11" } -prost-types = { version = "0.11" } +prost-5ef9efb8ec2df382 = { package = "prost", version = "0.12" } +prost-a6292c17cd707f01 = { package = "prost", version = "0.11" } +prost-types-5ef9efb8ec2df382 = { package = "prost-types", version = "0.12" } +prost-types-a6292c17cd707f01 = { package = "prost-types", version = "0.11" } rand = { version = "0.8", features = ["small_rng"] } rand_core = { version = "0.6", default-features = false, features = ["std"] } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "dfa-search", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } -regex-syntax-c38e5c1d305a1b54 = { package = "regex-syntax", version = "0.8" } -rustls = { version = "0.21", default-features = false, features = ["dangerous_configuration", "logging", "tls12"] } +regex-syntax = { version = "0.8" } +ring = { version = "0.17", features = ["std"] } +rustls = { version = "0.21", features = ["dangerous_configuration"] } serde = { version = "1", features = ["derive", "rc"] } serde_json = { version = "1", features = ["raw_value"] } sha2 = { version = "0.10" } @@ -144,7 +145,7 @@ sqlx-macros-core = { version = "0.7", features = ["_rt-tokio", "_tls-rustls", "j sqlx-postgres = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] } sqlx-sqlite = { version = "0.7", default-features = false, features = ["any", "json", "migrate", "offline", "uuid"] } syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full"] } -syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] } +syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit-mut"] } tokio = { version = "1", features = ["full", "tracing"] } tokio-stream = { version = "0.1", features = ["fs", "net"] } tracing = { version = "0.1", features = ["log", "max_level_trace", "release_max_level_trace"] } @@ -156,48 +157,40 @@ uuid = { version = "1", features = ["v4"] } [target.x86_64-unknown-linux-gnu.dependencies] nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.x86_64-unknown-linux-gnu.build-dependencies] libc = { version = "0.2", features = ["extra_traits"] } nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.x86_64-apple-darwin.dependencies] nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.x86_64-apple-darwin.build-dependencies] libc = { version = "0.2", features = ["extra_traits"] } nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.aarch64-apple-darwin.dependencies] nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.aarch64-apple-darwin.build-dependencies] libc = { version = "0.2", features = ["extra_traits"] } nix = { version = "0.27", features = ["fs", "signal", "user"] } -rustls = { version = "0.21" } spin = { version = "0.9" } [target.x86_64-pc-windows-msvc.dependencies] -rustls = { version = "0.21" } spin = { version = "0.9" } -winapi = { version = "0.3", default-features = false, features = ["cfg", "combaseapi", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "heapapi", "ifdef", "in6addr", "inaddr", "ioapiset", "iphlpapi", "lmaccess", "lmapibuf", "lmcons", "memoryapi", "minwinbase", "minwindef", "netioapi", "ntlsa", "ntsecapi", "ntstatus", "objidl", "oleauto", "pdh", "powerbase", "processenv", "psapi", "rpcdce", "sddl", "securitybaseapi", "shellapi", "std", "synchapi", "sysinfoapi", "wbemcli", "winbase", "wincon", "windef", "winerror", "winioctl", "winnt", "winsock2", "wtypesbase"] } -windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } +winapi = { version = "0.3", default-features = false, features = ["cfg", "combaseapi", "consoleapi", "errhandlingapi", "evntrace", "fileapi", "handleapi", "heapapi", "ifdef", "in6addr", "inaddr", "ioapiset", "iphlpapi", "lmaccess", "lmapibuf", "lmcons", "memoryapi", "minwinbase", "minwindef", "netioapi", "ntlsa", "ntsecapi", "ntstatus", "objidl", "oleauto", "pdh", "powerbase", "processenv", "psapi", "rpcdce", "sddl", "securitybaseapi", "shellapi", "std", "synchapi", "sysinfoapi", "wbemcli", "winbase", "wincon", "windef", "winerror", "winioctl", "winnt", "winsock2"] } +windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] } [target.x86_64-pc-windows-msvc.build-dependencies] -rustls = { version = "0.21" } spin = { version = "0.9" } -windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } +windows-sys-b21d60becc0929df = { package = "windows-sys", version = "0.52", features = ["Win32_Foundation", "Win32_NetworkManagement_IpHelper", "Win32_Networking_WinSock", "Win32_Security_Authentication_Identity", "Win32_Security_Credentials", "Win32_Security_Cryptography", "Win32_Storage_FileSystem", "Win32_System_Com", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_Memory", "Win32_System_Threading", "Win32_UI_Input_KeyboardAndMouse", "Win32_UI_Shell"] } windows-sys-c8eced492e86ede7 = { package = "windows-sys", version = "0.48", features = ["Win32_Foundation", "Win32_Networking_WinSock", "Win32_Security", "Win32_Storage_FileSystem", "Win32_System_Console", "Win32_System_Diagnostics_Debug", "Win32_System_IO", "Win32_System_Pipes", "Win32_System_Registry", "Win32_System_SystemServices", "Win32_System_Threading", "Win32_System_Time", "Win32_System_WindowsProgramming", "Win32_UI_Shell"] } ### END HAKARI SECTION